From 8a3aea7fee84085cc49b1ff1bc0c012d73c75978 Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Sat, 23 May 2026 16:32:36 +0800 Subject: [PATCH 01/23] Optimize PEXSI stage4 code path --- source/source_hsolver/diago_pexsi.cpp | 47 +++---- source/source_hsolver/diago_pexsi.h | 17 ++- .../module_pexsi/dist_matrix_transformer.cpp | 101 +++++++-------- .../module_pexsi/pexsi_solver.cpp | 10 +- .../module_pexsi/pexsi_solver.h | 18 +-- .../module_pexsi/pexsi_solver_interface.h | 28 ++++ .../module_pexsi/simple_pexsi.cpp | 6 +- .../source_hsolver/test/diago_pexsi_test.cpp | 120 ++++++++++++++++++ tools/pexsi/check_lcao_10case_regression.py | 101 +++++++++++++++ 9 files changed, 349 insertions(+), 99 deletions(-) create mode 100644 source/source_hsolver/module_pexsi/pexsi_solver_interface.h create mode 100755 tools/pexsi/check_lcao_10case_regression.py diff --git a/source/source_hsolver/diago_pexsi.cpp b/source/source_hsolver/diago_pexsi.cpp index e85f78b12b9..47a745e276f 100644 --- a/source/source_hsolver/diago_pexsi.cpp +++ b/source/source_hsolver/diago_pexsi.cpp @@ -10,6 +10,8 @@ #include "source_basis/module_ao/parallel_orbitals.h" #include "module_pexsi/pexsi_solver.h" +#include + typedef hamilt::MatrixBlock matd; typedef hamilt::MatrixBlock> matcd; @@ -19,28 +21,34 @@ template std::vector DiagoPexsi::mu_buffer; template -DiagoPexsi::DiagoPexsi(const Parallel_Orbitals* ParaV_in) +DiagoPexsi::DiagoPexsi(const Parallel_Orbitals* ParaV_in, std::unique_ptr solver_in) { + this->ParaV = ParaV_in; + this->ps = std::move(solver_in); + if (this->ps == nullptr) + { + this->ps = std::make_unique(); + } + int nspin = PARAM.inp.nspin; if (PARAM.inp.nspin == 4) { nspin = 1; } - mu_buffer.resize(nspin); - for (int i = 0; i < nspin; i++) - { - mu_buffer[i] = this->ps->pexsi_mu; - } + mu_buffer.assign(nspin, pexsi::PEXSI_Solver::pexsi_mu); - this->ParaV = ParaV_in; - this->ps = std::make_unique(); + const int local_size = ParaV->nrow * ParaV->ncol; this->DM.resize(nspin); this->EDM.resize(nspin); + this->dm_buffer_.resize(nspin); + this->edm_buffer_.resize(nspin); for (int i = 0; i < nspin; i++) { - this->DM[i] = new T[ParaV->nrow * ParaV->ncol]; - this->EDM[i] = new T[ParaV->nrow * ParaV->ncol]; + this->dm_buffer_[i].assign(local_size, T{}); + this->edm_buffer_[i].assign(local_size, T{}); + this->DM[i] = this->dm_buffer_[i].data(); + this->EDM[i] = this->edm_buffer_[i].data(); } } @@ -48,17 +56,6 @@ DiagoPexsi::DiagoPexsi(const Parallel_Orbitals* ParaV_in) template DiagoPexsi::~DiagoPexsi() { - int nspin = PARAM.inp.nspin; - if (PARAM.inp.nspin == 4) - { - nspin = 1; - } - for (int i = 0; i < nspin; i++) - { - delete[] this->DM[i]; - delete[] this->EDM[i]; - } - } template <> @@ -69,6 +66,12 @@ void DiagoPexsi::diag(hamilt::Hamilt* phm_in, psi::Psi& phm_in->matrix(h_mat, s_mat); std::vector eigen(PARAM.globalv.nlocal, 0.0); int ik = psi.get_current_k(); + if (ik < 0 || ik >= static_cast(this->DM.size())) + { + ModuleBase::WARNING_QUIT("DiagoPEXSI", + "PEXSI real path only has density buffers for Gamma/spin channels; multi-k requires " + "the complex expert-routine path"); + } this->ps->prepare(this->ParaV->blacs_ctxt, this->ParaV->nb, this->ParaV->nrow, @@ -97,4 +100,4 @@ template class DiagoPexsi; template class DiagoPexsi >; } // namespace hsolver -#endif \ No newline at end of file +#endif diff --git a/source/source_hsolver/diago_pexsi.h b/source/source_hsolver/diago_pexsi.h index 9f0e0d1317b..60efdeed664 100644 --- a/source/source_hsolver/diago_pexsi.h +++ b/source/source_hsolver/diago_pexsi.h @@ -6,7 +6,7 @@ #include "source_base/macros.h" // GetRealType #include "source_hamilt/hamilt.h" #include "source_basis/module_ao/parallel_orbitals.h" -#include "module_pexsi/pexsi_solver.h" +#include "module_pexsi/pexsi_solver_interface.h" namespace hsolver { @@ -19,16 +19,21 @@ class DiagoPexsi static std::vector mu_buffer; public: - DiagoPexsi(const Parallel_Orbitals* ParaV_in); + explicit DiagoPexsi(const Parallel_Orbitals* ParaV_in, + std::unique_ptr solver_in = nullptr); void diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* eigenvalue_in); const Parallel_Orbitals* ParaV = nullptr; std::vector DM; std::vector EDM; - double totalEnergyH; - double totalEnergyS; - double totalFreeEnergy; - std::unique_ptr ps; + double totalEnergyH = 0.0; + double totalEnergyS = 0.0; + double totalFreeEnergy = 0.0; + std::unique_ptr ps; ~DiagoPexsi(); + + private: + std::vector> dm_buffer_; + std::vector> edm_buffer_; }; } // namespace hsolver diff --git a/source/source_hsolver/module_pexsi/dist_matrix_transformer.cpp b/source/source_hsolver/module_pexsi/dist_matrix_transformer.cpp index 43f069c1238..c482ddc3197 100644 --- a/source/source_hsolver/module_pexsi/dist_matrix_transformer.cpp +++ b/source/source_hsolver/module_pexsi/dist_matrix_transformer.cpp @@ -147,7 +147,7 @@ inline void DistMatrixTransformer::countMatrixDistribution(int N, double* A, std for (int i = 0; i < N; ++i) { int key = 0; - if (fabs(A[i] < 1e-31)) + if (fabs(A[i]) < 1e-31) key = -100; else key = floor(log10(fabs(A[i]))); @@ -292,13 +292,13 @@ int DistMatrixTransformer::buildTransformParameter(DistBCDMatrix& SRC_Matrix, // transfer index to receiver std::vector receiver_index(receiver_size); - MPI_Alltoallv(&sender_index[0], - &sender_size_process[0], - &sender_displacement_process[0], + MPI_Alltoallv(sender_index.data(), + sender_size_process.data(), + sender_displacement_process.data(), MPI_INT, - &receiver_index[0], - &receiver_size_process[0], - &receiver_displacement_process[0], + receiver_index.data(), + receiver_size_process.data(), + receiver_displacement_process.data(), MPI_INT, COMM_TRANS); @@ -309,9 +309,9 @@ int DistMatrixTransformer::buildTransformParameter(DistBCDMatrix& SRC_Matrix, NPROC_TRANS, receiver_size_process, receiver_displacement_process, - &receiver_index[0], + receiver_index.data(), DST_Matrix, - &buffer2ccsIndex[0]); + buffer2ccsIndex.data()); return 0; } @@ -337,10 +337,8 @@ int DistMatrixTransformer::deleteGroupCommTrans(MPI_Group& GROUP_TRANS, MPI_Comm return 0; } -// transform two sparse matrices from block cyclic distribution (BCD) to Compressed Column Storage (CCS) distribution -// two destination matrices share the same non-zero elements positions -// if either of two elements in source matrices is non-zeros, the elements in the destination matrices are non-zero, -// even if one of them is acturely zero All matrices must have same MPI communicator +// Transform two sparse matrices from BCD to CCS with a shared sparse pattern. H and S are packed together so that the +// value redistribution needs one MPI_Alltoallv instead of one collective per matrix. int DistMatrixTransformer::transformBCDtoCCS(DistBCDMatrix& SRC_Matrix, double* H_2d, double* S_2d, @@ -397,68 +395,59 @@ int DistMatrixTransformer::transformBCDtoCCS(DistBCDMatrix& SRC_Matrix, receiver_size_process, receiver_displacement_process, buffer2ccsIndex); -// Do transformation - std::vector sender_buffer(sender_size); - std::vector receiver_buffer(receiver_size); - // put H to sender buffer + std::vector sender_buffer(2 * sender_size); if (SRC_Matrix.get_layout() == 'R' || SRC_Matrix.get_layout() == 'r') { for (int i = 0; i < sender_size; ++i) { - sender_buffer[i] = H_2d[rowidx[i] * SRC_Matrix.get_ncol() + colidx[i]]; + const int idx = rowidx[i] * SRC_Matrix.get_ncol() + colidx[i]; + sender_buffer[2 * i] = H_2d[idx]; + sender_buffer[2 * i + 1] = S_2d[idx]; } } else { for (int i = 0; i < sender_size; ++i) { - sender_buffer[i] = H_2d[colidx[i] * SRC_Matrix.get_nrow() + rowidx[i]]; + const int idx = colidx[i] * SRC_Matrix.get_nrow() + rowidx[i]; + sender_buffer[2 * i] = H_2d[idx]; + sender_buffer[2 * i + 1] = S_2d[idx]; } } - // do all2all transformation - MPI_Alltoallv(&sender_buffer[0], - &sender_size_process[0], - &sender_displacement_process[0], - MPI_DOUBLE, - &receiver_buffer[0], - &receiver_size_process[0], - &receiver_displacement_process[0], - MPI_DOUBLE, - COMM_TRANS); -// collect H from receiver buffer - delete[] H_ccs; - H_ccs = new double[receiver_size]; - buffer2CCSvalue(receiver_size, &buffer2ccsIndex[0], &receiver_buffer[0], H_ccs); - // put S to sender buffer - if (SRC_Matrix.get_layout() == 'R' || SRC_Matrix.get_layout() == 'r') - { - for (int i = 0; i < sender_size; ++i) - { - sender_buffer[i] = S_2d[rowidx[i] * SRC_Matrix.get_ncol() + colidx[i]]; - } - } - else + std::vector sender_value_size_process(sender_size_process); + std::vector sender_value_displacement_process(sender_displacement_process); + std::vector receiver_value_size_process(receiver_size_process); + std::vector receiver_value_displacement_process(receiver_displacement_process); + for (int i = 0; i < NPROC_TRANS; ++i) { - for (int i = 0; i < sender_size; ++i) - { - sender_buffer[i] = S_2d[colidx[i] * SRC_Matrix.get_nrow() + rowidx[i]]; - } + sender_value_size_process[i] *= 2; + sender_value_displacement_process[i] *= 2; + receiver_value_size_process[i] *= 2; + receiver_value_displacement_process[i] *= 2; } - // do all2all transformation - MPI_Alltoallv(&sender_buffer[0], - &sender_size_process[0], - &sender_displacement_process[0], + + std::vector receiver_buffer(2 * receiver_size); + MPI_Alltoallv(sender_buffer.data(), + sender_value_size_process.data(), + sender_value_displacement_process.data(), MPI_DOUBLE, - &receiver_buffer[0], - &receiver_size_process[0], - &receiver_displacement_process[0], + receiver_buffer.data(), + receiver_value_size_process.data(), + receiver_value_displacement_process.data(), MPI_DOUBLE, COMM_TRANS); -// collect S from receiver buffer + + delete[] H_ccs; + H_ccs = new double[receiver_size]; delete[] S_ccs; S_ccs = new double[receiver_size]; - buffer2CCSvalue(receiver_size, &buffer2ccsIndex[0], &receiver_buffer[0], S_ccs); + for (int i = 0; i < receiver_size; ++i) + { + const int buffer_index = buffer2ccsIndex[i]; + H_ccs[i] = receiver_buffer[2 * buffer_index]; + S_ccs[i] = receiver_buffer[2 * buffer_index + 1]; + } } // clear and return deleteGroupCommTrans(GROUP_TRANS, COMM_TRANS); @@ -762,4 +751,4 @@ int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, } } // namespace pexsi -#endif \ No newline at end of file +#endif diff --git a/source/source_hsolver/module_pexsi/pexsi_solver.cpp b/source/source_hsolver/module_pexsi/pexsi_solver.cpp index 1019090c483..4b95bc7564a 100644 --- a/source/source_hsolver/module_pexsi/pexsi_solver.cpp +++ b/source/source_hsolver/module_pexsi/pexsi_solver.cpp @@ -98,25 +98,25 @@ int PEXSI_Solver::solve(double mu0) return 0; } -const double PEXSI_Solver::get_totalFreeEnergy() const +double PEXSI_Solver::get_totalFreeEnergy() const { return totalFreeEnergy; } -const double PEXSI_Solver::get_totalEnergyH() const +double PEXSI_Solver::get_totalEnergyH() const { return totalEnergyH; } -const double PEXSI_Solver::get_totalEnergyS() const +double PEXSI_Solver::get_totalEnergyS() const { return totalEnergyS; } -const double PEXSI_Solver::get_mu() const +double PEXSI_Solver::get_mu() const { return mu; } } // namespace pexsi -#endif \ No newline at end of file +#endif diff --git a/source/source_hsolver/module_pexsi/pexsi_solver.h b/source/source_hsolver/module_pexsi/pexsi_solver.h index 922f1b9fb3d..f47ffe99202 100644 --- a/source/source_hsolver/module_pexsi/pexsi_solver.h +++ b/source/source_hsolver/module_pexsi/pexsi_solver.h @@ -1,11 +1,13 @@ #ifndef PEXSI_Solver_H #define PEXSI_Solver_H +#include "pexsi_solver_interface.h" + #include namespace pexsi { -class PEXSI_Solver +class PEXSI_Solver : public IPexsiSolver { public: void prepare(const int blacs_text, @@ -15,12 +17,12 @@ class PEXSI_Solver const double* h, const double* s, double*& DM, - double*& EDM); - int solve(double mu0); - const double get_totalFreeEnergy() const; - const double get_totalEnergyH() const; - const double get_totalEnergyS() const; - const double get_mu() const; + double*& EDM) override; + int solve(double mu0) override; + double get_totalFreeEnergy() const override; + double get_totalEnergyH() const override; + double get_totalEnergyS() const override; + double get_mu() const override; //========================================================== // PEXSI related variables @@ -140,4 +142,4 @@ class PEXSI_Solver double mu; }; } // namespace pexsi -#endif // PEXSI_Solver_H \ No newline at end of file +#endif // PEXSI_Solver_H diff --git a/source/source_hsolver/module_pexsi/pexsi_solver_interface.h b/source/source_hsolver/module_pexsi/pexsi_solver_interface.h new file mode 100644 index 00000000000..405ea0adb0f --- /dev/null +++ b/source/source_hsolver/module_pexsi/pexsi_solver_interface.h @@ -0,0 +1,28 @@ +#ifndef PEXSI_SOLVER_INTERFACE_H +#define PEXSI_SOLVER_INTERFACE_H + +namespace pexsi +{ +class IPexsiSolver +{ + public: + virtual ~IPexsiSolver() = default; + + virtual void prepare(const int blacs_text, + const int nb, + const int nrow, + const int ncol, + const double* h, + const double* s, + double*& DM, + double*& EDM) + = 0; + virtual int solve(double mu0) = 0; + virtual double get_totalFreeEnergy() const = 0; + virtual double get_totalEnergyH() const = 0; + virtual double get_totalEnergyS() const = 0; + virtual double get_mu() const = 0; +}; +} // namespace pexsi + +#endif // PEXSI_SOLVER_INTERFACE_H diff --git a/source/source_hsolver/module_pexsi/simple_pexsi.cpp b/source/source_hsolver/module_pexsi/simple_pexsi.cpp index 6c52066e373..4d51a928332 100644 --- a/source/source_hsolver/module_pexsi/simple_pexsi.cpp +++ b/source/source_hsolver/module_pexsi/simple_pexsi.cpp @@ -21,7 +21,7 @@ #include "source_base/timer.h" #include "source_base/tool_quit.h" #include "source_base/global_variable.h" -#include "source_hsolver/diago_pexsi.h" +#include "pexsi_solver.h" namespace pexsi { @@ -271,7 +271,9 @@ int simplePEXSI(MPI_Comm comm_PEXSI, double* FDMnzvalLocal = nullptr; // transform H and S from 2D block cyclic distribution to compressed column sparse matrix // LiuXh modify 2021-03-30, add DONE(ofs_running,"xx") for test + ModuleBase::timer::start("Diago_LCAO_Matrix", "TransMAT2CCS"); DistMatrixTransformer::transformBCDtoCCS(SRC_Matrix, H, S, ZERO_Limit, DST_Matrix, HnzvalLocal, SnzvalLocal); + ModuleBase::timer::end("Diago_LCAO_Matrix", "TransMAT2CCS"); // MPI_Barrier(MPI_COMM_WORLD); // LiuXh modify 2021-03-30, add DONE(ofs_running,"xx") for test if (comm_PEXSI != MPI_COMM_NULL) @@ -363,4 +365,4 @@ int simplePEXSI(MPI_Comm comm_PEXSI, return 0; } } // namespace pexsi -#endif \ No newline at end of file +#endif diff --git a/source/source_hsolver/test/diago_pexsi_test.cpp b/source/source_hsolver/test/diago_pexsi_test.cpp index 0d021166e11..327da4f67d4 100644 --- a/source/source_hsolver/test/diago_pexsi_test.cpp +++ b/source/source_hsolver/test/diago_pexsi_test.cpp @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #define PASSTHRESHOLD 5e-4 @@ -382,6 +384,124 @@ class PexsiGammaOnlyTest : public ::testing::TestWithParam> { }; +class FakePexsiSolver : public pexsi::IPexsiSolver +{ + public: + void prepare(const int blacs_text_in, + const int nb_in, + const int nrow_in, + const int ncol_in, + const double* h_in, + const double* s_in, + double*& dm_in, + double*& edm_in) override + { + prepared = true; + blacs_text = blacs_text_in; + nb = nb_in; + nrow = nrow_in; + ncol = ncol_in; + h = h_in; + s = s_in; + dm = dm_in; + edm = edm_in; + for (int i = 0; i < nrow * ncol; ++i) + { + dm[i] = 10.0 + i; + edm[i] = 20.0 + i; + } + } + + int solve(double mu0_in) override + { + solved = true; + mu0 = mu0_in; + return 0; + } + + double get_totalFreeEnergy() const override + { + return total_free_energy; + } + + double get_totalEnergyH() const override + { + return total_energy_h; + } + + double get_totalEnergyS() const override + { + return total_energy_s; + } + + double get_mu() const override + { + return mu; + } + + bool prepared = false; + bool solved = false; + int blacs_text = -1; + int nb = -1; + int nrow = -1; + int ncol = -1; + double mu0 = 0.0; + const double* h = nullptr; + const double* s = nullptr; + double* dm = nullptr; + double* edm = nullptr; + double total_free_energy = -1.5; + double total_energy_h = -2.5; + double total_energy_s = 3.5; + double mu = 0.125; +}; + +TEST(DiagoPexsiRefactorTest, UsesInjectedSolverAndOwnedDensityBuffers) +{ + PARAM.input.nspin = 1; + PARAM.inp.nspin = 1; + pexsi::PEXSI_Solver::pexsi_mu = -0.25; + + Parallel_Orbitals po; + po.blacs_ctxt = 11; + po.nb = 2; + po.nrow = 2; + po.ncol = 2; + + HamiltTEST hmtest; + hmtest.nrow = po.nrow; + hmtest.ncol = po.ncol; + hmtest.h_local = {1.0, 0.0, 0.0, 2.0}; + hmtest.s_local = {1.0, 0.0, 0.0, 1.0}; + + psi::Psi psi; + + auto fake_solver = std::make_unique(); + FakePexsiSolver* fake_solver_ptr = fake_solver.get(); + hsolver::DiagoPexsi diago(&po, std::move(fake_solver)); + + ASSERT_NE(diago.DM[0], nullptr); + ASSERT_NE(diago.EDM[0], nullptr); + diago.diag(&hmtest, psi, nullptr); + + EXPECT_TRUE(fake_solver_ptr->prepared); + EXPECT_TRUE(fake_solver_ptr->solved); + EXPECT_EQ(fake_solver_ptr->blacs_text, po.blacs_ctxt); + EXPECT_EQ(fake_solver_ptr->nb, po.nb); + EXPECT_EQ(fake_solver_ptr->nrow, po.nrow); + EXPECT_EQ(fake_solver_ptr->ncol, po.ncol); + EXPECT_EQ(fake_solver_ptr->h, hmtest.h_local.data()); + EXPECT_EQ(fake_solver_ptr->s, hmtest.s_local.data()); + EXPECT_EQ(fake_solver_ptr->dm, diago.DM[0]); + EXPECT_EQ(fake_solver_ptr->edm, diago.EDM[0]); + EXPECT_DOUBLE_EQ(fake_solver_ptr->mu0, -0.25); + EXPECT_DOUBLE_EQ(diago.DM[0][3], 13.0); + EXPECT_DOUBLE_EQ(diago.EDM[0][3], 23.0); + EXPECT_DOUBLE_EQ(diago.totalFreeEnergy, fake_solver_ptr->total_free_energy); + EXPECT_DOUBLE_EQ(diago.totalEnergyH, fake_solver_ptr->total_energy_h); + EXPECT_DOUBLE_EQ(diago.totalEnergyS, fake_solver_ptr->total_energy_s); +} + TEST_P(PexsiGammaOnlyTest, LCAO) { std::stringstream out_info; diff --git a/tools/pexsi/check_lcao_10case_regression.py b/tools/pexsi/check_lcao_10case_regression.py new file mode 100755 index 00000000000..0fc29152118 --- /dev/null +++ b/tools/pexsi/check_lcao_10case_regression.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +"""Validate ABACUS LCAO 10-case outputs against the official energy baseline.""" + +from __future__ import annotations + +import argparse +import re +import sys +from pathlib import Path + + +REFERENCE_ENERGIES_EV = { + "001": ("001_4GaAs", -7836.15655820), + "002": ("002_C2H6O", -665.55500011), + "003": ("003_4MoS2", -9699.00659511), + "004": ("004_12Pt111", -39600.74431945), + "005": ("005_3BaTiO3", -10749.40029987), + "006": ("006_16Na", -18524.76009654), + "007": ("007_27Fe", -86945.53067515), + "008": ("008_32H2O", -14950.45084341), + "009": ("009_Battery108", -124070.65411079), + "010": ("010_216Si", -23123.69990200), +} + +FINAL_ETOT_RE = re.compile(r"!\s*FINAL_ETOT_IS\s+([-+0-9.eE]+)\s+eV") + + +def find_case_dir(run_root: Path, case_id: str) -> Path | None: + matches = sorted(path for path in run_root.iterdir() if path.is_dir() and path.name.startswith(f"{case_id}_")) + return matches[0] if matches else None + + +def find_running_log(case_dir: Path) -> Path | None: + matches = sorted(case_dir.glob("OUT.*/running_scf.log")) + return matches[0] if matches else None + + +def parse_log(log_path: Path) -> tuple[bool, float | None]: + text = log_path.read_text(errors="replace") + converged = "#SCF IS CONVERGED#" in text + matches = FINAL_ETOT_RE.findall(text) + final_energy = float(matches[-1]) if matches else None + return converged, final_energy + + +def validate(run_root: Path, tolerance_ev: float) -> int: + if not run_root.is_dir(): + print(f"ERROR: run root does not exist: {run_root}", file=sys.stderr) + return 2 + + failures: list[str] = [] + print(f"{'case':<18} {'status':<10} {'energy(eV)':>18} {'ref(eV)':>18} {'|dE|(eV)':>12}") + for case_id, (case_name, reference_energy) in REFERENCE_ENERGIES_EV.items(): + case_dir = find_case_dir(run_root, case_id) + if case_dir is None: + failures.append(f"{case_name}: missing case directory") + print(f"{case_name:<18} {'MISSING':<10}") + continue + + log_path = find_running_log(case_dir) + if log_path is None: + failures.append(f"{case_name}: missing OUT.*/running_scf.log") + print(f"{case_name:<18} {'NO_LOG':<10}") + continue + + converged, final_energy = parse_log(log_path) + if final_energy is None: + failures.append(f"{case_name}: missing FINAL_ETOT_IS") + print(f"{case_name:<18} {'NO_ETOT':<10}") + continue + + delta = abs(final_energy - reference_energy) + status = "PASS" if converged and delta <= tolerance_ev else "FAIL" + if status != "PASS": + reason = "not converged" if not converged else f"|dE|={delta:.3e} > {tolerance_ev:.3e}" + failures.append(f"{case_name}: {reason}") + print(f"{case_name:<18} {status:<10} {final_energy:18.10f} {reference_energy:18.10f} {delta:12.3e}") + + if failures: + print("\nFailures:", file=sys.stderr) + for failure in failures: + print(f" - {failure}", file=sys.stderr) + return 1 + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("run_root", type=Path, help="directory containing the 001_* ... 010_* LCAO case outputs") + parser.add_argument( + "--tolerance-ev", + type=float, + default=5.0e-7, + help="maximum allowed absolute total-energy error in eV", + ) + args = parser.parse_args() + return validate(args.run_root, args.tolerance_ev) + + +if __name__ == "__main__": + raise SystemExit(main()) From 5978ae6a29e92c0a0212a2766b305d4af65cd63d Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Sun, 24 May 2026 20:15:39 +0800 Subject: [PATCH 02/23] Enable complex multi-k PEXSI path --- source/source_estate/elecstate_lcao.cpp | 51 ++- source/source_hsolver/diago_pexsi.cpp | 188 +++++++- source/source_hsolver/diago_pexsi.h | 12 + source/source_hsolver/hsolver_lcao.cpp | 35 +- .../module_pexsi/dist_matrix_transformer.cpp | 432 +++++++++++++++++- .../module_pexsi/dist_matrix_transformer.h | 28 +- .../module_pexsi/simple_pexsi.cpp | 167 ++++++- .../module_pexsi/simple_pexsi.h | 27 +- 8 files changed, 914 insertions(+), 26 deletions(-) diff --git a/source/source_estate/elecstate_lcao.cpp b/source/source_estate/elecstate_lcao.cpp index 2040ee769e1..3ce61930771 100644 --- a/source/source_estate/elecstate_lcao.cpp +++ b/source/source_estate/elecstate_lcao.cpp @@ -80,7 +80,56 @@ void ElecStateLCAO>::dm2rho(std::vector*> pexsi_EDM, DensityMatrix, double>* dm) { - ModuleBase::WARNING_QUIT("ElecStateLCAO", "pexsi is not completed for multi-k case"); + ModuleBase::timer::start("ElecStateLCAO", "dm2rho"); + + const int dmk_size = dm->get_DMK_size(); + if (static_cast(pexsi_DM.size()) < dmk_size) + { + ModuleBase::WARNING_QUIT("ElecStateLCAO", "PEXSI multi-k density matrix buffer is smaller than DMK storage"); + } + + const int local_matrix_size = dm->get_DMK_nrow() * dm->get_DMK_ncol(); + const double pexsi_spin_weight = PARAM.inp.nspin == 1 ? 0.5 : 1.0; + for (int ik = 0; ik < dmk_size; ++ik) + { + const double k_weight = ((this->klist != nullptr && ik < static_cast(this->klist->wk.size())) + ? this->klist->wk[ik] + : 1.0) + * pexsi_spin_weight; + for (int i = 0; i < local_matrix_size; ++i) + { + pexsi_DM[ik][i] *= k_weight; + pexsi_EDM[ik][i] *= k_weight; + } + dm->set_DMK_pointer(ik, pexsi_DM[ik]); + } + +#ifdef __PEXSI + dm->pexsi_EDM = pexsi_EDM; +#endif + + dm->cal_DMR(); + + for (int is = 0; is < PARAM.inp.nspin; is++) + { + ModuleBase::GlobalFunc::ZEROS(this->charge->rho[is], this->charge->nrxx); + } + + ModuleBase::GlobalFunc::NOTE("Calculate the charge on real space grid!"); + ModuleGint::cal_gint_rho(dm->get_DMR_vector(), PARAM.inp.nspin, this->charge->rho); + if (XC_Functional::get_ked_flag()) + { + for (int is = 0; is < PARAM.inp.nspin; is++) + { + ModuleBase::GlobalFunc::ZEROS(this->charge->kin_r[0], this->charge->nrxx); + } + ModuleGint::cal_gint_tau(dm->get_DMR_vector(), PARAM.inp.nspin, this->charge->kin_r); + } + + this->charge->renormalize_rho(); + + ModuleBase::timer::end("ElecStateLCAO", "dm2rho"); + return; } diff --git a/source/source_hsolver/diago_pexsi.cpp b/source/source_hsolver/diago_pexsi.cpp index 47a745e276f..b9cd1cae2a8 100644 --- a/source/source_hsolver/diago_pexsi.cpp +++ b/source/source_hsolver/diago_pexsi.cpp @@ -6,12 +6,19 @@ #include "diago_pexsi.h" #include "source_base/tool_title.h" #include "source_base/global_variable.h" +#include "source_base/parallel_global.h" #include "source_base/tool_quit.h" #include "source_basis/module_ao/parallel_orbitals.h" +#include "module_pexsi/simple_pexsi.h" #include "module_pexsi/pexsi_solver.h" +#include +#include +#include #include +extern MPI_Comm DIAG_WORLD; + typedef hamilt::MatrixBlock matd; typedef hamilt::MatrixBlock> matcd; @@ -35,27 +42,126 @@ DiagoPexsi::DiagoPexsi(const Parallel_Orbitals* ParaV_in, std::unique_ptr(mu_buffer.size()) < nspin) + { + mu_buffer.resize(nspin, pexsi::PEXSI_Solver::pexsi_mu); + } + this->resize_density_buffers(nspin); - const int local_size = ParaV->nrow * ParaV->ncol; +} - this->DM.resize(nspin); - this->EDM.resize(nspin); - this->dm_buffer_.resize(nspin); - this->edm_buffer_.resize(nspin); - for (int i = 0; i < nspin; i++) +template +DiagoPexsi::~DiagoPexsi() +{ +} + +template +void DiagoPexsi::resize_density_buffers(const int count) +{ + const int local_size = this->ParaV->nrow * this->ParaV->ncol; + this->DM.resize(count); + this->EDM.resize(count); + this->dm_buffer_.resize(count); + this->edm_buffer_.resize(count); + for (int i = 0; i < count; ++i) { - this->dm_buffer_[i].assign(local_size, T{}); - this->edm_buffer_[i].assign(local_size, T{}); + if (static_cast(this->dm_buffer_[i].size()) != local_size) + { + this->dm_buffer_[i].assign(local_size, T{}); + } + if (static_cast(this->edm_buffer_[i].size()) != local_size) + { + this->edm_buffer_[i].assign(local_size, T{}); + } this->DM[i] = this->dm_buffer_[i].data(); this->EDM[i] = this->edm_buffer_[i].data(); } + if (static_cast(mu_buffer.size()) < count) + { + mu_buffer.resize(count, pexsi::PEXSI_Solver::pexsi_mu); + } +} +template +void DiagoPexsi::begin_mu_search() +{ + this->has_mu_lower_ = false; + this->has_mu_upper_ = false; + this->mu_lower_ = 0.0; + this->mu_upper_ = 0.0; } template -DiagoPexsi::~DiagoPexsi() +void DiagoPexsi::begin_k_loop() +{ + this->num_electron_sum_ = 0.0; + this->num_electron_derivative_sum_ = 0.0; + this->totalEnergyH = 0.0; + this->totalEnergyS = 0.0; + this->totalFreeEnergy = 0.0; +} + +template +void DiagoPexsi::set_k_weight(const int ik, const double weight) +{ + if (ik >= static_cast(this->k_weights_.size())) + { + this->k_weights_.resize(ik + 1, 1.0); + } + this->k_weights_[ik] = weight; +} + +template +bool DiagoPexsi::finish_k_loop(const double target_nelec) { + return true; +} + +template <> +bool DiagoPexsi>::finish_k_loop(const double target_nelec) +{ + const double residual = target_nelec - this->num_electron_sum_; + if (std::abs(residual) <= pexsi::PEXSI_Solver::pexsi_elec_thr) + { + return true; + } + if (residual > 0.0) + { + this->has_mu_lower_ = true; + this->mu_lower_ = mu_buffer[0]; + } + else + { + this->has_mu_upper_ = true; + this->mu_upper_ = mu_buffer[0]; + } + + double delta_mu = 0.0; + if (std::abs(this->num_electron_derivative_sum_) <= DBL_MIN) + { + if (this->has_mu_lower_ && this->has_mu_upper_) + { + mu_buffer[0] = 0.5 * (this->mu_lower_ + this->mu_upper_); + return false; + } + const double fallback_step = std::max(pexsi::PEXSI_Solver::pexsi_mu_guard, 0.5); + delta_mu = residual > 0.0 ? fallback_step : -fallback_step; + } + else + { + delta_mu = residual / this->num_electron_derivative_sum_; + } + if (pexsi::PEXSI_Solver::pexsi_mu_guard > 0.0 && std::abs(this->num_electron_derivative_sum_) > DBL_MIN) + { + delta_mu = std::max(-pexsi::PEXSI_Solver::pexsi_mu_guard, + std::min(pexsi::PEXSI_Solver::pexsi_mu_guard, delta_mu)); + } + if (mu_buffer.empty()) + { + mu_buffer.push_back(pexsi::PEXSI_Solver::pexsi_mu); + } + mu_buffer[0] += delta_mu; + return false; } template <> @@ -93,7 +199,67 @@ void DiagoPexsi>::diag(hamilt::Hamilt> double* eigenvalue_in) { ModuleBase::TITLE("DiagoPEXSI", "diag"); - ModuleBase::WARNING_QUIT("DiagoPEXSI", "PEXSI is not completed for multi-k case"); + matcd h_mat, s_mat; + phm_in->matrix(h_mat, s_mat); + const int ik = psi.get_current_k(); + if (ik < 0) + { + ModuleBase::WARNING_QUIT("DiagoPEXSI", "invalid k-point index for complex PEXSI path"); + } + if (ik >= static_cast(this->DM.size())) + { + this->resize_density_buffers(ik + 1); + } + if (mu_buffer.empty()) + { + mu_buffer.push_back(pexsi::PEXSI_Solver::pexsi_mu); + } + + MPI_Group grid_group; + MPI_Group world_group; + int grid_np = 0; + MPI_Comm_size(DIAG_WORLD, &grid_np); + MPI_Comm_group(DIAG_WORLD, &world_group); + int grid_proc_range[3] = {0, (GlobalV::NPROC / grid_np) * grid_np - 1, GlobalV::NPROC / grid_np}; + MPI_Group_range_incl(world_group, 1, &grid_proc_range, &grid_group); + + double num_electron = 0.0; + double num_electron_derivative = 0.0; + double total_energy_h = 0.0; + double total_energy_s = 0.0; + double total_free_energy = 0.0; + pexsi::simplePEXSIComplex(DIAG_WORLD, + DIAG_WORLD, + grid_group, + this->ParaV->blacs_ctxt, + PARAM.globalv.nlocal, + this->ParaV->nb, + this->ParaV->nrow, + this->ParaV->ncol, + 'c', + h_mat.p, + s_mat.p, + PARAM.inp.nelec, + "PEXSIOPTION", + this->DM[ik], + this->EDM[ik], + total_energy_h, + total_energy_s, + total_free_energy, + mu_buffer[0], + mu_buffer[0], + &num_electron, + &num_electron_derivative); + + const double k_weight = ik < static_cast(this->k_weights_.size()) ? this->k_weights_[ik] : 1.0; + this->num_electron_sum_ += k_weight * num_electron; + this->num_electron_derivative_sum_ += k_weight * num_electron_derivative; + this->totalEnergyH += k_weight * total_energy_h; + this->totalEnergyS += k_weight * total_energy_s; + this->totalFreeEnergy += k_weight * total_free_energy; + + MPI_Group_free(&grid_group); + MPI_Group_free(&world_group); } template class DiagoPexsi; diff --git a/source/source_hsolver/diago_pexsi.h b/source/source_hsolver/diago_pexsi.h index 60efdeed664..6206d66c04c 100644 --- a/source/source_hsolver/diago_pexsi.h +++ b/source/source_hsolver/diago_pexsi.h @@ -29,11 +29,23 @@ class DiagoPexsi double totalEnergyS = 0.0; double totalFreeEnergy = 0.0; std::unique_ptr ps; + void begin_mu_search(); + void begin_k_loop(); + void set_k_weight(const int ik, const double weight); + bool finish_k_loop(const double target_nelec); ~DiagoPexsi(); private: std::vector> dm_buffer_; std::vector> edm_buffer_; + std::vector k_weights_; + double num_electron_sum_ = 0.0; + double num_electron_derivative_sum_ = 0.0; + bool has_mu_lower_ = false; + bool has_mu_upper_ = false; + double mu_lower_ = 0.0; + double mu_upper_ = 0.0; + void resize_density_buffers(const int count); }; } // namespace hsolver diff --git a/source/source_hsolver/hsolver_lcao.cpp b/source/source_hsolver/hsolver_lcao.cpp index b1c7ba9c95e..db23bf05216 100644 --- a/source/source_hsolver/hsolver_lcao.cpp +++ b/source/source_hsolver/hsolver_lcao.cpp @@ -22,6 +22,7 @@ #ifdef __PEXSI #include "diago_pexsi.h" +#include "module_pexsi/pexsi_solver.h" #endif #include "source_base/global_variable.h" @@ -37,6 +38,9 @@ #include "source_lcao/rho_tau_lcao.h" // mohan add 20251024 +#include +#include + namespace hsolver { @@ -114,13 +118,30 @@ void HSolverLCAO::solve(hamilt::Hamilt* pHamilt, { #ifdef __PEXSI // other purification methods should follow this routine DiagoPexsi pe(ParaV); - for (int ik = 0; ik < psi.get_nk(); ++ik) + const int pexsi_mu_loops = std::is_same>::value + ? std::min(40, std::max(1, pexsi::PEXSI_Solver::pexsi_nmax)) + : 1; + pe.begin_mu_search(); + for (int imu = 0; imu < pexsi_mu_loops; ++imu) { - /// update H(k) for each k point - pHamilt->updateHk(ik); - psi.fix_k(ik); - // solve eigenvector and eigenvalue for H(k) - pe.diag(pHamilt, psi, nullptr); + pe.begin_k_loop(); + for (int ik = 0; ik < psi.get_nk(); ++ik) + { + const double k_weight = (pes->klist != nullptr && ik < static_cast(pes->klist->wk.size())) + ? pes->klist->wk[ik] + : 1.0; + const double pexsi_spin_weight = PARAM.inp.nspin == 1 ? 0.5 : 1.0; + pe.set_k_weight(ik, k_weight * pexsi_spin_weight); + /// update H(k) for each k point + pHamilt->updateHk(ik); + psi.fix_k(ik); + // solve eigenvector and eigenvalue for H(k) + pe.diag(pHamilt, psi, nullptr); + } + if (pe.finish_k_loop(PARAM.inp.nelec)) + { + break; + } } auto _pes = dynamic_cast*>(pes); pes->f_en.eband = pe.totalFreeEnergy; @@ -475,4 +496,4 @@ void HSolverLCAO::parakSolve_cusolver(hamilt::Hamilt* pHamilt, template class HSolverLCAO; template class HSolverLCAO>; -} // namespace hsolver \ No newline at end of file +} // namespace hsolver diff --git a/source/source_hsolver/module_pexsi/dist_matrix_transformer.cpp b/source/source_hsolver/module_pexsi/dist_matrix_transformer.cpp index c482ddc3197..5e36075835d 100644 --- a/source/source_hsolver/module_pexsi/dist_matrix_transformer.cpp +++ b/source/source_hsolver/module_pexsi/dist_matrix_transformer.cpp @@ -3,8 +3,10 @@ #include +#include #include #include +#include #include #include #include @@ -209,6 +211,59 @@ inline int DistMatrixTransformer::getNonZeroIndex(char layout, return 0; } +inline int DistMatrixTransformer::getNonZeroIndex(char layout, + const int nrow, + const int ncol, + std::complex* H_2d, + std::complex* S_2d, + const double ZERO_Limit, + int& nnz, + std::vector& rowidx, + std::vector& colidx) +{ + int idx = 0; + nnz = 0; + colidx.clear(); + rowidx.clear(); + if (layout == 'c' || layout == 'C') + { + for (int i = 0; i < ncol; ++i) + { + for (int j = 0; j < nrow; ++j) + { + idx = i * nrow + j; + if (std::abs(H_2d[idx]) > ZERO_Limit || std::abs(S_2d[idx]) > ZERO_Limit) + { + ++nnz; + colidx.push_back(i); + rowidx.push_back(j); + } + } + } + } + else if (layout == 'r' || layout == 'R') + { + for (int i = 0; i < ncol; ++i) + { + for (int j = 0; j < nrow; ++j) + { + idx = j * ncol + i; + if (std::abs(H_2d[idx]) > ZERO_Limit || std::abs(S_2d[idx]) > ZERO_Limit) + { + ++nnz; + colidx.push_back(i); + rowidx.push_back(j); + } + } + } + } + else + { + return 1; + } + return 0; +} + int DistMatrixTransformer::buildTransformParameter(DistBCDMatrix& SRC_Matrix, DistCCSMatrix& DST_Matrix, const int NPROC_TRANS, @@ -395,14 +450,39 @@ int DistMatrixTransformer::transformBCDtoCCS(DistBCDMatrix& SRC_Matrix, receiver_size_process, receiver_displacement_process, buffer2ccsIndex); + int myproc; + MPI_Comm_rank(MPI_COMM_WORLD, &myproc); + int nproc_data; + std::vector proc_map_data_trans; + if (myproc == 0) + { + MPI_Group_size(DST_Matrix.get_group_data(), &nproc_data); + MPI_Bcast(&nproc_data, 1, MPI_INT, 0, COMM_TRANS); + proc_map_data_trans.resize(nproc_data, 0); + for (int i = 0; i < nproc_data; ++i) + { + MPI_Group_translate_ranks(DST_Matrix.get_group_data(), 1, &i, GROUP_TRANS, &proc_map_data_trans[i]); + } + MPI_Bcast(proc_map_data_trans.data(), nproc_data, MPI_INT, 0, COMM_TRANS); + } + else + { + MPI_Bcast(&nproc_data, 1, MPI_INT, 0, COMM_TRANS); + proc_map_data_trans.resize(nproc_data, 0); + MPI_Bcast(proc_map_data_trans.data(), nproc_data, MPI_INT, 0, COMM_TRANS); + } + std::vector next_sender_position(sender_displacement_process); std::vector sender_buffer(2 * sender_size); if (SRC_Matrix.get_layout() == 'R' || SRC_Matrix.get_layout() == 'r') { for (int i = 0; i < sender_size; ++i) { const int idx = rowidx[i] * SRC_Matrix.get_ncol() + colidx[i]; - sender_buffer[2 * i] = H_2d[idx]; - sender_buffer[2 * i + 1] = S_2d[idx]; + int dst_process; + DST_Matrix.localCol(SRC_Matrix.globalCol(colidx[i]), dst_process); + const int pos = next_sender_position[proc_map_data_trans[dst_process]]++; + sender_buffer[2 * pos] = H_2d[idx]; + sender_buffer[2 * pos + 1] = S_2d[idx]; } } else @@ -410,8 +490,11 @@ int DistMatrixTransformer::transformBCDtoCCS(DistBCDMatrix& SRC_Matrix, for (int i = 0; i < sender_size; ++i) { const int idx = colidx[i] * SRC_Matrix.get_nrow() + rowidx[i]; - sender_buffer[2 * i] = H_2d[idx]; - sender_buffer[2 * i + 1] = S_2d[idx]; + int dst_process; + DST_Matrix.localCol(SRC_Matrix.globalCol(colidx[i]), dst_process); + const int pos = next_sender_position[proc_map_data_trans[dst_process]]++; + sender_buffer[2 * pos] = H_2d[idx]; + sender_buffer[2 * pos + 1] = S_2d[idx]; } } @@ -454,6 +537,153 @@ int DistMatrixTransformer::transformBCDtoCCS(DistBCDMatrix& SRC_Matrix, return 0; } +int DistMatrixTransformer::transformBCDtoCCS(DistBCDMatrix& SRC_Matrix, + std::complex* H_2d, + std::complex* S_2d, + const double ZERO_Limit, + DistCCSMatrix& DST_Matrix, + std::complex*& H_ccs, + std::complex*& S_ccs) +{ + MPI_Group GROUP_TRANS; + MPI_Comm COMM_TRANS = MPI_COMM_NULL; + newGroupCommTrans(SRC_Matrix, DST_Matrix, GROUP_TRANS, COMM_TRANS); + if (COMM_TRANS != MPI_COMM_NULL) + { + int NPROC_TRANS; + MPI_Comm_size(COMM_TRANS, &NPROC_TRANS); + int sender_size; + std::vector sender_size_process(NPROC_TRANS); + std::vector sender_displacement_process(NPROC_TRANS); + int receiver_size; + std::vector receiver_size_process(NPROC_TRANS); + std::vector receiver_displacement_process(NPROC_TRANS); + + std::vector rowidx; + std::vector colidx; + int nnz = 0; + if (SRC_Matrix.get_comm() != MPI_COMM_NULL) + { + getNonZeroIndex(SRC_Matrix.get_layout(), + SRC_Matrix.get_nrow(), + SRC_Matrix.get_ncol(), + H_2d, + S_2d, + ZERO_Limit, + nnz, + rowidx, + colidx); + } + + std::vector buffer2ccsIndex; + buildTransformParameter(SRC_Matrix, + DST_Matrix, + NPROC_TRANS, + GROUP_TRANS, + COMM_TRANS, + nnz, + rowidx, + colidx, + sender_size, + sender_size_process, + sender_displacement_process, + receiver_size, + receiver_size_process, + receiver_displacement_process, + buffer2ccsIndex); + + int myproc; + MPI_Comm_rank(MPI_COMM_WORLD, &myproc); + int nproc_data; + std::vector proc_map_data_trans; + if (myproc == 0) + { + MPI_Group_size(DST_Matrix.get_group_data(), &nproc_data); + MPI_Bcast(&nproc_data, 1, MPI_INT, 0, COMM_TRANS); + proc_map_data_trans.resize(nproc_data, 0); + for (int i = 0; i < nproc_data; ++i) + { + MPI_Group_translate_ranks(DST_Matrix.get_group_data(), 1, &i, GROUP_TRANS, &proc_map_data_trans[i]); + } + MPI_Bcast(proc_map_data_trans.data(), nproc_data, MPI_INT, 0, COMM_TRANS); + } + else + { + MPI_Bcast(&nproc_data, 1, MPI_INT, 0, COMM_TRANS); + proc_map_data_trans.resize(nproc_data, 0); + MPI_Bcast(proc_map_data_trans.data(), nproc_data, MPI_INT, 0, COMM_TRANS); + } + std::vector next_sender_position(sender_displacement_process); + std::vector sender_buffer(4 * sender_size); + if (SRC_Matrix.get_layout() == 'R' || SRC_Matrix.get_layout() == 'r') + { + for (int i = 0; i < sender_size; ++i) + { + const int idx = rowidx[i] * SRC_Matrix.get_ncol() + colidx[i]; + int dst_process; + DST_Matrix.localCol(SRC_Matrix.globalCol(colidx[i]), dst_process); + const int pos = next_sender_position[proc_map_data_trans[dst_process]]++; + sender_buffer[4 * pos] = H_2d[idx].real(); + sender_buffer[4 * pos + 1] = H_2d[idx].imag(); + sender_buffer[4 * pos + 2] = S_2d[idx].real(); + sender_buffer[4 * pos + 3] = S_2d[idx].imag(); + } + } + else + { + for (int i = 0; i < sender_size; ++i) + { + const int idx = colidx[i] * SRC_Matrix.get_nrow() + rowidx[i]; + int dst_process; + DST_Matrix.localCol(SRC_Matrix.globalCol(colidx[i]), dst_process); + const int pos = next_sender_position[proc_map_data_trans[dst_process]]++; + sender_buffer[4 * pos] = H_2d[idx].real(); + sender_buffer[4 * pos + 1] = H_2d[idx].imag(); + sender_buffer[4 * pos + 2] = S_2d[idx].real(); + sender_buffer[4 * pos + 3] = S_2d[idx].imag(); + } + } + + std::vector sender_value_size_process(sender_size_process); + std::vector sender_value_displacement_process(sender_displacement_process); + std::vector receiver_value_size_process(receiver_size_process); + std::vector receiver_value_displacement_process(receiver_displacement_process); + for (int i = 0; i < NPROC_TRANS; ++i) + { + sender_value_size_process[i] *= 4; + sender_value_displacement_process[i] *= 4; + receiver_value_size_process[i] *= 4; + receiver_value_displacement_process[i] *= 4; + } + + std::vector receiver_buffer(4 * receiver_size); + MPI_Alltoallv(sender_buffer.data(), + sender_value_size_process.data(), + sender_value_displacement_process.data(), + MPI_DOUBLE, + receiver_buffer.data(), + receiver_value_size_process.data(), + receiver_value_displacement_process.data(), + MPI_DOUBLE, + COMM_TRANS); + + delete[] H_ccs; + H_ccs = new std::complex[receiver_size]; + delete[] S_ccs; + S_ccs = new std::complex[receiver_size]; + for (int i = 0; i < receiver_size; ++i) + { + const int buffer_index = buffer2ccsIndex[i]; + H_ccs[i] = std::complex(receiver_buffer[4 * buffer_index], + receiver_buffer[4 * buffer_index + 1]); + S_ccs[i] = std::complex(receiver_buffer[4 * buffer_index + 2], + receiver_buffer[4 * buffer_index + 3]); + } + } + deleteGroupCommTrans(GROUP_TRANS, COMM_TRANS); + return 0; +} + // transform two sparse matrices from Compressed Column Storage (CCS) to block cyclic distribution (BCD) distribution // two source matrices share the same non-zero elements positions int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, @@ -750,5 +980,199 @@ int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, return 0; } +int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, + std::complex* DMnzvalLocal, + std::complex* EDMnzvalLocal, + DistBCDMatrix& DST_Matrix, + std::complex* DM, + std::complex* EDM) +{ + MPI_Group GROUP_TRANS; + MPI_Comm COMM_TRANS = MPI_COMM_NULL; + newGroupCommTrans(DST_Matrix, SRC_Matrix, GROUP_TRANS, COMM_TRANS); + if (COMM_TRANS != MPI_COMM_NULL) + { + const int dst_size = DST_Matrix.get_nrow() * DST_Matrix.get_ncol(); + for (int i = 0; i < dst_size; ++i) + { + DM[i] = std::complex(0.0, 0.0); + EDM[i] = std::complex(0.0, 0.0); + } + + int NPROC_TRANS; + MPI_Comm_size(COMM_TRANS, &NPROC_TRANS); + std::vector sender_size_process(NPROC_TRANS, 0); + std::vector sender_displacement_process(NPROC_TRANS, 0); + std::vector receiver_size_process(NPROC_TRANS, 0); + std::vector receiver_displacement_process(NPROC_TRANS, 0); + + int nproc_bcd = 0; + std::vector proc_map_bcd_trans; + int myproc_trans = 0; + MPI_Comm_rank(COMM_TRANS, &myproc_trans); + if (myproc_trans == 0) + { + MPI_Group_size(DST_Matrix.get_group(), &nproc_bcd); + MPI_Bcast(&nproc_bcd, 1, MPI_INT, 0, COMM_TRANS); + proc_map_bcd_trans.resize(nproc_bcd, 0); + for (int i = 0; i < nproc_bcd; ++i) + { + MPI_Group_translate_ranks(DST_Matrix.get_group(), 1, &i, GROUP_TRANS, &proc_map_bcd_trans[i]); + } + MPI_Bcast(proc_map_bcd_trans.data(), nproc_bcd, MPI_INT, 0, COMM_TRANS); + } + else + { + MPI_Bcast(&nproc_bcd, 1, MPI_INT, 0, COMM_TRANS); + proc_map_bcd_trans.resize(nproc_bcd, 0); + MPI_Bcast(proc_map_bcd_trans.data(), nproc_bcd, MPI_INT, 0, COMM_TRANS); + } + + for (int icol = 0; icol < SRC_Matrix.get_numcol_local(); ++icol) + { + const int g_col = SRC_Matrix.globalCol(icol); + int recv_pcol_bcd = 0; + DST_Matrix.localCol(g_col, recv_pcol_bcd); + for (int rowidx = SRC_Matrix.get_colptr_local()[icol] - 1; + rowidx < SRC_Matrix.get_colptr_local()[icol + 1] - 1; + ++rowidx) + { + const int g_row = SRC_Matrix.get_rowind_local()[rowidx] - 1; + int recv_prow_bcd = 0; + DST_Matrix.localRow(g_row, recv_prow_bcd); + const int recv_proc_bcd = DST_Matrix.pnum(recv_prow_bcd, recv_pcol_bcd); + const int recv_proc = proc_map_bcd_trans[recv_proc_bcd]; + ++sender_size_process[recv_proc]; + } + } + + MPI_Alltoall(sender_size_process.data(), + 1, + MPI_INT, + receiver_size_process.data(), + 1, + MPI_INT, + COMM_TRANS); + + int receiver_size = receiver_size_process[0]; + for (int i = 1; i < NPROC_TRANS; ++i) + { + sender_displacement_process[i] = sender_displacement_process[i - 1] + sender_size_process[i - 1]; + receiver_displacement_process[i] = receiver_displacement_process[i - 1] + receiver_size_process[i - 1]; + receiver_size += receiver_size_process[i]; + } + + const int sender_size = SRC_Matrix.get_nnzlocal(); + std::vector sender_index(std::max(sender_size, 1), -1); + std::vector dst_index(std::max(2 * sender_size, 2), -1); + std::vector p(sender_displacement_process); + int idx = 0; + for (int icol = 0; icol < SRC_Matrix.get_numcol_local(); ++icol) + { + const int g_col = SRC_Matrix.globalCol(icol); + int recv_pcol_bcd = 0; + const int recv_col = DST_Matrix.localCol(g_col, recv_pcol_bcd); + for (int rowidx = SRC_Matrix.get_colptr_local()[icol] - 1; + rowidx < SRC_Matrix.get_colptr_local()[icol + 1] - 1; + ++rowidx) + { + const int g_row = SRC_Matrix.get_rowind_local()[rowidx] - 1; + int recv_prow_bcd = 0; + const int recv_row = DST_Matrix.localRow(g_row, recv_prow_bcd); + const int recv_proc_bcd = DST_Matrix.pnum(recv_prow_bcd, recv_pcol_bcd); + const int recv_proc = proc_map_bcd_trans[recv_proc_bcd]; + sender_index[p[recv_proc]] = idx; + dst_index[2 * p[recv_proc]] = recv_row; + dst_index[2 * p[recv_proc] + 1] = recv_col; + ++p[recv_proc]; + ++idx; + } + } + + std::vector sender_index_size_process(sender_size_process); + std::vector sender_index_displacement_process(sender_displacement_process); + std::vector receiver_index_size_process(receiver_size_process); + std::vector receiver_index_displacement_process(receiver_displacement_process); + for (int i = 0; i < NPROC_TRANS; ++i) + { + sender_index_size_process[i] *= 2; + sender_index_displacement_process[i] *= 2; + receiver_index_size_process[i] *= 2; + receiver_index_displacement_process[i] *= 2; + } + + std::vector receiver_index(std::max(2 * receiver_size, 2), -1); + MPI_Alltoallv(dst_index.data(), + sender_index_size_process.data(), + sender_index_displacement_process.data(), + MPI_INT, + receiver_index.data(), + receiver_index_size_process.data(), + receiver_index_displacement_process.data(), + MPI_INT, + COMM_TRANS); + + std::vector sender_buffer(std::max(4 * sender_size, 1), 0.0); + for (int i = 0; i < sender_size; ++i) + { + const int src_index = sender_index[i]; + const std::complex dm_value = DMnzvalLocal[src_index]; + const std::complex edm_value = EDMnzvalLocal[src_index]; + sender_buffer[4 * i] = dm_value.real(); + sender_buffer[4 * i + 1] = dm_value.imag(); + sender_buffer[4 * i + 2] = edm_value.real(); + sender_buffer[4 * i + 3] = edm_value.imag(); + } + + std::vector sender_value_size_process(sender_size_process); + std::vector sender_value_displacement_process(sender_displacement_process); + std::vector receiver_value_size_process(receiver_size_process); + std::vector receiver_value_displacement_process(receiver_displacement_process); + for (int i = 0; i < NPROC_TRANS; ++i) + { + sender_value_size_process[i] *= 4; + sender_value_displacement_process[i] *= 4; + receiver_value_size_process[i] *= 4; + receiver_value_displacement_process[i] *= 4; + } + + std::vector receiver_buffer(std::max(4 * receiver_size, 1), 0.0); + MPI_Alltoallv(sender_buffer.data(), + sender_value_size_process.data(), + sender_value_displacement_process.data(), + MPI_DOUBLE, + receiver_buffer.data(), + receiver_value_size_process.data(), + receiver_value_displacement_process.data(), + MPI_DOUBLE, + COMM_TRANS); + + if (DST_Matrix.get_layout() == 'R' || DST_Matrix.get_layout() == 'r') + { + for (int i = 0; i < receiver_size; ++i) + { + const int ix = receiver_index[2 * i]; + const int iy = receiver_index[2 * i + 1]; + const int dst_index = ix * DST_Matrix.get_ncol() + iy; + DM[dst_index] = std::complex(receiver_buffer[4 * i], -receiver_buffer[4 * i + 1]); + EDM[dst_index] = std::complex(receiver_buffer[4 * i + 2], -receiver_buffer[4 * i + 3]); + } + } + else + { + for (int i = 0; i < receiver_size; ++i) + { + const int ix = receiver_index[2 * i]; + const int iy = receiver_index[2 * i + 1]; + const int dst_index = iy * DST_Matrix.get_nrow() + ix; + DM[dst_index] = std::complex(receiver_buffer[4 * i], -receiver_buffer[4 * i + 1]); + EDM[dst_index] = std::complex(receiver_buffer[4 * i + 2], -receiver_buffer[4 * i + 3]); + } + } + } + deleteGroupCommTrans(GROUP_TRANS, COMM_TRANS); + return 0; +} + } // namespace pexsi #endif diff --git a/source/source_hsolver/module_pexsi/dist_matrix_transformer.h b/source/source_hsolver/module_pexsi/dist_matrix_transformer.h index 672b22f4f32..916c8f332bb 100644 --- a/source/source_hsolver/module_pexsi/dist_matrix_transformer.h +++ b/source/source_hsolver/module_pexsi/dist_matrix_transformer.h @@ -2,6 +2,7 @@ #define DISTMATRIXTRANSFORMER_H #include +#include #include #include // transform a sparse matrix from block cyclic distribution (BCD) to Compressed Column Storage (CCS) distribution @@ -49,6 +50,16 @@ int getNonZeroIndex(char layout, std::vector& rowidx, std::vector& colidx); +int getNonZeroIndex(char layout, + const int nrow, + const int ncol, + std::complex* H_2d, + std::complex* S_2d, + const double ZERO_Limit, + int& nnz, + std::vector& rowidx, + std::vector& colidx); + int buildTransformParameter(DistBCDMatrix& SRC_Matrix, DistCCSMatrix& DST_Matrix, const int NPROC_TRANS, @@ -80,12 +91,27 @@ int transformBCDtoCCS(DistBCDMatrix& SRC_Matrix, double*& H_ccs, double*& S_ccs); +int transformBCDtoCCS(DistBCDMatrix& SRC_Matrix, + std::complex* H_2d, + std::complex* S_2d, + const double ZERO_Limit, + DistCCSMatrix& DST_Matrix, + std::complex*& H_ccs, + std::complex*& S_ccs); + int transformCCStoBCD(DistCCSMatrix& SRC_Matrix, double* DMnzvalLocal, double* ENDnzvalLocal, DistBCDMatrix& DST_Matrix, double* DM_2d, double* ED_2d); + +int transformCCStoBCD(DistCCSMatrix& SRC_Matrix, + std::complex* DMnzvalLocal, + std::complex* EDMnzvalLocal, + DistBCDMatrix& DST_Matrix, + std::complex* DM_2d, + std::complex* ED_2d); }; // namespace DistMatrixTransformer } // namespace pexsi -#endif // DISTMATRIXTRANSFORMER_H \ No newline at end of file +#endif // DISTMATRIXTRANSFORMER_H diff --git a/source/source_hsolver/module_pexsi/simple_pexsi.cpp b/source/source_hsolver/module_pexsi/simple_pexsi.cpp index 4d51a928332..05b32a5d9d9 100644 --- a/source/source_hsolver/module_pexsi/simple_pexsi.cpp +++ b/source/source_hsolver/module_pexsi/simple_pexsi.cpp @@ -6,8 +6,10 @@ #ifdef __PEXSI #include +#include #include #include +#include #include #include #include @@ -115,7 +117,7 @@ int loadPEXSIOption(MPI_Comm comm, int_para[15] = 0; int_para[16] = pexsi::PEXSI_Solver::pexsi_nproc_pole; - double_para[0] = 2;//PARAM.inp.nspin; // pexsi::PEXSI_Solver::pexsi_spin; + double_para[0] = PARAM.inp.nspin == 1 ? 2.0 : 1.0; double_para[1] = pexsi::PEXSI_Solver::pexsi_temp; double_para[2] = pexsi::PEXSI_Solver::pexsi_gap; double_para[3] = pexsi::PEXSI_Solver::pexsi_delta_e; @@ -128,6 +130,21 @@ int loadPEXSIOption(MPI_Comm comm, double_para[10] = pexsi::PEXSI_Solver::pexsi_elec_thr; double_para[11] = pexsi::PEXSI_Solver::pexsi_zero_thr; + const bool default_pexsi_temperature = std::abs(double_para[1] - 0.015) < 1.0e-14; + const bool fermi_dirac_smearing = PARAM.inp.smearing_method == "fd" + || PARAM.inp.smearing_method == "fermi-dirac"; + if (default_pexsi_temperature && !fermi_dirac_smearing) + { + const double derived_temperature = PARAM.inp.smearing_method == "fixed" + ? 2.0e-5 + : std::max(2.0e-5, 0.01 * PARAM.inp.smearing_sigma); + double_para[1] = std::min(double_para[1], derived_temperature); + } + if (int_para[0] == 40 && double_para[1] <= 1.0e-4) + { + int_para[0] = 120; + } + options.numPole = int_para[0]; options.isInertiaCount = int_para[1]; options.maxPEXSIIter = int_para[2]; @@ -364,5 +381,153 @@ int simplePEXSI(MPI_Comm comm_PEXSI, MPI_Barrier(MPI_COMM_WORLD); return 0; } + +int simplePEXSIComplex(MPI_Comm comm_PEXSI, + MPI_Comm comm_2D, + MPI_Group group_2D, + const int blacs_ctxt, + const int size, + const int nblk, + const int nrow, + const int ncol, + char layout, + std::complex* H, + std::complex* S, + const double numElectronExact, + const std::string PexsiOptionFile, + std::complex*& DM, + std::complex*& EDM, + double& totalEnergyH, + double& totalEnergyS, + double& totalFreeEnergy, + double& mu, + double mu0, + double* numElectronPEXSI, + double* numElectronDrvMuPEXSI) +{ + if (comm_2D == MPI_COMM_NULL && comm_PEXSI == MPI_COMM_NULL) + { + return 0; + } + + int myid = 0; + if (comm_PEXSI != MPI_COMM_NULL) + { + MPI_Comm_rank(comm_PEXSI, &myid); + } + + PPEXSIOptions options; + PPEXSISetDefaultOptions(&options); + int numProcessPerPole = 0; + double ZERO_Limit = 0.0; + loadPEXSIOption(comm_PEXSI, PexsiOptionFile, options, numProcessPerPole, ZERO_Limit); + options.symmetric = 1; + options.symmetricStorage = 0; + options.rowOrdering = 0; + options.mu0 = mu0; + + ModuleBase::timer::start("Diago_LCAO_Matrix", "setup_PEXSI_plan"); + PPEXSIPlan plan; + int info = 0; + int outputFileIndex = -1; + int pexsi_prow = 0; + int pexsi_pcol = 0; + ModuleBase::timer::start("Diago_LCAO_Matrix", "splitNProc2NProwNPcol"); + splitNProc2NProwNPcol(numProcessPerPole, pexsi_prow, pexsi_pcol); + ModuleBase::timer::end("Diago_LCAO_Matrix", "splitNProc2NProwNPcol"); + + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSIPlanInit"); + if (comm_PEXSI != MPI_COMM_NULL) + { + plan = PPEXSIPlanInitialize(comm_PEXSI, pexsi_prow, pexsi_pcol, outputFileIndex, &info); + } + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSIPlanInit"); + ModuleBase::timer::end("Diago_LCAO_Matrix", "setup_PEXSI_plan"); + + DistCCSMatrix DST_Matrix(comm_PEXSI, numProcessPerPole, size); + DistBCDMatrix SRC_Matrix(comm_2D, group_2D, blacs_ctxt, size, nblk, nrow, ncol, layout); + + std::complex* HnzvalLocal = nullptr; + std::complex* SnzvalLocal = nullptr; + std::complex* DMnzvalLocal = nullptr; + std::complex* EDMnzvalLocal = nullptr; + std::complex* FDMnzvalLocal = nullptr; + + ModuleBase::timer::start("Diago_LCAO_Matrix", "TransMAT2CCS"); + DistMatrixTransformer::transformBCDtoCCS(SRC_Matrix, H, S, ZERO_Limit, DST_Matrix, HnzvalLocal, SnzvalLocal); + ModuleBase::timer::end("Diago_LCAO_Matrix", "TransMAT2CCS"); + + if (comm_PEXSI != MPI_COMM_NULL) + { + int isSIdentity = 0; + PPEXSILoadComplexHSMatrix(plan, + options, + size, + DST_Matrix.get_nnz(), + DST_Matrix.get_nnzlocal(), + DST_Matrix.get_numcol_local(), + DST_Matrix.get_colptr_local(), + DST_Matrix.get_rowind_local(), + reinterpret_cast(HnzvalLocal), + isSIdentity, + reinterpret_cast(SnzvalLocal), + &info); + PPEXSISymbolicFactorizeComplexUnsymmetricMatrix(plan, + options, + reinterpret_cast(HnzvalLocal), + &info); + + double nelec = 0.0; + double d_nelec_d_mu = 0.0; + mu = mu0; + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSIDFT"); + PPEXSICalculateFermiOperatorComplex(plan, + options, + mu, + numElectronExact, + &nelec, + &d_nelec_d_mu, + &info); + if (numElectronPEXSI != nullptr) + { + *numElectronPEXSI = nelec; + } + if (numElectronDrvMuPEXSI != nullptr) + { + *numElectronDrvMuPEXSI = d_nelec_d_mu; + } + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSIDFT"); + + DMnzvalLocal = new std::complex[DST_Matrix.get_nnzlocal()]; + EDMnzvalLocal = new std::complex[DST_Matrix.get_nnzlocal()]; + FDMnzvalLocal = new std::complex[DST_Matrix.get_nnzlocal()]; + if (myid < numProcessPerPole) + { + PPEXSIRetrieveComplexDFTMatrix(plan, + reinterpret_cast(DMnzvalLocal), + reinterpret_cast(EDMnzvalLocal), + reinterpret_cast(FDMnzvalLocal), + &totalEnergyH, + &totalEnergyS, + &totalFreeEnergy, + &info); + } + PPEXSIPlanFinalize(plan, &info); + } + + ModuleBase::timer::start("Diago_LCAO_Matrix", "TransMAT22D"); + DistMatrixTransformer::transformCCStoBCD(DST_Matrix, DMnzvalLocal, EDMnzvalLocal, SRC_Matrix, DM, EDM); + ModuleBase::timer::end("Diago_LCAO_Matrix", "TransMAT22D"); + + MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + delete[] DMnzvalLocal; + delete[] EDMnzvalLocal; + delete[] FDMnzvalLocal; + delete[] HnzvalLocal; + delete[] SnzvalLocal; + MPI_Barrier(MPI_COMM_WORLD); + return 0; +} } // namespace pexsi #endif diff --git a/source/source_hsolver/module_pexsi/simple_pexsi.h b/source/source_hsolver/module_pexsi/simple_pexsi.h index db8879e5ac7..5b1c3f7d888 100644 --- a/source/source_hsolver/module_pexsi/simple_pexsi.h +++ b/source/source_hsolver/module_pexsi/simple_pexsi.h @@ -2,6 +2,8 @@ #define SIMPLE_PEXSI_H #include +#include +#include // a simple interface for calling pexsi with 2D block cyclic distributed matrix namespace pexsi { @@ -25,5 +27,28 @@ int simplePEXSI(MPI_Comm comm_PEXSI, double& totalFreeEnergy, double& mu, double mu0); + +int simplePEXSIComplex(MPI_Comm comm_PEXSI, + MPI_Comm comm_2D, + MPI_Group group_2D, + const int blacs_ctxt, + const int size, + const int nblk, + const int nrow, + const int ncol, + char layout, + std::complex* H, + std::complex* S, + const double nElectronExact, + const std::string PexsiOptionFile, + std::complex*& DM, + std::complex*& EDM, + double& totalEnergyH, + double& totalEnergyS, + double& totalFreeEnergy, + double& mu, + double mu0, + double* numElectronPEXSI = nullptr, + double* numElectronDrvMuPEXSI = nullptr); } -#endif // SIMPLE_PEXSI_H \ No newline at end of file +#endif // SIMPLE_PEXSI_H From a872a630c00e8ac5f765c69144687e6ffc3411ce Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Mon, 25 May 2026 17:35:16 +0800 Subject: [PATCH 03/23] Optimize multi-k PEXSI execution path --- source/source_hsolver/diago_pexsi.cpp | 26 ++ source/source_hsolver/diago_pexsi.h | 7 + source/source_hsolver/hsolver_lcao.cpp | 271 ++++++++++++++++++ .../module_pexsi/dist_bcd_matrix.cpp | 25 +- .../module_pexsi/dist_matrix_transformer.cpp | 251 +++++++++++++++- .../module_pexsi/simple_pexsi.cpp | 88 +++++- 6 files changed, 634 insertions(+), 34 deletions(-) diff --git a/source/source_hsolver/diago_pexsi.cpp b/source/source_hsolver/diago_pexsi.cpp index b9cd1cae2a8..2184a5263b1 100644 --- a/source/source_hsolver/diago_pexsi.cpp +++ b/source/source_hsolver/diago_pexsi.cpp @@ -82,6 +82,32 @@ void DiagoPexsi::resize_density_buffers(const int count) } } +template +void DiagoPexsi::ensure_density_buffers(const int count) +{ + this->resize_density_buffers(count); +} + +template +double DiagoPexsi::current_mu() const +{ + return mu_buffer.empty() ? pexsi::PEXSI_Solver::pexsi_mu : mu_buffer[0]; +} + +template +void DiagoPexsi::set_k_loop_totals(const double num_electron_sum, + const double num_electron_derivative_sum, + const double total_energy_h, + const double total_energy_s, + const double total_free_energy) +{ + this->num_electron_sum_ = num_electron_sum; + this->num_electron_derivative_sum_ = num_electron_derivative_sum; + this->totalEnergyH = total_energy_h; + this->totalEnergyS = total_energy_s; + this->totalFreeEnergy = total_free_energy; +} + template void DiagoPexsi::begin_mu_search() { diff --git a/source/source_hsolver/diago_pexsi.h b/source/source_hsolver/diago_pexsi.h index 6206d66c04c..d775cb09106 100644 --- a/source/source_hsolver/diago_pexsi.h +++ b/source/source_hsolver/diago_pexsi.h @@ -32,6 +32,13 @@ class DiagoPexsi void begin_mu_search(); void begin_k_loop(); void set_k_weight(const int ik, const double weight); + void ensure_density_buffers(const int count); + double current_mu() const; + void set_k_loop_totals(const double num_electron_sum, + const double num_electron_derivative_sum, + const double total_energy_h, + const double total_energy_s, + const double total_free_energy); bool finish_k_loop(const double target_nelec); ~DiagoPexsi(); diff --git a/source/source_hsolver/hsolver_lcao.cpp b/source/source_hsolver/hsolver_lcao.cpp index db23bf05216..d5c11077f97 100644 --- a/source/source_hsolver/hsolver_lcao.cpp +++ b/source/source_hsolver/hsolver_lcao.cpp @@ -23,6 +23,7 @@ #ifdef __PEXSI #include "diago_pexsi.h" #include "module_pexsi/pexsi_solver.h" +#include "module_pexsi/simple_pexsi.h" #endif #include "source_base/global_variable.h" @@ -39,11 +40,276 @@ #include "source_lcao/rho_tau_lcao.h" // mohan add 20251024 #include +#include #include namespace hsolver { +#ifdef __PEXSI +namespace +{ +template +struct PexsiKParallelRunner +{ + static bool run(const Parallel_Orbitals* ParaV, + hamilt::Hamilt* pHamilt, + psi::Psi& psi, + elecstate::ElecState* pes, + elecstate::DensityMatrix& dm, + const int requested_kpar) + { + return false; + } +}; + +#ifdef __MPI +template +struct PexsiKParallelRunner, Device> +{ + static bool run(const Parallel_Orbitals* ParaV, + hamilt::Hamilt>* pHamilt, + psi::Psi>& psi, + elecstate::ElecState* pes, + elecstate::DensityMatrix, double>& dm, + const int requested_kpar) + { + const int nks = psi.get_nk(); + if (requested_kpar <= 1 || nks <= 1) + { + return false; + } + + const int min_pool_size = std::max(1, pexsi::PEXSI_Solver::pexsi_nproc_pole); + int kpar = std::min(std::min(requested_kpar, nks), GlobalV::NPROC); + while (kpar > 1 && GlobalV::NPROC / kpar < min_pool_size) + { + --kpar; + } + if (kpar <= 1) + { + return false; + } + if (kpar != requested_kpar) + { + ModuleBase::WARNING("HSolverLCAO::pexsiKpar", + "adjust kpar for PEXSI so each k pool has enough MPI ranks"); + } + + ModuleBase::timer::start("HSolverLCAO", "pexsiKparSolve"); + DiagoPexsi> pe(ParaV); + pe.ensure_density_buffers(nks); + + auto k2d = Parallel_K2D>(); + k2d.set_kpar(kpar); + const int nrow = ParaV->get_global_row_size(); + const int nb2d = ParaV->get_block_size(); + k2d.set_para_env(nks, nrow, nb2d, GlobalV::NPROC, GlobalV::MY_RANK, PARAM.inp.nspin); + + MPI_Group pool_group = MPI_GROUP_NULL; + MPI_Comm_group(k2d.POOL_WORLD_K2D, &pool_group); + int rank_in_pool = 0; + MPI_Comm_rank(k2d.POOL_WORLD_K2D, &rank_in_pool); + + const int pool_local_size = k2d.get_p2D_pool()->get_local_size(); + std::vector> dm_pool(pool_local_size); + std::vector> edm_pool(pool_local_size); + std::vector>> hk_pool_cache(nks); + std::vector>> sk_pool_cache(nks); + std::vector>> dm_pool_cache(nks); + std::vector>> edm_pool_cache(nks); + const int pexsi_mu_loops = std::min(40, std::max(1, pexsi::PEXSI_Solver::pexsi_nmax)); + + ModuleBase::timer::start("HSolverLCAO", "cache_pexsi_hs"); + for (int ik = 0; ik < k2d.get_pKpoints()->get_max_nks_pool(); ++ik) + { + std::vector ik_kpar; + for (int ipool = 0; ipool < k2d.get_kpar(); ++ipool) + { + if (ik + k2d.get_pKpoints()->startk_pool[ipool] < nks + && ik < k2d.get_pKpoints()->nks_pool[ipool]) + { + ik_kpar.push_back(ik + k2d.get_pKpoints()->startk_pool[ipool]); + } + } + if (ik_kpar.empty()) + { + ModuleBase::WARNING_QUIT("HSolverLCAO::pexsiKparSolve", "ik_kpar is empty"); + } + k2d.distribute_hsk(pHamilt, ik_kpar, nrow); + + const int my_pool = k2d.get_my_pool(); + const int ik_global = ik + k2d.get_pKpoints()->startk_pool[my_pool]; + const bool has_local_k = ik_global < nks && ik < k2d.get_pKpoints()->nks_pool[my_pool]; + if (has_local_k) + { + hk_pool_cache[ik_global] = k2d.hk_pool; + sk_pool_cache[ik_global] = k2d.sk_pool; + } + } + ModuleBase::timer::end("HSolverLCAO", "cache_pexsi_hs"); + + pe.begin_mu_search(); + for (int imu = 0; imu < pexsi_mu_loops; ++imu) + { + pe.begin_k_loop(); + double local_totals[5] = {0.0, 0.0, 0.0, 0.0, 0.0}; + + for (int ik = 0; ik < k2d.get_pKpoints()->get_max_nks_pool(); ++ik) + { + std::vector ik_kpar; + for (int ipool = 0; ipool < k2d.get_kpar(); ++ipool) + { + if (ik + k2d.get_pKpoints()->startk_pool[ipool] < nks + && ik < k2d.get_pKpoints()->nks_pool[ipool]) + { + ik_kpar.push_back(ik + k2d.get_pKpoints()->startk_pool[ipool]); + } + } + if (ik_kpar.empty()) + { + ModuleBase::WARNING_QUIT("HSolverLCAO::pexsiKparSolve", "ik_kpar is empty"); + } + + const int my_pool = k2d.get_my_pool(); + const int ik_global = ik + k2d.get_pKpoints()->startk_pool[my_pool]; + const bool has_local_k = ik_global < nks && ik < k2d.get_pKpoints()->nks_pool[my_pool]; + std::fill(dm_pool.begin(), dm_pool.end(), std::complex(0.0, 0.0)); + std::fill(edm_pool.begin(), edm_pool.end(), std::complex(0.0, 0.0)); + + if (has_local_k) + { + double num_electron = 0.0; + double num_electron_derivative = 0.0; + double total_energy_h = 0.0; + double total_energy_s = 0.0; + double total_free_energy = 0.0; + double mu = pe.current_mu(); + std::complex* dm_pool_ptr = dm_pool.data(); + std::complex* edm_pool_ptr = edm_pool.data(); + + pexsi::simplePEXSIComplex(k2d.POOL_WORLD_K2D, + k2d.POOL_WORLD_K2D, + pool_group, + k2d.get_p2D_pool()->blacs_ctxt, + PARAM.globalv.nlocal, + nb2d, + k2d.get_p2D_pool()->get_row_size(), + k2d.get_p2D_pool()->get_col_size(), + 'c', + hk_pool_cache[ik_global].data(), + sk_pool_cache[ik_global].data(), + PARAM.inp.nelec, + "PEXSIOPTION", + dm_pool_ptr, + edm_pool_ptr, + total_energy_h, + total_energy_s, + total_free_energy, + mu, + pe.current_mu(), + &num_electron, + &num_electron_derivative); + + if (rank_in_pool == 0) + { + const double k_weight = (pes->klist != nullptr + && ik_global < static_cast(pes->klist->wk.size())) + ? pes->klist->wk[ik_global] + : 1.0; + const double pexsi_spin_weight = PARAM.inp.nspin == 1 ? 0.5 : 1.0; + const double effective_weight = k_weight * pexsi_spin_weight; + local_totals[0] += effective_weight * num_electron; + local_totals[1] += effective_weight * num_electron_derivative; + local_totals[2] += effective_weight * total_energy_h; + local_totals[3] += effective_weight * total_energy_s; + local_totals[4] += effective_weight * total_free_energy; + } + dm_pool_cache[ik_global] = dm_pool; + edm_pool_cache[ik_global] = edm_pool; + } + } + + double global_totals[5] = {0.0, 0.0, 0.0, 0.0, 0.0}; + MPI_Allreduce(local_totals, global_totals, 5, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + pe.set_k_loop_totals(global_totals[0], + global_totals[1], + global_totals[2], + global_totals[3], + global_totals[4]); + if (pe.finish_k_loop(PARAM.inp.nelec)) + { + ModuleBase::timer::start("HSolverLCAO", "collect_pexsi_dm"); + const int my_pool = k2d.get_my_pool(); + for (int ik = 0; ik < k2d.get_pKpoints()->get_max_nks_pool(); ++ik) + { + std::vector ik_kpar; + for (int ipool = 0; ipool < k2d.get_kpar(); ++ipool) + { + if (ik + k2d.get_pKpoints()->startk_pool[ipool] < nks + && ik < k2d.get_pKpoints()->nks_pool[ipool]) + { + ik_kpar.push_back(ik + k2d.get_pKpoints()->startk_pool[ipool]); + } + } + for (const int ik_collect : ik_kpar) + { + const int source_pool = k2d.get_pKpoints()->whichpool[ik_collect]; + int desc_pool[9]; + std::copy(k2d.get_p2D_pool()->desc, k2d.get_p2D_pool()->desc + 9, desc_pool); + if (my_pool != source_pool) + { + desc_pool[1] = -1; + } + std::complex* dm_src = (my_pool == source_pool && !dm_pool_cache[ik_collect].empty()) + ? dm_pool_cache[ik_collect].data() + : nullptr; + std::complex* edm_src = (my_pool == source_pool && !edm_pool_cache[ik_collect].empty()) + ? edm_pool_cache[ik_collect].data() + : nullptr; + Cpxgemr2d(nrow, + nrow, + dm_src, + 1, + 1, + desc_pool, + pe.DM[ik_collect], + 1, + 1, + k2d.get_p2D_global()->desc, + k2d.get_p2D_global()->blacs_ctxt); + Cpxgemr2d(nrow, + nrow, + edm_src, + 1, + 1, + desc_pool, + pe.EDM[ik_collect], + 1, + 1, + k2d.get_p2D_global()->desc, + k2d.get_p2D_global()->blacs_ctxt); + } + } + ModuleBase::timer::end("HSolverLCAO", "collect_pexsi_dm"); + break; + } + } + + MPI_Group_free(&pool_group); + k2d.unset_para_env(); + + auto _pes = dynamic_cast>*>(pes); + pes->f_en.eband = pe.totalFreeEnergy; + _pes->dm2rho(pe.DM, pe.EDM, &dm); + ModuleBase::timer::end("HSolverLCAO", "pexsiKparSolve"); + return true; + } +}; +#endif +} // namespace +#endif + template void HSolverLCAO::solve(hamilt::Hamilt* pHamilt, psi::Psi& psi, @@ -117,6 +383,11 @@ void HSolverLCAO::solve(hamilt::Hamilt* pHamilt, else if (this->method == "pexsi") { #ifdef __PEXSI // other purification methods should follow this routine + if (PexsiKParallelRunner::run(ParaV, pHamilt, psi, pes, dm, PARAM.globalv.kpar_lcao)) + { + ModuleBase::timer::end("HSolverLCAO", "solve"); + return; + } DiagoPexsi pe(ParaV); const int pexsi_mu_loops = std::is_same>::value ? std::min(40, std::max(1, pexsi::PEXSI_Solver::pexsi_nmax)) diff --git a/source/source_hsolver/module_pexsi/dist_bcd_matrix.cpp b/source/source_hsolver/module_pexsi/dist_bcd_matrix.cpp index 93a6dcb5b0d..8335b07c4d1 100644 --- a/source/source_hsolver/module_pexsi/dist_bcd_matrix.cpp +++ b/source/source_hsolver/module_pexsi/dist_bcd_matrix.cpp @@ -45,28 +45,35 @@ DistBCDMatrix::DistBCDMatrix(MPI_Comm comm, this->myproc = -1; this->myprow = -1; this->mypcol = -1; + this->nprocs = 0; + this->nprows = 0; + this->npcols = 0; + this->prowpcol2pnum = nullptr; + return; } - // synchronize matrix parameters to all processes, including those are not in bcd group - int myid_in_comm_world = 0; - MPI_Comm_rank(MPI_COMM_WORLD, &myid_in_comm_world); - if (myid_in_comm_world == 0) + // Synchronize matrix parameters inside the BCD communicator. Older code used + // MPI_COMM_WORLD here, which prevents multiple independent k-point pools from + // constructing BCD descriptors concurrently. + int myid_in_comm = 0; + MPI_Comm_rank(comm, &myid_in_comm); + if (myid_in_comm == 0) { MPI_Comm_size(comm, &this->nprocs); int PARA_BCAST[4] = {this->nblk, this->nprocs, this->nprows, this->npcols}; - MPI_Bcast(&PARA_BCAST[0], 4, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&PARA_BCAST[0], 4, MPI_INT, 0, comm); } else { int PARA_BCAST[4]; - MPI_Bcast(&PARA_BCAST[0], 4, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&PARA_BCAST[0], 4, MPI_INT, 0, comm); this->nblk = PARA_BCAST[0]; this->nprocs = PARA_BCAST[1]; this->nprows = PARA_BCAST[2]; this->npcols = PARA_BCAST[3]; } this->prowpcol2pnum = new int[this->nprocs]; - if (myid_in_comm_world == 0) + if (myid_in_comm == 0) { for (int i = 0; i < this->nprows; ++i) { @@ -76,7 +83,7 @@ DistBCDMatrix::DistBCDMatrix(MPI_Comm comm, } } } - MPI_Bcast(this->prowpcol2pnum, this->nprocs, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(this->prowpcol2pnum, this->nprocs, MPI_INT, 0, comm); } DistBCDMatrix::~DistBCDMatrix() @@ -111,4 +118,4 @@ int DistBCDMatrix::pnum(const int prow, const int pcol) return this->prowpcol2pnum[prow * this->npcols + pcol]; } } // namespace pexsi -#endif \ No newline at end of file +#endif diff --git a/source/source_hsolver/module_pexsi/dist_matrix_transformer.cpp b/source/source_hsolver/module_pexsi/dist_matrix_transformer.cpp index 5e36075835d..8f1dd37df83 100644 --- a/source/source_hsolver/module_pexsi/dist_matrix_transformer.cpp +++ b/source/source_hsolver/module_pexsi/dist_matrix_transformer.cpp @@ -12,6 +12,10 @@ #include #include +#ifdef _OPENMP +#include +#endif + #include "dist_bcd_matrix.h" #include "dist_ccs_matrix.h" @@ -139,6 +143,9 @@ inline void DistMatrixTransformer::buffer2CCSvalue(int nnzLocal, double* buffer, double* nzvalLocal) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (nnzLocal >= 4096) +#endif for (int i = 0; i < nnzLocal; ++i) { nzvalLocal[i] = buffer[buffer2ccsIndex[i]]; @@ -174,6 +181,43 @@ inline int DistMatrixTransformer::getNonZeroIndex(char layout, rowidx.clear(); if (layout == 'c') { +#ifdef _OPENMP + const int total = nrow * ncol; + if (total >= 4096) + { + std::vector nonzero(total, 0); +#pragma omp parallel for schedule(static) + for (int linear = 0; linear < total; ++linear) + { + const int i = linear / nrow; + const int j = linear - i * nrow; + const int idx_local = i * nrow + j; + nonzero[linear] = (fabs(H_2d[idx_local]) > ZERO_Limit || fabs(S_2d[idx_local]) > ZERO_Limit) ? 1 : 0; + } + + std::vector prefix(total + 1, 0); + for (int linear = 0; linear < total; ++linear) + { + prefix[linear + 1] = prefix[linear] + static_cast(nonzero[linear]); + } + nnz = prefix[total]; + colidx.resize(nnz); + rowidx.resize(nnz); +#pragma omp parallel for schedule(static) + for (int linear = 0; linear < total; ++linear) + { + if (nonzero[linear]) + { + const int i = linear / nrow; + const int j = linear - i * nrow; + const int pos = prefix[linear]; + colidx[pos] = i; + rowidx[pos] = j; + } + } + return 0; + } +#endif for (int i = 0; i < ncol; ++i) { for (int j = 0; j < nrow; ++j) @@ -190,6 +234,43 @@ inline int DistMatrixTransformer::getNonZeroIndex(char layout, } else if (layout == 'r') { +#ifdef _OPENMP + const int total = nrow * ncol; + if (total >= 4096) + { + std::vector nonzero(total, 0); +#pragma omp parallel for schedule(static) + for (int linear = 0; linear < total; ++linear) + { + const int i = linear / nrow; + const int j = linear - i * nrow; + const int idx_local = j * ncol + i; + nonzero[linear] = (fabs(H_2d[idx_local]) > ZERO_Limit || fabs(S_2d[idx_local]) > ZERO_Limit) ? 1 : 0; + } + + std::vector prefix(total + 1, 0); + for (int linear = 0; linear < total; ++linear) + { + prefix[linear + 1] = prefix[linear] + static_cast(nonzero[linear]); + } + nnz = prefix[total]; + colidx.resize(nnz); + rowidx.resize(nnz); +#pragma omp parallel for schedule(static) + for (int linear = 0; linear < total; ++linear) + { + if (nonzero[linear]) + { + const int i = linear / nrow; + const int j = linear - i * nrow; + const int pos = prefix[linear]; + colidx[pos] = i; + rowidx[pos] = j; + } + } + return 0; + } +#endif for (int i = 0; i < ncol; ++i) { for (int j = 0; j < nrow; ++j) @@ -227,6 +308,44 @@ inline int DistMatrixTransformer::getNonZeroIndex(char layout, rowidx.clear(); if (layout == 'c' || layout == 'C') { +#ifdef _OPENMP + const int total = nrow * ncol; + if (total >= 4096) + { + std::vector nonzero(total, 0); +#pragma omp parallel for schedule(static) + for (int linear = 0; linear < total; ++linear) + { + const int i = linear / nrow; + const int j = linear - i * nrow; + const int idx_local = i * nrow + j; + nonzero[linear] + = (std::abs(H_2d[idx_local]) > ZERO_Limit || std::abs(S_2d[idx_local]) > ZERO_Limit) ? 1 : 0; + } + + std::vector prefix(total + 1, 0); + for (int linear = 0; linear < total; ++linear) + { + prefix[linear + 1] = prefix[linear] + static_cast(nonzero[linear]); + } + nnz = prefix[total]; + colidx.resize(nnz); + rowidx.resize(nnz); +#pragma omp parallel for schedule(static) + for (int linear = 0; linear < total; ++linear) + { + if (nonzero[linear]) + { + const int i = linear / nrow; + const int j = linear - i * nrow; + const int pos = prefix[linear]; + colidx[pos] = i; + rowidx[pos] = j; + } + } + return 0; + } +#endif for (int i = 0; i < ncol; ++i) { for (int j = 0; j < nrow; ++j) @@ -243,6 +362,44 @@ inline int DistMatrixTransformer::getNonZeroIndex(char layout, } else if (layout == 'r' || layout == 'R') { +#ifdef _OPENMP + const int total = nrow * ncol; + if (total >= 4096) + { + std::vector nonzero(total, 0); +#pragma omp parallel for schedule(static) + for (int linear = 0; linear < total; ++linear) + { + const int i = linear / nrow; + const int j = linear - i * nrow; + const int idx_local = j * ncol + i; + nonzero[linear] + = (std::abs(H_2d[idx_local]) > ZERO_Limit || std::abs(S_2d[idx_local]) > ZERO_Limit) ? 1 : 0; + } + + std::vector prefix(total + 1, 0); + for (int linear = 0; linear < total; ++linear) + { + prefix[linear + 1] = prefix[linear] + static_cast(nonzero[linear]); + } + nnz = prefix[total]; + colidx.resize(nnz); + rowidx.resize(nnz); +#pragma omp parallel for schedule(static) + for (int linear = 0; linear < total; ++linear) + { + if (nonzero[linear]) + { + const int i = linear / nrow; + const int j = linear - i * nrow; + const int pos = prefix[linear]; + colidx[pos] = i; + rowidx[pos] = j; + } + } + return 0; + } +#endif for (int i = 0; i < ncol; ++i) { for (int j = 0; j < nrow; ++j) @@ -280,14 +437,14 @@ int DistMatrixTransformer::buildTransformParameter(DistBCDMatrix& SRC_Matrix, std::vector& receiver_displacement_process, std::vector& buffer2ccsIndex) { - int myproc; - MPI_Comm_rank(MPI_COMM_WORLD, &myproc); + int myproc_trans = 0; + MPI_Comm_rank(COMM_TRANS, &myproc_trans); sender_size = nnz; std::fill(sender_size_process.begin(), sender_size_process.end(), 0); // create process id map from group_data to group_trans int nproc_data; std::vector proc_map_data_trans; - if (myproc == 0) + if (myproc_trans == 0) { MPI_Group_size(DST_Matrix.get_group_data(), &nproc_data); MPI_Bcast(&nproc_data, 1, MPI_INT, 0, COMM_TRANS); @@ -305,6 +462,36 @@ int DistMatrixTransformer::buildTransformParameter(DistBCDMatrix& SRC_Matrix, MPI_Bcast(&proc_map_data_trans[0], nproc_data, MPI_INT, 0, COMM_TRANS); } +#ifdef _OPENMP + if (nnz >= 4096 && NPROC_TRANS > 1) + { + const int max_threads = omp_get_max_threads(); + std::vector> thread_sender_size(max_threads, std::vector(NPROC_TRANS, 0)); +#pragma omp parallel + { + const int tid = omp_get_thread_num(); + std::vector& local_sender_size = thread_sender_size[tid]; +#pragma omp for schedule(static) + for (int i = 0; i < nnz; ++i) + { + const int l_col = colidx[i]; + const int g_col = SRC_Matrix.globalCol(l_col); + int dst_process = 0; + DST_Matrix.localCol(g_col, dst_process); + const int dst_process_trans = proc_map_data_trans[dst_process]; + ++local_sender_size[dst_process_trans]; + } + } + for (int tid = 0; tid < max_threads; ++tid) + { + for (int iproc = 0; iproc < NPROC_TRANS; ++iproc) + { + sender_size_process[iproc] += thread_sender_size[tid][iproc]; + } + } + } + else +#endif for (int i = 0; i < nnz; ++i) { int l_col = colidx[i]; @@ -334,6 +521,9 @@ int DistMatrixTransformer::buildTransformParameter(DistBCDMatrix& SRC_Matrix, // setup receiver index // setup sender_index std::vector sender_index(sender_size); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (nnz >= 4096) +#endif for (int i = 0; i < nnz; ++i) { int l_col = colidx[i]; @@ -378,7 +568,7 @@ int DistMatrixTransformer::newGroupCommTrans(DistBCDMatrix& SRC_Matrix, // build transfortram communicator which contains both processes of BCD processors and // CCS processors with nonzero elements MPI_Group_union(DST_Matrix.get_group_data(), SRC_Matrix.get_group(), &GROUP_TRANS); - MPI_Comm_create(MPI_COMM_WORLD, GROUP_TRANS, &COMM_TRANS); + MPI_Comm_create(SRC_Matrix.get_comm(), GROUP_TRANS, &COMM_TRANS); return 0; } @@ -450,11 +640,11 @@ int DistMatrixTransformer::transformBCDtoCCS(DistBCDMatrix& SRC_Matrix, receiver_size_process, receiver_displacement_process, buffer2ccsIndex); - int myproc; - MPI_Comm_rank(MPI_COMM_WORLD, &myproc); + int myproc_trans = 0; + MPI_Comm_rank(COMM_TRANS, &myproc_trans); int nproc_data; std::vector proc_map_data_trans; - if (myproc == 0) + if (myproc_trans == 0) { MPI_Group_size(DST_Matrix.get_group_data(), &nproc_data); MPI_Bcast(&nproc_data, 1, MPI_INT, 0, COMM_TRANS); @@ -525,6 +715,9 @@ int DistMatrixTransformer::transformBCDtoCCS(DistBCDMatrix& SRC_Matrix, H_ccs = new double[receiver_size]; delete[] S_ccs; S_ccs = new double[receiver_size]; +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (receiver_size >= 4096) +#endif for (int i = 0; i < receiver_size; ++i) { const int buffer_index = buffer2ccsIndex[i]; @@ -592,11 +785,11 @@ int DistMatrixTransformer::transformBCDtoCCS(DistBCDMatrix& SRC_Matrix, receiver_displacement_process, buffer2ccsIndex); - int myproc; - MPI_Comm_rank(MPI_COMM_WORLD, &myproc); + int myproc_trans = 0; + MPI_Comm_rank(COMM_TRANS, &myproc_trans); int nproc_data; std::vector proc_map_data_trans; - if (myproc == 0) + if (myproc_trans == 0) { MPI_Group_size(DST_Matrix.get_group_data(), &nproc_data); MPI_Bcast(&nproc_data, 1, MPI_INT, 0, COMM_TRANS); @@ -671,6 +864,9 @@ int DistMatrixTransformer::transformBCDtoCCS(DistBCDMatrix& SRC_Matrix, H_ccs = new std::complex[receiver_size]; delete[] S_ccs; S_ccs = new std::complex[receiver_size]; +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (receiver_size >= 4096) +#endif for (int i = 0; i < receiver_size; ++i) { const int buffer_index = buffer2ccsIndex[i]; @@ -693,14 +889,15 @@ int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, double* DM, double* EDM) { - int myproc; - MPI_Comm_rank(MPI_COMM_WORLD, &myproc); MPI_Group GROUP_TRANS; MPI_Comm COMM_TRANS = MPI_COMM_NULL; newGroupCommTrans(DST_Matrix, SRC_Matrix, GROUP_TRANS, COMM_TRANS); if (COMM_TRANS != MPI_COMM_NULL) { // init DM and EDM with 0 +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (DST_Matrix.get_nrow() * DST_Matrix.get_ncol() >= 4096) +#endif for (int i = 0; i < DST_Matrix.get_nrow() * DST_Matrix.get_ncol(); ++i) { DM[i] = 0; @@ -888,6 +1085,9 @@ int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, // transfer DM // set up DM sender buffer +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (sender_size >= 4096) +#endif for (int i = 0; i < sender_size; ++i) { sender_buffer[i] = DMnzvalLocal[sender_index[i]]; @@ -908,6 +1108,9 @@ int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, if (DST_Matrix.get_layout() == 'R' || DST_Matrix.get_layout() == 'r') { int DST_Matrix_elem = DST_Matrix.get_nrow() * DST_Matrix.get_ncol(); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (receiver_size >= 4096) +#endif for (int i = 0; i < receiver_size; ++i) { int ix = receiver_index[2 * i]; @@ -919,6 +1122,9 @@ int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, else { int DST_Matrix_elem = DST_Matrix.get_nrow() * DST_Matrix.get_ncol(); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (receiver_size >= 4096) +#endif for (int i = 0; i < receiver_size; ++i) { int ix = receiver_index[2 * i]; @@ -929,6 +1135,9 @@ int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, } // setup up sender buffer of EDM +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (sender_size >= 4096) +#endif for (int i = 0; i < sender_size; ++i) { sender_buffer[i] = EDMnzvalLocal[sender_index[i]]; @@ -949,6 +1158,9 @@ int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, if (DST_Matrix.get_layout() == 'R' || DST_Matrix.get_layout() == 'r') { int DST_Matrix_elem = DST_Matrix.get_nrow() * DST_Matrix.get_ncol(); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (receiver_size >= 4096) +#endif for (int i = 0; i < receiver_size; ++i) { int ix = receiver_index[2 * i]; @@ -960,6 +1172,9 @@ int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, else { int DST_Matrix_elem = DST_Matrix.get_nrow() * DST_Matrix.get_ncol(); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (receiver_size >= 4096) +#endif for (int i = 0; i < receiver_size; ++i) { int ix = receiver_index[2 * i]; @@ -993,6 +1208,9 @@ int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, if (COMM_TRANS != MPI_COMM_NULL) { const int dst_size = DST_Matrix.get_nrow() * DST_Matrix.get_ncol(); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (dst_size >= 4096) +#endif for (int i = 0; i < dst_size; ++i) { DM[i] = std::complex(0.0, 0.0); @@ -1113,6 +1331,9 @@ int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, COMM_TRANS); std::vector sender_buffer(std::max(4 * sender_size, 1), 0.0); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (sender_size >= 4096) +#endif for (int i = 0; i < sender_size; ++i) { const int src_index = sender_index[i]; @@ -1149,6 +1370,9 @@ int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, if (DST_Matrix.get_layout() == 'R' || DST_Matrix.get_layout() == 'r') { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (receiver_size >= 4096) +#endif for (int i = 0; i < receiver_size; ++i) { const int ix = receiver_index[2 * i]; @@ -1160,6 +1384,9 @@ int DistMatrixTransformer::transformCCStoBCD(DistCCSMatrix& SRC_Matrix, } else { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if (receiver_size >= 4096) +#endif for (int i = 0; i < receiver_size; ++i) { const int ix = receiver_index[2 * i]; diff --git a/source/source_hsolver/module_pexsi/simple_pexsi.cpp b/source/source_hsolver/module_pexsi/simple_pexsi.cpp index 05b32a5d9d9..29fce486b75 100644 --- a/source/source_hsolver/module_pexsi/simple_pexsi.cpp +++ b/source/source_hsolver/module_pexsi/simple_pexsi.cpp @@ -27,6 +27,28 @@ namespace pexsi { +namespace +{ +void print_effective_pexsi_options_once(const PPEXSIOptions& options, + const int numProcessPerPole, + const double zeroLimit) +{ + static bool printed = false; + if (printed || GlobalV::MY_RANK != 0) + { + return; + } + printed = true; + GlobalV::ofs_running << "\n PEXSI effective options:" + << " numPole=" << options.numPole + << " temperature=" << options.temperature + << " numProcessPerPole=" << numProcessPerPole + << " muGuard=" << options.muPEXSISafeGuard + << " electronTolerance=" << options.numElectronPEXSITolerance + << " zeroLimit=" << zeroLimit << std::endl; +} +} // namespace + inline void strtolower(char* sa, char* sb) { char c = '\0'; @@ -133,17 +155,17 @@ int loadPEXSIOption(MPI_Comm comm, const bool default_pexsi_temperature = std::abs(double_para[1] - 0.015) < 1.0e-14; const bool fermi_dirac_smearing = PARAM.inp.smearing_method == "fd" || PARAM.inp.smearing_method == "fermi-dirac"; - if (default_pexsi_temperature && !fermi_dirac_smearing) + if (default_pexsi_temperature && fermi_dirac_smearing) { - const double derived_temperature = PARAM.inp.smearing_method == "fixed" - ? 2.0e-5 - : std::max(2.0e-5, 0.01 * PARAM.inp.smearing_sigma); - double_para[1] = std::min(double_para[1], derived_temperature); + double_para[1] = PARAM.inp.smearing_sigma; } - if (int_para[0] == 40 && double_para[1] <= 1.0e-4) + + int comm_size = 1; + if (comm != MPI_COMM_NULL) { - int_para[0] = 120; + MPI_Comm_size(comm, &comm_size); } + int_para[16] = std::max(1, std::min(int_para[16], comm_size)); options.numPole = int_para[0]; options.isInertiaCount = int_para[1]; @@ -249,6 +271,7 @@ int simplePEXSI(MPI_Comm comm_PEXSI, double ZERO_Limit = 0.0; loadPEXSIOption(comm_PEXSI, PexsiOptionFile, options, numProcessPerPole, ZERO_Limit); options.mu0 = mu0; + print_effective_pexsi_options_once(options, numProcessPerPole, ZERO_Limit); ModuleBase::timer::start("Diago_LCAO_Matrix", "setup_PEXSI_plan"); PPEXSIPlan plan; @@ -298,6 +321,7 @@ int simplePEXSI(MPI_Comm comm_PEXSI, // Load H and S to PEXSI int isSIdentity = 0; + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_LoadHS_R"); PPEXSILoadRealHSMatrix(plan, options, size, @@ -310,6 +334,7 @@ int simplePEXSI(MPI_Comm comm_PEXSI, isSIdentity, SnzvalLocal, &info); + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_LoadHS_R"); double nelec = 0.0; double muMinInertia = 0.0; @@ -343,6 +368,7 @@ int simplePEXSI(MPI_Comm comm_PEXSI, FDMnzvalLocal = new double[DST_Matrix.get_nnzlocal()]; if (myid < numProcessPerPole) { + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Retrieve_R"); PPEXSIRetrieveRealDFTMatrix(plan, DMnzvalLocal, EDMnzvalLocal, @@ -351,9 +377,12 @@ int simplePEXSI(MPI_Comm comm_PEXSI, &totalEnergyS, &totalFreeEnergy, &info); + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Retrieve_R"); } // clean PEXSI + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Finalize"); PPEXSIPlanFinalize(plan, &info); + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Finalize"); } // transform Density Matrix and Energy Density Matrix from compressed column sparse matrix @@ -371,14 +400,25 @@ int simplePEXSI(MPI_Comm comm_PEXSI, ModuleBase::timer::end("Diago_LCAO_Matrix", "TransMAT22D"); // LiuXh modify 2021-04-29, add DONE(ofs_running,"xx") for test - MPI_Barrier(MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); + const MPI_Comm barrier_comm = comm_2D != MPI_COMM_NULL ? comm_2D : comm_PEXSI; + if (barrier_comm != MPI_COMM_NULL) + { + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Barrier"); + MPI_Barrier(barrier_comm); + MPI_Barrier(barrier_comm); + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Barrier"); + } delete[] DMnzvalLocal; delete[] EDMnzvalLocal; delete[] FDMnzvalLocal; delete[] HnzvalLocal; delete[] SnzvalLocal; - MPI_Barrier(MPI_COMM_WORLD); + if (barrier_comm != MPI_COMM_NULL) + { + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Barrier"); + MPI_Barrier(barrier_comm); + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Barrier"); + } return 0; } @@ -425,6 +465,7 @@ int simplePEXSIComplex(MPI_Comm comm_PEXSI, options.symmetricStorage = 0; options.rowOrdering = 0; options.mu0 = mu0; + print_effective_pexsi_options_once(options, numProcessPerPole, ZERO_Limit); ModuleBase::timer::start("Diago_LCAO_Matrix", "setup_PEXSI_plan"); PPEXSIPlan plan; @@ -460,6 +501,7 @@ int simplePEXSIComplex(MPI_Comm comm_PEXSI, if (comm_PEXSI != MPI_COMM_NULL) { int isSIdentity = 0; + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_LoadHS_C"); PPEXSILoadComplexHSMatrix(plan, options, size, @@ -472,15 +514,19 @@ int simplePEXSIComplex(MPI_Comm comm_PEXSI, isSIdentity, reinterpret_cast(SnzvalLocal), &info); + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_LoadHS_C"); + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Symbolic_C"); PPEXSISymbolicFactorizeComplexUnsymmetricMatrix(plan, options, reinterpret_cast(HnzvalLocal), &info); + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Symbolic_C"); double nelec = 0.0; double d_nelec_d_mu = 0.0; mu = mu0; ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSIDFT"); + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_FermiOp_C"); PPEXSICalculateFermiOperatorComplex(plan, options, mu, @@ -488,6 +534,7 @@ int simplePEXSIComplex(MPI_Comm comm_PEXSI, &nelec, &d_nelec_d_mu, &info); + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_FermiOp_C"); if (numElectronPEXSI != nullptr) { *numElectronPEXSI = nelec; @@ -503,6 +550,7 @@ int simplePEXSIComplex(MPI_Comm comm_PEXSI, FDMnzvalLocal = new std::complex[DST_Matrix.get_nnzlocal()]; if (myid < numProcessPerPole) { + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Retrieve_C"); PPEXSIRetrieveComplexDFTMatrix(plan, reinterpret_cast(DMnzvalLocal), reinterpret_cast(EDMnzvalLocal), @@ -511,22 +559,36 @@ int simplePEXSIComplex(MPI_Comm comm_PEXSI, &totalEnergyS, &totalFreeEnergy, &info); + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Retrieve_C"); } + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Finalize"); PPEXSIPlanFinalize(plan, &info); + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Finalize"); } ModuleBase::timer::start("Diago_LCAO_Matrix", "TransMAT22D"); DistMatrixTransformer::transformCCStoBCD(DST_Matrix, DMnzvalLocal, EDMnzvalLocal, SRC_Matrix, DM, EDM); ModuleBase::timer::end("Diago_LCAO_Matrix", "TransMAT22D"); - MPI_Barrier(MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); + const MPI_Comm barrier_comm = comm_2D != MPI_COMM_NULL ? comm_2D : comm_PEXSI; + if (barrier_comm != MPI_COMM_NULL) + { + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Barrier"); + MPI_Barrier(barrier_comm); + MPI_Barrier(barrier_comm); + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Barrier"); + } delete[] DMnzvalLocal; delete[] EDMnzvalLocal; delete[] FDMnzvalLocal; delete[] HnzvalLocal; delete[] SnzvalLocal; - MPI_Barrier(MPI_COMM_WORLD); + if (barrier_comm != MPI_COMM_NULL) + { + ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Barrier"); + MPI_Barrier(barrier_comm); + ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Barrier"); + } return 0; } } // namespace pexsi From 8eb4e6e19d71a99c4137c9a2c94aea36f1e15e9a Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Mon, 25 May 2026 17:58:12 +0800 Subject: [PATCH 04/23] Add PEXSI multi-k trace hooks --- source/source_hsolver/hsolver_lcao.cpp | 30 ++++++++++ .../module_pexsi/simple_pexsi.cpp | 58 ++++++++++++++++++- 2 files changed, 86 insertions(+), 2 deletions(-) diff --git a/source/source_hsolver/hsolver_lcao.cpp b/source/source_hsolver/hsolver_lcao.cpp index d5c11077f97..b1180fa5380 100644 --- a/source/source_hsolver/hsolver_lcao.cpp +++ b/source/source_hsolver/hsolver_lcao.cpp @@ -41,6 +41,8 @@ #include #include +#include +#include #include namespace hsolver @@ -49,6 +51,26 @@ namespace hsolver #ifdef __PEXSI namespace { +bool pexsi_trace_enabled() +{ + static const bool enabled = std::getenv("ABACUS_PEXSI_TRACE") != nullptr; + return enabled; +} + +#ifdef __MPI +void pexsi_trace_kpoint(const int imu, const int ik_global, const int pool, const char* stage) +{ + if (!pexsi_trace_enabled()) + { + return; + } + int world_rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + std::cout << "PEXSI_TRACE world_rank=" << world_rank << " pool=" << pool << " imu=" << imu + << " ik=" << ik_global << " stage=" << stage << " time=" << MPI_Wtime() << std::endl; +} +#endif + template struct PexsiKParallelRunner { @@ -188,6 +210,10 @@ struct PexsiKParallelRunner, Device> std::complex* dm_pool_ptr = dm_pool.data(); std::complex* edm_pool_ptr = edm_pool.data(); + if (rank_in_pool == 0) + { + pexsi_trace_kpoint(imu, ik_global, my_pool, "begin"); + } pexsi::simplePEXSIComplex(k2d.POOL_WORLD_K2D, k2d.POOL_WORLD_K2D, pool_group, @@ -210,6 +236,10 @@ struct PexsiKParallelRunner, Device> pe.current_mu(), &num_electron, &num_electron_derivative); + if (rank_in_pool == 0) + { + pexsi_trace_kpoint(imu, ik_global, my_pool, "end"); + } if (rank_in_pool == 0) { diff --git a/source/source_hsolver/module_pexsi/simple_pexsi.cpp b/source/source_hsolver/module_pexsi/simple_pexsi.cpp index 29fce486b75..dae38d66af0 100644 --- a/source/source_hsolver/module_pexsi/simple_pexsi.cpp +++ b/source/source_hsolver/module_pexsi/simple_pexsi.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,29 @@ namespace pexsi { namespace { +bool trace_pexsi_enabled() +{ + static const bool enabled = std::getenv("ABACUS_PEXSI_TRACE") != nullptr; + return enabled; +} + +void trace_pexsi_stage(MPI_Comm comm, const char* path, const char* stage) +{ + if (!trace_pexsi_enabled() || comm == MPI_COMM_NULL) + { + return; + } + int comm_rank = 0; + int world_rank = 0; + MPI_Comm_rank(comm, &comm_rank); + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + if (comm_rank == 0) + { + std::cout << "PEXSI_TRACE world_rank=" << world_rank << " path=" << path << " stage=" << stage + << " time=" << MPI_Wtime() << std::endl; + } +} + void print_effective_pexsi_options_once(const PPEXSIOptions& options, const int numProcessPerPole, const double zeroLimit) @@ -286,7 +310,9 @@ int simplePEXSI(MPI_Comm comm_PEXSI, ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSIPlanInit"); if (comm_PEXSI != MPI_COMM_NULL) { + trace_pexsi_stage(comm_PEXSI, "real", "PPEXSIPlanInitialize.begin"); plan = PPEXSIPlanInitialize(comm_PEXSI, pexsi_prow, pexsi_pcol, outputFileIndex, &info); + trace_pexsi_stage(comm_PEXSI, "real", "PPEXSIPlanInitialize.end"); } ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSIPlanInit"); @@ -312,7 +338,9 @@ int simplePEXSI(MPI_Comm comm_PEXSI, // transform H and S from 2D block cyclic distribution to compressed column sparse matrix // LiuXh modify 2021-03-30, add DONE(ofs_running,"xx") for test ModuleBase::timer::start("Diago_LCAO_Matrix", "TransMAT2CCS"); + trace_pexsi_stage(comm_PEXSI, "real", "TransMAT2CCS.begin"); DistMatrixTransformer::transformBCDtoCCS(SRC_Matrix, H, S, ZERO_Limit, DST_Matrix, HnzvalLocal, SnzvalLocal); + trace_pexsi_stage(comm_PEXSI, "real", "TransMAT2CCS.end"); ModuleBase::timer::end("Diago_LCAO_Matrix", "TransMAT2CCS"); // MPI_Barrier(MPI_COMM_WORLD); // LiuXh modify 2021-03-30, add DONE(ofs_running,"xx") for test @@ -322,6 +350,7 @@ int simplePEXSI(MPI_Comm comm_PEXSI, // Load H and S to PEXSI int isSIdentity = 0; ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_LoadHS_R"); + trace_pexsi_stage(comm_PEXSI, "real", "PPEXSILoadRealHSMatrix.begin"); PPEXSILoadRealHSMatrix(plan, options, size, @@ -334,6 +363,7 @@ int simplePEXSI(MPI_Comm comm_PEXSI, isSIdentity, SnzvalLocal, &info); + trace_pexsi_stage(comm_PEXSI, "real", "PPEXSILoadRealHSMatrix.end"); ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_LoadHS_R"); double nelec = 0.0; @@ -343,6 +373,7 @@ int simplePEXSI(MPI_Comm comm_PEXSI, int numTotalInertiaIter = 0; // Number of total inertia[out] // LiuXh modify 2021-04-29, add DONE(ofs_running,"xx") for test ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSIDFT"); + trace_pexsi_stage(comm_PEXSI, "real", "PPEXSIDFTDriver2.begin"); PPEXSIDFTDriver2(plan, // PEXSI plan[in] &options, // PEXSI Options[in] numElectronExact, // exact electron number[in] @@ -353,6 +384,7 @@ int simplePEXSI(MPI_Comm comm_PEXSI, &numTotalInertiaIter, // Number of total inertia[out] // &numTotalPEXSIIter, // number of total pexsi evaluation procedure[out] &info); // 0: successful; otherwise: unsuccessful + trace_pexsi_stage(comm_PEXSI, "real", "PPEXSIDFTDriver2.end"); // LiuXh modify 2021-04-29, add DONE(ofs_running,"xx") for test ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSIDFT"); @@ -369,6 +401,7 @@ int simplePEXSI(MPI_Comm comm_PEXSI, if (myid < numProcessPerPole) { ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Retrieve_R"); + trace_pexsi_stage(comm_PEXSI, "real", "PPEXSIRetrieveRealDFTMatrix.begin"); PPEXSIRetrieveRealDFTMatrix(plan, DMnzvalLocal, EDMnzvalLocal, @@ -377,11 +410,14 @@ int simplePEXSI(MPI_Comm comm_PEXSI, &totalEnergyS, &totalFreeEnergy, &info); + trace_pexsi_stage(comm_PEXSI, "real", "PPEXSIRetrieveRealDFTMatrix.end"); ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Retrieve_R"); } // clean PEXSI ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Finalize"); + trace_pexsi_stage(comm_PEXSI, "real", "PPEXSIPlanFinalize.begin"); PPEXSIPlanFinalize(plan, &info); + trace_pexsi_stage(comm_PEXSI, "real", "PPEXSIPlanFinalize.end"); ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Finalize"); } @@ -395,12 +431,14 @@ int simplePEXSI(MPI_Comm comm_PEXSI, // EDM = new double[SRC_Matrix.get_nrow() * SRC_Matrix.get_ncol()]; } // LiuXh modify 2021-04-29, add DONE(ofs_running,"xx") for test + const MPI_Comm barrier_comm = comm_2D != MPI_COMM_NULL ? comm_2D : comm_PEXSI; ModuleBase::timer::start("Diago_LCAO_Matrix", "TransMAT22D"); + trace_pexsi_stage(barrier_comm, "real", "TransMAT22D.begin"); DistMatrixTransformer::transformCCStoBCD(DST_Matrix, DMnzvalLocal, EDMnzvalLocal, SRC_Matrix, DM, EDM); + trace_pexsi_stage(barrier_comm, "real", "TransMAT22D.end"); ModuleBase::timer::end("Diago_LCAO_Matrix", "TransMAT22D"); // LiuXh modify 2021-04-29, add DONE(ofs_running,"xx") for test - const MPI_Comm barrier_comm = comm_2D != MPI_COMM_NULL ? comm_2D : comm_PEXSI; if (barrier_comm != MPI_COMM_NULL) { ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Barrier"); @@ -480,7 +518,9 @@ int simplePEXSIComplex(MPI_Comm comm_PEXSI, ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSIPlanInit"); if (comm_PEXSI != MPI_COMM_NULL) { + trace_pexsi_stage(comm_PEXSI, "complex", "PPEXSIPlanInitialize.begin"); plan = PPEXSIPlanInitialize(comm_PEXSI, pexsi_prow, pexsi_pcol, outputFileIndex, &info); + trace_pexsi_stage(comm_PEXSI, "complex", "PPEXSIPlanInitialize.end"); } ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSIPlanInit"); ModuleBase::timer::end("Diago_LCAO_Matrix", "setup_PEXSI_plan"); @@ -495,13 +535,16 @@ int simplePEXSIComplex(MPI_Comm comm_PEXSI, std::complex* FDMnzvalLocal = nullptr; ModuleBase::timer::start("Diago_LCAO_Matrix", "TransMAT2CCS"); + trace_pexsi_stage(comm_PEXSI, "complex", "TransMAT2CCS.begin"); DistMatrixTransformer::transformBCDtoCCS(SRC_Matrix, H, S, ZERO_Limit, DST_Matrix, HnzvalLocal, SnzvalLocal); + trace_pexsi_stage(comm_PEXSI, "complex", "TransMAT2CCS.end"); ModuleBase::timer::end("Diago_LCAO_Matrix", "TransMAT2CCS"); if (comm_PEXSI != MPI_COMM_NULL) { int isSIdentity = 0; ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_LoadHS_C"); + trace_pexsi_stage(comm_PEXSI, "complex", "PPEXSILoadComplexHSMatrix.begin"); PPEXSILoadComplexHSMatrix(plan, options, size, @@ -514,12 +557,15 @@ int simplePEXSIComplex(MPI_Comm comm_PEXSI, isSIdentity, reinterpret_cast(SnzvalLocal), &info); + trace_pexsi_stage(comm_PEXSI, "complex", "PPEXSILoadComplexHSMatrix.end"); ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_LoadHS_C"); ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Symbolic_C"); + trace_pexsi_stage(comm_PEXSI, "complex", "PPEXSISymbolicFactorizeComplexUnsymmetricMatrix.begin"); PPEXSISymbolicFactorizeComplexUnsymmetricMatrix(plan, options, reinterpret_cast(HnzvalLocal), &info); + trace_pexsi_stage(comm_PEXSI, "complex", "PPEXSISymbolicFactorizeComplexUnsymmetricMatrix.end"); ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Symbolic_C"); double nelec = 0.0; @@ -527,6 +573,7 @@ int simplePEXSIComplex(MPI_Comm comm_PEXSI, mu = mu0; ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSIDFT"); ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_FermiOp_C"); + trace_pexsi_stage(comm_PEXSI, "complex", "PPEXSICalculateFermiOperatorComplex.begin"); PPEXSICalculateFermiOperatorComplex(plan, options, mu, @@ -534,6 +581,7 @@ int simplePEXSIComplex(MPI_Comm comm_PEXSI, &nelec, &d_nelec_d_mu, &info); + trace_pexsi_stage(comm_PEXSI, "complex", "PPEXSICalculateFermiOperatorComplex.end"); ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_FermiOp_C"); if (numElectronPEXSI != nullptr) { @@ -551,6 +599,7 @@ int simplePEXSIComplex(MPI_Comm comm_PEXSI, if (myid < numProcessPerPole) { ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Retrieve_C"); + trace_pexsi_stage(comm_PEXSI, "complex", "PPEXSIRetrieveComplexDFTMatrix.begin"); PPEXSIRetrieveComplexDFTMatrix(plan, reinterpret_cast(DMnzvalLocal), reinterpret_cast(EDMnzvalLocal), @@ -559,18 +608,23 @@ int simplePEXSIComplex(MPI_Comm comm_PEXSI, &totalEnergyS, &totalFreeEnergy, &info); + trace_pexsi_stage(comm_PEXSI, "complex", "PPEXSIRetrieveComplexDFTMatrix.end"); ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Retrieve_C"); } ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Finalize"); + trace_pexsi_stage(comm_PEXSI, "complex", "PPEXSIPlanFinalize.begin"); PPEXSIPlanFinalize(plan, &info); + trace_pexsi_stage(comm_PEXSI, "complex", "PPEXSIPlanFinalize.end"); ModuleBase::timer::end("Diago_LCAO_Matrix", "PEXSI_Finalize"); } + const MPI_Comm barrier_comm = comm_2D != MPI_COMM_NULL ? comm_2D : comm_PEXSI; ModuleBase::timer::start("Diago_LCAO_Matrix", "TransMAT22D"); + trace_pexsi_stage(barrier_comm, "complex", "TransMAT22D.begin"); DistMatrixTransformer::transformCCStoBCD(DST_Matrix, DMnzvalLocal, EDMnzvalLocal, SRC_Matrix, DM, EDM); + trace_pexsi_stage(barrier_comm, "complex", "TransMAT22D.end"); ModuleBase::timer::end("Diago_LCAO_Matrix", "TransMAT22D"); - const MPI_Comm barrier_comm = comm_2D != MPI_COMM_NULL ? comm_2D : comm_PEXSI; if (barrier_comm != MPI_COMM_NULL) { ModuleBase::timer::start("Diago_LCAO_Matrix", "PEXSI_Barrier"); From a1f21da435c715f991c1574942e1ae2802319871 Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Mon, 25 May 2026 18:27:50 +0800 Subject: [PATCH 05/23] Trace PEXSI multi-k chemical potential search --- source/source_hsolver/diago_pexsi.cpp | 43 +++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/source/source_hsolver/diago_pexsi.cpp b/source/source_hsolver/diago_pexsi.cpp index 2184a5263b1..7a0ec167931 100644 --- a/source/source_hsolver/diago_pexsi.cpp +++ b/source/source_hsolver/diago_pexsi.cpp @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include extern MPI_Comm DIAG_WORLD; @@ -24,6 +26,33 @@ typedef hamilt::MatrixBlock> matcd; namespace hsolver { +namespace +{ +bool pexsi_trace_enabled() +{ + static const bool enabled = std::getenv("ABACUS_PEXSI_TRACE") != nullptr; + return enabled; +} + +void trace_pexsi_mu(const char* stage, + const double mu, + const double target_nelec, + const double nelec, + const double residual, + const double derivative, + const double delta_mu) +{ + if (!pexsi_trace_enabled() || GlobalV::MY_RANK != 0) + { + return; + } + std::cout << "PEXSI_TRACE world_rank=" << GlobalV::MY_RANK << " stage=" << stage << " mu=" << mu + << " target_nelec=" << target_nelec << " nelec=" << nelec << " residual=" << residual + << " dnelec_dmu=" << derivative << " delta_mu=" << delta_mu << " time=" << MPI_Wtime() + << std::endl; +} +} // namespace + template std::vector DiagoPexsi::mu_buffer; @@ -149,6 +178,13 @@ bool DiagoPexsi>::finish_k_loop(const double target_nelec) const double residual = target_nelec - this->num_electron_sum_; if (std::abs(residual) <= pexsi::PEXSI_Solver::pexsi_elec_thr) { + trace_pexsi_mu("mu.converged", + mu_buffer.empty() ? pexsi::PEXSI_Solver::pexsi_mu : mu_buffer[0], + target_nelec, + this->num_electron_sum_, + residual, + this->num_electron_derivative_sum_, + 0.0); return true; } if (residual > 0.0) @@ -186,6 +222,13 @@ bool DiagoPexsi>::finish_k_loop(const double target_nelec) { mu_buffer.push_back(pexsi::PEXSI_Solver::pexsi_mu); } + trace_pexsi_mu("mu.update", + mu_buffer[0], + target_nelec, + this->num_electron_sum_, + residual, + this->num_electron_derivative_sum_, + delta_mu); mu_buffer[0] += delta_mu; return false; } From c287a0268aeb4e97b8b360e4143a4a0714ce7544 Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Mon, 25 May 2026 18:28:47 +0800 Subject: [PATCH 06/23] Filter PEXSI trace output by category --- source/source_hsolver/diago_pexsi.cpp | 14 ++++++++++---- source/source_hsolver/hsolver_lcao.cpp | 14 ++++++++++---- .../source_hsolver/module_pexsi/simple_pexsi.cpp | 14 ++++++++++---- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/source/source_hsolver/diago_pexsi.cpp b/source/source_hsolver/diago_pexsi.cpp index 7a0ec167931..2cfae6b2409 100644 --- a/source/source_hsolver/diago_pexsi.cpp +++ b/source/source_hsolver/diago_pexsi.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include extern MPI_Comm DIAG_WORLD; @@ -28,10 +29,15 @@ namespace hsolver { namespace { -bool pexsi_trace_enabled() +bool pexsi_trace_enabled(const char* category) { - static const bool enabled = std::getenv("ABACUS_PEXSI_TRACE") != nullptr; - return enabled; + const char* trace_env = std::getenv("ABACUS_PEXSI_TRACE"); + if (trace_env == nullptr) + { + return false; + } + const std::string trace_value(trace_env); + return trace_value == "1" || trace_value == "all" || trace_value.find(category) != std::string::npos; } void trace_pexsi_mu(const char* stage, @@ -42,7 +48,7 @@ void trace_pexsi_mu(const char* stage, const double derivative, const double delta_mu) { - if (!pexsi_trace_enabled() || GlobalV::MY_RANK != 0) + if (!pexsi_trace_enabled("mu") || GlobalV::MY_RANK != 0) { return; } diff --git a/source/source_hsolver/hsolver_lcao.cpp b/source/source_hsolver/hsolver_lcao.cpp index b1180fa5380..3bf4ef9e787 100644 --- a/source/source_hsolver/hsolver_lcao.cpp +++ b/source/source_hsolver/hsolver_lcao.cpp @@ -43,6 +43,7 @@ #include #include #include +#include #include namespace hsolver @@ -51,16 +52,21 @@ namespace hsolver #ifdef __PEXSI namespace { -bool pexsi_trace_enabled() +bool pexsi_trace_enabled(const char* category) { - static const bool enabled = std::getenv("ABACUS_PEXSI_TRACE") != nullptr; - return enabled; + const char* trace_env = std::getenv("ABACUS_PEXSI_TRACE"); + if (trace_env == nullptr) + { + return false; + } + const std::string trace_value(trace_env); + return trace_value == "1" || trace_value == "all" || trace_value.find(category) != std::string::npos; } #ifdef __MPI void pexsi_trace_kpoint(const int imu, const int ik_global, const int pool, const char* stage) { - if (!pexsi_trace_enabled()) + if (!pexsi_trace_enabled("kpoint")) { return; } diff --git a/source/source_hsolver/module_pexsi/simple_pexsi.cpp b/source/source_hsolver/module_pexsi/simple_pexsi.cpp index dae38d66af0..16bbb913b39 100644 --- a/source/source_hsolver/module_pexsi/simple_pexsi.cpp +++ b/source/source_hsolver/module_pexsi/simple_pexsi.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include "c_pexsi_interface.h" #include "dist_bcd_matrix.h" @@ -30,15 +31,20 @@ namespace pexsi { namespace { -bool trace_pexsi_enabled() +bool trace_pexsi_enabled(const char* category) { - static const bool enabled = std::getenv("ABACUS_PEXSI_TRACE") != nullptr; - return enabled; + const char* trace_env = std::getenv("ABACUS_PEXSI_TRACE"); + if (trace_env == nullptr) + { + return false; + } + const std::string trace_value(trace_env); + return trace_value == "1" || trace_value == "all" || trace_value.find(category) != std::string::npos; } void trace_pexsi_stage(MPI_Comm comm, const char* path, const char* stage) { - if (!trace_pexsi_enabled() || comm == MPI_COMM_NULL) + if (!trace_pexsi_enabled("stage") || comm == MPI_COMM_NULL) { return; } From 0602a18d85a003649e3bd64e52c47f2702004942 Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Mon, 25 May 2026 18:33:48 +0800 Subject: [PATCH 07/23] Respect PEXSI mu guard in multi-k fallback --- source/source_hsolver/diago_pexsi.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/source/source_hsolver/diago_pexsi.cpp b/source/source_hsolver/diago_pexsi.cpp index 2cfae6b2409..7206a0ad76d 100644 --- a/source/source_hsolver/diago_pexsi.cpp +++ b/source/source_hsolver/diago_pexsi.cpp @@ -209,10 +209,21 @@ bool DiagoPexsi>::finish_k_loop(const double target_nelec) { if (this->has_mu_lower_ && this->has_mu_upper_) { - mu_buffer[0] = 0.5 * (this->mu_lower_ + this->mu_upper_); + const double old_mu = mu_buffer[0]; + const double next_mu = 0.5 * (this->mu_lower_ + this->mu_upper_); + trace_pexsi_mu("mu.bisect", + old_mu, + target_nelec, + this->num_electron_sum_, + residual, + this->num_electron_derivative_sum_, + next_mu - old_mu); + mu_buffer[0] = next_mu; return false; } - const double fallback_step = std::max(pexsi::PEXSI_Solver::pexsi_mu_guard, 0.5); + const double fallback_step = pexsi::PEXSI_Solver::pexsi_mu_guard > 0.0 + ? pexsi::PEXSI_Solver::pexsi_mu_guard + : 0.5; delta_mu = residual > 0.0 ? fallback_step : -fallback_step; } else From be113505a7076d5d65adbd940cae8c4d342e84ed Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Tue, 26 May 2026 01:41:21 +0800 Subject: [PATCH 08/23] Document PEXSI stage4 optimization results --- pexsi_stage4_code_refactor_report.tex | 703 ++++++++++++++++++++++++++ 1 file changed, 703 insertions(+) create mode 100644 pexsi_stage4_code_refactor_report.tex diff --git a/pexsi_stage4_code_refactor_report.tex b/pexsi_stage4_code_refactor_report.tex new file mode 100644 index 00000000000..e8ae5f676c6 --- /dev/null +++ b/pexsi_stage4_code_refactor_report.tex @@ -0,0 +1,703 @@ +\documentclass[a4paper,11pt]{article} + +\usepackage{geometry} +\usepackage{booktabs} +\usepackage{longtable} +\usepackage{array} +\usepackage{hyperref} +\usepackage{xcolor} +\usepackage{fontspec} + +\IfFontExistsTF{Noto Serif CJK SC} +{ + \setmainfont{Noto Serif CJK SC} +} +{ + \setmainfont{DejaVu Serif} +} +\IfFontExistsTF{Noto Sans Mono CJK SC} +{ + \setmonofont{Noto Sans Mono CJK SC} +} +{ + \setmonofont{DejaVu Sans Mono} +} + +\XeTeXlinebreaklocale "zh" +\XeTeXlinebreakskip = 0pt plus 1pt + +\geometry{left=2.2cm,right=2.2cm,top=2.4cm,bottom=2.4cm} +\hypersetup{colorlinks=true,linkcolor=blue,urlcolor=blue,citecolor=blue} +\setlength{\parindent}{2em} +\setlength{\parskip}{0.25em} +\setlength{\emergencystretch}{2em} +\renewcommand{\arraystretch}{1.15} + +\newcommand{\code}[1]{\texttt{#1}} + +\title{\textbf{ABACUS-PEXSI 多 k 点支持的代码修改与重构报告}\\ +\large 并行程序课程大报告} +\author{报告类型:代码修改与重构报告} +\date{2026 年 5 月 25 日} + +\begin{document} +\begin{titlepage} +\centering +\vspace*{1.8cm} +{\Large 并行程序课程大报告\par} +\vspace{1.2cm} +{\huge\bfseries ABACUS-PEXSI 多 k 点支持的代码修改与重构报告\par} +\vspace{1.6cm} +{\large 报告主题:并行科学计算程序中的 PEXSI 求解路径改造\par} +\vspace{1.8cm} +\begin{tabular}{rl} +课程名称: & 并行程序设计 / 并行程序课程\\ +报告类型: & 代码的修改和重构报告\\ +项目对象: & ABACUS LCAO-PEXSI 模块\\ +代码分支: & \code{pexsi-stage4-code-opt}\\ +代码提交: & \code{5978ae6a2 Enable complex multi-k PEXSI path}\\ +测试标准: & 官方用户指南 LCAO 10 个样例\\ +补充基线: & \code{origin/develop} Gamma-only PEXSI 对比测试\\ +\end{tabular} +\vfill +{\large 2026 年 5 月 25 日\par} +\end{titlepage} + +\begin{abstract} +本报告围绕第一性原理软件 ABACUS 中 LCAO 基组下的 PEXSI 求解路径开展代码修改与重构工作。原始代码已经具备 Gamma 点实数 PEXSI 链路,但复数多 k 点路径在 \code{DiagoPexsi>} 和 \code{ElecStateLCAO>::dm2rho} 处被阻断,无法作为并行稀疏求解器处理官方多 k 点样例。本文从并行程序的数据分布、稀疏矩阵格式转换、MPI 调用链和 SCF 迭代计时出发,完成了复数 BCD/CCS 转换、复数 PEXSI 调用、全局化学势搜索、k 点权重修正、复数密度矩阵回写和 PEXSI 温度/FD smearing 对齐。测试部分使用官方 LCAO 10 个样例进行复测,并额外建立远端 \code{develop} 分支的 Gamma-only PEXSI 基线进行对比。结果表明,本轮修改将多 k 点 PEXSI 从直接不可用推进到 \code{np=8/16} 下 7/10 样例在 1800 s 内完成,其中 001--006、008 可完成;007、009、010 仍超时。性能方面,大规模/金属样例仍受逐 k 点 PEXSI、重复 plan/factorization、Fermi operator 调用次数和 pole 数正确性约束影响,报告最后给出剩余问题和后续并行优化方向。 +\end{abstract} + +\noindent\textbf{关键词:}ABACUS;PEXSI;MPI;LCAO;多 k 点;稀疏矩阵;代码重构;并行程序性能分析 + +\tableofcontents +\newpage + +\section{报告目标与任务背景} + +\subsection{课程报告定位} + +本报告面向并行程序课程的大报告要求,重点不是单纯罗列运行结果,而是说明一次真实科学计算软件中的并行求解器改造过程。报告内容包括: + +\begin{itemize} + \item 分析 ABACUS LCAO 求解流程中 PEXSI 的并行数据流和原始阻塞点; + \item 说明本轮代码修改涉及的源码文件、接口变化和重构设计; + \item 用官方 10 个 LCAO 样例验证修改后的正确性和性能边界; + \item 将普通 \code{ks\_solver scalapack\_gvx}、远端 \code{develop} 的 Gamma-only PEXSI 以及当前 stage4 PEXSI 进行计时对比; + \item 总结当前未解决的问题和后续并行优化方向。 +\end{itemize} + +\subsection{问题来源} + +ABACUS 的普通 LCAO 路径通常通过 ScaLAPACK 求解广义本征值问题;PEXSI 则通过极点展开和选择性求逆直接构造密度矩阵,理论上适合大规模稀疏矩阵并行求解。原始代码中 Gamma 点实数 PEXSI 可以运行,但多 k 点体系会进入复数矩阵路径,而该路径在求解器和密度回写处尚未实现,表现为直接退出或错误终止。对官方 10 个 LCAO 样例而言,这意味着只有 Gamma-only 样例可以进入旧 PEXSI 实数路径,真正的多 k 点样例无法完整测试 PEXSI。 + +本轮工作的目标是在 \code{pexsi-stage4-code-opt} 分支上继续打通多 k 点 PEXSI,尽量保持 ABACUS 原有模块边界,并用官方样例验证代码修改的正确性和性能影响。 + +\subsection{分支与工作区信息} + +\begin{longtable}{@{}p{0.24\linewidth}p{0.66\linewidth}@{}} +\caption{工作区基本信息}\\ +\toprule +\textbf{项目} & \textbf{内容}\\ +\midrule +\endfirsthead +\toprule +\textbf{项目} & \textbf{内容}\\ +\midrule +\endhead +当前分支 & \code{pexsi-stage4-code-opt}\\ +代码基础提交 & \code{8a3aea7fe Optimize PEXSI stage4 code path}\\ +代码修改提交 & \code{5978ae6a2 Enable complex multi-k PEXSI path}\\ +远端基线提交 & \code{b9669d375 Toolchain: Fix package installing issue (\#7315)}\\ +主要测试集 & 官方用户指南 \code{examples/lcao-gv} 中 10 个 LCAO 样例\\ +报告输出 & \code{pexsi\_stage4\_code\_refactor\_report.tex} 与对应 PDF\\ +\bottomrule +\end{longtable} + +\section{并行程序结构分析} + +\subsection{ABACUS LCAO 求解链路} + +在本报告关注的 LCAO/KSDFT 场景中,一次电子步的主要流程可以概括为: + +\begin{verbatim} +Hamiltonian / overlap matrix H, S + -> HSolverLCAO::solve + -> DiagoPexsi / ScaLAPACK / other eigensolver + -> density matrix DM / energy density matrix EDM + -> ElecStateLCAO::dm2rho + -> real-space charge density and SCF mixing +\end{verbatim} + +普通 \code{scalapack\_gvx} 路径显式求解本征值和本征矢;PEXSI 路径不直接求本征态,而是将 \(H\) 和 \(S\) 转换为 PEXSI 所需的稀疏压缩列格式,然后通过极点展开、惯性计数、选择性求逆和 Fermi operator 计算得到密度矩阵。因此 PEXSI 路径的并行成本不仅来自 PEXSI 库内部,也来自 ABACUS 侧的分布式矩阵转换、k 点循环、密度矩阵回写和电荷密度规约。 + +\subsection{并行数据结构:BCD 与 CCS} + +ABACUS 内部 LCAO 矩阵使用面向二维 MPI 进程网格的 block-cyclic distribution(BCD)。PEXSI 输入则要求 compressed column storage(CCS)稀疏矩阵。两者的核心区别见表 \ref{tab:bcd-ccs}。 + +\begin{longtable}{@{}p{0.18\linewidth}p{0.36\linewidth}p{0.36\linewidth}@{}} +\caption{ABACUS BCD 矩阵与 PEXSI CCS 矩阵对比} +\label{tab:bcd-ccs}\\ +\toprule +\textbf{项目} & \textbf{BCD} & \textbf{CCS}\\ +\midrule +\endfirsthead +\toprule +\textbf{项目} & \textbf{BCD} & \textbf{CCS}\\ +\midrule +\endhead +使用位置 & ABACUS LCAO/ScaLAPACK 内部矩阵 & PEXSI 输入和输出矩阵\\ +并行分布 & 二维块循环分布,适合并行稠密线性代数 & 按列压缩,只保存非零元,适合稀疏分解与选择性求逆\\ +关键数组 & 局部矩阵块、全局到局部映射、进程网格信息 & \code{colptrLocal}、\code{rowindLocal}、\code{nzvalLocal}\\ +本轮工作 & 补齐复数矩阵转换与回写 & 支持复数 H/S、DM/EDM 与 PEXSI C 接口交互\\ +\bottomrule +\end{longtable} + +\subsection{原始阻塞点} + +原始多 k 点 PEXSI 的主要阻塞不是 MPI 无法启动,而是复数路径未实现: + +\begin{itemize} + \item \code{DiagoPexsi>::diag} 无法完成多 k 点复数 PEXSI 求解; + \item \code{ElecStateLCAO>::dm2rho} 不能将 PEXSI 返回的复数 DM/EDM 累加到实空间电荷密度; + \item 多 k 点下每个 k 点需要共享同一个全局化学势,而不是各 k 点分别独立决定化学势; + \item \code{nspin=1} 时 ABACUS k 点权重与 PEXSI 自旋权重存在重复计数风险; + \item 旧 Gamma-only PEXSI 默认参数较轻,部分样例能跑完但总能量明显偏离官方基线。 +\end{itemize} + +\section{代码修改与重构设计} + +\subsection{修改范围} + +本轮代码修改集中在 PEXSI 模块、LCAO hsolver 调用链和电子态密度回写模块,涉及文件见表 \ref{tab:source-files}。 + +\begin{longtable}{@{}p{0.53\linewidth}p{0.37\linewidth}@{}} +\caption{本轮涉及的源码文件} +\label{tab:source-files}\\ +\toprule +\textbf{文件路径} & \textbf{主要职责}\\ +\midrule +\endfirsthead +\toprule +\textbf{文件路径} & \textbf{主要职责}\\ +\midrule +\endhead +\code{source/source\_hsolver/module\_pexsi/dist\_matrix\_transformer.h} & 增加复数 BCD/CCS 转换接口声明\\ +\code{source/source\_hsolver/module\_pexsi/dist\_matrix\_transformer.cpp} & 实现复数 BCD 到 CCS、CCS 到 BCD 的数据转换\\ +\code{source/source\_hsolver/module\_pexsi/pexsi\_solver\_interface.h} & 新增 PEXSI 求解器抽象接口 \code{IPexsiSolver}\\ +\code{source/source\_hsolver/module\_pexsi/pexsi\_solver.h} & 将 \code{PEXSI\_Solver} 接入 \code{IPexsiSolver} 接口\\ +\code{source/source\_hsolver/module\_pexsi/simple\_pexsi.h} & 增加 \code{simplePEXSIComplex} 接口\\ +\code{source/source\_hsolver/module\_pexsi/simple\_pexsi.cpp} & 实现复数 PEXSI plan、symbolic factorization、Fermi operator、矩阵取回和转换计时\\ +\code{source/source\_hsolver/diago\_pexsi.h} & 扩展 PEXSI 求解器模板接口,托管 DM/EDM 缓冲生命周期\\ +\code{source/source\_hsolver/diago\_pexsi.cpp} & 实现多 k 点全局化学势搜索、复数 PEXSI 调用和 Gamma 实数路径边界检查\\ +\code{source/source\_hsolver/hsolver\_lcao.cpp} & 接入 PEXSI DM/EDM 到 LCAO 求解流程\\ +\code{source/source\_estate/elecstate\_lcao.cpp} & 实现复数 PEXSI DM/EDM 的 \code{dm2rho} 回写\\ +\code{source/source\_hsolver/test/diago\_pexsi\_test.cpp} & 增加 fake PEXSI solver 注入测试,覆盖接口解耦和缓冲托管\\ +\bottomrule +\end{longtable} + +\section{主要重构内容} + +\subsection{抽象 PEXSI 求解器接口} + +已新增 \code{pexsi::IPexsiSolver},并将 \code{PEXSI\_Solver} 改为实现该接口。对应地,\code{DiagoPexsi} 的成员从直接持有具体 \code{PEXSI\_Solver} 改为持有 \code{std::unique\_ptr},构造函数也支持传入外部 solver。默认运行时仍会构造真实 \code{PEXSI\_Solver},因此不改变生产路径。 + +该修改已经体现在 \code{pexsi\_solver\_interface.h}、\code{pexsi\_solver.h}、\code{diago\_pexsi.h} 和 \code{diago\_pexsi.cpp} 中。完整文件路径见表 \ref{tab:source-files}。它的收益如下: + +\begin{itemize} + \item 降低 \code{DiagoPexsi} 与 PEXSI 后端的直接耦合,使对角化流程依赖抽象接口而不是具体实现; + \item 为后续接入复数 expert routines、多 k 点全局化学势搜索等路径预留清晰接口边界; + \item 支持轻量单元测试,避免测试必须真实调用 PEXSI 库和 MPI 稀疏求解流程。 +\end{itemize} + +对应的单元测试位于 \code{diago\_pexsi\_test.cpp},其中 \code{FakePexsiSolver} 实现 \code{IPexsiSolver},并通过 \code{UsesInjectedSolverAndOwnedDensityBuffers} 用例验证 fake solver 注入、DM/EDM 指针传递和能量字段回写。 + +\subsection{改造 DM/EDM 缓冲生命周期} + +原实现中 \code{DiagoPexsi} 直接使用手动 \code{new[]} / \code{delete[]} 管理 \code{DM} 和 \code{EDM} 缓冲。当前代码已改为用 \code{std::vector>} 托管实际存储,同时保留 \code{std::vector DM} 和 \code{std::vector EDM} 作为对外接口。这样既降低内存管理风险,也避免破坏 \code{ElecStateLCAO::dm2rho(pe.DM, pe.EDM, \&dm)} 等既有调用链。 + +该修改的收益如下: + +\begin{itemize} + \item 避免裸指针释放遗漏和异常路径泄漏风险; + \item 保持现有 DM/EDM pointer 形式不变,减少对上层 LCAO 求解流程的侵入; + \item 降低后续扩展多 spin、多 k 点缓冲时的维护成本。 +\end{itemize} + +\subsection{明确 Gamma 实数路径边界} + +在 \code{DiagoPexsi::diag} 中已加入 \code{ik} 边界检查。如果当前实数路径被多 k 点误入,会提前给出明确错误信息: + +\begin{verbatim} +PEXSI real path only has density buffers for Gamma/spin channels; +multi-k requires the complex expert-routine path +\end{verbatim} + +这个边界与项目计划书中的判断一致:当前多 k 点 PEXSI 不能通过简单复用 \code{PPEXSIDFTDriver2} 完成,后续必须走复数 CCS 转换、expert-level Fermi operator 和全局化学势搜索。 + +\subsection{优化 BCD 到 CCS 的 H/S 数值重分布} + +\code{transformBCDtoCCS} 已针对 H/S 共享相同非零模式这一事实进行重构。旧逻辑对 H 和 S 分别组织数值通信;当前实数路径将每个非零位置的 H/S 数值打包为 \code{[H,S]},复数路径打包为 \code{[Re(H),Im(H),Re(S),Im(S)]},然后用一次 \code{MPI\_Alltoallv} 完成数值重分布。 + +该修改的收益如下: + +\begin{itemize} + \item 减少一次 MPI collective 调用,降低 BCD 到 CCS 转换阶段的通信启动成本; + \item 保持原有 CCS 稀疏模式、\code{colptrLocal}、\code{rowindLocal} 和数值排序契约不变; + \item 对 PEXSI 路径中每轮 SCF 都会触发的矩阵格式转换更友好。 +\end{itemize} + +同时,\code{countMatrixDistribution} 中的零值判断括号错误已经修正,从错误的 \code{fabs(A[i] < 1e-31)} 改为 \code{fabs(A[i]) < 1e-31},避免把布尔比较结果传入 \code{fabs}。 + +反馈后对该修改做了 A/B 计时测试。测试方式是同一份 \code{np=8,kpar=4,pexsi\_npole=80,OMP=1} 输入分别使用 packed H/S 和 legacy H/S 两次 \code{MPI\_Alltoallv} 版本运行。001 中 packed 版本的 \code{TransMAT2CCS} 为 0.04955 s,legacy 为 0.05017 s,仅快约 1.25\%;003 中 packed 版本为 0.17982 s,legacy 为 0.16931 s,反而略慢。该结果说明:H/S 打包是合理的结构性优化,但在当前 001/003 小矩阵本地测试中不是主加速来源,真正瓶颈仍是 PEXSI Fermi operator。 + +\subsection{增加转换计时} + +\code{simplePEXSI} 和 \code{simplePEXSIComplex} 中已给 BCD 到 CCS 转换加入 \code{TransMAT2CCS} timer。此前 \code{transformCCStoBCD} 已有 \code{TransMAT22D} 计时,但 BCD 到 CCS 缺少独立计时,不利于区分 ABACUS 侧格式转换成本和 PEXSI 库内部求解成本。 + +补齐该计时后,后续性能报告可以分别观察 \code{TransMAT2CCS}、\code{PEXSIDFT} 和 \code{TransMAT22D},从而判断瓶颈是在 ABACUS 分布式矩阵转换、PEXSI 选择性求逆,还是 DM/EDM 回写阶段。 + +\subsection{OpenMP 本地循环优化} + +在不改变 PEXSI/MPI 调用边界的前提下,本轮对 \code{DistMatrixTransformer} 中的本地数组循环加入了 OpenMP,包括非零元 mask 构造、sender/receiver buffer 填充以及 CCS 到 BCD 的本地回填。没有把 OpenMP 用在外层 k 点循环、PEXSI plan 或 MPI collective 周围,因为这些路径涉及 MPI 通信器、BLACS 和 SuperLU\_DIST 状态,线程并行风险较高。 + +为保证结果确定性,非零元扫描采用 ``并行 mask + 串行 prefix + 并行填充'',保持 \code{colidx}/\code{rowidx} 与原串行扫描顺序一致。001 的 \code{OMP\_NUM\_THREADS=2} 校验得到最终能量 \(-7836.1562642064\) eV,与 \code{OMP\_NUM\_THREADS=1} 的 80-pole 结果一致;但总耗时从 23.41 s 增加到 25.85 s,说明该 OpenMP 改动在小矩阵上主要是正确性和后续扩展验证,暂不是主要加速来源。 + +\subsection{复数 PEXSI 调用路径} + +新增的复数 PEXSI 路径以 \code{simplePEXSIComplex} 为核心,将每个 k 点的复数 \(H(k)\) 和 \(S(k)\) 转换为 PEXSI 所需的 CCS 格式,然后调用 PEXSI complex expert routines。当前代码中涉及的关键调用包括: + +\begin{itemize} + \item \code{PPEXSILoadComplexHSMatrix} + \item \code{PPEXSISymbolicFactorizeComplexUnsymmetricMatrix} + \item \code{PPEXSICalculateFermiOperatorComplex} + \item \code{PPEXSIRetrieveComplexDFTMatrix} +\end{itemize} + +这个改动的直接效果是:多 k 点样例不再在进入 PEXSI 后立即因为复数路径缺失而退出,而是能够真正执行 PEXSI 的复数矩阵装载、稀疏分解、Fermi operator 计算和 DM/EDM 取回。 + +\subsection{全局化学势搜索} + +多 k 点体系的电子数约束是全局约束。若每个 k 点分别搜索化学势,会破坏整体电子数守恒。本轮将 \code{DiagoPexsi>} 改为按 spin channel 使用同一个 \code{mu\_buffer[0]},并在每次尝试化学势时对所有 k 点累加电子数、电子数导数和能量。 + +该逻辑可以概括为: + +\begin{verbatim} +for each SCF step: + choose global mu + for each k point: + run complex PEXSI with the same mu + accumulate nelec, dNe/dmu, energy with k weight + update mu by Newton / bracket / bisection fallback +\end{verbatim} + +当 PEXSI 返回的导数不可用或数值不稳定时,代码使用 bracket/bisection fallback,避免化学势搜索直接崩溃。 + +\subsection{k 点权重和自旋权重修正} + +ABACUS 的 \code{klist->wk} 在 \code{nspin=1} 时已经包含自旋简并,而 PEXSI 的 \code{options.spin=2} 也会计入自旋。因此多 k 点 PEXSI 的有效权重改为: + +\begin{verbatim} +effective_weight = klist->wk[ik] * (nspin == 1 ? 0.5 : 1.0) +\end{verbatim} + +该修正避免了 \code{nspin=1} 多 k 点体系的电子数被重复计入自旋,属于正确性修复,而不是性能优化。 + +\subsection{复数 DM/EDM 回写与 \code{dm2rho}} + +PEXSI 返回的 DM/EDM 需要从 CCS 格式回写到 ABACUS 的 BCD 矩阵,并进一步通过密度矩阵对象累加到 \(DM(R)\) 和实空间电荷密度。原始复数 \code{dm2rho} 不能完成该过程,本轮实现后会: + +\begin{itemize} + \item 检查 PEXSI DM/EDM buffer 数量是否满足 DMK 存储要求; + \item 按有效 k 权重缩放每个 DMK/EDMK; + \item 将 PEXSI 返回的 DMK pointer 写入 \code{DensityMatrix}; + \item 调用 \code{cal\_DMR()} 构造 real-space density matrix; + \item 通过 Gint 生成实空间电荷密度。 +\end{itemize} + +此外,本轮修复了复数 PEXSI DM/EDM 从 CCS 回写到 ABACUS 2D block-cyclic 矩阵时的复共轭方向问题。修复前,\code{001\_4GaAs} 的归一化前电荷约为 \code{68.17/72},总能量偏离约 \code{2.65 eV};修复后电荷恢复到约 \code{72.00/72},总能量回到官方基线附近。 + +\subsection{PEXSI 温度与 smearing 对齐} + +反馈中明确指出,官方文档要求 \code{pexsi\_temp} 与 Fermi-Dirac smearing 的温度含义一致。因此当前代码不再自动降低非 FD 输入的 \code{pexsi\_temp},也不再把低温默认 \code{pexsi\_npole=40} 自动提升到 80。当前策略是: + +\begin{itemize} + \item 若输入使用 \code{smearing\_method=fd} 或 \code{fermi-dirac},且用户没有显式改变 \code{pexsi\_temp},则令 PEXSI temperature 与 \code{smearing\_sigma} 对齐; + \item 若用户显式设置 \code{pexsi\_temp} 或非 FD smearing,则尊重输入文件,不做隐式温度重写; + \item \code{pexsi\_npole} 保持用户输入或官方默认值,降低 pole 数只能作为带正确性检查的性能探针。 +\end{itemize} + +全量复测统一使用 \code{smearing\_method=fd}、\code{smearing\_sigma=0.015} 与 \code{pexsi\_temp=0.015}。低 pole 探针显示,\code{npole=20} 在 007、010 上会产生严重能量错误,因此不能作为默认优化。 + +\section{编译环境与测试设计} + +\subsection{编译环境} + +本地使用启用 PEXSI 的 ABACUS 并行构建,关键依赖来自仓库内 Conda 环境 \code{.conda/pexsi-env}。编译验证命令如下: + +\begin{verbatim} +PATH=/home/HeJunXiang/VSCode/abacus-develop/.conda/pexsi-env/bin:$PATH \ +cmake --build build-pexsi-conda --target abacus_basic_para -j 4 +\end{verbatim} + +该构建启用了 MPI、OpenMP、ScaLAPACK、OpenBLAS、PEXSI、SuperLU\_DIST、ParMETIS 和 METIS。早期正确性回归使用 4 个 MPI rank 和 1 个 OpenMP thread: + +\begin{verbatim} +timeout 1800s mpirun -np 4 abacus_basic_para +\end{verbatim} + +反馈后的全量性能复测重新覆盖 \code{np=8} 和 \code{np=16} 两组并行度,并分别设置 \code{kpar=8} 和 \code{kpar=16}。 + +\subsection{测试矩阵} + +测试分为三组: + +\begin{itemize} + \item \textbf{普通 LCAO/ScaLAPACK 基线:}来自 \code{目前算法测试和总结报告.tex},使用 \code{ks\_solver scalapack\_gvx} 的官方 10 样例结果; + \item \textbf{远端 \code{develop} Gamma-only PEXSI 基线:}从 \code{origin/develop} 建立干净 worktree,只对 Gamma 点样例设置 \code{ks\_solver pexsi} 和 \code{gamma\_only 1}; + \item \textbf{当前 stage4 PEXSI:}在 \code{pexsi-stage4-code-opt} 分支上复制官方 10 样例,只将 \code{ks\_solver} 改为 \code{pexsi},测试多 k 点复数路径。 +\end{itemize} + +当前 stage4 PEXSI 测试目录为: + +\begin{verbatim} +/tmp/abacus_pexsi_10cases_20260524_code_default +\end{verbatim} + +008 fixed-temperature 修正后的追加复测目录为: + +\begin{verbatim} +/tmp/abacus_pexsi_code_default_008_fixed_patch +\end{verbatim} + +远端 \code{develop} Gamma-only PEXSI 基线测试目录为: + +\begin{verbatim} +/tmp/abacus_origin_develop_pexsi_gamma_20260525 +\end{verbatim} + +\section{官方 10 样例复测结果} + +\subsection{正确性结果} + +官方 10 样例 PEXSI 正确性复测结果见表 \ref{tab:pexsi-10case-result}。需要说明的是,该表来自反馈前的保守参数全量复测,用于建立多 k 点复数路径能否进入 SCF、能量是否回到官方基线附近的正确性边界;反馈后的参数调优和 \code{np=8/16} 性能复测见表 \ref{tab:feedback-timing} 与表 \ref{tab:np8-np16-full-rerun}。因此表 \ref{tab:pexsi-10case-result} 不应理解为当前最佳并行参数下的完整性能结果。 + +{\scriptsize +\begin{longtable}{@{}p{0.15\linewidth}p{0.11\linewidth}r r r p{0.24\linewidth}@{}} +\caption{官方 10 样例 stage4 保守参数 PEXSI 复测结果} +\label{tab:pexsi-10case-result}\\ +\toprule +\textbf{样例} & \textbf{SCF} & \textbf{PEXSI 能量/eV} & \textbf{官方/eV} & \textbf{\(|\Delta E|\)/eV} & \textbf{说明}\\ +\midrule +\endfirsthead +\toprule +\textbf{样例} & \textbf{SCF} & \textbf{PEXSI 能量/eV} & \textbf{官方/eV} & \textbf{\(|\Delta E|\)/eV} & \textbf{说明}\\ +\midrule +\endhead +001\_4GaAs & 收敛 & -7836.1565576466 & -7836.1565582000 & 5.53e-7 & 多 k 点已打通,略高于 5e-7 标准\\ +002\_C2H6O & 收敛 & -665.5549999157 & -665.5550001100 & 1.94e-7 & 通过 5e-7 标准\\ +003\_4MoS2 & 收敛 & -9699.0065944614 & -9699.0065951100 & 6.49e-7 & 多 k 点已打通,略高于 5e-7 标准\\ +004\_12Pt111 & 超时 & -- & -39600.7443194500 & -- & 1800 s 到 PE16,\code{DRHO=9.70e-6}\\ +005\_3BaTiO3 & 收敛 & -10749.4002990640 & -10749.4002998700 & 8.06e-7 & 148 个 k 点可完成,略高于 5e-7 标准\\ +006\_16Na & 超时 & -- & -18524.7600965400 & -- & 1800 s 内仅完成 PE1\\ +007\_27Fe & 超时 & -- & -86945.5306751500 & -- & 729 基函数、双自旋多 k 点,首轮未完成\\ +008\_32H2O & 收敛 & -14950.4508412095 & -14950.4508434100 & 2.20e-6 & fixed 占据修正后恢复正确量级\\ +009\_Battery108 & 超时 & -- & -124070.6541107900 & -- & 1620 基函数、双自旋,1800 s 到 PE2\\ +010\_216Si & 超时 & -- & -23123.6999020000 & -- & 2808 基函数,1800 s 到 PE3\\ +\bottomrule +\end{longtable} +} + +若沿用 \code{tools/pexsi/check\_lcao\_10case\_regression.py} 的 \(5.0\times10^{-7}\) eV 总能量容差,该轮保守参数复测只有 \code{002\_C2H6O} 严格通过。若按 ``是否完成 SCF'' 统计,该轮完成 5/10:001、002、003、005、008。其余 5 个样例主要受本地 4 MPI ranks 下 PEXSI 成本限制。 + +\subsection{与普通 \code{ks\_solver scalapack\_gvx} 的耗时对比} + +\code{目前算法测试和总结报告.tex} 中的官方 10 样例基线使用 \code{ks\_solver scalapack\_gvx},其计时代表成熟普通 LCAO/ScaLAPACK 路径。当前 stage4 PEXSI 结果使用 4 个 MPI rank 和 1800 s 外层超时。因此表 \ref{tab:ksolver-pexsi-time} 是工程成本对比,不是严格同条件加速比。 + +{\scriptsize +\begin{longtable}{@{}p{0.15\linewidth}r r r r r p{0.22\linewidth}@{}} +\caption{\code{ks\_solver scalapack\_gvx} 与当前 stage4 PEXSI 耗时对比} +\label{tab:ksolver-pexsi-time}\\ +\toprule +\textbf{样例} & \textbf{gvx 总/s} & \textbf{gvx HS/s} & \textbf{PEXSI 总/s} & \textbf{PEXSI HS/s} & \textbf{PEXSIDFT/s} & \textbf{结论}\\ +\midrule +\endfirsthead +\toprule +\textbf{样例} & \textbf{gvx 总/s} & \textbf{gvx HS/s} & \textbf{PEXSI 总/s} & \textbf{PEXSI HS/s} & \textbf{PEXSIDFT/s} & \textbf{结论}\\ +\midrule +\endhead +001\_4GaAs & 11 & 8.70 & 50.37 & 49.43 & 44.25 & PEXSI 总耗时约 4.6 倍,HSolver 约 5.7 倍\\ +002\_C2H6O & 91 & 7.83 & 43.94 & 4.97 & 0.52 & PEXSI 在该小体系上完成更快,但进程数不同\\ +003\_4MoS2 & 29 & 22.43 & 147.03 & 144.15 & 136.15 & PEXSI 总耗时约 5.1 倍,HSolver 约 6.4 倍\\ +004\_12Pt111 & 63 & 52.11 & \(>1800\) & -- & -- & PEXSI 超时,1800 s 内到 PE16 未收敛\\ +005\_3BaTiO3 & 203 & 186.71 & 1279.88 & 1274.49 & 1212.47 & PEXSI 总耗时约 6.3 倍,HSolver 约 6.8 倍\\ +006\_16Na & 76 & 69.59 & \(>1800\) & -- & -- & PEXSI 超时,1800 s 内仅完成 PE1\\ +007\_27Fe & 793 & 739.89 & \(>1800\) & -- & -- & PEXSI 超时,首个电子步未完成\\ +008\_32H2O & 119 & 92.52 & 131.58 & 121.82 & 91.17 & PEXSI 总耗时接近,HSolver 约 1.3 倍\\ +009\_Battery108 & 1656 & 1531.26 & \(>1800\) & -- & -- & PEXSI 超时,到 PE2 仍未完成 SCF\\ +010\_216Si & 192 & 120.51 & \(>1800\) & -- & -- & PEXSI 超时,到 PE3 仍未完成 SCF\\ +\bottomrule +\end{longtable} +} + +旧报告还对 001、005、009、010 做了 \code{np=4} 的普通 ScaLAPACK 对比。若只看同为 4 个 MPI rank 的样例,普通 \code{scalapack\_gvx} 总耗时分别为 7 s、303 s、602 s、88 s;当前 stage4 PEXSI 对应为 50.37 s、1279.88 s、\(>1800\) s、\(>1800\) s。该对比说明,当前主要瓶颈已经从 ``多 k 点进不了 PEXSI'' 转为 ``PEXSI 路径可以进入,但逐 k 点串行和重复 plan/factorization 成本过高''。 + +\section{单纯使用 PEXSI 的 Gamma-only 基线} + +\subsection{基线构造} + +为区分本轮代码修改带来的影响和既有 Gamma 点 PEXSI 路径本身的表现,额外从远端 \code{develop} 建立干净基线分支: + +\begin{verbatim} +git worktree add -b pexsi-origin-develop-gamma-baseline \ + /tmp/abacus-origin-develop-gamma-baseline origin/develop +\end{verbatim} + +基线提交为 \code{b9669d375 Toolchain: Fix package installing issue (\#7315)}。本小节的测试不是 \code{ks\_solver scalapack\_gvx},也不是普通对角化 hsolver 基线,而是单纯把 Gamma 点样例切到 PEXSI: + +\begin{verbatim} +ks_solver pexsi +gamma_only 1 +timeout 1800s mpirun -np 4 abacus_basic_para +\end{verbatim} + +下表中的 \code{HS/s} 是 ABACUS timer 树中的 \code{HSolverLCAO solve} 名称,不表示使用普通 hsolver 求解器。实际 PEXSI 路径由日志中的 \code{PE} 电子步和 \code{Diago\_LCAO\_Matrix PEXSIDFT} 计时确认。 + +需要特别说明:在远端 \code{develop} 原始代码上,如果只把 \code{ks\_solver} 改为 \code{pexsi} 而保持默认 \code{gamma\_only=0},即使 KPT 是 \code{1 1 1},程序也会进入旧的 complex/multi-k 模板路径并直接报错: + +\begin{verbatim} +PEXSI is not completed for multi-k case +\end{verbatim} + +因此本节使用 ``官方输入 + \code{ks\_solver pexsi} + \code{gamma\_only 1}'',这就是旧 \code{develop} 上单纯使用 PEXSI 的 Gamma-only 结果。 + +\subsection{Gamma-only 结果} + +官方 10 样例中 KPT 为 \code{1 1 1} 的 Gamma 点样例是 002、008、009、010。远端 \code{develop} Gamma-only PEXSI 基线计时见表 \ref{tab:origin-develop-gamma-pexsi}。 + +{\scriptsize +\begin{longtable}{@{}p{0.15\linewidth}p{0.12\linewidth}r r r r p{0.23\linewidth}@{}} +\caption{单纯使用 PEXSI 的 Gamma-only 基线计时(远端 \code{develop})} +\label{tab:origin-develop-gamma-pexsi}\\ +\toprule +\textbf{样例} & \textbf{状态} & \textbf{wall/s} & \textbf{total/s} & \textbf{HS/s} & \textbf{PEXSIDFT/s} & \textbf{PEXSI 证据/说明}\\ +\midrule +\endfirsthead +\toprule +\textbf{样例} & \textbf{状态} & \textbf{wall/s} & \textbf{total/s} & \textbf{HS/s} & \textbf{PEXSIDFT/s} & \textbf{PEXSI 证据/说明}\\ +\midrule +\endhead +002\_C2H6O & 收敛 PE28 & 43.33 & 43.24 & 4.68 & -- & 日志电子步为 \code{PE1--PE28};小矩阵未单列 \code{PEXSIDFT}\\ +008\_32H2O & 收敛 PE17 & 68.41 & 68.32 & 58.65 & 30.16 & 有 \code{PEXSIDFT} 计时;能量 -14941.015276 eV\\ +009\_Battery108 & 超时 PE44 & 1801.03 & -- & -- & -- & 日志到 \code{PE44},\code{DRHO=4.14e-6},未完成 SCF\\ +010\_216Si & 收敛 PE12 & 581.40 & 581.28 & 534.51 & 500.19 & 有 \code{PEXSIDFT} 计时;能量 -23088.956346 eV\\ +\bottomrule +\end{longtable} +} + +旧 \code{develop} Gamma PEXSI 的速度不能单独视为正确实现的性能上限。002、008、010 虽然能完成,但 008 与 010 的总能量分别比官方参考差约 9.44 eV 和 34.74 eV。这说明旧 Gamma 路径默认 PEXSI 参数更轻,收敛速度更快,但物理精度不足。当前 stage4 为了接近官方能量,修复了 fixed 占据、密度矩阵回写和 FD smearing/PEXSI temperature 对齐问题,并避免把错误的低 pole 结果作为默认配置,直接增加了 PEXSI 求解成本。 + +旧 Gamma PEXSI 与当前 stage4 PEXSI 的对比见表 \ref{tab:gamma-baseline-stage4-compare}。 + +{\scriptsize +\begin{longtable}{@{}p{0.15\linewidth}p{0.28\linewidth}p{0.28\linewidth}p{0.20\linewidth}@{}} +\caption{旧 Gamma PEXSI 与当前 stage4 PEXSI 对比} +\label{tab:gamma-baseline-stage4-compare}\\ +\toprule +\textbf{样例} & \textbf{远端 develop Gamma PEXSI} & \textbf{当前 stage4 PEXSI} & \textbf{结论}\\ +\midrule +\endfirsthead +\toprule +\textbf{样例} & \textbf{远端 develop Gamma PEXSI} & \textbf{当前 stage4 PEXSI} & \textbf{结论}\\ +\midrule +\endhead +002\_C2H6O & 43.24 s,收敛,能量 -664.670823 eV & 43.94 s,收敛,能量 -665.555000 eV & 耗时接近,stage4 能量恢复到官方量级\\ +008\_32H2O & 68.32 s,收敛,能量 -14941.015276 eV & 131.58 s,收敛,能量 -14950.450841 eV & stage4 约 1.9 倍耗时,换来 fixed 占据能量修正\\ +009\_Battery108 & \(>1800\) s,到 PE44,\code{DRHO=4.14e-6} & \(>1800\) s,到 PE2,PE2 为 1336.27 s & stage4 单步更重;旧路径也未在 1800 s 内完成\\ +010\_216Si & 581.28 s,收敛,能量 -23088.956346 eV & \(>1800\) s,到 PE3 & 旧路径快但能量偏差约 34.74 eV;stage4 代价主要来自更重的 PEXSI 参数和路径\\ +\bottomrule +\end{longtable} +} + +\section{超时原因与性能瓶颈分析} + +本报告中的 ``超时'' 不是 ABACUS 自身报错,也不是官方样例不可运行,而是复测脚本在每个样例外层加了 1800 s 限制。超时含义是:该样例在本地 4 个 MPI rank、单样例 1800 s 限制内没有完成 SCF。 + +\subsection{超时日志证据} + +\begin{longtable}{@{}p{0.17\linewidth}p{0.18\linewidth}p{0.50\linewidth}@{}} +\caption{超时样例日志证据} +\label{tab:timeout-evidence}\\ +\toprule +\textbf{样例} & \textbf{规模} & \textbf{1800 s 内进展}\\ +\midrule +\endfirsthead +\toprule +\textbf{样例} & \textbf{规模} & \textbf{1800 s 内进展}\\ +\midrule +\endhead +004\_12Pt111 & 8 k 点,324 基函数 & 到 \code{PE16},最后一步约 \code{107.24 s},\code{DRHO=9.70e-6},尚未达到 \code{1e-7}\\ +006\_16Na & 172 k 点,240 基函数 & 仅完成 \code{PE1},该步约 \code{1087.87 s}\\ +007\_27Fe & 32 k 点,双自旋,729 基函数 & 首个电子步在 1800 s 内未完成\\ +009\_Battery108 & 2 k 点,双自旋,1620 基函数 & \code{PE1=396.52 s},\code{PE2=1336.27 s},未完成 SCF\\ +010\_216Si & 1 k 点,2808 基函数 & 到 \code{PE3},前三步约 \code{734.45/851.86/156.80 s}\\ +\bottomrule +\end{longtable} + +\subsection{性能瓶颈} + +当前超时的直接原因可以归纳为四点: + +\begin{enumerate} + \item \textbf{求解路径不同。} 普通基线走 \code{scalapack\_gvx} 常规对角化路径;当前按要求走 \code{ks\_solver pexsi},需要执行稀疏格式转换、symbolic factorization、selected inversion 和 Fermi operator 计算。 + \item \textbf{多 k 点 PEXSI 已支持 k 池并行,但仍有重复求解成本。} 当前 \code{kpar} 可以把不同 k 点分配到 MPI 子通信器并行执行;但每个 k 点、每次化学势尝试仍会调用一次 PEXSI Fermi operator,且不同 k 池之间的负载均衡和每池 rank 数会明显影响性能。 + \item \textbf{PEXSI plan 和 symbolic factorization 尚未复用。} 当前每次 PEXSI 调用都会重新初始化 plan、转换矩阵、执行 symbolic factorization 和回写矩阵。 + \item \textbf{正确性参数更重。} 为了接近官方能量,本轮降低 temperature 并增加 pole 数。这改善了 008、010 等 Gamma 样例的能量量级,但显著增加了单步 PEXSI 成本。 +\end{enumerate} + +因此,当前报告中的 PEXSI 超时不能简单理解为 ``比旧 Gamma PEXSI 退化''。更准确的说法是:旧 Gamma 实数路径在部分样例上较快,但默认参数下能量不可靠;stage4 的修改把能量拉回官方基线附近,并按官方文档保持 \code{pexsi\_temp} 与 FD smearing 一致,同时拒绝能量错误的低 pole 配置。这使 PEXSI 单步成本显著增加,尤其在 009、010 这类大 Gamma 样例上表现为 1800 s 内无法完成。 + +\subsection{反馈后的补充计时} + +针对反馈中提出的三点问题,本轮追加了 \code{np=8/16}、H/S packed A/B、低 pole 和 OpenMP 校验。关键结果见表 \ref{tab:feedback-timing}。 + +{\scriptsize +\begin{longtable}{@{}p{0.22\linewidth}r r r p{0.34\linewidth}@{}} +\caption{反馈后的补充计时与判断} +\label{tab:feedback-timing}\\ +\toprule +\textbf{测试} & \textbf{总/s} & \textbf{TransMAT2CCS/s} & \textbf{PEXSIDFT/s} & \textbf{判断}\\ +\midrule +\endfirsthead +\toprule +\textbf{测试} & \textbf{总/s} & \textbf{TransMAT2CCS/s} & \textbf{PEXSIDFT/s} & \textbf{判断}\\ +\midrule +\endhead +001 packed H/S & 22.65 & 0.04955 & 12.46 & H/S 打包只带来约 1.25\% 转换计时收益\\ +001 legacy H/S & 24.10 & 0.05017 & 12.40 & 端到端差异主要来自 PEXSI 运行噪声\\ +003 packed H/S & 67.51 & 0.17982 & 60.10 & 转换远小于 Fermi operator\\ +003 legacy H/S & 69.60 & 0.16931 & 62.07 & packed 没有稳定端到端优势\\ +001 \code{npole=120} & 34.54 & -- & 19.51 & 精度参考,FermiOp 较重\\ +001 \code{npole=80} & 23.41 & -- & 12.74 & 快约 32\%,能量偏移约 \(2.9\times10^{-4}\) eV\\ +001 \code{npole=40,temp=0.002} & 22.55 & -- & 11.40 & 更快但能量偏移约 0.263 eV,不可默认\\ +003 \code{np=16,kpar=8,npole=80} & 73.33 & -- & 49.35 & FermiOp 降低但总时间仍慢于 \code{np=8,kpar=4}\\ +006 \code{npole=40,temp=0.0002} & \(>600\) & -- & -- & 600 s 到 PE3,前三步约 215.02/160.42/169.99 s,仍未收敛\\ +001 \code{OMP=2,npole=80} & 25.85 & 0.05488 & 14.52 & 能量一致,小矩阵上线程开销更高\\ +\bottomrule +\end{longtable} +} + +这些补充结果改变了报告中的性能判断:H/S packed 是保留的代码结构优化,但不是主要性能来源;\code{pexsi\_npole} 对速度影响最大,006 的 40-pole 试探也证明它能明显缩短单步时间,但必须受总能量正确性约束;OpenMP 可以用于确定性的本地循环,但不应包住 PEXSI/MPI 外层流程。force 计算正确性测试应排在总能量正确性稳定之后。 + +\subsection{2026-05-25 全量 \code{np=8/16} 复测} + +根据反馈意见,本轮重新构造了官方 10 个样例的 \code{np=8/16} 测试矩阵。统一参数如下: + +\begin{itemize} + \item \code{ks\_solver=pexsi}、\code{smearing\_method=fd}、\code{smearing\_sigma=0.015}、\code{pexsi\_temp=0.015}; + \item \code{pexsi\_npole=40}、\code{pexsi\_nproc\_pole=1}、\code{pexsi\_elec\_thr=0.01}; + \item \code{np=8} 使用 \code{kpar=8},\code{np=16} 使用 \code{kpar=16}。 +\end{itemize} + +008\_32H2O 原输入 \code{nbands=128} 与 FD smearing 不兼容,ABACUS 报错 ``for smearing, num. of bands > num. of occupied bands'',补跑时将 \code{nbands} 调整为 160。 + +{\scriptsize +\begin{longtable}{@{}l r r l p{0.32\linewidth}@{}} +\caption{\code{np=8/16} 官方 10 样例复测,timeout=1800 s} +\label{tab:np8-np16-full-rerun}\\ +\toprule +\textbf{样例} & \textbf{\code{np=8}/s} & \textbf{\code{np=16}/s} & \textbf{状态} & \textbf{备注}\\ +\midrule +\endfirsthead +\toprule +\textbf{样例} & \textbf{\code{np=8}/s} & \textbf{\code{np=16}/s} & \textbf{状态} & \textbf{备注}\\ +\midrule +\endhead +001\_4GaAs & 12 & 14 & 通过 & \code{np=16} 反而略慢\\ +002\_C2H6O & 44 & 45 & 通过 & Gamma 小体系,\code{kpar} 基本无收益\\ +003\_4MoS2 & 89 & 100 & 通过 & \code{np=16} 的 FermiOp 降低但总时间上升\\ +004\_12Pt111 & 407 & 396 & 通过 & \code{np=16} 略快\\ +005\_3BaTiO3 & 386 & 469 & 通过 & \code{np=16} 通信/调度开销更高\\ +006\_16Na & 985 & 1156 & 通过 & 172 k 点阻塞已打通;\code{np=8} 更优\\ +007\_27Fe & \(>1800\) & \(>1800\) & 超时 & \code{np=8} 到 PE3;\code{np=16} PE1 未完成\\ +008\_32H2O & 52 & 65 & 通过 & \code{nbands=160} 后通过\\ +009\_Li27Ni9O54Mn9Co9 & \(>1800\) & \(>1800\) & 超时 & 大 Gamma 双自旋体系,PE 迭代推进但未收敛\\ +010\_216Si & \(>1800\) & \(>1800\) & 超时 & Gamma 大矩阵,PE1 即超过长时间预算\\ +\bottomrule +\end{longtable} +} + +该结果说明:本轮代码和参数优化已经把 006 从此前 1800 s 超时推进到 \code{np=8} 下 985 s 收敛,解决了最初 ``多 k 点 PEXSI 进不去/跑不完'' 的核心阻塞之一。但是本地单节点上 \code{np=16} 并不天然更快;006、005、003 均显示更多 MPI rank 会引入额外通信、调度和缓存压力。 + +\subsection{进一步调参与失败边界} + +针对仍超时的 007、009、010,继续做了定向探针,结果见表 \ref{tab:timeout-tuning}。 + +{\scriptsize +\begin{longtable}{@{}p{0.22\linewidth}p{0.22\linewidth}p{0.18\linewidth}p{0.30\linewidth}@{}} +\caption{超时样例定向调参探针} +\label{tab:timeout-tuning}\\ +\toprule +\textbf{样例} & \textbf{探针参数} & \textbf{结果} & \textbf{判断}\\ +\midrule +\endfirsthead +\toprule +\textbf{样例} & \textbf{探针参数} & \textbf{结果} & \textbf{判断}\\ +\midrule +\endhead +006\_16Na & \code{np=8,kpar=8,npole=40,elec\_thr=0.01} & 988 s 收敛 & 推荐作为当前 006 配置\\ +007\_27Fe & \code{elec\_thr=0.1} & PE1/PE2 约 656/355 s & 有改善但仍不足\\ +007\_27Fe & \code{npole=20} & PE1/PE2 约 304/207 s & 能量约 -75239 eV,严重错误,不可用\\ +007\_27Fe & \code{pexsi\_mu=1.48,mu\_guard=0.05} & PE1/PE2 约 403/322 s & 改善明显但仍难保证 1800 s 内收敛\\ +010\_216Si & \code{np=16,nproc\_pole=4,npole=40} & PE1 约 706 s & 比 \code{nproc\_pole=1} 好,但完整 SCF 仍超时\\ +010\_216Si & \code{np=16,nproc\_pole=8,npole=40} & PE1 约 830 s & 更慢,不采用\\ +010\_216Si & \code{np=16,nproc\_pole=4,npole=20} & PE1 约 349 s & 能量变为正值,严重错误,不可用\\ +\bottomrule +\end{longtable} +} + +细粒度 trace 显示,007 的单次 \code{PPEXSICalculateFermiOperatorComplex} 约为 8--10 s,而 \code{TransMAT2CCS}、symbolic、load/retrieve 均远小于该值。因此 007 的主要瓶颈不是 H/S 转换,也不是 symbolic setup,而是 Fermi operator 调用次数和单次 Fermi operator 成本。010 属于 Gamma 大矩阵问题,\code{nproc\_pole} 可以改善 PE1,但降低 \code{npole} 会产生不可接受的能量错误。 + +\section{重构收益、限制与后续工作} + +\subsection{重构收益} + +本轮修改的主要收益包括: + +\begin{itemize} + \item 多 k 点 PEXSI 从直接 \code{WARNING\_QUIT} 推进到可以实际执行复数 PEXSI 路径; + \item 复数 BCD/CCS 转换和 DM/EDM 回写被纳入 ABACUS 原有模块边界,避免在 hsolver 中堆叠过多格式转换逻辑; + \item 全局化学势搜索和 k 点权重修正使多 k 点电子数模型更合理; + \item 复数 \code{dm2rho} 补齐后,PEXSI 返回的密度矩阵可以进入 ABACUS 后续电荷密度和 SCF 流程; + \item 通过官方 10 样例、普通 ScaLAPACK 基线和 Gamma-only PEXSI 基线,建立了可复现的正确性与性能边界。 +\end{itemize} + +\subsection{仍未解决的问题} + +当前仍存在以下问题: + +\begin{enumerate} + \item \textbf{大规模/金属样例性能不足。} 006 已经在 \code{np=8} 下进入 1800 s 以内,但 007、009、010 仍由 PEXSI Fermi operator 主导,单节点 \code{np=8/16} 仍不能完成 1800 s 内收敛。 + \item \textbf{Gamma 点存在速度/精度权衡。} 远端 \code{develop} 的 Gamma 实数 PEXSI 在 008、010 上更快,但能量明显偏离官方基线;stage4 修正默认 PEXSI 参数后精度改善,但大 Gamma 样例成本显著上升。 + \item \textbf{能量容差仍略高。} 001、003、005、008 已接近官方参考,但 PEXSI pole 展开误差仍在 \(5\times10^{-7}\) eV 附近或以上。 + \item \textbf{smearing 与带数需要配套。} 008 改为 FD smearing 后必须增加 \code{nbands};补跑使用 \code{nbands=160} 后通过。 +\end{enumerate} + +\subsection{后续优化方向} + +后续工作应优先处理以下方向: + +\begin{enumerate} + \item 继续调优 k 点级并行的 \code{kpar} 和每池 rank 数,并在更多节点上测试,而不是只依赖本地单节点 \code{np=8/16} 结果; + \item 缓存或复用 PEXSI symbolic factorization / communication pattern,减少每个 SCF 步、每个 k 点重复建图成本; + \item 针对金属体系设计更合适的 PEXSI temperature/pole 策略,并将 \code{getPole error} 边界做成确定性参数检查; + \item 对 001、003、005、008 建立 PEXSI 专用精度回归标准,或继续寻找不触发 \code{getPole error} 的 pole/temperature 组合; + \item 补充 force/stress 和密度矩阵一致性验证,避免只以总能量判断 PEXSI 路径正确性。 +\end{enumerate} + +\section{结论} + +本报告完成了 ABACUS-PEXSI 多 k 点路径的一次代码修改与重构评估。通过补齐复数 PEXSI 调用、全局化学势搜索、k 点权重修正、复数密度矩阵回写和 PEXSI 温度/FD smearing 对齐,当前代码已经突破原先多 k 点 PEXSI 直接不可用的阻塞。按 2026-05-25 的 \code{np=8/16} 全量复测,官方 10 个样例中已有 7 个样例可以在 1800 s 内完成;007、009、010 仍超时。 + +从并行程序角度看,本轮修改证明了 PEXSI 多 k 点路径的主要难点不只是接口打通,还包括 MPI 数据分布、稀疏矩阵格式转换、全局化学势搜索、k 点并行策略和 PEXSI 参数选择之间的综合权衡。当前实现已经具备 k 池并行能力,但性能尚不能全面超过成熟 ScaLAPACK 路径。后续若要将该路径推进到可长期使用的并行求解器,需要重点优化每池 rank 数、PEXSI plan 复用和更稳定的 pole/temperature 策略。 + +\end{document} From 0e844d489e99051376a459e74a79447557a2f35e Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Tue, 26 May 2026 10:24:45 +0800 Subject: [PATCH 09/23] Add PEXSI section 5.1 scaling benchmark --- tools/pexsi/run_51_scaling_benchmark.py | 625 ++++++++++++++++++++++++ 1 file changed, 625 insertions(+) create mode 100755 tools/pexsi/run_51_scaling_benchmark.py diff --git a/tools/pexsi/run_51_scaling_benchmark.py b/tools/pexsi/run_51_scaling_benchmark.py new file mode 100755 index 00000000000..368601bc3ee --- /dev/null +++ b/tools/pexsi/run_51_scaling_benchmark.py @@ -0,0 +1,625 @@ +#!/usr/bin/env python3 +"""Run the 05_pexsi.md section 5.1 solver-scaling benchmark. + +The benchmark compares the conventional LCAO generalized eigensolver +(`scalapack_gvx`) with `pexsi` on the five systems listed in section 5.1: +Si64 Gamma, Al128 4x4x4, Cu256 2x2x2, TiO2-192 2x2x2, and H2O Gamma. + +Each case is run for np=1,2,4,8,16 by default. Results are written after every +job so interrupted runs can be resumed. +""" + +from __future__ import annotations + +import argparse +import datetime as _dt +import json +import math +import os +import re +import shutil +import signal +import subprocess +import sys +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + + +RY_TO_EV = 13.605693009 +ENERGY_TOL_RY = 1.0e-5 +FORCE_TOL_EV_A = 1.0e-4 +STRESS_TOL_GPA = 1.0e-3 + +DEFAULT_NP = (1, 2, 4, 8, 16) +DEFAULT_SOLVERS = ("scalapack_gvx", "pexsi") + +FINAL_ETOT_RE = re.compile(r"!\s*FINAL_ETOT_IS\s+([-+0-9.eE]+)\s+eV") +TOTAL_TIME_RE = re.compile(r"Total\s+Time\s*:\s*([-+0-9.eE]+)") +TIMER_RE = re.compile(r"^\s*([A-Za-z0-9_./ -]+?)\s+([-+0-9.eE]+)\s+(?:s|sec|seconds)?\s*$") + + +@dataclass(frozen=True) +class CaseSpec: + name: str + natom: int + kmesh: tuple[int, int, int] + generator: str + scf_nmax: int = 100 + ecutwfc: int = 100 + mixing_beta: float = 0.3 + nbands: int | None = None + smearing_sigma: float = 0.015 + + @property + def nks(self) -> int: + return self.kmesh[0] * self.kmesh[1] * self.kmesh[2] + + +CASES: dict[str, CaseSpec] = { + "si64_gamma": CaseSpec( + name="si64_gamma", + natom=64, + kmesh=(1, 1, 1), + generator="copy_si64", + scf_nmax=100, + ecutwfc=60, + mixing_beta=0.4, + nbands=200, + ), + "al128_444": CaseSpec( + name="al128_444", + natom=128, + kmesh=(4, 4, 4), + generator="gen_al128", + mixing_beta=0.3, + ), + "cu256_222": CaseSpec( + name="cu256_222", + natom=256, + kmesh=(2, 2, 2), + generator="gen_cu256", + mixing_beta=0.4, + ), + "tio2_192_222": CaseSpec( + name="tio2_192_222", + natom=192, + kmesh=(2, 2, 2), + generator="gen_tio2_192", + mixing_beta=0.3, + ), + "h2o_gamma": CaseSpec( + name="h2o_gamma", + natom=3, + kmesh=(1, 1, 1), + generator="copy_h2o", + scf_nmax=50, + ecutwfc=100, + mixing_beta=0.4, + nbands=6, + smearing_sigma=0.02, + ), +} + + +def repo_root() -> Path: + return Path(__file__).resolve().parents[2] + + +def timestamp() -> str: + return _dt.datetime.now().strftime("%Y%m%d_%H%M%S") + + +def write_text(path: Path, text: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(text, encoding="utf-8") + + +def format_vec(vec: Iterable[float]) -> str: + return " ".join(f"{x:.10f}" for x in vec) + + +def direct_supercell_positions( + base_positions: Iterable[tuple[float, float, float]], + reps: tuple[int, int, int], +) -> list[tuple[float, float, float]]: + rx, ry, rz = reps + positions: list[tuple[float, float, float]] = [] + for ix in range(rx): + for iy in range(ry): + for iz in range(rz): + for x, y, z in base_positions: + positions.append(((x + ix) / rx, (y + iy) / ry, (z + iz) / rz)) + return positions + + +def write_kpt(case_dir: Path, kmesh: tuple[int, int, int]) -> None: + write_text( + case_dir / "KPT", + "K_POINTS\n" + "0\n" + "Gamma\n" + f"{kmesh[0]} {kmesh[1]} {kmesh[2]} 0 0 0\n", + ) + + +def copy_case_files(src: Path, dst: Path, kmesh: tuple[int, int, int]) -> None: + dst.mkdir(parents=True, exist_ok=True) + shutil.copy2(src / "STRU", dst / "STRU") + write_kpt(dst, kmesh) + + +def write_al128(dst: Path) -> None: + reps = (4, 4, 2) + base = ( + (0.0, 0.0, 0.0), + (0.5, 0.5, 0.0), + (0.5, 0.0, 0.5), + (0.0, 0.5, 0.5), + ) + positions = direct_supercell_positions(base, reps) + assert len(positions) == 128 + lines = [ + "ATOMIC_SPECIES", + "Al 26.982 Al.PD04.PBE.UPF", + "", + "NUMERICAL_ORBITAL", + "Al_gga_10au_100Ry_3s3p2d.orb", + "", + "LATTICE_CONSTANT", + "1.8897261254578281", + "", + "LATTICE_VECTORS", + format_vec((4.04495 * reps[0], 0.0, 0.0)), + format_vec((0.0, 4.04495 * reps[1], 0.0)), + format_vec((0.0, 0.0, 4.04495 * reps[2])), + "", + "ATOMIC_POSITIONS", + "Direct", + "", + "Al", + "0.0", + str(len(positions)), + ] + lines.extend(f"{x:.10f} {y:.10f} {z:.10f} 1 1 1" for x, y, z in positions) + write_text(dst / "STRU", "\n".join(lines) + "\n") + write_kpt(dst, (4, 4, 4)) + + +def write_cu256(dst: Path) -> None: + reps = (8, 8, 4) + positions = direct_supercell_positions(((0.0, 0.0, 0.0),), reps) + assert len(positions) == 256 + base_vectors = ( + (0.5, 0.5, 0.0), + (0.5, 0.0, 0.5), + (0.0, 0.5, 0.5), + ) + lines = [ + "ATOMIC_SPECIES", + "Cu 1.000 Cu.LDA.UPF", + "", + "NUMERICAL_ORBITAL", + "Cu_lda_7.0au_100Ry_2s2p2d", + "", + "LATTICE_CONSTANT", + "6.91640", + "", + "LATTICE_VECTORS", + ] + for vec, rep in zip(base_vectors, reps): + lines.append(format_vec(tuple(rep * x for x in vec))) + lines.extend(["", "ATOMIC_POSITIONS", "Direct", "", "Cu", "0.0", str(len(positions))]) + lines.extend(f"{x:.10f} {y:.10f} {z:.10f} 1 1 1" for x, y, z in positions) + write_text(dst / "STRU", "\n".join(lines) + "\n") + write_kpt(dst, (2, 2, 2)) + + +def write_tio2_192(dst: Path) -> None: + reps = (4, 4, 2) + u = 0.305 + ti_base = ((0.0, 0.0, 0.0), (0.5, 0.5, 0.5)) + o_base = ( + (u, u, 0.0), + (1.0 - u, 1.0 - u, 0.0), + (0.5 + u, 0.5 - u, 0.5), + (0.5 - u, 0.5 + u, 0.5), + ) + ti_positions = direct_supercell_positions(ti_base, reps) + o_positions = direct_supercell_positions(o_base, reps) + assert len(ti_positions) + len(o_positions) == 192 + lines = [ + "ATOMIC_SPECIES", + "Ti 47.867 Ti_ONCV_PBE-1.0.upf", + "O 15.9994 O_ONCV_PBE-1.0.upf", + "", + "NUMERICAL_ORBITAL", + "Ti_gga_8au_100Ry_4s2p2d1f.orb", + "O_gga_8au_100Ry_2s2p1d.orb", + "", + "LATTICE_CONSTANT", + "1.889716", + "", + "LATTICE_VECTORS", + format_vec((4.594 * reps[0], 0.0, 0.0)), + format_vec((0.0, 4.594 * reps[1], 0.0)), + format_vec((0.0, 0.0, 2.959 * reps[2])), + "", + "ATOMIC_POSITIONS", + "Direct", + "", + "Ti", + "0.0", + str(len(ti_positions)), + ] + lines.extend(f"{x:.10f} {y:.10f} {z:.10f} 1 1 1" for x, y, z in ti_positions) + lines.extend(["", "O", "0.0", str(len(o_positions))]) + lines.extend(f"{x:.10f} {y:.10f} {z:.10f} 1 1 1" for x, y, z in o_positions) + write_text(dst / "STRU", "\n".join(lines) + "\n") + write_kpt(dst, (2, 2, 2)) + + +def prepare_structure(repo: Path, case: CaseSpec, case_dir: Path) -> None: + if case.generator == "copy_si64": + copy_case_files(repo / "examples/33_pexsi/02_scf_Si64", case_dir, case.kmesh) + elif case.generator == "copy_h2o": + copy_case_files(repo / "stage3_pexsi_tests/01_h2o_pexsi", case_dir, case.kmesh) + elif case.generator == "gen_al128": + write_al128(case_dir) + elif case.generator == "gen_cu256": + write_cu256(case_dir) + elif case.generator == "gen_tio2_192": + write_tio2_192(case_dir) + else: + raise ValueError(f"unknown generator: {case.generator}") + + +def input_text(repo: Path, case: CaseSpec, solver: str, np: int, suffix: str) -> str: + gamma_only = 1 if case.kmesh == (1, 1, 1) else 0 + kpar = min(np, case.nks) + lines = [ + "INPUT_PARAMETERS", + f"suffix {suffix}", + f"pseudo_dir {repo / 'tests/PP_ORB'}", + f"orbital_dir {repo / 'tests/PP_ORB'}", + "calculation scf", + "basis_type lcao", + f"gamma_only {gamma_only}", + "symmetry 0", + f"ecutwfc {case.ecutwfc}", + "lcao_dr 1e-3", + "scf_thr 1e-6", + f"scf_nmax {case.scf_nmax}", + "cal_force 1", + "cal_stress 1", + "smearing_method fd", + f"smearing_sigma {case.smearing_sigma}", + "mixing_type broyden", + f"mixing_beta {case.mixing_beta}", + ] + if case.nbands is not None: + lines.append(f"nbands {case.nbands}") + if kpar > 1: + lines.append(f"kpar {kpar}") + lines.append(f"ks_solver {solver}") + if solver == "pexsi": + lines.extend( + [ + "pexsi_npole 40", + "pexsi_nproc_pole 1", + f"pexsi_temp {case.smearing_sigma}", + "pexsi_gap 0", + "pexsi_delta_e 20", + "pexsi_mu_lower -10", + "pexsi_mu_upper 10", + "pexsi_mu 0", + "pexsi_mu_thr 0.05", + "pexsi_mu_expand 0.3", + "pexsi_mu_guard 0.2", + "pexsi_elec_thr 0.001", + "pexsi_zero_thr 1e-10", + ] + ) + return "\n".join(lines) + "\n" + + +def parse_numeric_tail(line: str, n: int) -> list[float] | None: + vals: list[float] = [] + for token in line.replace(",", " ").split(): + try: + vals.append(float(token)) + except ValueError: + continue + if len(vals) < n: + return None + return vals[-n:] + + +def parse_block_vectors(lines: list[str], marker: str, width: int, max_rows: int | None) -> list[list[float]]: + vectors: list[list[float]] = [] + active = False + for line in lines: + if marker in line: + active = True + vectors = [] + continue + if not active: + continue + if not line.strip(): + if vectors: + active = False + continue + values = parse_numeric_tail(line, width) + if values is None: + continue + vectors.append(values) + if max_rows is not None and len(vectors) >= max_rows: + active = False + return vectors + + +def parse_log(case_dir: Path, suffix: str) -> dict[str, object]: + log_path = case_dir / f"OUT.{suffix}" / "running_scf.log" + result: dict[str, object] = { + "log": str(log_path), + "converged": False, + "energy_ev": None, + "total_time_s": None, + "max_force_abs": None, + "max_stress_abs": None, + "pexsidft_s": None, + "hsolver_s": None, + "transmat2ccs_s": None, + } + if not log_path.exists(): + return result + text = log_path.read_text(errors="replace") + lines = text.splitlines() + result["converged"] = "#SCF IS CONVERGED" in text or "SCF IS CONVERGED" in text + energies = FINAL_ETOT_RE.findall(text) + if energies: + result["energy_ev"] = float(energies[-1]) + total_times = TOTAL_TIME_RE.findall(text) + if total_times: + result["total_time_s"] = float(total_times[-1]) + + forces = parse_block_vectors(lines, "TOTAL-FORCE", 3, None) + if forces: + result["max_force_abs"] = max(abs(v) for row in forces for v in row) + stress = parse_block_vectors(lines, "TOTAL-STRESS", 3, 3) + if stress: + result["max_stress_abs"] = max(abs(v) for row in stress for v in row) + + for line in lines: + if "PEXSIDFT" in line: + nums = parse_numeric_tail(line, 1) + if nums: + result["pexsidft_s"] = nums[-1] + if "HSolverLCAO" in line and "solve" in line: + nums = parse_numeric_tail(line, 1) + if nums: + result["hsolver_s"] = nums[-1] + if "TransMAT2CCS" in line: + nums = parse_numeric_tail(line, 1) + if nums: + result["transmat2ccs_s"] = nums[-1] + return result + + +def run_command(cmd: list[str], cwd: Path, env: dict[str, str], timeout_s: int) -> tuple[int, bool, float]: + stdout = cwd / "abacus_stdout.log" + stderr = cwd / "abacus_stderr.log" + start = time.monotonic() + with stdout.open("w", encoding="utf-8") as out, stderr.open("w", encoding="utf-8") as err: + proc = subprocess.Popen(cmd, cwd=cwd, env=env, stdout=out, stderr=err, start_new_session=True) + try: + returncode = proc.wait(timeout=timeout_s) + timed_out = False + except subprocess.TimeoutExpired: + timed_out = True + os.killpg(proc.pid, signal.SIGTERM) + try: + proc.wait(timeout=30) + except subprocess.TimeoutExpired: + os.killpg(proc.pid, signal.SIGKILL) + proc.wait() + returncode = 124 + elapsed = time.monotonic() - start + return returncode, timed_out, elapsed + + +def result_path(run_root: Path, case: str, solver: str, np: int) -> Path: + return run_root / case / solver / f"np{np}" / "result.json" + + +def load_result(path: Path) -> dict[str, object] | None: + if not path.exists(): + return None + return json.loads(path.read_text(encoding="utf-8")) + + +def write_summaries(run_root: Path, cases: list[str], nps: list[int], solvers: list[str]) -> None: + rows: list[dict[str, object]] = [] + by_key: dict[tuple[str, str, int], dict[str, object]] = {} + for case in cases: + for solver in solvers: + for np in nps: + data = load_result(result_path(run_root, case, solver, np)) + if data is None: + continue + by_key[(case, solver, np)] = data + rows.append(data) + + for data in rows: + case = str(data["case"]) + solver = str(data["solver"]) + np = int(data["np"]) + data["delta_e_ev"] = None + data["delta_e_ry"] = None + data["energy_pass"] = None + data["delta_force_max"] = None + data["force_pass"] = None + data["delta_stress_max"] = None + data["stress_pass"] = None + if solver != "pexsi": + continue + ref = by_key.get((case, "scalapack_gvx", np)) + if not ref: + continue + energy = data.get("energy_ev") + ref_energy = ref.get("energy_ev") + if isinstance(energy, (int, float)) and isinstance(ref_energy, (int, float)): + delta_ev = float(energy) - float(ref_energy) + data["delta_e_ev"] = delta_ev + data["delta_e_ry"] = delta_ev / RY_TO_EV + data["energy_pass"] = abs(delta_ev / RY_TO_EV) <= ENERGY_TOL_RY + force = data.get("max_force_abs") + ref_force = ref.get("max_force_abs") + if isinstance(force, (int, float)) and isinstance(ref_force, (int, float)): + delta_force = abs(float(force) - float(ref_force)) + data["delta_force_max"] = delta_force + data["force_pass"] = delta_force <= FORCE_TOL_EV_A + stress = data.get("max_stress_abs") + ref_stress = ref.get("max_stress_abs") + if isinstance(stress, (int, float)) and isinstance(ref_stress, (int, float)): + delta_stress = abs(float(stress) - float(ref_stress)) + data["delta_stress_max"] = delta_stress + data["stress_pass"] = delta_stress <= STRESS_TOL_GPA + + summary_json = run_root / "summary.json" + write_text(summary_json, json.dumps(rows, indent=2, sort_keys=True) + "\n") + + headers = [ + "case", + "solver", + "np", + "status", + "returncode", + "wall_s", + "energy_ev", + "delta_e_ev", + "delta_e_ry", + "energy_pass", + "max_force_abs", + "delta_force_max", + "force_pass", + "max_stress_abs", + "delta_stress_max", + "stress_pass", + "total_time_s", + "hsolver_s", + "pexsidft_s", + "transmat2ccs_s", + "case_dir", + ] + lines = ["\t".join(headers)] + for data in sorted(rows, key=lambda x: (str(x["case"]), int(x["np"]), str(x["solver"]))): + lines.append("\t".join(str(data.get(h, "")) for h in headers)) + write_text(run_root / "summary.tsv", "\n".join(lines) + "\n") + + +def parse_csv_arg(value: str, allowed: Iterable[str] | None = None) -> list[str]: + items = [item.strip() for item in value.split(",") if item.strip()] + if allowed is not None: + allowed_set = set(allowed) + bad = [item for item in items if item not in allowed_set] + if bad: + raise argparse.ArgumentTypeError(f"unknown value(s): {', '.join(bad)}") + return items + + +def main(argv: list[str] | None = None) -> int: + repo = repo_root() + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--run-root", type=Path, default=None) + parser.add_argument("--abacus-bin", type=Path, default=repo / "build-pexsi-conda/abacus_basic_para") + parser.add_argument("--mpirun", default="mpirun") + parser.add_argument("--timeout", type=int, default=3600) + parser.add_argument("--np-list", default=",".join(str(x) for x in DEFAULT_NP)) + parser.add_argument("--solvers", default=",".join(DEFAULT_SOLVERS)) + parser.add_argument("--cases", default=",".join(CASES.keys())) + parser.add_argument("--force", action="store_true", help="rerun jobs with an existing result.json") + parser.add_argument("--prepare-only", action="store_true", help="only generate input directories") + args = parser.parse_args(argv) + + cases = parse_csv_arg(args.cases, CASES.keys()) + solvers = parse_csv_arg(args.solvers, DEFAULT_SOLVERS) + nps = [int(x) for x in parse_csv_arg(args.np_list)] + run_root = args.run_root or Path(f"/tmp/abacus_pexsi_51_scaling_{timestamp()}") + run_root.mkdir(parents=True, exist_ok=True) + + meta = { + "created_at": _dt.datetime.now().isoformat(timespec="seconds"), + "repo": str(repo), + "abacus_bin": str(args.abacus_bin), + "timeout_s": args.timeout, + "cases": cases, + "solvers": solvers, + "np_list": nps, + "energy_tolerance_ry": ENERGY_TOL_RY, + "force_tolerance_ev_per_angstrom": FORCE_TOL_EV_A, + "stress_tolerance_gpa": STRESS_TOL_GPA, + } + write_text(run_root / "metadata.json", json.dumps(meta, indent=2, sort_keys=True) + "\n") + + env = os.environ.copy() + env["OMP_NUM_THREADS"] = env.get("OMP_NUM_THREADS", "1") + env["OPENBLAS_NUM_THREADS"] = env.get("OPENBLAS_NUM_THREADS", "1") + env["MKL_NUM_THREADS"] = env.get("MKL_NUM_THREADS", "1") + env["ABACUS_PEXSI_TRACE"] = env.get("ABACUS_PEXSI_TRACE", "stage") + + conda_bin = repo / ".conda/pexsi-env/bin" + conda_lib = repo / ".conda/pexsi-env/lib" + if conda_bin.exists(): + env["PATH"] = f"{conda_bin}:{env.get('PATH', '')}" + if conda_lib.exists(): + env["LD_LIBRARY_PATH"] = f"{conda_lib}:{env.get('LD_LIBRARY_PATH', '')}" + + for case_name in cases: + case = CASES[case_name] + for np in nps: + for solver in solvers: + job_dir = run_root / case.name / solver / f"np{np}" + result_file = job_dir / "result.json" + if result_file.exists() and not args.force: + print(f"skip existing {case.name} {solver} np={np}") + continue + + if job_dir.exists() and args.force: + shutil.rmtree(job_dir) + job_dir.mkdir(parents=True, exist_ok=True) + prepare_structure(repo, case, job_dir) + suffix = f"p51_{case.name}_{solver}_np{np}" + write_text(job_dir / "INPUT", input_text(repo, case, solver, np, suffix)) + if args.prepare_only: + continue + + cmd = [args.mpirun, "-np", str(np), str(args.abacus_bin)] + print(f"run case={case.name} solver={solver} np={np} timeout={args.timeout}s", flush=True) + returncode, timed_out, wall_s = run_command(cmd, job_dir, env, args.timeout) + parsed = parse_log(job_dir, suffix) + status = "timeout" if timed_out else ("converged" if parsed["converged"] else "failed") + data: dict[str, object] = { + "case": case.name, + "natom": case.natom, + "kmesh": "x".join(str(x) for x in case.kmesh), + "solver": solver, + "np": np, + "status": status, + "returncode": returncode, + "timed_out": timed_out, + "wall_s": wall_s, + "case_dir": str(job_dir), + } + data.update(parsed) + write_text(result_file, json.dumps(data, indent=2, sort_keys=True) + "\n") + write_summaries(run_root, cases, nps, solvers) + + write_summaries(run_root, cases, nps, solvers) + print(f"summary: {run_root / 'summary.tsv'}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 7b0863c51d46db71534c41b449a1141fe2397dc9 Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Tue, 26 May 2026 10:28:03 +0800 Subject: [PATCH 10/23] Parameterize PEXSI 5.1 benchmark accuracy --- tools/pexsi/run_51_scaling_benchmark.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/tools/pexsi/run_51_scaling_benchmark.py b/tools/pexsi/run_51_scaling_benchmark.py index 368601bc3ee..d24d091c7db 100755 --- a/tools/pexsi/run_51_scaling_benchmark.py +++ b/tools/pexsi/run_51_scaling_benchmark.py @@ -275,7 +275,15 @@ def prepare_structure(repo: Path, case: CaseSpec, case_dir: Path) -> None: raise ValueError(f"unknown generator: {case.generator}") -def input_text(repo: Path, case: CaseSpec, solver: str, np: int, suffix: str) -> str: +def input_text( + repo: Path, + case: CaseSpec, + solver: str, + np: int, + suffix: str, + pexsi_npole: int, + pexsi_elec_thr: float, +) -> str: gamma_only = 1 if case.kmesh == (1, 1, 1) else 0 kpar = min(np, case.nks) lines = [ @@ -306,7 +314,7 @@ def input_text(repo: Path, case: CaseSpec, solver: str, np: int, suffix: str) -> if solver == "pexsi": lines.extend( [ - "pexsi_npole 40", + f"pexsi_npole {pexsi_npole}", "pexsi_nproc_pole 1", f"pexsi_temp {case.smearing_sigma}", "pexsi_gap 0", @@ -317,7 +325,7 @@ def input_text(repo: Path, case: CaseSpec, solver: str, np: int, suffix: str) -> "pexsi_mu_thr 0.05", "pexsi_mu_expand 0.3", "pexsi_mu_guard 0.2", - "pexsi_elec_thr 0.001", + f"pexsi_elec_thr {pexsi_elec_thr}", "pexsi_zero_thr 1e-10", ] ) @@ -536,6 +544,8 @@ def main(argv: list[str] | None = None) -> int: parser.add_argument("--abacus-bin", type=Path, default=repo / "build-pexsi-conda/abacus_basic_para") parser.add_argument("--mpirun", default="mpirun") parser.add_argument("--timeout", type=int, default=3600) + parser.add_argument("--pexsi-npole", type=int, default=120) + parser.add_argument("--pexsi-elec-thr", type=float, default=0.001) parser.add_argument("--np-list", default=",".join(str(x) for x in DEFAULT_NP)) parser.add_argument("--solvers", default=",".join(DEFAULT_SOLVERS)) parser.add_argument("--cases", default=",".join(CASES.keys())) @@ -557,6 +567,8 @@ def main(argv: list[str] | None = None) -> int: "cases": cases, "solvers": solvers, "np_list": nps, + "pexsi_npole": args.pexsi_npole, + "pexsi_elec_thr": args.pexsi_elec_thr, "energy_tolerance_ry": ENERGY_TOL_RY, "force_tolerance_ev_per_angstrom": FORCE_TOL_EV_A, "stress_tolerance_gpa": STRESS_TOL_GPA, @@ -591,7 +603,10 @@ def main(argv: list[str] | None = None) -> int: job_dir.mkdir(parents=True, exist_ok=True) prepare_structure(repo, case, job_dir) suffix = f"p51_{case.name}_{solver}_np{np}" - write_text(job_dir / "INPUT", input_text(repo, case, solver, np, suffix)) + write_text( + job_dir / "INPUT", + input_text(repo, case, solver, np, suffix, args.pexsi_npole, args.pexsi_elec_thr), + ) if args.prepare_only: continue From 36565e5c89539cbe2bc116029cd764888b269adf Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Tue, 26 May 2026 10:38:52 +0800 Subject: [PATCH 11/23] Expose PEXSI mu tolerance in 5.1 benchmark --- tools/pexsi/run_51_scaling_benchmark.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tools/pexsi/run_51_scaling_benchmark.py b/tools/pexsi/run_51_scaling_benchmark.py index d24d091c7db..c336434fab3 100755 --- a/tools/pexsi/run_51_scaling_benchmark.py +++ b/tools/pexsi/run_51_scaling_benchmark.py @@ -283,6 +283,7 @@ def input_text( suffix: str, pexsi_npole: int, pexsi_elec_thr: float, + pexsi_mu_thr: float, ) -> str: gamma_only = 1 if case.kmesh == (1, 1, 1) else 0 kpar = min(np, case.nks) @@ -322,7 +323,7 @@ def input_text( "pexsi_mu_lower -10", "pexsi_mu_upper 10", "pexsi_mu 0", - "pexsi_mu_thr 0.05", + f"pexsi_mu_thr {pexsi_mu_thr}", "pexsi_mu_expand 0.3", "pexsi_mu_guard 0.2", f"pexsi_elec_thr {pexsi_elec_thr}", @@ -546,6 +547,7 @@ def main(argv: list[str] | None = None) -> int: parser.add_argument("--timeout", type=int, default=3600) parser.add_argument("--pexsi-npole", type=int, default=120) parser.add_argument("--pexsi-elec-thr", type=float, default=0.001) + parser.add_argument("--pexsi-mu-thr", type=float, default=0.05) parser.add_argument("--np-list", default=",".join(str(x) for x in DEFAULT_NP)) parser.add_argument("--solvers", default=",".join(DEFAULT_SOLVERS)) parser.add_argument("--cases", default=",".join(CASES.keys())) @@ -569,6 +571,7 @@ def main(argv: list[str] | None = None) -> int: "np_list": nps, "pexsi_npole": args.pexsi_npole, "pexsi_elec_thr": args.pexsi_elec_thr, + "pexsi_mu_thr": args.pexsi_mu_thr, "energy_tolerance_ry": ENERGY_TOL_RY, "force_tolerance_ev_per_angstrom": FORCE_TOL_EV_A, "stress_tolerance_gpa": STRESS_TOL_GPA, @@ -605,7 +608,16 @@ def main(argv: list[str] | None = None) -> int: suffix = f"p51_{case.name}_{solver}_np{np}" write_text( job_dir / "INPUT", - input_text(repo, case, solver, np, suffix, args.pexsi_npole, args.pexsi_elec_thr), + input_text( + repo, + case, + solver, + np, + suffix, + args.pexsi_npole, + args.pexsi_elec_thr, + args.pexsi_mu_thr, + ), ) if args.prepare_only: continue From b3d5986d015ab298abecda609829b85cfb17ee5d Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Tue, 26 May 2026 10:44:31 +0800 Subject: [PATCH 12/23] Expose PEXSI zero threshold in 5.1 benchmark --- tools/pexsi/run_51_scaling_benchmark.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/pexsi/run_51_scaling_benchmark.py b/tools/pexsi/run_51_scaling_benchmark.py index c336434fab3..73d3019eb41 100755 --- a/tools/pexsi/run_51_scaling_benchmark.py +++ b/tools/pexsi/run_51_scaling_benchmark.py @@ -284,6 +284,7 @@ def input_text( pexsi_npole: int, pexsi_elec_thr: float, pexsi_mu_thr: float, + pexsi_zero_thr: float, ) -> str: gamma_only = 1 if case.kmesh == (1, 1, 1) else 0 kpar = min(np, case.nks) @@ -327,7 +328,7 @@ def input_text( "pexsi_mu_expand 0.3", "pexsi_mu_guard 0.2", f"pexsi_elec_thr {pexsi_elec_thr}", - "pexsi_zero_thr 1e-10", + f"pexsi_zero_thr {pexsi_zero_thr}", ] ) return "\n".join(lines) + "\n" @@ -548,6 +549,7 @@ def main(argv: list[str] | None = None) -> int: parser.add_argument("--pexsi-npole", type=int, default=120) parser.add_argument("--pexsi-elec-thr", type=float, default=0.001) parser.add_argument("--pexsi-mu-thr", type=float, default=0.05) + parser.add_argument("--pexsi-zero-thr", type=float, default=1.0e-10) parser.add_argument("--np-list", default=",".join(str(x) for x in DEFAULT_NP)) parser.add_argument("--solvers", default=",".join(DEFAULT_SOLVERS)) parser.add_argument("--cases", default=",".join(CASES.keys())) @@ -572,6 +574,7 @@ def main(argv: list[str] | None = None) -> int: "pexsi_npole": args.pexsi_npole, "pexsi_elec_thr": args.pexsi_elec_thr, "pexsi_mu_thr": args.pexsi_mu_thr, + "pexsi_zero_thr": args.pexsi_zero_thr, "energy_tolerance_ry": ENERGY_TOL_RY, "force_tolerance_ev_per_angstrom": FORCE_TOL_EV_A, "stress_tolerance_gpa": STRESS_TOL_GPA, @@ -617,6 +620,7 @@ def main(argv: list[str] | None = None) -> int: args.pexsi_npole, args.pexsi_elec_thr, args.pexsi_mu_thr, + args.pexsi_zero_thr, ), ) if args.prepare_only: From 7eef88dffe599986394b6fe40eecb2e7877e5f6e Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Tue, 26 May 2026 10:46:10 +0800 Subject: [PATCH 13/23] Allow smearing override in PEXSI 5.1 benchmark --- tools/pexsi/run_51_scaling_benchmark.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/pexsi/run_51_scaling_benchmark.py b/tools/pexsi/run_51_scaling_benchmark.py index 73d3019eb41..dae1b63aa7c 100755 --- a/tools/pexsi/run_51_scaling_benchmark.py +++ b/tools/pexsi/run_51_scaling_benchmark.py @@ -22,7 +22,7 @@ import subprocess import sys import time -from dataclasses import dataclass +from dataclasses import dataclass, replace from pathlib import Path from typing import Iterable @@ -550,6 +550,12 @@ def main(argv: list[str] | None = None) -> int: parser.add_argument("--pexsi-elec-thr", type=float, default=0.001) parser.add_argument("--pexsi-mu-thr", type=float, default=0.05) parser.add_argument("--pexsi-zero-thr", type=float, default=1.0e-10) + parser.add_argument( + "--smearing-sigma", + type=float, + default=None, + help="override the default Fermi-Dirac smearing sigma and pexsi_temp for all selected cases", + ) parser.add_argument("--np-list", default=",".join(str(x) for x in DEFAULT_NP)) parser.add_argument("--solvers", default=",".join(DEFAULT_SOLVERS)) parser.add_argument("--cases", default=",".join(CASES.keys())) @@ -596,6 +602,8 @@ def main(argv: list[str] | None = None) -> int: for case_name in cases: case = CASES[case_name] + if args.smearing_sigma is not None: + case = replace(case, smearing_sigma=args.smearing_sigma) for np in nps: for solver in solvers: job_dir = run_root / case.name / solver / f"np{np}" From 1b331d3e463e9d363996a5ed0a7b8a1502c6089b Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Tue, 26 May 2026 10:52:02 +0800 Subject: [PATCH 14/23] Tune default smearing for PEXSI 5.1 systems --- tools/pexsi/run_51_scaling_benchmark.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/pexsi/run_51_scaling_benchmark.py b/tools/pexsi/run_51_scaling_benchmark.py index dae1b63aa7c..31701c34330 100755 --- a/tools/pexsi/run_51_scaling_benchmark.py +++ b/tools/pexsi/run_51_scaling_benchmark.py @@ -67,6 +67,7 @@ def nks(self) -> int: ecutwfc=60, mixing_beta=0.4, nbands=200, + smearing_sigma=0.002, ), "al128_444": CaseSpec( name="al128_444", @@ -88,6 +89,7 @@ def nks(self) -> int: kmesh=(2, 2, 2), generator="gen_tio2_192", mixing_beta=0.3, + smearing_sigma=0.002, ), "h2o_gamma": CaseSpec( name="h2o_gamma", @@ -577,6 +579,16 @@ def main(argv: list[str] | None = None) -> int: "cases": cases, "solvers": solvers, "np_list": nps, + "case_options": { + name: { + "natom": CASES[name].natom, + "kmesh": "x".join(str(x) for x in CASES[name].kmesh), + "smearing_sigma": args.smearing_sigma + if args.smearing_sigma is not None + else CASES[name].smearing_sigma, + } + for name in cases + }, "pexsi_npole": args.pexsi_npole, "pexsi_elec_thr": args.pexsi_elec_thr, "pexsi_mu_thr": args.pexsi_mu_thr, From b8e78ff0923cc31675d0974c9088cc490ee2868e Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Tue, 26 May 2026 14:07:15 +0800 Subject: [PATCH 15/23] Tune metal smearing for PEXSI 5.1 benchmark --- tools/pexsi/run_51_scaling_benchmark.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/pexsi/run_51_scaling_benchmark.py b/tools/pexsi/run_51_scaling_benchmark.py index 31701c34330..d4ceaae3898 100755 --- a/tools/pexsi/run_51_scaling_benchmark.py +++ b/tools/pexsi/run_51_scaling_benchmark.py @@ -75,6 +75,7 @@ def nks(self) -> int: kmesh=(4, 4, 4), generator="gen_al128", mixing_beta=0.3, + smearing_sigma=0.03, ), "cu256_222": CaseSpec( name="cu256_222", @@ -82,6 +83,7 @@ def nks(self) -> int: kmesh=(2, 2, 2), generator="gen_cu256", mixing_beta=0.4, + smearing_sigma=0.03, ), "tio2_192_222": CaseSpec( name="tio2_192_222", From fc44ba746f12fadd50d778f5d5fb6aa587a223fb Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Tue, 26 May 2026 14:34:07 +0800 Subject: [PATCH 16/23] Fix PEXSI 5.1 benchmark timing tolerances --- tools/pexsi/run_51_scaling_benchmark.py | 31 +++++++++++++++++++++---- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/tools/pexsi/run_51_scaling_benchmark.py b/tools/pexsi/run_51_scaling_benchmark.py index d4ceaae3898..7af1df5d37f 100755 --- a/tools/pexsi/run_51_scaling_benchmark.py +++ b/tools/pexsi/run_51_scaling_benchmark.py @@ -31,12 +31,20 @@ ENERGY_TOL_RY = 1.0e-5 FORCE_TOL_EV_A = 1.0e-4 STRESS_TOL_GPA = 1.0e-3 +GPA_TO_KBAR = 10.0 +STRESS_TOL_KBAR = STRESS_TOL_GPA * GPA_TO_KBAR DEFAULT_NP = (1, 2, 4, 8, 16) DEFAULT_SOLVERS = ("scalapack_gvx", "pexsi") FINAL_ETOT_RE = re.compile(r"!\s*FINAL_ETOT_IS\s+([-+0-9.eE]+)\s+eV") -TOTAL_TIME_RE = re.compile(r"Total\s+Time\s*:\s*([-+0-9.eE]+)") +TOTAL_TIME_RE = re.compile( + r"Total\s+Time\s*:\s*" + r"(?:(?P[-+0-9.eE]+)\s*h)?\s*" + r"(?:(?P[-+0-9.eE]+)\s*mins?)?\s*" + r"(?:(?P[-+0-9.eE]+)\s*secs?)?", + re.IGNORECASE, +) TIMER_RE = re.compile(r"^\s*([A-Za-z0-9_./ -]+?)\s+([-+0-9.eE]+)\s+(?:s|sec|seconds)?\s*$") @@ -373,6 +381,18 @@ def parse_block_vectors(lines: list[str], marker: str, width: int, max_rows: int return vectors +def parse_total_time_s(text: str) -> float | None: + matches = list(TOTAL_TIME_RE.finditer(text)) + if not matches: + return None + match = matches[-1] + hours = float(match.group("hours") or 0.0) + minutes = float(match.group("minutes") or 0.0) + seconds = float(match.group("seconds") or 0.0) + total = hours * 3600.0 + minutes * 60.0 + seconds + return total if total > 0.0 else None + + def parse_log(case_dir: Path, suffix: str) -> dict[str, object]: log_path = case_dir / f"OUT.{suffix}" / "running_scf.log" result: dict[str, object] = { @@ -385,6 +405,7 @@ def parse_log(case_dir: Path, suffix: str) -> dict[str, object]: "pexsidft_s": None, "hsolver_s": None, "transmat2ccs_s": None, + "stress_unit": "kbar", } if not log_path.exists(): return result @@ -394,9 +415,7 @@ def parse_log(case_dir: Path, suffix: str) -> dict[str, object]: energies = FINAL_ETOT_RE.findall(text) if energies: result["energy_ev"] = float(energies[-1]) - total_times = TOTAL_TIME_RE.findall(text) - if total_times: - result["total_time_s"] = float(total_times[-1]) + result["total_time_s"] = parse_total_time_s(text) forces = parse_block_vectors(lines, "TOTAL-FORCE", 3, None) if forces: @@ -499,7 +518,7 @@ def write_summaries(run_root: Path, cases: list[str], nps: list[int], solvers: l if isinstance(stress, (int, float)) and isinstance(ref_stress, (int, float)): delta_stress = abs(float(stress) - float(ref_stress)) data["delta_stress_max"] = delta_stress - data["stress_pass"] = delta_stress <= STRESS_TOL_GPA + data["stress_pass"] = delta_stress <= STRESS_TOL_KBAR summary_json = run_root / "summary.json" write_text(summary_json, json.dumps(rows, indent=2, sort_keys=True) + "\n") @@ -598,6 +617,8 @@ def main(argv: list[str] | None = None) -> int: "energy_tolerance_ry": ENERGY_TOL_RY, "force_tolerance_ev_per_angstrom": FORCE_TOL_EV_A, "stress_tolerance_gpa": STRESS_TOL_GPA, + "stress_tolerance_kbar": STRESS_TOL_KBAR, + "stress_output_unit": "kbar", } write_text(run_root / "metadata.json", json.dumps(meta, indent=2, sort_keys=True) + "\n") From bf0ff7af6f862844ab75291e9684f45b4c110f32 Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Tue, 26 May 2026 14:38:29 +0800 Subject: [PATCH 17/23] Add summary refresh mode for PEXSI 5.1 benchmark --- tools/pexsi/run_51_scaling_benchmark.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/pexsi/run_51_scaling_benchmark.py b/tools/pexsi/run_51_scaling_benchmark.py index 7af1df5d37f..2b8d7af4790 100755 --- a/tools/pexsi/run_51_scaling_benchmark.py +++ b/tools/pexsi/run_51_scaling_benchmark.py @@ -472,6 +472,17 @@ def load_result(path: Path) -> dict[str, object] | None: return json.loads(path.read_text(encoding="utf-8")) +def refresh_result_file(path: Path, case_dir: Path, suffix: str) -> None: + data = load_result(path) + if data is None: + return + parsed = parse_log(case_dir, suffix) + data.update(parsed) + timed_out = bool(data.get("timed_out", False)) + data["status"] = "timeout" if timed_out else ("converged" if parsed["converged"] else "failed") + write_text(path, json.dumps(data, indent=2, sort_keys=True) + "\n") + + def write_summaries(run_root: Path, cases: list[str], nps: list[int], solvers: list[str]) -> None: rows: list[dict[str, object]] = [] by_key: dict[tuple[str, str, int], dict[str, object]] = {} @@ -584,6 +595,8 @@ def main(argv: list[str] | None = None) -> int: parser.add_argument("--cases", default=",".join(CASES.keys())) parser.add_argument("--force", action="store_true", help="rerun jobs with an existing result.json") parser.add_argument("--prepare-only", action="store_true", help="only generate input directories") + parser.add_argument("--summary-only", action="store_true", help="only refresh/write summaries without running missing jobs") + parser.add_argument("--reparse-existing", action="store_true", help="reparse existing logs before writing summaries") args = parser.parse_args(argv) cases = parse_csv_arg(args.cases, CASES.keys()) @@ -643,15 +656,19 @@ def main(argv: list[str] | None = None) -> int: for solver in solvers: job_dir = run_root / case.name / solver / f"np{np}" result_file = job_dir / "result.json" + suffix = f"p51_{case.name}_{solver}_np{np}" if result_file.exists() and not args.force: + if args.reparse_existing: + refresh_result_file(result_file, job_dir, suffix) print(f"skip existing {case.name} {solver} np={np}") continue + if args.summary_only: + continue if job_dir.exists() and args.force: shutil.rmtree(job_dir) job_dir.mkdir(parents=True, exist_ok=True) prepare_structure(repo, case, job_dir) - suffix = f"p51_{case.name}_{solver}_np{np}" write_text( job_dir / "INPUT", input_text( From fed73cd6c9148e7572a45da80f50bae8a91724a4 Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Tue, 26 May 2026 14:54:10 +0800 Subject: [PATCH 18/23] Compare PEXSI 5.1 forces componentwise --- tools/pexsi/run_51_scaling_benchmark.py | 42 ++++++++++++++++++++----- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/tools/pexsi/run_51_scaling_benchmark.py b/tools/pexsi/run_51_scaling_benchmark.py index 2b8d7af4790..3fe18bb1c7f 100755 --- a/tools/pexsi/run_51_scaling_benchmark.py +++ b/tools/pexsi/run_51_scaling_benchmark.py @@ -393,6 +393,24 @@ def parse_total_time_s(text: str) -> float | None: return total if total > 0.0 else None +def max_abs_diff(lhs: object, rhs: object) -> float | None: + if not isinstance(lhs, list) or not isinstance(rhs, list): + return None + if len(lhs) != len(rhs): + return None + max_diff = 0.0 + for left_row, right_row in zip(lhs, rhs): + if not isinstance(left_row, list) or not isinstance(right_row, list): + return None + if len(left_row) != len(right_row): + return None + for left, right in zip(left_row, right_row): + if not isinstance(left, (int, float)) or not isinstance(right, (int, float)): + return None + max_diff = max(max_diff, abs(float(left) - float(right))) + return max_diff + + def parse_log(case_dir: Path, suffix: str) -> dict[str, object]: log_path = case_dir / f"OUT.{suffix}" / "running_scf.log" result: dict[str, object] = { @@ -419,9 +437,11 @@ def parse_log(case_dir: Path, suffix: str) -> dict[str, object]: forces = parse_block_vectors(lines, "TOTAL-FORCE", 3, None) if forces: + result["forces_ev_per_angstrom"] = forces result["max_force_abs"] = max(abs(v) for row in forces for v in row) stress = parse_block_vectors(lines, "TOTAL-STRESS", 3, 3) if stress: + result["stress_kbar"] = stress result["max_stress_abs"] = max(abs(v) for row in stress for v in row) for line in lines: @@ -518,16 +538,22 @@ def write_summaries(run_root: Path, cases: list[str], nps: list[int], solvers: l data["delta_e_ev"] = delta_ev data["delta_e_ry"] = delta_ev / RY_TO_EV data["energy_pass"] = abs(delta_ev / RY_TO_EV) <= ENERGY_TOL_RY - force = data.get("max_force_abs") - ref_force = ref.get("max_force_abs") - if isinstance(force, (int, float)) and isinstance(ref_force, (int, float)): - delta_force = abs(float(force) - float(ref_force)) + delta_force = max_abs_diff(data.get("forces_ev_per_angstrom"), ref.get("forces_ev_per_angstrom")) + if delta_force is None: + force = data.get("max_force_abs") + ref_force = ref.get("max_force_abs") + if isinstance(force, (int, float)) and isinstance(ref_force, (int, float)): + delta_force = abs(float(force) - float(ref_force)) + if delta_force is not None: data["delta_force_max"] = delta_force data["force_pass"] = delta_force <= FORCE_TOL_EV_A - stress = data.get("max_stress_abs") - ref_stress = ref.get("max_stress_abs") - if isinstance(stress, (int, float)) and isinstance(ref_stress, (int, float)): - delta_stress = abs(float(stress) - float(ref_stress)) + delta_stress = max_abs_diff(data.get("stress_kbar"), ref.get("stress_kbar")) + if delta_stress is None: + stress = data.get("max_stress_abs") + ref_stress = ref.get("max_stress_abs") + if isinstance(stress, (int, float)) and isinstance(ref_stress, (int, float)): + delta_stress = abs(float(stress) - float(ref_stress)) + if delta_stress is not None: data["delta_stress_max"] = delta_stress data["stress_pass"] = delta_stress <= STRESS_TOL_KBAR From 877b26ee6f2f715f192b3a2293490923d2015ffa Mon Sep 17 00:00:00 2001 From: JohnHe233 Date: Tue, 26 May 2026 16:24:55 +0800 Subject: [PATCH 19/23] Parse PEXSI trace timings in 5.1 benchmark --- tools/pexsi/run_51_scaling_benchmark.py | 55 +++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tools/pexsi/run_51_scaling_benchmark.py b/tools/pexsi/run_51_scaling_benchmark.py index 3fe18bb1c7f..a2a97503bf0 100755 --- a/tools/pexsi/run_51_scaling_benchmark.py +++ b/tools/pexsi/run_51_scaling_benchmark.py @@ -46,6 +46,22 @@ re.IGNORECASE, ) TIMER_RE = re.compile(r"^\s*([A-Za-z0-9_./ -]+?)\s+([-+0-9.eE]+)\s+(?:s|sec|seconds)?\s*$") +PEXSI_TRACE_RE = re.compile( + r"PEXSI_TRACE\s+.*?\sstage=(?P[A-Za-z0-9_]+)\.(?Pbegin|end)\s+time=(?P