diff --git a/source/source_hsolver/CMakeLists.txt b/source/source_hsolver/CMakeLists.txt
index b115d6d4cd..6b364562a0 100644
--- a/source/source_hsolver/CMakeLists.txt
+++ b/source/source_hsolver/CMakeLists.txt
@@ -13,6 +13,7 @@ list(APPEND objects
     diago_pxxxgvx.cpp
     diag_hs_para.cpp
     diago_params.cpp
+    diago_ppcg.cpp
 
 )
 
diff --git a/source/source_hsolver/diago_ppcg.cpp b/source/source_hsolver/diago_ppcg.cpp
new file mode 100644
index 0000000000..fc49be8367
--- /dev/null
+++ b/source/source_hsolver/diago_ppcg.cpp
@@ -0,0 +1,542 @@
+#include "source_hsolver/diago_ppcg.h"
+
+#include "diago_iter_assist.h"
+#include "para_linear_transform.h"
+#include "source_base/global_function.h"
+#include "source_base/kernels/math_kernel_op.h"
+#include "source_base/parallel_comm.h"
+
+#include <ATen/kernels/blas.h>
+#include <ATen/kernels/lapack.h>
+#include <cstring>
+#include <limits>
+
+namespace hsolver
+{
+
+template <typename T, typename Device>
+DiagoPPCG<T, Device>::DiagoPPCG(const Real* precondition_in)
+{
+    this->r_type = ct::DataTypeToEnum<Real>::value;
+    this->t_type = ct::DataTypeToEnum<T>::value;
+    this->device_type = ct::DeviceTypeToEnum<Device>::value;
+
+    this->h_prec = std::move(ct::TensorMap((void*)precondition_in, r_type, ct::DeviceType::CpuDevice, {this->n_basis}));
+
+    this->one = &one_;
+    this->zero = &zero_;
+    this->neg_one = &neg_one_;
+}
+
+template <typename T, typename Device>
+DiagoPPCG<T, Device>::~DiagoPPCG()
+{
+    // h_prec is a ref to outside data, do not free.
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim)
+{
+    this->n_band = nband;
+    this->n_band_l = nband_l;
+    this->n_basis = nbasis;
+    this->n_dim = ndim;
+
+    this->eigen = std::move(ct::Tensor(r_type, device_type, {this->n_band}));
+    this->err_st = std::move(ct::Tensor(r_type, device_type, {this->n_band_l}));
+
+    this->psi = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->hpsi = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->w = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->hw = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->p = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->hp = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+    this->work = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis}));
+
+    this->prec = std::move(ct::Tensor(r_type, device_type, {this->n_basis}));
+
+    this->nlocked = 0;
+    this->eigen_locked.resize(this->n_band, static_cast<Real>(0.0));
+
+#ifdef __MPI
+    this->pmmcn.set_dimension(BP_WORLD, POOL_WORLD, n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band);
+    this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, BP_WORLD, false);
+
+    this->all_n_band_l.resize(this->plintrans.nproc_col);
+    MPI_Allgather(&this->n_band_l, 1, MPI_INT, this->all_n_band_l.data(), 1, MPI_INT, BP_WORLD);
+    this->band_displs.resize(this->plintrans.nproc_col);
+    this->band_displs[0] = 0;
+    for (int i = 1; i < this->plintrans.nproc_col; ++i)
+    {
+        this->band_displs[i] = this->band_displs[i - 1] + this->all_n_band_l[i - 1];
+    }
+#else
+    this->pmmcn.set_dimension(n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band);
+    this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, false);
+    this->all_n_band_l = {this->n_band_l};
+    this->band_displs = {0};
+#endif
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::calc_prec()
+{
+    syncmem_var_h2d_op()(this->prec.template data<Real>(), this->h_prec.template data<Real>(), this->n_basis);
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::calc_hpsi(const HPsiFunc& hpsi_func, T* psi_in, ct::Tensor& hpsi_out)
+{
+    hpsi_func(psi_in, hpsi_out.data<T>(), this->n_basis, this->n_band_l);
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::calc_grad(const ct::Tensor& prec_in,
+                                     ct::Tensor& err_out,
+                                     ct::Tensor& psi_in,
+                                     ct::Tensor& hpsi_in,
+                                     ct::Tensor& grad_out,
+                                     const int nlocked_in)
+{
+    int start_nband = 0;
+#ifdef __MPI
+    if (this->plintrans.nproc_col > 1)
+    {
+        start_nband = this->plintrans.start_colB[GlobalV::MY_BNDGROUP];
+    }
+#endif
+    int local_nlocked = std::max(0, nlocked_in - start_nband);
+    local_nlocked = std::min(local_nlocked, this->n_band_l);
+
+    // Zero out locked bands
+    for (int ib = 0; ib < local_nlocked; ++ib)
+    {
+        setmem_complex_op()(grad_out.data<T>() + ib * this->n_basis, 0, this->n_basis);
+        err_out.data<Real>()[ib] = static_cast<Real>(0.0);
+    }
+
+    for (int ib = local_nlocked; ib < this->n_band_l; ++ib)
+    {
+        T* psi_ptr = psi_in.data<T>() + ib * this->n_basis;
+        T* hpsi_ptr = hpsi_in.data<T>() + ib * this->n_basis;
+        T* grad_ptr = grad_out.data<T>() + ib * this->n_basis;
+
+        // 1. Normalize psi (and hpsi consistently)
+        Real norm = ModuleBase::dot_real_op<T, Device>()(this->n_dim, psi_ptr, psi_ptr, true);
+        norm = 1.0 / sqrt(norm);
+        ModuleBase::vector_div_constant_op<T, Device>()(this->n_dim, psi_ptr, psi_ptr, norm);
+        ModuleBase::vector_div_constant_op<T, Device>()(this->n_dim, hpsi_ptr, hpsi_ptr, norm);
+
+        // 2. Rayleigh quotient: epsilo = <psi|hpsi>
+        Real epsilo = ModuleBase::dot_real_op<T, Device>()(this->n_dim, psi_ptr, hpsi_ptr, true);
+
+        // 3. Residual: grad = hpsi - epsilo * psi
+        ModuleBase::vector_add_vector_op<T, Device>()(this->n_dim, grad_ptr, hpsi_ptr, 1.0, psi_ptr, -epsilo);
+
+        // 4. Error = ||raw residual||
+        Real err = ModuleBase::dot_real_op<T, Device>()(this->n_dim, grad_ptr, grad_ptr, true);
+        err_out.data<Real>()[ib] = sqrt(err);
+
+        // 5. Apply preconditioner: grad = grad / prec
+        ModuleBase::vector_div_vector_op<T, Device>()(this->n_dim, grad_ptr, grad_ptr, prec_in.data<Real>());
+    }
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::update_locking(const ct::Tensor& err_in, const std::vector<double>& ethr_band)
+{
+    // Gather local errors to global array
+    std::vector<Real> local_err(this->n_band_l);
+    if (err_in.device_type() == ct::DeviceType::GpuDevice)
+    {
+        syncmem_var_d2h_op()(local_err.data(), err_in.data<Real>(), this->n_band_l);
+    }
+    else
+    {
+        std::memcpy(local_err.data(), err_in.data<Real>(), this->n_band_l * sizeof(Real));
+    }
+
+    std::vector<Real> global_err(this->n_band, static_cast<Real>(0.0));
+    std::vector<double> global_ethr(this->n_band, 0.0);
+
+#ifdef __MPI
+    MPI_Datatype mpi_real_type = (sizeof(Real) == sizeof(float)) ? MPI_FLOAT : MPI_DOUBLE;
+    MPI_Allgatherv(local_err.data(),
+                   this->n_band_l,
+                   mpi_real_type,
+                   global_err.data(),
+                   this->all_n_band_l.data(),
+                   this->band_displs.data(),
+                   mpi_real_type,
+                   BP_WORLD);
+
+    std::vector<double> local_ethr_double(ethr_band.begin(), ethr_band.end());
+    MPI_Allgatherv(local_ethr_double.data(),
+                   this->n_band_l,
+                   MPI_DOUBLE,
+                   global_ethr.data(),
+                   this->all_n_band_l.data(),
+                   this->band_displs.data(),
+                   MPI_DOUBLE,
+                   BP_WORLD);
+#else
+    for (int i = 0; i < this->n_band_l; ++i)
+    {
+        global_err[i] = local_err[i];
+        global_ethr[i] = ethr_band[i];
+    }
+#endif
+
+    // Gather current eigenvalues from device
+    std::vector<Real> current_eigen(this->n_band, static_cast<Real>(0.0));
+    syncmem_var_d2h_op()(current_eigen.data(), this->eigen.data<Real>(), this->n_band);
+
+    // Scan from current nlocked forward and lock converged bands
+    while (this->nlocked < this->n_band)
+    {
+        if (global_err[this->nlocked] <= global_ethr[this->nlocked])
+        {
+            this->eigen_locked[this->nlocked] = current_eigen[this->nlocked];
+            this->nlocked++;
+        }
+        else
+        {
+            break;
+        }
+    }
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::orth_projection(const ct::Tensor& psi_in, ct::Tensor& hsub_tmp, ct::Tensor& grad_out)
+{
+    // hsub_tmp = psi^H * grad (n_band x n_band global)
+    this->pmmcn.multiply(1.0, psi_in.data<T>(), grad_out.data<T>(), 0.0, hsub_tmp.data<T>());
+
+    // grad = grad - psi * hsub_tmp
+    this->plintrans.act(-1.0, psi_in.data<T>(), hsub_tmp.data<T>(), 1.0, grad_out.data<T>());
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::orth_cholesky(ct::Tensor& workspace_in,
+                                         ct::Tensor& psi_out,
+                                         ct::Tensor& hpsi_out,
+                                         ct::Tensor& hsub_out)
+{
+    // hsub_out = psi_out^H * psi_out
+    this->pmmcn.multiply(1.0, psi_out.data<T>(), psi_out.data<T>(), 0.0, hsub_out.data<T>());
+
+    ct::kernels::set_matrix<T, ct_Device>()('L', hsub_out.data<T>(), this->n_band);
+
+    ct::kernels::lapack_potrf<T, ct_Device>()('U', this->n_band, hsub_out.data<T>(), this->n_band);
+    ct::kernels::lapack_trtri<T, ct_Device>()('U', 'N', this->n_band, hsub_out.data<T>(), this->n_band);
+
+    // Rotate psi and hpsi
+    this->plintrans.act(1.0, psi_out.data<T>(), hsub_out.data<T>(), 0.0, workspace_in.data<T>());
+    syncmem_complex_op()(psi_out.data<T>(), workspace_in.data<T>(), this->n_band_l * this->n_basis);
+
+    this->plintrans.act(1.0, hpsi_out.data<T>(), hsub_out.data<T>(), 0.0, workspace_in.data<T>());
+    syncmem_complex_op()(hpsi_out.data<T>(), workspace_in.data<T>(), this->n_band_l * this->n_basis);
+}
+
+template <typename T, typename Device>
+bool DiagoPPCG<T, Device>::test_error(const ct::Tensor& err_in, const std::vector<double>& ethr_band)
+{
+    Real* _err_st = err_in.data<Real>();
+    bool not_conv = false;
+    std::vector<Real> tmp_cpu;
+    if (err_in.device_type() == ct::DeviceType::GpuDevice)
+    {
+        tmp_cpu.resize(this->n_band_l);
+        _err_st = tmp_cpu.data();
+        syncmem_var_d2h_op()(_err_st, err_in.data<Real>(), this->n_band_l);
+    }
+    for (int ii = 0; ii < this->n_band_l; ++ii)
+    {
+        if (_err_st[ii] > ethr_band[ii])
+        {
+            not_conv = true;
+        }
+    }
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &not_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD);
+#endif
+    return not_conv;
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::diag_subspace(const HPsiFunc& hpsi_func,
+                                         const bool has_p,
+                                         ct::Tensor& psi_out,
+                                         ct::Tensor& hpsi_out,
+                                         ct::Tensor& p_out,
+                                         ct::Tensor& hp_out,
+                                         const int nlocked_in)
+{
+    const int n_sub = has_p ? 3 * this->n_band : 2 * this->n_band;
+
+    // 1. Compute H|w>
+    this->calc_hpsi(hpsi_func, this->w.data<T>(), this->hw);
+
+    // 2. Compute overlap (S) and Hamiltonian (H) projection blocks.
+    //    Only upper-triangular blocks are computed explicitly;
+    //    lower-triangular parts are filled by Hermitian conjugate.
+    ct::Tensor b_00(t_type, device_type, {this->n_band, this->n_band});
+    ct::Tensor b_01(t_type, device_type, {this->n_band, this->n_band});
+    ct::Tensor b_11(t_type, device_type, {this->n_band, this->n_band});
+
+    this->pmmcn.multiply(one_, psi_out.data<T>(), psi_out.data<T>(), zero_, b_00.data<T>());
+    this->pmmcn.multiply(one_, psi_out.data<T>(), this->w.data<T>(), zero_, b_01.data<T>());
+    this->pmmcn.multiply(one_, this->w.data<T>(), this->w.data<T>(), zero_, b_11.data<T>());
+
+    ct::Tensor bh_00(t_type, device_type, {this->n_band, this->n_band});
+    ct::Tensor bh_01(t_type, device_type, {this->n_band, this->n_band});
+    ct::Tensor bh_11(t_type, device_type, {this->n_band, this->n_band});
+
+    this->pmmcn.multiply(one_, psi_out.data<T>(), hpsi_out.data<T>(), zero_, bh_00.data<T>());
+    this->pmmcn.multiply(one_, psi_out.data<T>(), this->hw.data<T>(), zero_, bh_01.data<T>());
+    this->pmmcn.multiply(one_, this->w.data<T>(), this->hw.data<T>(), zero_, bh_11.data<T>());
+
+    ct::Tensor b_02, b_12, b_22, bh_02, bh_12, bh_22;
+    if (has_p)
+    {
+        b_02 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band});
+        b_12 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band});
+        b_22 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band});
+        bh_02 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band});
+        bh_12 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band});
+        bh_22 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band});
+
+        this->pmmcn.multiply(one_, psi_out.data<T>(), p_out.data<T>(), zero_, b_02.data<T>());
+        this->pmmcn.multiply(one_, this->w.data<T>(), p_out.data<T>(), zero_, b_12.data<T>());
+        this->pmmcn.multiply(one_, p_out.data<T>(), p_out.data<T>(), zero_, b_22.data<T>());
+
+        this->pmmcn.multiply(one_, psi_out.data<T>(), hp_out.data<T>(), zero_, bh_02.data<T>());
+        this->pmmcn.multiply(one_, this->w.data<T>(), hp_out.data<T>(), zero_, bh_12.data<T>());
+        this->pmmcn.multiply(one_, p_out.data<T>(), hp_out.data<T>(), zero_, bh_22.data<T>());
+    }
+
+    // 3. Assemble projected matrices on CPU
+    ct::Tensor hsub_cpu(t_type, ct::DeviceType::CpuDevice, {n_sub, n_sub});
+    ct::Tensor ssub_cpu(t_type, ct::DeviceType::CpuDevice, {n_sub, n_sub});
+    ct::Tensor vcc_cpu(t_type, ct::DeviceType::CpuDevice, {n_sub, n_sub});
+    ct::Tensor eigen_cpu(r_type, ct::DeviceType::CpuDevice, {n_sub});
+
+    // Helper to copy block and optionally Hermitian-conjugate transpose
+    auto copy_block = [&](const ct::Tensor& dev_block, int row_off, int col_off, bool to_h, bool hc) {
+        std::vector<T> tmp(this->n_band * this->n_band);
+        syncmem_complex_d2h_op()(tmp.data(), dev_block.data<T>(), this->n_band * this->n_band);
+        T* dest = to_h ? hsub_cpu.data<T>() : ssub_cpu.data<T>();
+        for (int j = 0; j < this->n_band; ++j)
+        {
+            for (int i = 0; i < this->n_band; ++i)
+            {
+                T val = hc ? std::conj(tmp[j + i * this->n_band]) : tmp[i + j * this->n_band];
+                dest[(row_off + i) + (col_off + j) * n_sub] = val;
+            }
+        }
+    };
+
+    // S_sub assembly
+    copy_block(b_00, 0, 0, false, false);
+    copy_block(b_01, 0, this->n_band, false, false);
+    copy_block(b_01, this->n_band, 0, false, true); // b_10 = b_01^H
+    copy_block(b_11, this->n_band, this->n_band, false, false);
+
+    // H_sub assembly
+    copy_block(bh_00, 0, 0, true, false);
+    copy_block(bh_01, 0, this->n_band, true, false);
+    copy_block(bh_01, this->n_band, 0, true, true); // bh_10 = bh_01^H
+    copy_block(bh_11, this->n_band, this->n_band, true, false);
+
+    if (has_p)
+    {
+        copy_block(b_02, 0, 2 * this->n_band, false, false);
+        copy_block(b_02, 2 * this->n_band, 0, false, true);
+        copy_block(b_12, this->n_band, 2 * this->n_band, false, false);
+        copy_block(b_12, 2 * this->n_band, this->n_band, false, true);
+        copy_block(b_22, 2 * this->n_band, 2 * this->n_band, false, false);
+
+        copy_block(bh_02, 0, 2 * this->n_band, true, false);
+        copy_block(bh_02, 2 * this->n_band, 0, true, true);
+        copy_block(bh_12, this->n_band, 2 * this->n_band, true, false);
+        copy_block(bh_12, 2 * this->n_band, this->n_band, true, true);
+        copy_block(bh_22, 2 * this->n_band, 2 * this->n_band, true, false);
+    }
+
+    // 4. Freeze locked bands: force their rows/columns to diagonal standard basis
+    if (nlocked_in > 0)
+    {
+        for (int i = 0; i < nlocked_in; ++i)
+        {
+            for (int j = 0; j < n_sub; ++j)
+            {
+                T s_val = (j == i) ? one_ : zero_;
+                T h_val = (j == i) ? static_cast<T>(this->eigen_locked[i]) : zero_;
+                hsub_cpu.data<T>()[i + j * n_sub] = h_val;
+                hsub_cpu.data<T>()[j + i * n_sub] = h_val;
+                ssub_cpu.data<T>()[i + j * n_sub] = s_val;
+                ssub_cpu.data<T>()[j + i * n_sub] = s_val;
+            }
+        }
+    }
+
+    // 5. Solve generalized eigenvalue problem H_sub * v = lambda * S_sub * v
+    hsolver::hegvd_op<T, base_device::DEVICE_CPU>()(nullptr,
+                                                    n_sub,
+                                                    n_sub,
+                                                    hsub_cpu.data<T>(),
+                                                    ssub_cpu.data<T>(),
+                                                    eigen_cpu.data<Real>(),
+                                                    vcc_cpu.data<T>());
+
+    // Ensure locked eigenvalues remain unchanged (overwrite in case of numerical drift)
+    for (int i = 0; i < nlocked_in && i < this->n_band; ++i)
+    {
+        eigen_cpu.data<Real>()[i] = this->eigen_locked[i];
+    }
+
+    // 6. Move eigenvectors back to device
+    ct::Tensor vcc_dev = vcc_cpu.to_device<ct_Device>();
+
+    // 7. Update psi = X*C_X + W*C_W + (P*C_P)
+    setmem_complex_op()(this->work.data<T>(), 0, this->n_band_l * this->n_basis);
+    this->plintrans.act(1.0, psi_out.data<T>(), vcc_dev.data<T>() + 0, 0.0, this->work.data<T>());
+    this->plintrans.act(1.0, this->w.data<T>(), vcc_dev.data<T>() + this->n_band, 1.0, this->work.data<T>());
+    if (has_p)
+    {
+        this->plintrans.act(1.0, p_out.data<T>(), vcc_dev.data<T>() + 2 * this->n_band, 1.0, this->work.data<T>());
+    }
+    syncmem_complex_op()(psi_out.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    // 8. Update hpsi = HX*C_X + HW*C_W + (HP*C_P)
+    setmem_complex_op()(this->work.data<T>(), 0, this->n_band_l * this->n_basis);
+    this->plintrans.act(1.0, hpsi_out.data<T>(), vcc_dev.data<T>() + 0, 0.0, this->work.data<T>());
+    this->plintrans.act(1.0, this->hw.data<T>(), vcc_dev.data<T>() + this->n_band, 1.0, this->work.data<T>());
+    if (has_p)
+    {
+        this->plintrans.act(1.0, hp_out.data<T>(), vcc_dev.data<T>() + 2 * this->n_band, 1.0, this->work.data<T>());
+    }
+    syncmem_complex_op()(hpsi_out.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    // 9. Update p = W*C_W + (P*C_P)  -- LOBPCG style, no X component
+    setmem_complex_op()(this->work.data<T>(), 0, this->n_band_l * this->n_basis);
+    this->plintrans.act(1.0, this->w.data<T>(), vcc_dev.data<T>() + this->n_band, 0.0, this->work.data<T>());
+    if (has_p)
+    {
+        this->plintrans.act(1.0, p_out.data<T>(), vcc_dev.data<T>() + 2 * this->n_band, 1.0, this->work.data<T>());
+    }
+    syncmem_complex_op()(p_out.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    // 10. Update hp = HW*C_W + (HP*C_P)
+    setmem_complex_op()(this->work.data<T>(), 0, this->n_band_l * this->n_basis);
+    this->plintrans.act(1.0, this->hw.data<T>(), vcc_dev.data<T>() + this->n_band, 0.0, this->work.data<T>());
+    if (has_p)
+    {
+        this->plintrans.act(1.0, hp_out.data<T>(), vcc_dev.data<T>() + 2 * this->n_band, 1.0, this->work.data<T>());
+    }
+    syncmem_complex_op()(hp_out.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    // 11. Update eigenvalues with the lowest n_band eigenvalues from subspace diagonalization
+    syncmem_var_h2d_op()(this->eigen.data<Real>(), eigen_cpu.data<Real>(), this->n_band);
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
+                                T* psi_in,
+                                Real* eigenvalue_in,
+                                const std::vector<double>& ethr_band)
+{
+    const int current_scf_iter = hsolver::DiagoIterAssist<T, Device>::SCF_ITER;
+
+    // Map the input psi pointer
+    this->psi = std::move(ct::TensorMap(psi_in, t_type, device_type, {this->n_band_l, this->n_basis}));
+
+    // Update precondition array
+    this->calc_prec();
+
+    // Initial subspace diagonalization to improve the initial guess
+    this->calc_hpsi(hpsi_func, psi_in, this->hpsi);
+
+    // Build and diagonalize the subspace Hamiltonian in the psi basis
+    ct::Tensor hsub_init(t_type, device_type, {this->n_band, this->n_band});
+    this->pmmcn.multiply(one_, this->hpsi.data<T>(), this->psi.data<T>(), zero_, hsub_init.data<T>());
+    ct::kernels::lapack_heevd<T, ct_Device>()(this->n_band,
+                                              hsub_init.data<T>(),
+                                              this->n_band,
+                                              this->eigen.data<Real>());
+
+    // Rotate psi and hpsi with the eigenvectors of the subspace problem
+    this->plintrans.act(1.0, this->psi.data<T>(), hsub_init.data<T>(), 0.0, this->work.data<T>());
+    syncmem_complex_op()(this->psi.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+    this->plintrans.act(1.0, this->hpsi.data<T>(), hsub_init.data<T>(), 0.0, this->work.data<T>());
+    syncmem_complex_op()(this->hpsi.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    // Initialize search direction to zero
+    setmem_complex_op()(this->p.data<T>(), 0, this->n_band_l * this->n_basis);
+    setmem_complex_op()(this->hp.data<T>(), 0, this->n_band_l * this->n_basis);
+
+    // Allocate a reusable tensor for projection overlap
+    ct::Tensor hsub_orth(t_type, device_type, {this->n_band, this->n_band});
+
+    int ntry = 0;
+    int max_iter = current_scf_iter > 1 ? this->nline : this->nline * 6;
+    this->nlocked = 0;
+
+    do
+    {
+        ++ntry;
+
+        // 1. Calculate preconditioned residual w and error for active bands only
+        this->calc_grad(this->prec, this->err_st, this->psi, this->hpsi, this->w, this->nlocked);
+
+        // 2. Update locking status: scan from current nlocked forward
+        this->update_locking(this->err_st, ethr_band);
+
+        // 3. Exit if all bands have converged
+        if (this->nlocked >= this->n_band)
+        {
+            break;
+        }
+
+        // 4. Project active residual to orthogonal complement of psi
+        this->orth_projection(this->psi, hsub_orth, this->w);
+
+        // 5. Expanded subspace diagonalization with locking
+        //    Locked bands are frozen in the subspace problem
+        this->diag_subspace(hpsi_func, ntry > 1, this->psi, this->hpsi, this->p, this->hp, this->nlocked);
+
+        // Note: orth_cholesky is intentionally skipped here.
+        // The Rayleigh-Ritz step already provides orthonormal vectors (within numerical precision).
+        // Global Cholesky would destroy the locking by remixing all bands.
+
+    } while (ntry < max_iter && this->nlocked < this->n_band);
+
+    // Final subspace diagonalization to obtain accurate eigenvalues
+    this->pmmcn.multiply(one_, this->hpsi.data<T>(), this->psi.data<T>(), zero_, hsub_orth.data<T>());
+    ct::kernels::lapack_heevd<T, ct_Device>()(this->n_band,
+                                              hsub_orth.data<T>(),
+                                              this->n_band,
+                                              this->eigen.data<Real>());
+    this->plintrans.act(1.0, this->psi.data<T>(), hsub_orth.data<T>(), 0.0, this->work.data<T>());
+    syncmem_complex_op()(this->psi.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    // Copy eigenvalues to output
+    int start_nband = 0;
+#ifdef __MPI
+    if (this->plintrans.nproc_col > 1)
+    {
+        start_nband = this->plintrans.start_colB[GlobalV::MY_BNDGROUP];
+    }
+#endif
+    syncmem_var_d2h_op()(eigenvalue_in, this->eigen.data<Real>() + start_nband, this->n_band_l);
+}
+
+// Explicit template instantiations
+template class DiagoPPCG<std::complex<float>, base_device::DEVICE_CPU>;
+template class DiagoPPCG<std::complex<double>, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class DiagoPPCG<std::complex<float>, base_device::DEVICE_GPU>;
+template class DiagoPPCG<std::complex<double>, base_device::DEVICE_GPU>;
+#endif
+
+} // namespace hsolver
diff --git a/source/source_hsolver/diago_ppcg.h b/source/source_hsolver/diago_ppcg.h
new file mode 100644
index 0000000000..d0ca5a2ebb
--- /dev/null
+++ b/source/source_hsolver/diago_ppcg.h
@@ -0,0 +1,201 @@
+#ifndef DIAGO_PPCG_H_
+#define DIAGO_PPCG_H_
+
+#include "source_base/kernels/math_kernel_op.h"
+#include "source_base/module_device/memory_op.h"
+#include "source_base/module_device/types.h"
+#include "source_base/para_gemm.h"
+#include "source_hsolver/kernels/hegvd_op.h"
+#include "source_hsolver/para_linear_transform.h"
+
+#include <ATen/core/tensor.h>
+#include <ATen/core/tensor_map.h>
+#include <source_base/macros.h>
+
+namespace hsolver
+{
+
+/**
+ * @class DiagoPPCG
+ * @brief A class for diagonalization using the Projected Preconditioned Conjugate Gradient (PPCG) method.
+ *
+ * The DiagoPPCG class implements a block LOBPCG-like algorithm for solving generalized eigenvalue problems.
+ * It uses an expanded subspace [X, W, P] where X is the current eigenvector approximation,
+ * W is the preconditioned residual, and P is the conjugate search direction from previous steps.
+ *
+ * @tparam T The floating-point type used for calculations.
+ * @tparam Device The device used for calculations (e.g., cpu or gpu).
+ */
+template <typename T = std::complex<double>, typename Device = base_device::DEVICE_CPU>
+class DiagoPPCG
+{
+  private:
+    using Real = typename GetTypeReal<T>::type;
+
+  public:
+    /**
+     * @brief Constructor for DiagoPPCG class.
+     *
+     * @param precondition_in Pointer to the host precondition array.
+     */
+    explicit DiagoPPCG(const Real* precondition_in);
+
+    /**
+     * @brief Destructor for DiagoPPCG class.
+     */
+    ~DiagoPPCG();
+
+    /**
+     * @brief Initialize the class before diagonalization.
+     *
+     * @param nband The number of bands of all processes.
+     * @param nband_l The number of bands of current process.
+     * @param nbasis The number of basis functions. Leading dimension of psi.
+     * @param ndim The number of valid dimension of psi.
+     */
+    void init_iter(const int nband, const int nband_l, const int nbasis, const int ndim);
+
+    using HPsiFunc = std::function<void(T*, T*, const int, const int)>;
+
+    /**
+     * @brief Diagonalize the Hamiltonian using the PPCG method.
+     *
+     * @param hpsi_func A function computing the product of the Hamiltonian matrix H
+     * and a wavefunction blockvector X.
+     * @param psi_in Pointer to input wavefunction psi matrix with [dim: n_basis x n_band, column major].
+     * @param eigenvalue_in Pointer to the eigen array with [dim: n_band].
+     * @param ethr_band Convergence threshold for each band.
+     */
+    void diag(const HPsiFunc& hpsi_func, T* psi_in, Real* eigenvalue_in, const std::vector<double>& ethr_band);
+
+  private:
+    /// the number of bands of all processes
+    int n_band = 0;
+    /// the number of bands of current process
+    int n_band_l = 0;
+    /// the number of cols of the input psi
+    int n_basis = 0;
+    /// valid dimension of psi
+    int n_dim = 0;
+    /// max iter steps for ppcg loop
+    int nline = 4;
+
+    /// parallel matrix multiplication
+    ModuleBase::PGemmCN<T, Device> pmmcn;
+    PLinearTransform<T, Device> plintrans;
+
+    ct::DataType r_type = ct::DataType::DT_INVALID;
+    ct::DataType t_type = ct::DataType::DT_INVALID;
+    ct::DeviceType device_type = ct::DeviceType::UnKnown;
+
+    ct::Tensor h_prec = {};
+    ct::Tensor prec = {};
+    ct::Tensor eigen = {};
+
+    /// Number of globally converged (locked) bands
+    int nlocked = 0;
+    /// Locked eigenvalues on CPU
+    std::vector<Real> eigen_locked;
+    /// MPI band distribution for error gathering
+    std::vector<int> all_n_band_l;
+    std::vector<int> band_displs;
+    ct::Tensor err_st = {};
+
+    ct::Tensor psi = {}, hpsi = {};
+    ct::Tensor w = {}, hw = {};
+    ct::Tensor p = {}, hp = {};
+    ct::Tensor work = {};
+
+    Device* ctx = {};
+    const T *one = nullptr, *zero = nullptr, *neg_one = nullptr;
+    const T one_ = static_cast<T>(1.0), zero_ = static_cast<T>(0.0), neg_one_ = static_cast<T>(-1.0);
+
+    /**
+     * @brief Update the precondition array from host to device.
+     */
+    void calc_prec();
+
+    /**
+     * @brief Apply the H operator to psi and obtain the hpsi matrix.
+     */
+    void calc_hpsi(const HPsiFunc& hpsi_func, T* psi_in, ct::Tensor& hpsi_out);
+
+    /**
+     * @brief Calculate the preconditioned residual (gradient) and error.
+     *
+     * @param prec_in Input preconditioner.
+     * @param err_out Output error for each local band.
+     * @param psi_in Input wavefunction.
+     * @param hpsi_in H|psi> matrix.
+     * @param grad_out Output preconditioned residual.
+     */
+    void calc_grad(const ct::Tensor& prec_in,
+                   ct::Tensor& err_out,
+                   ct::Tensor& psi_in,
+                   ct::Tensor& hpsi_in,
+                   ct::Tensor& grad_out,
+                   const int nlocked_in = 0);
+
+    /**
+     * @brief Orthogonalize grad to psi using S-inner product (S=I for norm-conserving).
+     *
+     * @param psi_in Input wavefunction.
+     * @param hsub_tmp Workspace for overlap matrix.
+     * @param grad_out Input/Output gradient.
+     */
+    void orth_projection(const ct::Tensor& psi_in, ct::Tensor& hsub_tmp, ct::Tensor& grad_out);
+
+    /**
+     * @brief Perform expanded subspace diagonalization and update X, P, HX, HP.
+     *
+     * @param hpsi_func Hamiltonian application function.
+     * @param has_p If true, use 3-block [X, W, P]; otherwise use 2-block [X, W].
+     * @param psi_out Input/Output wavefunction.
+     * @param hpsi_out Input/Output H|psi>.
+     * @param p_out Input/Output search direction.
+     * @param hp_out Input/Output H|p>.
+     */
+    void diag_subspace(const HPsiFunc& hpsi_func,
+                       const bool has_p,
+                       ct::Tensor& psi_out,
+                       ct::Tensor& hpsi_out,
+                       ct::Tensor& p_out,
+                       ct::Tensor& hp_out,
+                       const int nlocked_in = 0);
+
+    /**
+     * @brief Orthogonalize and normalize psi using Cholesky decomposition.
+     */
+    void orth_cholesky(ct::Tensor& workspace_in, ct::Tensor& psi_out, ct::Tensor& hpsi_out, ct::Tensor& hsub_out);
+
+    /**
+     * @brief Update locking status: scan errors from current nlocked forward
+     *        and lock bands that have converged.
+     */
+    void update_locking(const ct::Tensor& err_in, const std::vector<double>& ethr_band);
+
+    /**
+     * @brief Check if all bands have converged.
+     */
+    bool test_error(const ct::Tensor& err_in, const std::vector<double>& ethr_band);
+
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
+    using setmem_var_op = ct::kernels::set_memory<Real, ct_Device>;
+    using resmem_var_op = ct::kernels::resize_memory<Real, ct_Device>;
+    using delmem_var_op = ct::kernels::delete_memory<Real, ct_Device>;
+    using syncmem_var_h2d_op = ct::kernels::synchronize_memory<Real, ct_Device, ct::DEVICE_CPU>;
+    using syncmem_var_d2h_op = ct::kernels::synchronize_memory<Real, ct::DEVICE_CPU, ct_Device>;
+
+    using setmem_complex_op = ct::kernels::set_memory<T, ct_Device>;
+    using delmem_complex_op = ct::kernels::delete_memory<T, ct_Device>;
+    using resmem_complex_op = ct::kernels::resize_memory<T, ct_Device>;
+    using syncmem_complex_op = ct::kernels::synchronize_memory<T, ct_Device, ct_Device>;
+    using syncmem_complex_h2d_op = ct::kernels::synchronize_memory<T, ct_Device, ct::DEVICE_CPU>;
+    using syncmem_complex_d2h_op = ct::kernels::synchronize_memory<T, ct::DEVICE_CPU, ct_Device>;
+
+    using gemm_op = ModuleBase::gemm_op<T, Device>;
+};
+
+} // namespace hsolver
+
+#endif // DIAGO_PPCG_H_
diff --git a/source/source_hsolver/test/CMakeLists.txt b/source/source_hsolver/test/CMakeLists.txt
index 1b1529adb4..13a0b0032d 100644
--- a/source/source_hsolver/test/CMakeLists.txt
+++ b/source/source_hsolver/test/CMakeLists.txt
@@ -16,6 +16,17 @@ if (ENABLE_MPI)
             ../../source_hamilt/operator.cpp
             ../../source_pw/module_pwdft/op_pw.cpp
   )
+  AddTest(
+    TARGET MODULE_HSOLVER_ppcg_perf
+    LIBS parameter  ${math_libs} base psi device container
+    SOURCES diago_ppcg_perf_test.cpp
+            ../diago_ppcg.cpp ../diago_bpcg.cpp ../diago_cg.cpp ../diago_david.cpp
+            ../para_linear_transform.cpp ../diago_iter_assist.cpp ../diag_const_nums.cpp
+            ../kernels/hegvd_op.cpp
+            ../../source_basis/module_pw/test/test_tool.cpp
+            ../../source_hamilt/operator.cpp
+            ../../source_pw/module_pwdft/op_pw.cpp
+  )
   AddTest(
     TARGET MODULE_HSOLVER_cg
     LIBS parameter  ${math_libs} base psi device container
diff --git a/source/source_hsolver/test/diago_ppcg_perf_test.cpp b/source/source_hsolver/test/diago_ppcg_perf_test.cpp
new file mode 100644
index 0000000000..6983d2eacc
--- /dev/null
+++ b/source/source_hsolver/test/diago_ppcg_perf_test.cpp
@@ -0,0 +1,284 @@
+#include "../diag_comm_info.h"
+#include "../diago_bpcg.h"
+#include "../diago_cg.h"
+#include "../diago_david.h"
+#include "../diago_iter_assist.h"
+#include "../diago_ppcg.h"
+#include "diago_mock.h"
+#include "source_base/kernels/math_kernel_op.h"
+#include "source_base/module_external/lapack_connector.h"
+#include "source_psi/psi.h"
+
+#include <cmath>
+#include <complex>
+#include <iomanip>
+#include <iostream>
+#include <mpi.h>
+#include <random>
+#include <vector>
+
+#ifdef __MPI
+#include "source_base/parallel_comm.h"
+#endif
+
+using T = std::complex<double>;
+
+// LAPACK reference eigenvalues (values only)
+void lapack_eigenvalues(int npw, const std::vector<T>& hm, double* e)
+{
+    std::vector<T> tmp = hm;
+    int lwork = 2 * npw;
+    std::vector<T> work(lwork);
+    std::vector<double> rwork(3 * npw - 2);
+    int info = 0;
+    char jobz = 'N', uplo = 'U';
+    zheev_(&jobz, &uplo, &npw, tmp.data(), &npw, e, work.data(), &lwork, rwork.data(), &info);
+}
+
+// Unified H|psi> via gemm
+auto make_hpsi_func(const std::vector<T>& hmat, int dim)
+{
+    return [hmat, dim](T* psi_in, T* hpsi_out, const int ld_psi, const int nvec) {
+        T one = T(1.0);
+        T zero = T(0.0);
+        base_device::DEVICE_CPU* ctx = {};
+        ModuleBase::gemm_op<T, base_device::DEVICE_CPU>()('N',
+                                                          'N',
+                                                          dim,
+                                                          nvec,
+                                                          dim,
+                                                          &one,
+                                                          hmat.data(),
+                                                          dim,
+                                                          psi_in,
+                                                          ld_psi,
+                                                          &zero,
+                                                          hpsi_out,
+                                                          ld_psi);
+    };
+}
+
+// S|psi> = |psi> (identity, norm-conserving)
+auto spsi_identity = [](T* psi_in, T* spsi_out, const int ld_psi, const int nvec) {
+    for (int i = 0; i < ld_psi * nvec; ++i)
+    {
+        spsi_out[i] = psi_in[i];
+    }
+};
+
+struct PerfResult
+{
+    std::string name;
+    double time = 0.0;
+    double max_err = 0.0;
+    bool converged = false;
+};
+
+// -------------------- PPCG --------------------
+PerfResult test_ppcg(int nband,
+                     int npw,
+                     double ethr,
+                     const psi::Psi<T>& psi0,
+                     const std::function<void(T*, T*, const int, const int)>& hpsi_func,
+                     double* precondition,
+                     const std::vector<double>& e_ref)
+{
+    psi::Psi<T> psi(psi0);
+    psi.fix_k(0);
+    std::vector<double> en(nband, 0.0);
+
+    hsolver::DiagoPPCG<T> ppcg(precondition);
+    ppcg.init_iter(nband, nband, npw, npw);
+    hsolver::DiagoIterAssist<T>::SCF_ITER = 1; // first SCF step
+    std::vector<double> ethr_band(nband, ethr);
+
+    double t1 = MPI_Wtime();
+    ppcg.diag(hpsi_func, psi.get_pointer(), en.data(), ethr_band);
+    double t2 = MPI_Wtime();
+
+    double err = 0.0;
+    for (int i = 0; i < nband; ++i)
+    {
+        err = std::max(err, std::abs(en[i] - e_ref[i]));
+    }
+    return {"PPCG", t2 - t1, err, err < 1e-2};
+}
+
+// -------------------- BPCG --------------------
+PerfResult test_bpcg(int nband,
+                     int npw,
+                     double ethr,
+                     const psi::Psi<T>& psi0,
+                     const std::function<void(T*, T*, const int, const int)>& hpsi_func,
+                     double* precondition,
+                     const std::vector<double>& e_ref)
+{
+    psi::Psi<T> psi(psi0);
+    psi.fix_k(0);
+    std::vector<double> en(nband, 0.0);
+
+    hsolver::DiagoBPCG<T> bpcg(precondition);
+    bpcg.init_iter(nband, nband, npw, npw);
+    hsolver::DiagoIterAssist<T>::SCF_ITER = 1;
+    std::vector<double> ethr_band(nband, ethr);
+
+    double t1 = MPI_Wtime();
+    bpcg.diag(hpsi_func, psi.get_pointer(), en.data(), ethr_band);
+    double t2 = MPI_Wtime();
+
+    double err = 0.0;
+    for (int i = 0; i < nband; ++i)
+    {
+        err = std::max(err, std::abs(en[i] - e_ref[i]));
+    }
+    return {"BPCG", t2 - t1, err, err < 1e-2};
+}
+
+// -------------------- CG --------------------
+PerfResult test_cg(int nband,
+                   int npw,
+                   double ethr,
+                   int maxiter,
+                   const psi::Psi<T>& psi0,
+                   const std::function<void(T*, T*, const int, const int)>& hpsi_func,
+                   double* precondition,
+                   const std::vector<double>& e_ref)
+{
+    psi::Psi<T> psi(psi0);
+    psi.fix_k(0);
+    std::vector<double> en(nband, 0.0);
+
+    hsolver::DiagoCG<T> cg("pw", "scf");
+    hsolver::DiagoIterAssist<T>::PW_DIAG_NMAX = maxiter;
+    hsolver::DiagoIterAssist<T>::PW_DIAG_THR = ethr;
+    std::vector<double> ethr_band(nband, ethr);
+
+    double t1 = MPI_Wtime();
+    cg.diag(hpsi_func, spsi_identity, npw, nband, npw, psi.get_pointer(), en.data(), ethr_band, precondition);
+    double t2 = MPI_Wtime();
+
+    double err = 0.0;
+    for (int i = 0; i < nband; ++i)
+    {
+        err = std::max(err, std::abs(en[i] - e_ref[i]));
+    }
+    return {"CG", t2 - t1, err, err < 1e-2};
+}
+
+// -------------------- Davidson --------------------
+PerfResult test_david(int nband,
+                      int npw,
+                      double ethr,
+                      int maxiter,
+                      const psi::Psi<T>& psi0,
+                      const std::function<void(T*, T*, const int, const int)>& hpsi_func,
+                      double* precondition,
+                      const std::vector<double>& e_ref)
+{
+    psi::Psi<T> psi(psi0);
+    psi.fix_k(0);
+    std::vector<double> en(nband, 0.0);
+
+#ifdef __MPI
+    int rank = 0, nproc = 1;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+    const hsolver::diag_comm_info comm_info(MPI_COMM_WORLD, rank, nproc);
+#else
+    const hsolver::diag_comm_info comm_info(0, 1);
+#endif
+
+    hsolver::DiagoDavid<T> david(precondition, nband, npw, 4, false, comm_info);
+    std::vector<double> ethr_band(nband, ethr);
+
+    double t1 = MPI_Wtime();
+    david.diag(hpsi_func, spsi_identity, npw, psi.get_pointer(), en.data(), ethr_band, maxiter);
+    double t2 = MPI_Wtime();
+
+    double err = 0.0;
+    for (int i = 0; i < nband; ++i)
+    {
+        err = std::max(err, std::abs(en[i] - e_ref[i]));
+    }
+    return {"Davidson", t2 - t1, err, err < 1e-2};
+}
+
+// ============================================================
+int main(int argc, char** argv)
+{
+    MPI_Init(&argc, &argv);
+    int rank = 0, nproc = 1;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+#ifdef __MPI
+    BP_WORLD = MPI_COMM_WORLD;
+#endif
+
+    // ---------- test parameters ----------
+    int nband = 20;
+    int npw = 500;
+    int sparsity = 0; // 0 = dense
+    double ethr = 1e-5;
+    int maxiter = 300;
+    // -------------------------------------
+
+    // generate Hamiltonian, precondition and initial guess
+    HPsi<T> hpsi_gen(nband, npw, sparsity);
+    DIAGOTEST::hmatrix = hpsi_gen.hamilt();
+    DIAGOTEST::npw = npw;
+    DIAGOTEST::npw_local = new int[1];
+    DIAGOTEST::npw_local[0] = npw;
+    DIAGOTEST::hmatrix_local = DIAGOTEST::hmatrix;
+
+    double* precondition = hpsi_gen.precond();
+
+    // LAPACK reference
+    std::vector<double> e_ref(npw);
+    lapack_eigenvalues(npw, DIAGOTEST::hmatrix, e_ref.data());
+
+    // initial psi guess (perturbed eigenvectors)
+    psi::Psi<T> psi0(1, nband, npw, npw, true);
+    std::default_random_engine p(1);
+    std::uniform_int_distribution<unsigned> u(1, 10);
+    for (int i = 0; i < nband; ++i)
+    {
+        for (int j = 0; j < npw; ++j)
+        {
+            double r = static_cast<double>(u(p)) / 10.0;
+            psi0(0, i, j) = DIAGOTEST::hmatrix[j * npw + i] * r;
+        }
+    }
+
+    auto hpsi_func = make_hpsi_func(DIAGOTEST::hmatrix_local, npw);
+
+    // run benchmarks
+    PerfResult r_ppcg = test_ppcg(nband, npw, ethr, psi0, hpsi_func, precondition, e_ref);
+    PerfResult r_bpcg = test_bpcg(nband, npw, ethr, psi0, hpsi_func, precondition, e_ref);
+    PerfResult r_cg = test_cg(nband, npw, ethr, maxiter, psi0, hpsi_func, precondition, e_ref);
+    PerfResult r_david = test_david(nband, npw, ethr, maxiter, psi0, hpsi_func, precondition, e_ref);
+
+    if (rank == 0)
+    {
+        std::cout << "\n========================================\n";
+        std::cout << "  Diagonalization Performance Comparison\n";
+        std::cout << "  nband=" << nband << ", npw=" << npw << ", sparsity=" << sparsity << "\n";
+        std::cout << "========================================\n";
+        std::cout << std::setw(10) << "Method" << std::setw(14) << "Time(s)" << std::setw(14) << "MaxError"
+                  << std::setw(8) << "OK" << "\n";
+        std::cout << "----------------------------------------\n";
+        auto print = [](const PerfResult& r) {
+            std::cout << std::setw(10) << r.name << std::setw(14) << std::scientific << std::setprecision(3) << r.time
+                      << std::setw(14) << r.max_err << std::setw(8) << (r.converged ? "Yes" : "No") << "\n";
+        };
+        print(r_ppcg);
+        print(r_bpcg);
+        print(r_cg);
+        print(r_david);
+        std::cout << "========================================\n\n";
+    }
+
+    delete[] DIAGOTEST::npw_local;
+    MPI_Finalize();
+    return 0;
+}