diff --git a/source/source_estate/elecstate_print.cpp b/source/source_estate/elecstate_print.cpp
index 517c002dd6d..df2bb9b4821 100644
--- a/source/source_estate/elecstate_print.cpp
+++ b/source/source_estate/elecstate_print.cpp
@@ -58,6 +58,7 @@ void print_scf_iterinfo(const std::string& ks_solver,
            {"scalapack_gvx", "GV"},
            {"cusolver", "CU"},
            {"bpcg", "BP"},
+           {"lobpcg", "LB"},
            {"pexsi", "PE"},
            {"cusolvermp", "CM"},
            {"sdft", "CT"}}; // CT = Chebyshev Trace, for pure SDFT (nbands=0) where no H diagonalization is performed
diff --git a/source/source_hsolver/CMakeLists.txt b/source/source_hsolver/CMakeLists.txt
index b115d6d4cd2..2e30217a6e5 100644
--- a/source/source_hsolver/CMakeLists.txt
+++ b/source/source_hsolver/CMakeLists.txt
@@ -4,6 +4,7 @@ list(APPEND objects
     diago_david.cpp
     diago_dav_subspace.cpp
     diago_bpcg.cpp
+    diago_lobpcg.cpp
     para_linear_transform.cpp
     hsolver_pw.cpp
     hsolver_lcaopw.cpp
diff --git a/source/source_hsolver/diago_lobpcg.cpp b/source/source_hsolver/diago_lobpcg.cpp
new file mode 100644
index 00000000000..8ae513ea080
--- /dev/null
+++ b/source/source_hsolver/diago_lobpcg.cpp
@@ -0,0 +1,694 @@
+#include "source_hsolver/diago_lobpcg.h"
+
+#include "diago_iter_assist.h"
+#include "source_base/global_function.h"
+#include "source_base/kernels/math_kernel_op.h"
+#include "source_base/parallel_comm.h"
+
+#include <ATen/kernels/lapack.h>
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <limits>
+#include <stdexcept>
+#include <type_traits>
+
+namespace hsolver {
+
+// ============================================================================
+// Band-major helpers (CPU, n_band_l == n_band required)
+//
+// Psi is stored band-major: psi_data[ib * n_basis + ig],
+// shape [n_band_l, n_basis].  The BLAS view is n_basis rows by n_band_l cols.
+//
+// Subspace matrices (C, V) are column-major for direct LAPACK use:
+//   C[col * ld + row] = C[j * nb + i] (nb = leading dimension).
+// ============================================================================
+
+// ============================================================================
+// File-static helpers
+// ============================================================================
+
+template <typename T>
+static void mirror_lower(T* mat, int ld, int active_sub)
+{
+    for (int c = 0; c < active_sub; c++)
+        for (int r = c + 1; r < active_sub; r++)
+            mat[c * ld + r] = std::conj(mat[r * ld + c]);
+}
+
+template <typename T>
+static void clean_hermitian_diag(T* mat, int ld, int active_sub)
+{
+    using Real = typename GetTypeReal<T>::type;
+    for (int i = 0; i < active_sub; i++) {
+        int idx = i * ld + i;
+        mat[idx] = T(std::real(mat[idx]), static_cast<Real>(0));
+    }
+}
+
+template <typename T>
+static bool append_orthonormal_block(
+    const int nvec, const int nbs, const typename GetTypeReal<T>::type thresh,
+    const T* block, const T* hblock,
+    std::vector<T>& basis, std::vector<T>& hbasis)
+{
+    using Real = typename GetTypeReal<T>::type;
+    bool appended = false;
+    const Real thresh2 = thresh * thresh;
+
+    for (int ib = 0; ib < nvec; ++ib) {
+        std::vector<T> q(nbs);
+        std::vector<T> hq(nbs);
+        for (int ig = 0; ig < nbs; ++ig) {
+            q[ig] = block[ib * nbs + ig];
+            hq[ig] = hblock[ib * nbs + ig];
+        }
+
+        const int nold = static_cast<int>(basis.size() / nbs);
+        for (int jq = 0; jq < nold; ++jq) {
+            T dot = static_cast<T>(0.0);
+            for (int ig = 0; ig < nbs; ++ig) {
+                dot += std::conj(basis[jq * nbs + ig]) * q[ig];
+            }
+#ifdef __MPI
+            Parallel_Reduce::reduce_pool(&dot, 1);
+#endif
+            for (int ig = 0; ig < nbs; ++ig) {
+                q[ig] -= dot * basis[jq * nbs + ig];
+                hq[ig] -= dot * hbasis[jq * nbs + ig];
+            }
+        }
+
+        Real norm2 = static_cast<Real>(0.0);
+        for (int ig = 0; ig < nbs; ++ig) {
+            norm2 += std::norm(q[ig]);
+        }
+#ifdef __MPI
+        Parallel_Reduce::reduce_pool(&norm2, 1);
+#endif
+        if (norm2 <= thresh2) {
+            continue;
+        }
+
+        const Real inv_norm = static_cast<Real>(1.0) / std::sqrt(norm2);
+        for (int ig = 0; ig < nbs; ++ig) {
+            basis.push_back(q[ig] * inv_norm);
+            hbasis.push_back(hq[ig] * inv_norm);
+        }
+        appended = true;
+    }
+
+    return appended;
+}
+
+// ============================================================================
+// Constructor / Destructor
+// ============================================================================
+
+template <typename T, typename Device>
+DiagoLobpcg<T, Device>::DiagoLobpcg(const Real* precondition)
+{
+    this->r_type   = ct::DataTypeToEnum<Real>::value;
+    this->t_type   = ct::DataTypeToEnum<T>::value;
+    this->dev_type = ct::DeviceTypeToEnum<Device>::value;
+    this->h_prec_ptr = precondition;
+    this->one     = &one_;
+    this->zero    = &zero_;
+    this->neg_one = &neg_one_;
+}
+
+template <typename T, typename Device>
+DiagoLobpcg<T, Device>::~DiagoLobpcg() {}
+
+// ============================================================================
+// init_iter
+// ============================================================================
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::init_iter(int nband, int nband_l,
+                                        int nbasis, int ndim)
+{
+    this->n_band   = nband;
+    this->n_band_l = nband_l;
+    this->n_basis  = nbasis;
+    this->n_dim    = ndim;
+    this->nsub     = 3 * n_band;
+    this->has_pdir = false;
+
+    this->eigen     = ct::Tensor(r_type, dev_type, {this->n_band});
+    this->sub_eigen = ct::Tensor(r_type, dev_type, {this->nsub});
+    this->err_st    = ct::Tensor(r_type, dev_type, {this->n_band_l});
+
+    this->hsub = ct::Tensor(t_type, dev_type, {this->nsub, this->nsub});
+    this->tmp_hsub = ct::Tensor(t_type, dev_type, {this->n_band, this->n_band});
+
+    this->hpsi  = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis});
+    this->spsi  = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis});
+    this->grad  = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis});
+    this->hgrad = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis});
+    this->pdir  = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis});
+    this->hpdir = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis});
+
+    this->work   = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis});
+    this->hwork  = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis});
+    this->pwork  = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis});
+    this->hpwork = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis});
+
+    this->prec = ct::Tensor(r_type, dev_type, {this->n_basis});
+    this->h_prec = ct::TensorMap(
+        (void*)this->h_prec_ptr, this->r_type,
+        ct::DeviceType::CpuDevice, {this->n_basis});
+
+#ifdef __MPI
+    this->pmmcn.set_dimension(BP_WORLD, POOL_WORLD,
+                              this->n_band_l, this->n_basis,
+                              this->n_band_l, this->n_basis,
+                              this->n_dim, this->n_band);
+    this->plintrans.set_dimension(this->n_dim, this->n_band_l,
+                                  this->n_band_l, this->n_basis,
+                                  BP_WORLD, false);
+#else
+    this->pmmcn.set_dimension(this->n_band_l, this->n_basis,
+                              this->n_band_l, this->n_basis,
+                              this->n_dim, this->n_band);
+    this->plintrans.set_dimension(this->n_dim, this->n_band_l,
+                                  this->n_band_l, this->n_basis,
+                                  false);
+#endif
+}
+
+// ============================================================================
+// calc_prec
+// ============================================================================
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::calc_prec()
+{
+    syncmem_var_h2d_op()(this->prec.template data<Real>(),
+                         this->h_prec.template data<Real>(),
+                         this->n_basis);
+}
+
+// ============================================================================
+// calc_hpsi_with_block / calc_spsi_with_block
+// ============================================================================
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::calc_hpsi_with_block(
+    const HPsiFunc& hpsi_func, T* psi_in, ct::Tensor& hpsi_out)
+{
+    hpsi_func(psi_in, hpsi_out.data<T>(), this->n_basis, this->n_band_l);
+}
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::calc_spsi_with_block(
+    const SPsiFunc& spsi_func, const T* psi_in, ct::Tensor& spsi_out)
+{
+    spsi_func(psi_in, spsi_out.data<T>(), this->n_basis, this->n_band_l);
+}
+
+// ============================================================================
+// rayleigh_ritz (NC, S=I)
+//
+// psi_in is assumed orthonormal.  H_sub = <psi|H|psi>,  heevd,  rotate.
+// ============================================================================
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::rayleigh_ritz(
+    ct::Tensor& psi_inout, ct::Tensor& hpsi_inout, ct::Tensor& eigen_out)
+{
+    const int nb = this->n_band;
+    const int nbs = this->n_basis;
+    const int local_sz = this->n_band_l * nbs;
+
+    // Zero-fill H_sub before inner_product to eliminate stale data
+    for (int ii = 0; ii < nb * nb; ++ii)
+        this->tmp_hsub.data<T>()[ii] = static_cast<T>(0.0);
+
+    this->pmmcn.multiply(this->one_, psi_inout.data<T>(), hpsi_inout.data<T>(),
+                         this->zero_, this->tmp_hsub.data<T>());
+    mirror_lower(this->tmp_hsub.data<T>(), nb, nb);
+    clean_hermitian_diag(this->tmp_hsub.data<T>(), nb, nb);
+
+    // Force exact Hermitian symmetrization.
+    {
+        T* hsub = this->tmp_hsub.data<T>();
+        for (int jj = 0; jj < nb; ++jj) {
+            hsub[jj * nb + jj] = T(std::real(hsub[jj * nb + jj]), 0.0);
+            for (int ii = jj + 1; ii < nb; ++ii) {
+                T a = hsub[jj * nb + ii];
+                T b = std::conj(hsub[ii * nb + jj]);
+                T avg = static_cast<T>(0.5) * (a + b);
+                hsub[jj * nb + ii] = avg;
+                hsub[ii * nb + jj] = std::conj(avg);
+            }
+        }
+    }
+
+    ct::kernels::lapack_heevd<T, ct_Device>()(
+        nb, this->tmp_hsub.data<T>(), nb, eigen_out.data<Real>());
+
+    setmem_complex_op()(this->work.data<T>(), static_cast<T>(0.0), local_sz);
+    this->plintrans.act(this->one_, psi_inout.data<T>(),
+                        this->tmp_hsub.data<T>(),
+                        this->zero_, this->work.data<T>());
+    syncmem_complex_op()(psi_inout.data<T>(), this->work.data<T>(), local_sz);
+
+    setmem_complex_op()(this->work.data<T>(), static_cast<T>(0.0), local_sz);
+    this->plintrans.act(this->one_, hpsi_inout.data<T>(),
+                        this->tmp_hsub.data<T>(),
+                        this->zero_, this->work.data<T>());
+    syncmem_complex_op()(hpsi_inout.data<T>(), this->work.data<T>(), local_sz);
+}
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::generalized_rayleigh_ritz(
+    ct::Tensor&, ct::Tensor&, ct::Tensor&, ct::Tensor&)
+{
+    ModuleBase::WARNING_QUIT("DiagoLobpcg",
+        "Generalized R-R (USPP) is not implemented yet.");
+}
+
+// ============================================================================
+// compute_residual — NC: R = HX - lambda*X,  grad = R ./ prec
+// ============================================================================
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::compute_residual(
+    const ct::Tensor& psi_in, const ct::Tensor& hpsi_in,
+    const ct::Tensor& eigen_in, const ct::Tensor& prec_in,
+    ct::Tensor& grad_out, ct::Tensor& err_out)
+{
+    const Real* _prec  = prec_in.data<Real>();
+    const Real* _eigen = eigen_in.data<Real>();
+    const T*    _psi   = psi_in.data<T>();
+    const T*    _hpsi  = hpsi_in.data<T>();
+    T*          _grad  = grad_out.data<T>();
+    Real*       _err   = err_out.data<Real>();
+
+    for (int ib = 0; ib < this->n_band_l; ib++) {
+        const int  ioff   = ib * this->n_basis;
+        const Real lambda = _eigen[ib];
+        Real       err_j  = 0.0;
+        for (int ig = 0; ig < this->n_dim; ig++) {
+            const int idx = ioff + ig;
+            const T   r   = _hpsi[idx] - lambda * _psi[idx];
+            _grad[idx]    = r / std::max(_prec[ig],
+                                             static_cast<Real>(1e-8));
+            err_j        += std::norm(r);
+        }
+        for (int ig = this->n_dim; ig < this->n_basis; ig++)
+            _grad[ioff + ig] = static_cast<T>(0.0);
+        _err[ib] = err_j;
+    }
+#ifdef __MPI
+    Parallel_Reduce::reduce_pool(_err, this->n_band_l);
+#endif
+    for (int ib = 0; ib < this->n_band_l; ib++)
+        _err[ib] = std::sqrt(_err[ib]);
+}
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::compute_residual_s(
+    const ct::Tensor&, const ct::Tensor&, const ct::Tensor&,
+    const ct::Tensor&, const ct::Tensor&, ct::Tensor&, ct::Tensor&)
+{
+    ModuleBase::WARNING_QUIT("DiagoLobpcg",
+        "compute_residual_s (USPP) is not implemented yet.");
+}
+
+// ============================================================================
+// orth_projection — grad -= psi * <psi|grad>   [S=I]
+//   inner(i,j) = <psi_i|grad_j>   (col-major)
+//   grad_j    -= sum_i psi_i * inner(i,j)
+// ============================================================================
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::orth_projection(
+    const ct::Tensor& psi_in, ct::Tensor& hsub_work, ct::Tensor& grad_out)
+{
+    this->pmmcn.multiply(this->one_, psi_in.data<T>(), grad_out.data<T>(),
+                         this->zero_, hsub_work.data<T>());
+
+    this->plintrans.act(this->neg_one_, psi_in.data<T>(),
+                        hsub_work.data<T>(),
+                        this->one_, grad_out.data<T>());
+}
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::orth_projection_s(
+    const ct::Tensor&, const ct::Tensor&, ct::Tensor&, ct::Tensor&)
+{
+    ModuleBase::WARNING_QUIT("DiagoLobpcg",
+        "orth_projection_s (USPP) is not implemented yet.");
+}
+
+// ============================================================================
+// rotate_wf
+// ============================================================================
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::rotate_wf(
+    const ct::Tensor& hsub_in, ct::Tensor& psi_out, ct::Tensor& workspace_in)
+{
+    const int nbs = this->n_basis;
+    setmem_complex_op()(workspace_in.data<T>(), static_cast<T>(0.0),
+                        this->n_band_l * nbs);
+    this->plintrans.act(this->one_, psi_out.data<T>(),
+                        hsub_in.data<T>(),
+                        this->zero_, workspace_in.data<T>());
+    syncmem_complex_op()(psi_out.data<T>(), workspace_in.data<T>(),
+                         this->n_band_l * nbs);
+}
+
+// ============================================================================
+// orth_cholesky — S=I Cholesky orthonormalization
+// ============================================================================
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::orth_cholesky(
+    ct::Tensor& workspace_in, ct::Tensor& psi_out,
+    ct::Tensor& hpsi_out, ct::Tensor& hsub_out)
+{
+    const int nb  = this->n_band;
+    const int nbs = this->n_basis;
+
+    T* psi_d  = psi_out.data<T>();
+    T* hpsi_d = hpsi_out.data<T>();
+    const Real eps = static_cast<Real>(100)
+                   * std::numeric_limits<Real>::epsilon();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        for (int jb = 0; jb < ib; ++jb) {
+            T dot = static_cast<T>(0.0);
+            for (int ig = 0; ig < nbs; ++ig) {
+                dot += std::conj(psi_d[jb * nbs + ig])
+                     * psi_d[ib * nbs + ig];
+            }
+#ifdef __MPI
+            Parallel_Reduce::reduce_pool(&dot, 1);
+#endif
+            for (int ig = 0; ig < nbs; ++ig) {
+                psi_d[ib * nbs + ig]  -= dot * psi_d[jb * nbs + ig];
+                hpsi_d[ib * nbs + ig] -= dot * hpsi_d[jb * nbs + ig];
+            }
+        }
+
+        Real norm2 = static_cast<Real>(0.0);
+        for (int ig = 0; ig < nbs; ++ig) {
+            norm2 += std::norm(psi_d[ib * nbs + ig]);
+        }
+#ifdef __MPI
+        Parallel_Reduce::reduce_pool(&norm2, 1);
+#endif
+        if (!(norm2 > eps)) {
+            throw std::runtime_error("orth_cholesky failed: dependent vectors");
+        }
+        const Real inv_norm = static_cast<Real>(1.0) / std::sqrt(norm2);
+        for (int ig = 0; ig < nbs; ++ig) {
+            psi_d[ib * nbs + ig]  *= inv_norm;
+            hpsi_d[ib * nbs + ig] *= inv_norm;
+        }
+    }
+}
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::orth_cholesky_s(
+    ct::Tensor&, ct::Tensor&, ct::Tensor&, ct::Tensor&, ct::Tensor&)
+{
+    ModuleBase::WARNING_QUIT("DiagoLobpcg",
+        "orth_cholesky_s (USPP) is not implemented yet.");
+}
+
+// ============================================================================
+// test_error
+// ============================================================================
+
+template <typename T, typename Device>
+bool DiagoLobpcg<T, Device>::test_error(
+    const ct::Tensor& err_in, const std::vector<double>& ethr_band)
+{
+    Real* _err_st = err_in.data<Real>();
+    bool  not_conv = false;
+    std::vector<Real> tmp_cpu;
+    if (err_in.device_type() == ct::DeviceType::GpuDevice) {
+        tmp_cpu.resize(this->n_band_l);
+        _err_st = tmp_cpu.data();
+        syncmem_var_d2h_op()(_err_st, err_in.data<Real>(), this->n_band_l);
+    }
+    for (int ii = 0; ii < this->n_band_l; ii++)
+        if (_err_st[ii] > ethr_band[ii]) not_conv = true;
+#ifdef __MPI
+    MPI_Allreduce(MPI_IN_PLACE, &not_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD);
+#endif
+    return not_conv;
+}
+
+// ============================================================================
+// lobpcg_update — generalized R-R on subspace W = [X, Z, P]
+//
+// H_sub = <W|H|W>,  S_sub = <W|W>
+// H_sub C = S_sub C Lambda  (hegvd)
+// X_new = V_XX*X + V_ZX*Z + V_PX*P
+// P_new = V_ZX*Z + V_PX*P         (soft restart)
+// ============================================================================
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::lobpcg_update(
+    ct::Tensor& psi, ct::Tensor& hpsi,
+    ct::Tensor& grad, ct::Tensor& hgrad,
+    ct::Tensor& pdir, ct::Tensor& hpdir,
+    ct::Tensor& eigen)
+{
+    const int n    = this->n_band;
+    const int nbs  = this->n_basis;
+    const int local_sz = this->n_band_l * nbs;
+    const Real eps = static_cast<Real>(100)
+                   * std::numeric_limits<Real>::epsilon();
+
+    std::vector<T> basis;
+    std::vector<T> hbasis;
+    basis.reserve((this->has_pdir ? 3 : 2) * local_sz);
+    hbasis.reserve((this->has_pdir ? 3 : 2) * local_sz);
+
+    append_orthonormal_block<T>(n, nbs, eps, psi.data<T>(), hpsi.data<T>(),
+                                basis, hbasis);
+    append_orthonormal_block<T>(n, nbs, eps, grad.data<T>(), hgrad.data<T>(),
+                                basis, hbasis);
+    if (this->has_pdir) {
+        append_orthonormal_block<T>(n, nbs, eps, pdir.data<T>(), hpdir.data<T>(),
+                                    basis, hbasis);
+    }
+
+    const int m = static_cast<int>(basis.size() / nbs);
+    if (m < n) {
+        throw std::runtime_error("LOBPCG subspace lost rank");
+    }
+
+    T* hsub_d = this->hsub.data<T>();
+    setmem_complex_op()(hsub_d, static_cast<T>(0.0), this->nsub * this->nsub);
+
+    for (int jc = 0; jc < m; ++jc) {
+        for (int ir = 0; ir < m; ++ir) {
+            T sum = static_cast<T>(0.0);
+            for (int ig = 0; ig < nbs; ++ig) {
+                sum += std::conj(basis[ir * nbs + ig])
+                     * hbasis[jc * nbs + ig];
+            }
+            hsub_d[jc * this->nsub + ir] = sum;
+        }
+    }
+#ifdef __MPI
+    for (int jc = 0; jc < m; ++jc) {
+        Parallel_Reduce::reduce_pool(hsub_d + jc * this->nsub, m);
+    }
+#endif
+    clean_hermitian_diag(hsub_d, this->nsub, m);
+    for (int jc = 0; jc < m; ++jc) {
+        for (int ir = jc + 1; ir < m; ++ir) {
+            const T avg = static_cast<T>(0.5)
+                        * (hsub_d[jc * this->nsub + ir]
+                        +  std::conj(hsub_d[ir * this->nsub + jc]));
+            hsub_d[jc * this->nsub + ir] = avg;
+            hsub_d[ir * this->nsub + jc] = std::conj(avg);
+        }
+    }
+
+    ct::kernels::lapack_heevd<T, ct_Device>()(
+        m, hsub_d, this->nsub, this->sub_eigen.data<Real>());
+
+    const Real* sub = this->sub_eigen.data<Real>();
+    Real* eig = eigen.data<Real>();
+    for (int ib = 0; ib < n; ++ib) {
+        eig[ib] = sub[ib];
+    }
+
+    T* x_new = this->work.data<T>();
+    T* hx_new = this->hwork.data<T>();
+    T* p_new = this->pwork.data<T>();
+    T* hp_new = this->hpwork.data<T>();
+    for (int i = 0; i < local_sz; ++i) {
+        x_new[i] = static_cast<T>(0.0);
+        hx_new[i] = static_cast<T>(0.0);
+        p_new[i] = static_cast<T>(0.0);
+        hp_new[i] = static_cast<T>(0.0);
+    }
+
+    for (int ib = 0; ib < n; ++ib) {
+        for (int iq = 0; iq < m; ++iq) {
+            const T coeff = hsub_d[ib * this->nsub + iq];
+            for (int ig = 0; ig < nbs; ++ig) {
+                const int dst = ib * nbs + ig;
+                const int src = iq * nbs + ig;
+                x_new[dst] += coeff * basis[src];
+                hx_new[dst] += coeff * hbasis[src];
+            }
+            if (iq >= n) {
+                for (int ig = 0; ig < nbs; ++ig) {
+                    const int dst = ib * nbs + ig;
+                    const int src = iq * nbs + ig;
+                    p_new[dst] += coeff * basis[src];
+                    hp_new[dst] += coeff * hbasis[src];
+                }
+            }
+        }
+    }
+
+    syncmem_complex_op()(psi.data<T>(),   x_new,  local_sz);
+    syncmem_complex_op()(hpsi.data<T>(),  hx_new, local_sz);
+    syncmem_complex_op()(pdir.data<T>(),  p_new,  local_sz);
+    syncmem_complex_op()(hpdir.data<T>(), hp_new, local_sz);
+
+    this->has_pdir = true;
+}
+
+// ============================================================================
+// diag — main LOBPCG loop (generalized interface, S=I currently supported)
+// ============================================================================
+
+template <typename T, typename Device>
+void DiagoLobpcg<T, Device>::diag(
+    const HPsiFunc& hpsi_func, const SPsiFunc& spsi_func,
+    T* psi_in, Real* eigenvalue_in, const std::vector<double>& ethr_band)
+{
+    // ---- runtime guard -----------------------------------------------------
+    if (this->n_band_l != this->n_band) {
+        ModuleBase::WARNING_QUIT("DiagoLobpcg",
+            "Explicit-loop LOBPCG requires n_band_l == n_band. "
+            "Band-parallel MPI is not supported in this implementation.");
+    }
+
+    this->has_pdir = false;
+    const int scf_iter = DiagoIterAssist<T, Device>::SCF_ITER;
+
+    this->psi = ct::TensorMap(psi_in, t_type, dev_type,
+                               {this->n_band_l, this->n_basis});
+
+    this->calc_prec();
+
+    this->calc_hpsi_with_block(hpsi_func, psi_in, this->hpsi);
+    this->calc_spsi_with_block(spsi_func, psi_in, this->spsi);
+    for (int ib = 0; ib < this->n_band_l; ++ib) {
+        for (int ig = 0; ig < this->n_dim; ++ig) {
+            const int idx = ib * this->n_basis + ig;
+            if (std::abs(this->spsi.data<T>()[idx] - this->psi.data<T>()[idx])
+                > static_cast<Real>(100) * std::numeric_limits<Real>::epsilon()) {
+                ModuleBase::WARNING_QUIT("DiagoLobpcg",
+                    "Generalized LOBPCG for S != I is not implemented yet.");
+            }
+        }
+    }
+    // Re-orthonormalize before initial R-R so H_sub is well-conditioned
+    this->orth_cholesky(this->work, this->psi, this->hpsi, this->tmp_hsub);
+    this->rayleigh_ritz(this->psi, this->hpsi, this->eigen);
+
+    setmem_complex_op()(this->pdir.data<T>(),  static_cast<T>(0.0),
+                         this->n_basis * this->n_band_l);
+    setmem_complex_op()(this->hpdir.data<T>(), static_cast<T>(0.0),
+                         this->n_basis * this->n_band_l);
+
+    const int max_iter = (scf_iter > 1) ? this->nline : (this->nline * 20);
+
+    for (int ntry = 0; ntry < max_iter; ++ntry) {
+        this->compute_residual(this->psi, this->hpsi, this->eigen,
+                               this->prec, this->grad, this->err_st);
+        if (!this->test_error(this->err_st, ethr_band))
+            break;
+
+        const int psi_sz = this->n_basis * this->n_band_l;
+        const int eig_sz = this->n_band_l;
+
+        this->orth_projection(this->psi, this->tmp_hsub, this->grad);
+
+        this->calc_hpsi_with_block(hpsi_func, this->grad.data<T>(), this->hgrad);
+
+        // Backup stable state in case lobpcg_update corrupts psi/hpsi
+        std::vector<T> psi_bak(psi_sz), hpsi_bak(psi_sz);
+        std::vector<Real> eigen_bak(eig_sz);
+        std::copy(this->psi.data<T>(),  this->psi.data<T>()  + psi_sz, psi_bak.data());
+        std::copy(this->hpsi.data<T>(), this->hpsi.data<T>() + psi_sz, hpsi_bak.data());
+        std::copy(this->eigen.data<Real>(), this->eigen.data<Real>() + eig_sz, eigen_bak.data());
+
+        try {
+            this->lobpcg_update(this->psi, this->hpsi,
+                                 this->grad, this->hgrad,
+                                 this->pdir, this->hpdir,
+                                 this->eigen);
+        } catch (const std::exception& e1) {
+            std::copy(psi_bak.data(), psi_bak.data() + psi_sz, this->psi.data<T>());
+            std::copy(hpsi_bak.data(), hpsi_bak.data() + psi_sz, this->hpsi.data<T>());
+            std::copy(eigen_bak.data(), eigen_bak.data() + eig_sz, this->eigen.data<Real>());
+
+            setmem_complex_op()(this->pdir.data<T>(),  static_cast<T>(0.0), psi_sz);
+            setmem_complex_op()(this->hpdir.data<T>(), static_cast<T>(0.0), psi_sz);
+            this->has_pdir = false;
+
+            try {
+                this->lobpcg_update(this->psi, this->hpsi,
+                                     this->grad, this->hgrad,
+                                     this->pdir, this->hpdir,
+                                     this->eigen);
+            } catch (const std::exception& e2) {
+                std::copy(psi_bak.data(), psi_bak.data() + psi_sz, this->psi.data<T>());
+                std::copy(hpsi_bak.data(), hpsi_bak.data() + psi_sz, this->hpsi.data<T>());
+                std::copy(eigen_bak.data(), eigen_bak.data() + eig_sz, this->eigen.data<Real>());
+
+                this->calc_hpsi_with_block(hpsi_func, this->psi.data<T>(), this->hpsi);
+                this->orth_cholesky(this->work, this->psi, this->hpsi, this->tmp_hsub);
+                this->rayleigh_ritz(this->psi, this->hpsi, this->eigen);
+            }
+        }
+
+        this->calc_hpsi_with_block(hpsi_func, this->psi.data<T>(), this->hpsi);
+        this->orth_cholesky(this->work, this->psi, this->hpsi, this->tmp_hsub);
+        this->rayleigh_ritz(this->psi, this->hpsi, this->eigen);
+        this->orth_projection(this->psi, this->tmp_hsub, this->pdir);
+        this->calc_hpsi_with_block(hpsi_func, this->pdir.data<T>(), this->hpdir);
+
+        if (scf_iter == 1 && ((ntry + 1) % this->nline == 0)) {
+            this->calc_hpsi_with_block(hpsi_func, this->psi.data<T>(), this->hpsi);
+            this->orth_cholesky(this->work, this->psi, this->hpsi, this->tmp_hsub);
+            this->rayleigh_ritz(this->psi, this->hpsi, this->eigen);
+            setmem_complex_op()(this->pdir.data<T>(),  static_cast<T>(0.0),
+                                 this->n_basis * this->n_band_l);
+            setmem_complex_op()(this->hpdir.data<T>(), static_cast<T>(0.0),
+                                 this->n_basis * this->n_band_l);
+            this->has_pdir = false;
+        }
+    }
+
+    this->calc_hpsi_with_block(hpsi_func, this->psi.data<T>(), this->hpsi);
+    this->orth_cholesky(this->work, this->psi, this->hpsi, this->tmp_hsub);
+    this->rayleigh_ritz(this->psi, this->hpsi, this->eigen);
+    // Ensure hpsi = H*psi is consistent for external residual checks
+    this->calc_hpsi_with_block(hpsi_func, this->psi.data<T>(), this->hpsi);
+
+    syncmem_var_d2h_op()(eigenvalue_in,
+                          this->eigen.data<Real>(),
+                          this->n_band_l);
+}
+
+template class DiagoLobpcg<std::complex<double>, base_device::DEVICE_CPU>;
+
+} // namespace hsolver
diff --git a/source/source_hsolver/diago_lobpcg.h b/source/source_hsolver/diago_lobpcg.h
new file mode 100644
index 00000000000..cbd07798a7b
--- /dev/null
+++ b/source/source_hsolver/diago_lobpcg.h
@@ -0,0 +1,232 @@
+#ifndef DIAGO_LOBPCG_H_
+#define DIAGO_LOBPCG_H_
+
+#include <complex>
+#include <functional>
+#include <type_traits>
+#include <vector>
+
+#include "source_base/kernels/math_kernel_op.h"
+#include "source_base/module_device/memory_op.h"
+#include "source_base/module_device/types.h"
+#include "source_base/para_gemm.h"
+#include "source_hamilt/hamilt.h"
+#include "source_hsolver/kernels/hegvd_op.h"
+#include "source_hsolver/para_linear_transform.h"
+
+#include <ATen/core/tensor.h>
+#include <ATen/core/tensor_map.h>
+#include <source_base/macros.h>
+
+namespace hsolver {
+
+/**
+ * @class DiagoLobpcg
+ * @brief Locally Optimal Block Preconditioned Conjugate Gradient eigensolver.
+ *
+ * LOBPCG maintains a block Rayleigh-Ritz subspace:
+ *   - First iteration:  W = [X, Z]           (2-block, no valid P yet)
+ *   - Subsequent:       W = [X, Z, P]        (3-block)
+ * where X = current eigenvectors, Z = preconditioned residual, P = search directions.
+ *
+ * @note Currently supports NC (S=I) + CPU only.
+ *       GPU and USPP support are planned for subsequent phases.
+ *
+ * @tparam T      Complex floating-point type.
+ * @tparam Device Must be base_device::DEVICE_CPU (GPU not yet supported).
+ */
+template <typename T = std::complex<double>, typename Device = base_device::DEVICE_CPU>
+class DiagoLobpcg
+{
+  private:
+    static_assert(std::is_same<Device, base_device::DEVICE_CPU>::value,
+                  "DiagoLobpcg currently supports CPU only.");
+
+    using Real = typename GetTypeReal<T>::type;
+
+  public:
+    /// @brief H * psi -> hpsi.
+    using HPsiFunc = std::function<void(T*, T*, const int, const int)>;
+
+    /// @brief S * psi -> spsi.
+    using SPsiFunc = std::function<void(const T*, T*, const int, const int)>;
+
+    /// Constructor — stores host preconditioner pointer.
+    explicit DiagoLobpcg(const Real* precondition);
+
+    ~DiagoLobpcg();
+
+    /// Allocate workspace and bind h_prec TensorMap (after n_basis is known).
+    void init_iter(const int nband, const int nband_l, const int nbasis, const int ndim);
+
+    /// Set max inner iterations per SCF step (default 4).
+    void set_nline(const int n) { this->nline = n; }
+
+    /// Generalized diagonalization interface. Currently supports S = I only.
+    void diag(const HPsiFunc& hpsi_func,
+              const SPsiFunc& spsi_func,
+              T* psi_in,
+              Real* eigenvalue_in,
+              const std::vector<double>& ethr_band);
+
+  private:
+    // ---- dimensions ----
+    int n_band   = 0;   ///< total bands (global)
+    int n_band_l = 0;   ///< local bands
+    int n_basis  = 0;   ///< basis functions (lda of psi)
+    int n_dim    = 0;   ///< valid dimension (= current_ngk)
+    int nline    = 4;   ///< max inner iterations per SCF step
+    int nsub     = 0;   ///< physical leading dim of hsub (= 3*n_band)
+    bool has_pdir = false; ///< true when P block holds valid directions
+
+    // ---- parallel ops ----
+    ModuleBase::PGemmCN<T, Device> pmmcn;
+    PLinearTransform<T, Device> plintrans;
+
+    // ---- type traits ----
+    ct::DataType r_type   = ct::DataType::DT_INVALID;
+    ct::DataType t_type   = ct::DataType::DT_INVALID;
+    ct::DeviceType dev_type = ct::DeviceType::UnKnown;
+
+    // ---- preconditioner ----
+    ct::Tensor prec        = {};  ///< device copy [n_basis]
+    const Real* h_prec_ptr = nullptr; ///< host pointer (saved in ctor)
+    ct::Tensor h_prec      = {};  ///< host TensorMap [n_basis] (bound in init_iter)
+
+    // ---- eigenvalues & convergence ----
+    ct::Tensor eigen     = {}; ///< output eigenvalues [n_band]
+    ct::Tensor sub_eigen = {}; ///< subspace eigenvalues [nsub] = [3*n_band]
+    ct::Tensor err_st    = {}; ///< residual norm per band [n_band_l]
+
+    // ---- core blocks ----
+    // Layout for {n_band_l, n_basis} tensors:
+    //   data[ib * n_basis + ig] — band-major contiguous.
+    //   BLAS view: n_basis rows × n_band_l cols, column-major.
+    ct::Tensor psi   = {};   ///< X (TensorMap → psi_in, no ownership)
+    ct::Tensor hpsi  = {};   ///< HX
+    ct::Tensor spsi  = {};   ///< SX  (Phase 2)
+    ct::Tensor grad  = {};   ///< Z = T(R)
+    ct::Tensor hgrad = {};   ///< HZ
+    ct::Tensor pdir  = {};   ///< P
+    ct::Tensor hpdir = {};   ///< HP
+
+    // ---- subspace matrices ----
+    ct::Tensor hsub = {};    ///< H_sub [nsub × nsub]
+
+    // ---- workspace ----
+    ct::Tensor work    = {}; ///< [n_band_l, n_basis]
+    ct::Tensor hwork   = {}; ///< [n_band_l, n_basis]
+    ct::Tensor pwork   = {}; ///< [n_band_l, n_basis] (P update)
+    ct::Tensor hpwork  = {}; ///< [n_band_l, n_basis] (HP update)
+    ct::Tensor tmp_hsub = {}; ///< scratch [n_band, n_band]
+
+    // ---- GEMM constants (following BPCG pattern) ----
+    Device* ctx = {};
+    const T one_     = static_cast<T>(1.0);
+    const T zero_    = static_cast<T>(0.0);
+    const T neg_one_ = static_cast<T>(-1.0);
+    const T* one     = nullptr;
+    const T* zero    = nullptr;
+    const T* neg_one = nullptr;
+
+    // ---- helpers ----
+
+    void calc_prec();
+
+    void calc_hpsi_with_block(const HPsiFunc& hpsi_func,
+                              T* psi_in,
+                              ct::Tensor& hpsi_out);
+
+    void calc_spsi_with_block(const SPsiFunc& spsi_func,
+                              const T* psi_in,
+                              ct::Tensor& spsi_out);
+
+    /// Standard R-R: H_sub = psi^H * hpsi → heevd → rotate.
+    /// multiply(hpsi, psi) = psi^H * hpsi.
+    void rayleigh_ritz(ct::Tensor& psi_inout,
+                       ct::Tensor& hpsi_inout,
+                       ct::Tensor& eigen_out);
+
+    /// Generalized R-R. NOT IMPLEMENTED — aborts.
+    void generalized_rayleigh_ritz(ct::Tensor& psi_inout,
+                                   ct::Tensor& hpsi_inout,
+                                   ct::Tensor& spsi_inout,
+                                   ct::Tensor& eigen_out);
+
+    /// NC residual: R = HX - X*Lambda, Z = R ./ prec.
+    /// CPU-only: direct loops.
+    void compute_residual(const ct::Tensor& psi_in,
+                          const ct::Tensor& hpsi_in,
+                          const ct::Tensor& eigen_in,
+                          const ct::Tensor& prec_in,
+                          ct::Tensor& grad_out,
+                          ct::Tensor& err_out);
+
+    /// USPP residual. NOT IMPLEMENTED — aborts.
+    void compute_residual_s(const ct::Tensor& psi_in,
+                            const ct::Tensor& hpsi_in,
+                            const ct::Tensor& spsi_in,
+                            const ct::Tensor& eigen_in,
+                            const ct::Tensor& prec_in,
+                            ct::Tensor& grad_out,
+                            ct::Tensor& err_out);
+
+    /// grad -= psi * (psi^H * grad).  multiply(grad, psi) = psi^H * grad.
+    void orth_projection(const ct::Tensor& psi_in,
+                         ct::Tensor& hsub_work,
+                         ct::Tensor& grad_out);
+
+    /// S-orthogonalize. NOT IMPLEMENTED — aborts.
+    void orth_projection_s(const ct::Tensor& psi_in,
+                           const ct::Tensor& spsi_in,
+                           ct::Tensor& hsub_work,
+                           ct::Tensor& grad_out);
+
+    /// Core subspace update (2-block first, then 3-block).
+    /// Orthonormalizes W = [X, Z, P] before Rayleigh-Ritz for stability.
+    void lobpcg_update(ct::Tensor& psi,
+                       ct::Tensor& hpsi,
+                       ct::Tensor& grad,
+                       ct::Tensor& hgrad,
+                       ct::Tensor& pdir,
+                       ct::Tensor& hpdir,
+                       ct::Tensor& eigen);
+
+    /// psi = psi * U  (via plintrans).
+    void rotate_wf(const ct::Tensor& hsub_in,
+                   ct::Tensor& psi_out,
+                   ct::Tensor& workspace_in);
+
+    /// S=I Cholesky: psi^H*psi → potrf(U) → trtri → psi *= U^{-1}, hpsi *= U^{-1}.
+    void orth_cholesky(ct::Tensor& workspace_in,
+                       ct::Tensor& psi_out,
+                       ct::Tensor& hpsi_out,
+                       ct::Tensor& hsub_out);
+
+    /// S-Cholesky. NOT IMPLEMENTED — aborts.
+    void orth_cholesky_s(ct::Tensor& workspace_in,
+                         ct::Tensor& psi_out,
+                         ct::Tensor& hpsi_out,
+                         ct::Tensor& spsi_out,
+                         ct::Tensor& hsub_out);
+
+    bool test_error(const ct::Tensor& err_in, const std::vector<double>& ethr_band);
+
+    // ---- memory-op aliases ----
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
+    using setmem_var_op    = ct::kernels::set_memory<Real, ct_Device>;
+    using resmem_var_op    = ct::kernels::resize_memory<Real, ct_Device>;
+    using delmem_var_op    = ct::kernels::delete_memory<Real, ct_Device>;
+    using syncmem_var_h2d_op = ct::kernels::synchronize_memory<Real, ct_Device, ct::DEVICE_CPU>;
+    using syncmem_var_d2h_op = ct::kernels::synchronize_memory<Real, ct::DEVICE_CPU, ct_Device>;
+
+    using setmem_complex_op    = ct::kernels::set_memory<T, ct_Device>;
+    using delmem_complex_op    = ct::kernels::delete_memory<T, ct_Device>;
+    using resmem_complex_op    = ct::kernels::resize_memory<T, ct_Device>;
+    using syncmem_complex_op   = ct::kernels::synchronize_memory<T, ct_Device, ct_Device>;
+    using syncmem_complex_h2d_op = ct::kernels::synchronize_memory<T, ct_Device, ct::DEVICE_CPU>;
+    using syncmem_complex_d2h_op = ct::kernels::synchronize_memory<T, ct::DEVICE_CPU, ct_Device>;
+};
+
+} // namespace hsolver
+#endif // DIAGO_LOBPCG_H_
diff --git a/source/source_hsolver/hsolver_pw.cpp b/source/source_hsolver/hsolver_pw.cpp
index b88bc3b90dd..618ee316406 100644
--- a/source/source_hsolver/hsolver_pw.cpp
+++ b/source/source_hsolver/hsolver_pw.cpp
@@ -11,6 +11,7 @@
 #include "source_hsolver/diago_cg.h"
 #include "source_hsolver/diago_dav_subspace.h"
 #include "source_hsolver/diago_david.h"
+#include "source_hsolver/diago_lobpcg.h"
 #include "source_hsolver/diago_iter_assist.h"
 #include "source_io/module_parameter/parameter.h"
 #include "source_psi/psi.h"
@@ -18,11 +19,46 @@
 
 
 #include <algorithm>
+#include <type_traits>
 #include <vector>
 
 namespace hsolver
 {
 
+template <typename T, typename Device, typename HPsiFunc, typename SPsiFunc>
+typename std::enable_if<std::is_same<T, std::complex<double>>::value
+                            && std::is_same<Device, base_device::DEVICE_CPU>::value,
+                        void>::type
+run_lobpcg_pw(const HPsiFunc& hpsi_func,
+              const SPsiFunc& spsi_func,
+              psi::Psi<T, Device>& psi,
+              const std::vector<typename GetTypeReal<T>::type>& pre_condition,
+              typename GetTypeReal<T>::type* eigenvalue,
+              const std::vector<double>& ethr_band)
+{
+    const int nband_l = psi.get_nbands();
+    const int nbasis = psi.get_nbasis();
+    const int ndim = psi.get_current_ngk();
+    DiagoLobpcg<T, Device> lobpcg(pre_condition.data());
+    lobpcg.init_iter(PARAM.inp.nbands, nband_l, nbasis, ndim);
+    lobpcg.diag(hpsi_func, spsi_func, psi.get_pointer(), eigenvalue, ethr_band);
+}
+
+template <typename T, typename Device, typename HPsiFunc, typename SPsiFunc>
+typename std::enable_if<!std::is_same<T, std::complex<double>>::value
+                            || !std::is_same<Device, base_device::DEVICE_CPU>::value,
+                        void>::type
+run_lobpcg_pw(const HPsiFunc&,
+              const SPsiFunc&,
+              psi::Psi<T, Device>&,
+              const std::vector<typename GetTypeReal<T>::type>&,
+              typename GetTypeReal<T>::type*,
+              const std::vector<double>&)
+{
+    ModuleBase::WARNING_QUIT("HSolverPW",
+        "LOBPCG is currently implemented only for CPU complex<double> PW calculations.");
+}
+
 template <typename T, typename Device>
 void HSolverPW<T, Device>::cal_smooth_ethr(const double& wk,
                                            const double* wg,
@@ -83,7 +119,7 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
     this->nproc_in_pool = nproc_in_pool_in;
 
     // report if the specified diagonalization method is not supported
-    const std::initializer_list<std::string> _methods = {"cg", "dav", "dav_subspace", "bpcg"};
+    const std::initializer_list<std::string> _methods = {"cg", "dav", "dav_subspace", "bpcg", "lobpcg"};
     if (std::find(std::begin(_methods), std::end(_methods), this->method) == std::end(_methods))
     {
         ModuleBase::WARNING_QUIT("HSolverPW::solve", "This type of eigensolver is not supported!");
@@ -273,8 +309,8 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm,
         hpsi_info info(&psi_wrapper, bands_range, hpsi_out);
         hm->ops->hPsi(info);
     };
-    auto spsi_func = [hm](const T* psi_in, T* spsi_out, const int ld_psi, const int nvec) {
-        hm->sPsi(psi_in, spsi_out, ld_psi, ld_psi, nvec);
+    auto spsi_func = [hm, cur_nbasis](const T* psi_in, T* spsi_out, const int ld_psi, const int nvec) {
+        hm->sPsi(psi_in, spsi_out, ld_psi, cur_nbasis, nvec);
     };
 
     if (this->method == "cg")
@@ -323,6 +359,10 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm,
         bpcg.init_iter(PARAM.inp.nbands, nband_l, nbasis, ndim);
         bpcg.diag(hpsi_func, psi.get_pointer(), eigenvalue, this->ethr_band);
     }
+    else if (this->method == "lobpcg")
+    {
+        run_lobpcg_pw<T, Device>(hpsi_func, spsi_func, psi, pre_condition, eigenvalue, this->ethr_band);
+    }
     else if (this->method == "dav_subspace")
     {
         bool scf = this->calculation_type == "nscf" ? false : true;
diff --git a/source/source_hsolver/test/CMakeLists.txt b/source/source_hsolver/test/CMakeLists.txt
index 1b1529adb4a..14d7c7dacc2 100644
--- a/source/source_hsolver/test/CMakeLists.txt
+++ b/source/source_hsolver/test/CMakeLists.txt
@@ -11,7 +11,15 @@ if (ENABLE_MPI)
   AddTest(
     TARGET MODULE_HSOLVER_bpcg
     LIBS parameter  ${math_libs} base psi device container
-    SOURCES diago_bpcg_test.cpp ../diago_bpcg.cpp ../para_linear_transform.cpp  ../diago_iter_assist.cpp  
+    SOURCES diago_bpcg_test.cpp ../diago_bpcg.cpp ../para_linear_transform.cpp  ../diago_iter_assist.cpp
+            ../../source_basis/module_pw/test/test_tool.cpp
+            ../../source_hamilt/operator.cpp
+            ../../source_pw/module_pwdft/op_pw.cpp
+  )
+  AddTest(
+    TARGET MODULE_HSOLVER_lobpcg
+    LIBS parameter  ${math_libs} base psi device container
+    SOURCES diago_lobpcg_test.cpp ../diago_lobpcg.cpp ../para_linear_transform.cpp  ../diago_iter_assist.cpp
             ../../source_basis/module_pw/test/test_tool.cpp
             ../../source_hamilt/operator.cpp
             ../../source_pw/module_pwdft/op_pw.cpp
@@ -76,14 +84,14 @@ if (ENABLE_MPI)
   AddTest(
     TARGET MODULE_HSOLVER_pw
     LIBS parameter  ${math_libs} psi device base container
-    SOURCES test_hsolver_pw.cpp ../hsolver_pw.cpp ../hsolver_lcaopw.cpp ../diago_bpcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp
+    SOURCES test_hsolver_pw.cpp ../hsolver_pw.cpp ../hsolver_lcaopw.cpp ../diago_bpcg.cpp ../diago_lobpcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp
     ../../source_estate/elecstate_tools.cpp ../../source_estate/occupy.cpp ../../source_base/module_fft/fft_bundle.cpp ../../source_base/module_fft/fft_cpu.cpp
   )
 
   AddTest(
     TARGET MODULE_HSOLVER_sdft
     LIBS parameter  ${math_libs} psi device base container
-    SOURCES test_hsolver_sdft.cpp ../hsolver_pw_sdft.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp
+    SOURCES test_hsolver_sdft.cpp ../hsolver_pw_sdft.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_lobpcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp
                 ../../source_estate/elecstate_tools.cpp ../../source_estate/occupy.cpp ../../source_base/module_fft/fft_bundle.cpp ../../source_base/module_fft/fft_cpu.cpp
     )
 
@@ -197,4 +205,4 @@ if (ENABLE_MPI)
       )
     endif()
   endif()
-endif()
\ No newline at end of file
+endif()
diff --git a/source/source_hsolver/test/diago_lobpcg_test.cpp b/source/source_hsolver/test/diago_lobpcg_test.cpp
new file mode 100644
index 00000000000..1cddaa6cfba
--- /dev/null
+++ b/source/source_hsolver/test/diago_lobpcg_test.cpp
@@ -0,0 +1,287 @@
+#include "../diago_iter_assist.h"
+#include "../diago_lobpcg.h"
+#include "source_base/global_variable.h"
+#include "source_base/module_external/lapack_connector.h"
+#include "source_base/parallel_comm.h"
+#include "source_basis/module_pw/test/test_tool.h"
+
+#ifdef __MPI
+#include "mpi.h"
+#endif
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <random>
+#include <vector>
+
+/************************************************
+ *  unit test of DiagoLobpcg (NC, CPU-only)
+ *
+ *  Validates eigenvalues, orthonormality, and
+ *  residual against LAPACK zheev for random
+ *  well-conditioned Hermitian matrices.
+ ***********************************************/
+
+using TestT      = std::complex<double>;
+using TestDevice = base_device::DEVICE_CPU;
+using TestReal   = double;
+
+/// Reference eigenvalues via LAPACK zheev (eigenvalues only).
+static int lapackEigen(int npw, std::vector<TestT>& hm, TestReal* e)
+{
+    int lwork = std::max(1, 2 * npw - 1);
+    std::vector<TestT> work(lwork);
+    std::vector<TestReal> rwork(std::max(1, 3 * npw - 2));
+    int info = 0;
+    char jobz = 'N', uplo = 'U';
+    zheev_(&jobz, &uplo, &npw, hm.data(), &npw, e,
+           work.data(), &lwork, rwork.data(), &info);
+    return info;
+}
+
+static void run_generalized_lobpcg()
+{
+    const int npw = 8, nband = 3;
+    std::vector<TestT> psi(nband * npw, {0.0, 0.0});
+    for (int ib = 0; ib < nband; ++ib)
+        psi[ib * npw + ib] = {1.0, 0.0};
+    std::vector<TestReal> eigens(nband, 0.0);
+    std::vector<TestReal> prec(npw, 1.0);
+    std::vector<double> ethr(nband, 1e-6);
+
+    auto hpsi_func = [](TestT* psi_in, TestT* hpsi_out,
+                        int ld_psi, int nvec) {
+        std::copy(psi_in, psi_in + ld_psi * nvec, hpsi_out);
+    };
+    auto spsi_func = [](const TestT* psi_in, TestT* spsi_out,
+                        int ld_psi, int nvec) {
+        std::copy(psi_in, psi_in + ld_psi * nvec, spsi_out);
+        spsi_out[0] *= 2.0;
+    };
+
+    hsolver::DiagoLobpcg<TestT, TestDevice> lobpcg(prec.data());
+    lobpcg.init_iter(nband, nband, npw, npw);
+    lobpcg.diag(hpsi_func, spsi_func, psi.data(), eigens.data(), ethr);
+}
+
+class DiagoLobpcgTest : public ::testing::Test
+{
+  protected:
+    static int idx(int row, int col, int ld) { return col * ld + row; }
+
+    void run_and_validate(int npw, int nband,
+                          const std::vector<TestT>& hmat,
+                          const std::vector<TestReal>& prec,
+                          const std::vector<TestReal>& e_ref,
+                          double eig_tol, double orth_tol, double res_tol)
+    {
+        const int ld = npw;
+
+        // ---- random orthonormal initial guess ----
+        std::vector<TestT> psi(nband * npw, {0.0, 0.0});
+        {
+            std::mt19937 gen(123);
+            std::uniform_real_distribution<TestReal> dist(-1.0, 1.0);
+            for (int ib = 0; ib < nband; ib++)
+                for (int ig = 0; ig < npw; ig++)
+                    psi[ib * npw + ig] = {dist(gen), dist(gen)};
+
+            for (int ib = 0; ib < nband; ib++)
+            {
+                for (int jb = 0; jb < ib; jb++)
+                {
+                    TestT dot = {0.0, 0.0};
+                    for (int ig = 0; ig < npw; ig++)
+                        dot += std::conj(psi[jb * npw + ig]) * psi[ib * npw + ig];
+                    for (int ig = 0; ig < npw; ig++)
+                        psi[ib * npw + ig] -= dot * psi[jb * npw + ig];
+                }
+                TestReal norm = 0.0;
+                for (int ig = 0; ig < npw; ig++)
+                    norm += std::norm(psi[ib * npw + ig]);
+                norm = std::sqrt(norm);
+                for (int ig = 0; ig < npw; ig++)
+                    psi[ib * npw + ig] /= norm;
+            }
+        }
+
+        auto hpsi_func = [&](TestT* psi_in, TestT* hpsi_out,
+                              int ld_psi, int nvec) {
+            for (int iv = 0; iv < nvec; iv++)
+                for (int i = 0; i < npw; i++)
+                {
+                    TestT sum = {0.0, 0.0};
+                    for (int j = 0; j < npw; j++)
+                        sum += hmat[idx(i, j, ld)] * psi_in[iv * ld_psi + j];
+                    hpsi_out[iv * ld_psi + i] = sum;
+                }
+        };
+
+        // ---- run LOBPCG ----
+        std::vector<TestReal> eigens(nband, 0.0);
+        std::vector<double> ethr(nband, 1e-6);
+
+        // SCF_ITER = 1 triggers periodic R-R restart that clears P, which
+        // naturally limits subspace ill-conditioning.  Use moderate nline.
+        const int old_scf = hsolver::DiagoIterAssist<TestT, TestDevice>::SCF_ITER;
+        hsolver::DiagoIterAssist<TestT, TestDevice>::SCF_ITER = 1;
+
+        hsolver::DiagoLobpcg<TestT, TestDevice> lobpcg(prec.data());
+        lobpcg.init_iter(nband, nband, npw, npw);
+        lobpcg.set_nline(4);
+        auto spsi_func = [](const TestT* psi_in, TestT* spsi_out,
+                            int ld_psi, int nvec) {
+            std::copy(psi_in, psi_in + ld_psi * nvec, spsi_out);
+        };
+        lobpcg.diag(hpsi_func, spsi_func, psi.data(), eigens.data(), ethr);
+
+        hsolver::DiagoIterAssist<TestT, TestDevice>::SCF_ITER = old_scf;
+
+        // ---- validate eigenvalues ----
+        for (int ib = 0; ib < nband; ib++)
+            ASSERT_NEAR(eigens[ib], e_ref[ib], eig_tol)
+                << "eigenvalue[" << ib << "] mismatch: "
+                << eigens[ib] << " vs ref " << e_ref[ib];
+
+        // ---- validate orthonormality ----
+        for (int i = 0; i < nband; i++)
+            for (int j = 0; j < nband; j++)
+            {
+                TestT dot = {0.0, 0.0};
+                for (int ig = 0; ig < npw; ig++)
+                    dot += std::conj(psi[i * npw + ig]) * psi[j * npw + ig];
+                if (i == j)
+                {
+                    EXPECT_NEAR(std::real(dot), 1.0, orth_tol)
+                        << "psi^H psi diag[" << i << "] = " << std::real(dot);
+                    EXPECT_NEAR(std::imag(dot), 0.0, orth_tol)
+                        << "psi^H psi diag[" << i << "] imag = " << std::imag(dot);
+                }
+                else
+                    EXPECT_NEAR(std::abs(dot), 0.0, orth_tol)
+                        << "psi^H psi (" << i << "," << j << ") = " << std::abs(dot);
+            }
+
+        // ---- validate residual: ||H*psi_i - eig_i * psi_i|| ----
+        for (int ib = 0; ib < nband; ib++)
+        {
+            TestReal res2 = 0.0;
+            for (int i = 0; i < npw; i++)
+            {
+                TestT hxi = {0.0, 0.0};
+                for (int j = 0; j < npw; j++)
+                    hxi += hmat[idx(i, j, ld)] * psi[ib * npw + j];
+                const auto r = hxi - eigens[ib] * psi[ib * npw + i];
+                res2 += std::norm(r);
+            }
+            EXPECT_LT(std::sqrt(res2), res_tol)
+                << "residual[" << ib << "] = " << std::sqrt(res2);
+        }
+    }
+
+    /// Build a strongly diagonally-dominant random Hermitian matrix.
+    static void build_well_conditioned(int npw, std::vector<TestT>& hmat,
+                                       std::vector<TestReal>& prec,
+                                       std::vector<TestReal>& e_ref)
+    {
+        const int ld = npw;
+        hmat.assign(npw * npw, {0.0, 0.0});
+
+        std::mt19937 gen(42);
+        std::uniform_real_distribution<TestReal> dist(-1.0, 1.0);
+
+        for (int i = 0; i < npw; i++)
+        {
+            for (int j = i; j < npw; j++)
+            {
+                TestReal re = dist(gen) * 0.5;
+                TestReal im = (i != j) ? dist(gen) * 0.5 : 0.0;
+                hmat[idx(i, j, ld)] = {re, im};
+                hmat[idx(j, i, ld)] = {re, -im};
+            }
+            hmat[idx(i, i, ld)] += TestT(
+                static_cast<TestReal>(2.0 * (i + 1) * (i + 1)), 0.0);
+        }
+
+        // Reference
+        auto hcopy = hmat;
+        e_ref.resize(npw);
+        ASSERT_EQ(lapackEigen(npw, hcopy, e_ref.data()), 0);
+
+        // Preconditioner
+        prec.resize(npw);
+        for (int i = 0; i < npw; i++)
+            prec[i] = std::max(static_cast<TestReal>(1.0),
+                               std::real(hmat[idx(i, i, ld)]));
+    }
+};
+
+// ============================================================================
+// Test cases: various matrix sizes and band counts
+// ============================================================================
+
+TEST_F(DiagoLobpcgTest, SmallMatrix)
+{
+    const int npw = 50, nband = 10;
+    std::vector<TestT> hmat;
+    std::vector<TestReal> prec, e_ref;
+    build_well_conditioned(npw, hmat, prec, e_ref);
+    run_and_validate(npw, nband, hmat, prec, e_ref, 1e-5, 1e-8, 1e-4);
+}
+
+TEST_F(DiagoLobpcgTest, MediumMatrix)
+{
+    const int npw = 200, nband = 20;
+    std::vector<TestT> hmat;
+    std::vector<TestReal> prec, e_ref;
+    build_well_conditioned(npw, hmat, prec, e_ref);
+    run_and_validate(npw, nband, hmat, prec, e_ref, 1e-4, 1e-6, 2e-4);
+}
+
+TEST_F(DiagoLobpcgTest, LargerMatrixFewBands)
+{
+    const int npw = 400, nband = 12;
+    std::vector<TestT> hmat;
+    std::vector<TestReal> prec, e_ref;
+    build_well_conditioned(npw, hmat, prec, e_ref);
+    run_and_validate(npw, nband, hmat, prec, e_ref, 1e-4, 1e-6, 3e-4);
+}
+
+TEST_F(DiagoLobpcgTest, NonIdentityOverlapNotImplemented)
+{
+    EXPECT_EXIT(
+        run_generalized_lobpcg(),
+        ::testing::ExitedWithCode(1),
+        ".*"
+    );
+}
+
+int main(int argc, char** argv)
+{
+    int nproc = 1, myrank = 0;
+
+#ifdef __MPI
+    int nproc_in_pool, kpar = 1, mypool, rank_in_pool;
+    setupmpi(argc, argv, nproc, myrank);
+    divide_pools(nproc, myrank, nproc_in_pool, kpar, mypool, rank_in_pool);
+    MPI_Comm_split(MPI_COMM_WORLD, myrank, 0, &BP_WORLD);
+    GlobalV::NPROC_IN_POOL = nproc;
+#else
+    MPI_Init(&argc, &argv);
+#endif
+
+    testing::InitGoogleTest(&argc, argv);
+    ::testing::TestEventListeners& listeners =
+        ::testing::UnitTest::GetInstance()->listeners();
+    if (myrank != 0)
+        delete listeners.Release(listeners.default_result_printer());
+
+    int result = RUN_ALL_TESTS();
+    if (myrank == 0 && result != 0)
+        std::cout << "ERROR: some tests are not passed" << std::endl;
+
+    MPI_Finalize();
+    return result;
+}
diff --git a/source/source_io/module_parameter/read_input_item_elec_stru.cpp b/source/source_io/module_parameter/read_input_item_elec_stru.cpp
index 39f37febc54..7fdb3fa5417 100644
--- a/source/source_io/module_parameter/read_input_item_elec_stru.cpp
+++ b/source/source_io/module_parameter/read_input_item_elec_stru.cpp
@@ -131,7 +131,7 @@ Then the user has to correct the input file and restart the calculation.)";
         };
         item.check_value = [](const Input_Item& item, const Parameter& para) {
             const std::string& ks_solver = para.input.ks_solver;
-            const std::vector<std::string> pw_solvers = {"cg", "dav", "bpcg", "dav_subspace"};
+            const std::vector<std::string> pw_solvers = {"cg", "dav", "bpcg", "dav_subspace", "lobpcg"};
             const std::vector<std::string> lcao_solvers = {
                 "genelpa",
                 "elpa",
@@ -1040,7 +1040,7 @@ Use case: When experimental or high-level theoretical results suggest that the S
         item.annotation = "threshold for eigenvalues is cg electron iterations";
         item.category = "Plane wave related variables";
         item.type = "Real";
-        item.description = "Only used when you use ks_solver = cg/dav/dav_subspace/bpcg. It indicates the threshold for the first electronic iteration, from the second iteration the pw_diag_thr will be updated automatically. For nscf calculations with planewave basis set, pw_diag_thr should be <= 1e-3.";
+        item.description = "Only used when you use ks_solver = cg/dav/dav_subspace/bpcg/lobpcg. It indicates the threshold for the first electronic iteration, from the second iteration the pw_diag_thr will be updated automatically. For nscf calculations with planewave basis set, pw_diag_thr should be <= 1e-3.";
         item.default_value = "0.01";
         item.unit = "";
         item.availability = "";
@@ -1099,10 +1099,10 @@ Use case: When experimental or high-level theoretical results suggest that the S
         item.annotation = "max iteration number for cg";
         item.category = "Plane wave related variables";
         item.type = "Integer";
-        item.description = "Only useful when you use ks_solver = cg/dav/dav_subspace/bpcg. It indicates the maximal iteration number for cg/david/dav_subspace/bpcg method.";
+        item.description = "Only useful when you use ks_solver = cg/dav/dav_subspace/bpcg/lobpcg. It indicates the maximal iteration number for cg/david/dav_subspace/bpcg/lobpcg method.";
         item.default_value = "50";
         item.unit = "";
-        item.availability = "basis_type==pw, ks_solver==cg/dav/dav_subspace/bpcg";
+        item.availability = "basis_type==pw, ks_solver==cg/dav/dav_subspace/bpcg/lobpcg";
         read_sync_int(input.pw_diag_nmax);
         this->add_item(item);
     }