deepmodeling · dyzheng · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
@@ -149,6 +149,9 @@ elseif(USE_ROCM)
   )
 endif()
 
+# base library uses symbols from device library (memory_op, math_ylm_op)
+target_link_libraries(base PUBLIC device)
+
 if(ENABLE_COVERAGE)
   add_coverage(driver)
 endif()
diff --git a/source/source_base/kernels/cuda/math_kernel_op.cu b/source/source_base/kernels/cuda/math_kernel_op.cu
@@ -313,6 +313,9 @@ void gemm_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const ch
 {
     cublasOperation_t cutransA = judge_trans_op(true, transa, "gemm_op");
     cublasOperation_t cutransB = judge_trans_op(true, transb, "gemm_op");
+    if (cublas_handle == nullptr) {
+        CHECK_CUBLAS(cublasCreate(&cublas_handle));
+    }
     CHECK_CUBLAS(cublasZgemm(cublas_handle, cutransA, cutransB, m, n ,k, (double2*)alpha, (double2*)a , lda, (double2*)b, ldb, (double2*)beta, (double2*)c, ldc));
 }
 

diff --git a/source/source_base/module_container/base/macros/cuda.h b/source/source_base/module_container/base/macros/cuda.h
@@ -67,11 +67,13 @@ struct GetTypeCuda<double>
 {
     static constexpr cudaDataType cuda_data_type = cudaDataType::CUDA_R_64F;
 };
+#if CUDA_VERSION >= 11000
 template <>
 struct GetTypeCuda<int64_t>
 {
     static constexpr cudaDataType cuda_data_type = cudaDataType::CUDA_R_64I;
 };
+#endif
 template <>
 struct GetTypeCuda<std::complex<float>>
 {

diff --git a/source/source_base/module_container/base/third_party/cusolver.h b/source/source_base/module_container/base/third_party/cusolver.h
@@ -19,6 +19,8 @@
 namespace container {
 namespace cuSolverConnector {
 
+#if CUDA_VERSION >= 11000
+// Generic API (CUDA 11.0+)
 template <typename T>
 static inline
 void trtri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, T* A, const int& lda)
@@ -47,6 +49,57 @@ void trtri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& d
     CHECK_CUDA(cudaFree(d_work));
     CHECK_CUDA(cudaFree(d_info));
 }
+#else
+// Legacy API fallback (CUDA < 11.0)
+static inline void trtri(cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, float* A, const int& lda)
+{
+    int lwork = 0;
+    CHECK_CUSOLVER(cusolverDnStrtri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, A, lda, &lwork));
+    float* d_work = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_work, lwork * sizeof(float)));
+    int* d_info = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_info, sizeof(int)));
+    CHECK_CUSOLVER(cusolverDnStrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, A, lda, d_work, lwork, d_info));
+    CHECK_CUDA(cudaFree(d_work));
+    CHECK_CUDA(cudaFree(d_info));
+}
+static inline void trtri(cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, double* A, const int& lda)
+{
+    int lwork = 0;
+    CHECK_CUSOLVER(cusolverDnDtrtri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, A, lda, &lwork));
+    double* d_work = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_work, lwork * sizeof(double)));
+    int* d_info = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_info, sizeof(int)));
+    CHECK_CUSOLVER(cusolverDnDtrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, A, lda, d_work, lwork, d_info));
+    CHECK_CUDA(cudaFree(d_work));
+    CHECK_CUDA(cudaFree(d_info));
+}
+static inline void trtri(cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, std::complex<float>* A, const int& lda)
+{
+    int lwork = 0;
+    CHECK_CUSOLVER(cusolverDnCtrtri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, reinterpret_cast<cuComplex*>(A), lda, &lwork));
+    cuComplex* d_work = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_work, lwork * sizeof(cuComplex)));
+    int* d_info = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_info, sizeof(int)));
+    CHECK_CUSOLVER(cusolverDnCtrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, reinterpret_cast<cuComplex*>(A), lda, d_work, lwork, d_info));
+    CHECK_CUDA(cudaFree(d_work));
+    CHECK_CUDA(cudaFree(d_info));
+}
+static inline void trtri(cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, std::complex<double>* A, const int& lda)
+{
+    int lwork = 0;
+    CHECK_CUSOLVER(cusolverDnZtrtri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, reinterpret_cast<cuDoubleComplex*>(A), lda, &lwork));
+    cuDoubleComplex* d_work = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_work, lwork * sizeof(cuDoubleComplex)));
+    int* d_info = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_info, sizeof(int)));
+    CHECK_CUSOLVER(cusolverDnZtrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, reinterpret_cast<cuDoubleComplex*>(A), lda, d_work, lwork, d_info));
+    CHECK_CUDA(cudaFree(d_work));
+    CHECK_CUDA(cudaFree(d_info));
+}
+#endif
 
 static inline
 void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, float * A, const int& lda)

diff --git a/source/source_base/module_device/device_check.h b/source/source_base/module_device/device_check.h
@@ -67,6 +67,7 @@ static const char* _cusolverGetErrorString(cusolverStatus_t error)
         return "CUSOLVER_STATUS_ZERO_PIVOT";
     case CUSOLVER_STATUS_INVALID_LICENSE:
         return "CUSOLVER_STATUS_INVALID_LICENSE";
+#if CUDA_VERSION >= 11000
     case CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED:
         return "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED";
     case CUSOLVER_STATUS_IRS_PARAMS_INVALID:
@@ -93,6 +94,7 @@ static const char* _cusolverGetErrorString(cusolverStatus_t error)
         return "CUSOLVER_STATUS_IRS_MATRIX_SINGULAR";
     case CUSOLVER_STATUS_INVALID_WORKSPACE:
         return "CUSOLVER_STATUS_INVALID_WORKSPACE";
+#endif
     default:
         return "<unknown>";
     }

diff --git a/source/source_base/parallel_global.cpp b/source/source_base/parallel_global.cpp
@@ -201,15 +201,30 @@ void Parallel_Global::read_pal_param(int argc,
 #ifdef __MPI
 void Parallel_Global::finalize_mpi()
 {
-    MPI_Comm_free(&POOL_WORLD);
-    if (KP_WORLD != MPI_COMM_NULL)
+    if (POOL_WORLD != MPI_COMM_NULL && POOL_WORLD != MPI_COMM_WORLD)
+    {
+        MPI_Comm_free(&POOL_WORLD);
+    }
+    if (KP_WORLD != MPI_COMM_NULL && KP_WORLD != MPI_COMM_WORLD)
     {
         MPI_Comm_free(&KP_WORLD);
     }
-    MPI_Comm_free(&INT_BGROUP);
-    MPI_Comm_free(&BP_WORLD);
-    MPI_Comm_free(&GRID_WORLD);
-    MPI_Comm_free(&DIAG_WORLD);
+    if (INT_BGROUP != MPI_COMM_NULL && INT_BGROUP != MPI_COMM_WORLD)
+    {
+        MPI_Comm_free(&INT_BGROUP);
+    }
+    if (BP_WORLD != MPI_COMM_NULL && BP_WORLD != MPI_COMM_WORLD)
+    {
+        MPI_Comm_free(&BP_WORLD);
+    }
+    if (GRID_WORLD != MPI_COMM_NULL && GRID_WORLD != MPI_COMM_WORLD)
+    {
+        MPI_Comm_free(&GRID_WORLD);
+    }
+    if (DIAG_WORLD != MPI_COMM_NULL && DIAG_WORLD != MPI_COMM_WORLD)
+    {
+        MPI_Comm_free(&DIAG_WORLD);
+    }
     MPI_Finalize();
 }
 #endif

diff --git a/source/source_cell/read_atoms_helper.cpp b/source/source_cell/read_atoms_helper.cpp
@@ -453,6 +453,9 @@ bool parse_atom_properties(std::ifstream& ifpos,
             atom.lambda[ia].x /= ModuleBase::Ry_to_eV;
             atom.lambda[ia].y /= ModuleBase::Ry_to_eV;
             atom.lambda[ia].z /= ModuleBase::Ry_to_eV;
+            std::cout << "[DS-DIAG] STRU parse: lambda[" << ia << "]=("
+                      << atom.lambda[ia].x << ", " << atom.lambda[ia].y << ", "
+                      << atom.lambda[ia].z << ") Ry/uB (converted from eV/uB)" << std::endl;
         }
         else if ( tmpid == "sc")
         {

diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp
@@ -398,7 +398,27 @@ void ESolver_KS_LCAO<TK, TR>::hamilt2rho_single(UnitCell& ucell, int istep, int
     bool skip_charge = PARAM.inp.calculation == "nscf" ? true : false;
 
     // 2) run the inner lambda loop to contrain atomic moments with the DeltaSpin method
-    bool skip_solve = run_deltaspin_lambda_loop_lcao<TK>(iter - 1, this->drho, PARAM.inp);
+    bool skip_solve = false;
+    if (PARAM.inp.sc_mag_switch)
+    {
+        spinconstrain::SpinConstrain<TK>& sc = spinconstrain::SpinConstrain<TK>::getScInstance();
+        if (PARAM.inp.sc_lambda_strategy == "linear_scan")
+        {
+            sc.run_lambda_linear_scan(iter - 1);
+            skip_solve = true;
+        }
+        else if (!sc.mag_converged() && this->drho > 0 && this->drho < PARAM.inp.sc_scf_thr)
+        {
+            sc.run_lambda_loop(iter - 1);
+            sc.set_mag_converged(true);
+            skip_solve = true;
+        }
+        else if (sc.mag_converged())
+        {
+            sc.run_lambda_loop(iter - 1);
+            skip_solve = true;
+        }
+    }
 
     // 3) run Hsolver
     if (!skip_solve)
@@ -407,6 +427,12 @@ void ESolver_KS_LCAO<TK, TR>::hamilt2rho_single(UnitCell& ucell, int istep, int
         hsolver_lcao_obj.solve(static_cast<hamilt::Hamilt<TK>*>(this->p_hamilt), this->psi[0], this->pelec, *this->dmat.dm, 
           this->chr, PARAM.inp.nspin, skip_charge);
     }
+    else
+    {
+        // Lambda loop updated the density matrix (DM) but not the real-space charge density.
+        // HSolver was skipped, so we need to sync rho from DM manually.
+        LCAO_domain::dm2rho(this->dmat.dm->get_DMR_vector(), PARAM.inp.nspin, &this->chr);
+    }
 
     // 4) EXX
 #ifdef __EXX

diff --git a/source/source_esolver/esolver_ks_pw.cpp b/source/source_esolver/esolver_ks_pw.cpp
@@ -189,7 +189,7 @@ void ESolver_KS_PW<T, Device>::iter_init(UnitCell& ucell, const int istep, const
 
     // update local occupations for DFT+U
     // should before lambda loop in DeltaSpin
-    pw::iter_init_dftu_pw(iter, istep, this->dftu, this->stp.template get_psi_t<T, Device>(), this->pelec->wg, ucell, PARAM.inp);
+    pw::iter_init_dftu_pw(iter, istep, this->dftu, this->stp.template get_psi_t<T, Device>(), this->pelec->wg, ucell, this->p_chgmix, this->kv.isk.data());
 }
 
 // Temporary, it should be replaced by hsolver later.

diff --git a/source/source_esolver/lcao_others.cpp b/source/source_esolver/lcao_others.cpp
@@ -156,6 +156,7 @@ void ESolver_KS_LCAO<TK, TR>::others(UnitCell& ucell, const int istep)
                    PARAM.inp.sccut,
                    PARAM.inp.sc_drop_thr,
                    ucell,
+                   PARAM.inp.sc_direction_only,
                    &(this->pv),
                    PARAM.inp.nspin,
                    this->kv,

diff --git a/source/source_estate/elecstate_lcao.h b/source/source_estate/elecstate_lcao.h
@@ -3,6 +3,8 @@
 
 #include "elecstate.h"
 #include "source_estate/module_dm/density_matrix.h"
+#include "source_basis/module_ao/parallel_orbitals.h"
+#include "source_cell/klist.h"
 
 #include <vector>
 
@@ -24,12 +26,7 @@ class ElecStateLCAO : public ElecState
         this->classname = "ElecStateLCAO";
     }
 
-    virtual ~ElecStateLCAO()
-    {
-    }
-
-    // update charge density for next scf step
-    // void getNewRho() override;
+    virtual ~ElecStateLCAO() = default;
 
     static int out_wfc_lcao;
     static bool need_psi_grid;

diff --git a/source/source_estate/module_charge/charge_mixing.cpp b/source/source_estate/module_charge/charge_mixing.cpp
@@ -257,3 +257,34 @@ bool Charge_Mixing::if_scf_oscillate(const int iteration, const double drho, con
 
     return false;
 }
+
+void Charge_Mixing::allocate_mixing_uom(int uom_size)
+{
+    ModuleBase::TITLE("Charge_Mixing", "allocate_mixing_uom");
+    ModuleBase::timer::start("Charge_Mixing", "allocate_mixing_uom");
+    ModuleBase::timer::end("Charge_Mixing", "allocate_mixing_uom");
+    // For nspin=2, uom_size already includes both spin channels
+    // (eff_pot_pw.size() = pot_index * 2 for nspin=2)
+    // So uom_fold should always be 1
+    this->mixing->init_mixing_data(this->uom_mdata, uom_size, sizeof(double));
+    this->uom_mdata.reset();
+    ModuleBase::timer::start("Charge_Mixing", "allocate_mixing_uom");
+    ModuleBase::timer::end("Charge_Mixing", "allocate_mixing_uom");
+    return;
+}
+
+void Charge_Mixing::mix_uom(std::vector<double>& uom_in, std::vector<double>& uom_save_in)
+{
+    ModuleBase::TITLE("Charge_Mixing", "mix_uom");
+    ModuleBase::timer::start("Charge_Mixing", "mix_uom");
+    ModuleBase::timer::end("Charge_Mixing", "mix_uom");
+    double* uom_value_out = uom_in.data();
+    double* uom_value_in = uom_save_in.data();
+    // For all nspin cases, uom_array layout is already fully sized
+    // and mixing operates on the entire array
+    this->mixing->push_data(this->uom_mdata, uom_value_in, uom_value_out, nullptr, false);
+    this->mixing->mix_data(this->uom_mdata, uom_value_out);
+    ModuleBase::timer::start("Charge_Mixing", "mix_uom");
+    ModuleBase::timer::end("Charge_Mixing", "mix_uom");
+    return;
+}
diff --git a/source/source_estate/module_charge/charge_mixing.h b/source/source_estate/module_charge/charge_mixing.h
@@ -50,6 +50,7 @@ class Charge_Mixing
                     double& tpiba_in);
 
     void close_kerker_gg0() { mixing_gg0 = 0.0; mixing_gg0_mag = 0.0; }
+    void conserve_setting() { mixing_beta = 0.01; mixing_beta_mag = 0.04; }
     /**
      * @brief initialize mixing, including constructing mixing and allocating memory for mixing data
      * @brief this function should be called at eachiterinit()
@@ -74,7 +75,20 @@ class Charge_Mixing
      */
     void mix_dmr(elecstate::DensityMatrix<double, double>* DM);
     void mix_dmr(elecstate::DensityMatrix<std::complex<double>, double>* DM);
-
+
+    /**
+     * @brief allocate memory of uom_mdata
+     * @param uom_size size of DFT+U occupation matrix
+     */
+    void allocate_mixing_uom(int size_uom);
+
+    /**
+     * @brief DFT+U occupation matrix mixing
+     * @param uom_in output occupation matrix
+     * @param uom_save_in input occupation matrix
+     */
+    void mix_uom(std::vector<double>& uom_in, std::vector<double>& uom_save_in);
+
     /**
      * @brief Get the drho between rho and rho_save, similar for get_dkin
      *
@@ -118,6 +132,7 @@ class Charge_Mixing
     Base_Mixing::Mixing_Data tau_mdata;    ///< Mixing data for kinetic energy density
     Base_Mixing::Mixing_Data nhat_mdata;   ///< Mixing data for compensation density
     Base_Mixing::Mixing_Data dmr_mdata;    ///< Mixing data for real space density matrix
+    Base_Mixing::Mixing_Data uom_mdata;    ///< Mixing data for DFT+U occupation matrix
     Base_Mixing::Plain_Mixing* mixing_highf = nullptr; ///< The high_frequency part is mixed by plain mixing method.
 
     //======================================

diff --git a/source/source_estate/module_charge/chgmixing.cpp b/source/source_estate/module_charge/chgmixing.cpp
@@ -128,6 +128,13 @@ void module_charge::chgmixing_ks_pw(const int iter, // scf iteration number
     {
         p_chgmix->init_mixing();
         p_chgmix->mixing_restart_step = inp.scf_nmax + 1;
+        if (inp.dft_plus_u && inp.mixing_dftu)
+        {
+            // enable mixing_dftu for DFT+U occupation mixing
+            dftu.enable_mixing();
+            // allocate memory for uom_mdata
+            p_chgmix->allocate_mixing_uom(dftu.get_size_eff_pot_pw());
+        }
     }
 
     // For mixing restart
@@ -158,9 +165,9 @@ void module_charge::chgmixing_ks_pw(const int iter, // scf iteration number
 				{
 					dftu.uramping_update(); // update U by uramping if uramping > 0.01
 					std::cout << " U-Ramping! Current U = ";
-					for (int i = 0; i < dftu.U0.size(); i++)
+					for (int i = 0; i < dftu.get_num_u_types(); i++)
 					{
-						std::cout << dftu.U[i] * ModuleBase::Ry_to_eV << " ";
+						std::cout << dftu.get_hubbard_u(i) * ModuleBase::Ry_to_eV << " ";
 					}
 					std::cout << " eV " << std::endl;
 				}
@@ -184,13 +191,18 @@ void module_charge::chgmixing_ks_lcao(const int iter, // scf iteration number
         p_chgmix->mix_reset(); // init mixing
         p_chgmix->mixing_restart_step = inp.scf_nmax + 1;
         p_chgmix->mixing_restart_count = 0;
+        // enable mixing_dftu for DFT+U occupation mixing
+        if (inp.dft_plus_u && inp.mixing_dftu)
+        {
+            dftu.enable_mixing();
+        }
         // this output will be removed once the feeature is stable
         if (dftu.uramping > 0.01)
         {
             std::cout << " U-Ramping! Current U = ";
-            for (int i = 0; i < dftu.U0.size(); i++)
+            for (int i = 0; i < dftu.get_num_u_types(); i++)
             {
-                std::cout << dftu.U[i] * ModuleBase::Ry_to_eV << " ";
+                std::cout << dftu.get_hubbard_u(i) * ModuleBase::Ry_to_eV << " ";
             }
             std::cout << " eV " << std::endl;
         }
@@ -207,9 +219,9 @@ void module_charge::chgmixing_ks_lcao(const int iter, // scf iteration number
             if (dftu.uramping > 0.01)
             {
                 std::cout << " U-Ramping! Current U = ";
-                for (int i = 0; i < dftu.U0.size(); i++)
+                for (int i = 0; i < dftu.get_num_u_types(); i++)
                 {
-                    std::cout << dftu.U[i] * ModuleBase::Ry_to_eV << " ";
+                    std::cout << dftu.get_hubbard_u(i) * ModuleBase::Ry_to_eV << " ";
                 }
                 std::cout << " eV " << std::endl;
             }

diff --git a/source/source_io/module_parameter/input_parameter.h b/source/source_io/module_parameter/input_parameter.h
@@ -597,11 +597,16 @@ struct Input_para
     double sc_thr = 1e-06;          ///< threshold for spin-constrained DFT in uB
     int nsc = 100;                  ///< maximum number of inner lambda loop
     int nsc_min = 2;                ///< minimum number of inner lambda loop
-    int sc_scf_nmin = 2;            ///< minimum number of outer scf loop before initial lambda loop
     double alpha_trial = 0.01;      ///< initial trial step size for lambda in eV/uB^2
     double sccut = 3.0;             ///< restriction of step size in eV/uB
     double sc_scf_thr = 1e-3;       ///< minimum number of outer scf loop before initial lambda loop
     double sc_drop_thr = 1e-3;      ///< threshold for lambda-loop threshold cutoff in spin-constrained DFT
+    std::string sc_lambda_strategy = "bfgs";  ///< lambda update strategy: bfgs, bfgs2, linear_response, augmented_lagrangian, hybrid_delayed, linear_scan
+    bool sc_direction_only = false; ///< only optimize the direction of magnetization
+    // linear_scan parameters
+    double sc_scan_lambda_start = 0.0;  ///< start value for lambda scan (eV/uB)
+    double sc_scan_lambda_end = 1.0;    ///< end value for lambda scan (eV/uB)
+    int sc_scan_steps = 20;             ///< number of steps in lambda scan
 
     // ==============   #Parameters (18.Quasiatomic Orbital analysis) =========
     ///<==========================================================