From f8bb7ab8f962051105f6b734da6121be212f11be Mon Sep 17 00:00:00 2001
From: ubuntu <3158793232@qq.com>
Date: Fri, 30 Jan 2026 13:22:41 +0800
Subject: [PATCH 1/8] fix comupte bug

---
 source/source_pw/module_pwdft/vl_pw.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/source/source_pw/module_pwdft/vl_pw.cpp b/source/source_pw/module_pwdft/vl_pw.cpp
index 672fbd6185..c1a3f4e8d0 100644
--- a/source/source_pw/module_pwdft/vl_pw.cpp
+++ b/source/source_pw/module_pwdft/vl_pw.cpp
@@ -3,7 +3,7 @@
 #include "source_base/libm/libm.h"
 #include "source_base/math_integral.h"
 #include "source_base/timer.h"
-
+#include "source_base/tool_quit.h"
 pseudopot_cell_vl::pseudopot_cell_vl()
 {
 	numeric = nullptr;
@@ -226,8 +226,14 @@ void pseudopot_cell_vl::vloc_of_g(const int& msh,
 			aux [ir] = aux1 [ir] * ModuleBase::libm::sin(gx * r [ir]) / gx;
 		}
 		ModuleBase::Integral::Simpson_Integral(msh, aux, rab, vloc_1d[ig] );
-		//  here we add the analytic fourier transform of the erf function
-		vloc_1d[ig] -= fac * ModuleBase::libm::exp(- gx2 * 0.25)/ gx2;
+		// here we add the analytic fourier transform of the erf function
+		// here exp(-gx2*0.25) < 1e-100, skip it to avoid 
+		// float underflow and abundant calculation
+		if (gx2 > 921)
+		{
+			continue;
+		}
+		vloc_1d[ig] -= fac * std::exp(- gx2 * 0.25)/ gx2;
 	} // enddo
 
 	const double d_fpi_omega = ModuleBase::FOUR_PI/ucell.omega;//mohan add 2008-06-04

From d6aaca7f7b624f7d92abb48152010b7bcf4a69f0 Mon Sep 17 00:00:00 2001
From: ubuntu <3158793232@qq.com>
Date: Fri, 30 Jan 2026 13:29:08 +0800
Subject: [PATCH 2/8] clean format

---
 source/source_pw/module_pwdft/vl_pw.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/source_pw/module_pwdft/vl_pw.cpp b/source/source_pw/module_pwdft/vl_pw.cpp
index c1a3f4e8d0..fb949bc20a 100644
--- a/source/source_pw/module_pwdft/vl_pw.cpp
+++ b/source/source_pw/module_pwdft/vl_pw.cpp
@@ -3,7 +3,7 @@
 #include "source_base/libm/libm.h"
 #include "source_base/math_integral.h"
 #include "source_base/timer.h"
-#include "source_base/tool_quit.h"
+
 pseudopot_cell_vl::pseudopot_cell_vl()
 {
 	numeric = nullptr;

From 64f07a30b56c227b57bd252c2efc2f7426410f8b Mon Sep 17 00:00:00 2001
From: ubuntu <3158793232@qq.com>
Date: Fri, 30 Jan 2026 13:31:22 +0800
Subject: [PATCH 3/8] add change

---
 source/source_pw/module_pwdft/vl_pw.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/source_pw/module_pwdft/vl_pw.cpp b/source/source_pw/module_pwdft/vl_pw.cpp
index fb949bc20a..5ff21a9aaf 100644
--- a/source/source_pw/module_pwdft/vl_pw.cpp
+++ b/source/source_pw/module_pwdft/vl_pw.cpp
@@ -233,7 +233,7 @@ void pseudopot_cell_vl::vloc_of_g(const int& msh,
 		{
 			continue;
 		}
-		vloc_1d[ig] -= fac * std::exp(- gx2 * 0.25)/ gx2;
+		vloc_1d[ig] -= fac *  ModuleBase::libm::exp(- gx2 * 0.25)/ gx2;
 	} // enddo
 
 	const double d_fpi_omega = ModuleBase::FOUR_PI/ucell.omega;//mohan add 2008-06-04

From 9fc712b1e9f84631b9047143cf0d0b5df61e2599 Mon Sep 17 00:00:00 2001
From: ubuntu <3158793232@qq.com>
Date: Fri, 30 Jan 2026 16:11:01 +0800
Subject: [PATCH 4/8] add exp

---
 source/source_base/libm/libm.h           | 27 ++++++++++++++++++++++++
 source/source_pw/module_pwdft/forces.cpp | 17 +++++++++++++--
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/source/source_base/libm/libm.h b/source/source_base/libm/libm.h
index 13bf65de5c..65996ad43b 100644
--- a/source/source_base/libm/libm.h
+++ b/source/source_base/libm/libm.h
@@ -49,6 +49,33 @@ template<> inline double cos(double x) { return __cos(x); }
 template<> inline double sin(double x) { return __sin(x); }
 template<> inline void sincos(double x, double *s, double *c) { __sincos(x, s, c); }
 template<> inline std::complex<double> exp(const std::complex<double> &x) { return __cexp(x); }
+// A safe exponential function that truncates to zero for very small values
+// to avoid underflow and expensive calculations.
+// Threshold: exp(-230) approx 1e-100.
+inline std::complex<double> truncated_exp(const std::complex<double> &x)
+{
+    if (x.real() < -230) return std::complex<double>(0.0, 0.0);
+        return __cexp(x);
+}
+
+inline std::complex<float> truncated_exp(const std::complex<float> &x)
+{
+    if (x.real() < -230) return std::complex<float>(0.0, 0.0);
+        return __cexpf(x);
+}
+
+inline double truncated_exp(const double &x)
+{
+    if (x < -230) return 0.0;
+        return __exp(x);
+}
+
+inline float truncated_exp(const float &x)
+{
+    if (x < -230) return 0.0;
+        return __expf(x);
+}
+
 
 template<> inline float exp(float x) { return __expf(x); }
 template<> inline float cos(float x) { return __cosf(x); }
diff --git a/source/source_pw/module_pwdft/forces.cpp b/source/source_pw/module_pwdft/forces.cpp
index fbcebe6304..5a9604dc41 100644
--- a/source/source_pw/module_pwdft/forces.cpp
+++ b/source/source_pw/module_pwdft/forces.cpp
@@ -537,8 +537,21 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         {
             ModuleBase::WARNING_QUIT("ewald", "Can't find optimal alpha.");
         }
+        const double damping_factor = ucell.tpiba2 * rho_basis->ggecut / 4.0 / alpha;
+        // damping_factor corresponds to the exponent (G^2 / 4alpha) 
+        // in the Ewald error estimate.
+        // It measures how strongly the terms are damped at the plane-wave cutoff.
+        // Asymptotic behavior: erfc(x) ~ exp(-x^2) for large x. 
+        // Here x^2 = damping_factor.
+        // If damping_factor > 230.0, exp(-230.0) approx 1e-100, 
+        // meaning the error is negligible.
+        if (damping_factor > 230.0)
+        {
+            upperbound = 0.0; 
+            break;
+        }
         upperbound = 2.0 * charge * charge * sqrt(2.0 * alpha / ModuleBase::TWO_PI)
-                     * erfc(sqrt(ucell.tpiba2 * rho_basis->ggecut / 4.0 / alpha));
+                     * erfc(sqrt(damping_factor));
     } while (upperbound > 1.0e-6);
     const int ig0 = rho_basis->ig_gge0;
 #pragma omp parallel for
@@ -548,7 +561,7 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         {
             continue; // skip G=0
         }
-        aux[ig] *= ModuleBase::libm::exp(-1.0 * rho_basis->gg[ig] * ucell.tpiba2 / alpha / 4.0)
+        aux[ig] *= ModuleBase::libm::truncated_exp(std::complex<double>(-1.0 * rho_basis->gg[ig] * ucell.tpiba2 / alpha / 4.0, 0.0))
                 / (rho_basis->gg[ig] * ucell.tpiba2);
     }
 

From 25fc4357e8c12a72eb25615183a7e7a064ab2b08 Mon Sep 17 00:00:00 2001
From: ubuntu <3158793232@qq.com>
Date: Mon, 2 Feb 2026 17:54:26 +0800
Subject: [PATCH 5/8] fix bug

---
 .../module_pwdft/kernels/force_op.cpp         | 24 +++++++++++++-
 .../module_pwdft/kernels/stress_op.cpp        | 32 +++++++++++++++++--
 2 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/source/source_pw/module_pwdft/kernels/force_op.cpp b/source/source_pw/module_pwdft/kernels/force_op.cpp
index 029afdbf7c..85ac655148 100644
--- a/source/source_pw/module_pwdft/kernels/force_op.cpp
+++ b/source/source_pw/module_pwdft/kernels/force_op.cpp
@@ -109,8 +109,30 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
 
                             for (int ipol = 0; ipol < 3; ipol++)
                             {
+                                #ifdef __SW
+                                const std::complex<FPTYPE>& dbecp_val = dbecp[ipol * nbands * nkb + ib * nkb + inkb];
+                                uint64_t* parts = reinterpret_cast<uint64_t*>(const_cast<std::complex<FPTYPE>*>(&dbecp_val));
+                                uint64_t exp_real = (parts[0] >> 52) & 0x7FF;
+                                uint64_t exp_imag = (parts[1] >> 52) & 0x7FF;
+                                if ((exp_real <= 923) || (exp_imag <= 923)) {
+                                    continue;
+                                }
+                                const std::complex<FPTYPE>& becp_val = becp[ib * nkb + inkb];
+                                uint64_t* becp_parts = reinterpret_cast<uint64_t*>(const_cast<std::complex<FPTYPE>*>(&becp_val));
+                                uint64_t becp_exp_real = (becp_parts[0] >> 52) & 0x7FF;
+                                uint64_t becp_exp_imag = (becp_parts[1] >> 52) & 0x7FF;
+                                if ((becp_exp_real <= 923) || (becp_exp_imag <= 923)) {
+                                    continue;
+                                }
+                                uint64_t* lf_parts = reinterpret_cast<uint64_t*>(const_cast<FPTYPE*>(&local_force[ipol]));
+                                uint64_t lf_exp = (lf_parts[0] >> 52) & 0x7FF;
+                                if (lf_exp <= 923) {
+                                    continue;
+                                }
+                                #endif
                                 const FPTYPE dbb
-                                    = (conj(dbecp[ipol * nbands * nkb + ib * nkb + inkb]) * becp[ib * nkb + inkb])
+                                    = (conj(dbecp[ipol * nbands * nkb + ib * nkb + inkb]) 
+                                    * becp[ib * nkb + inkb])
                                           .real();
                                 local_force[ipol] -= ps * fac * dbb;
                                 // cf[iat*3+ipol] += ps * fac * dbb;
diff --git a/source/source_pw/module_pwdft/kernels/stress_op.cpp b/source/source_pw/module_pwdft/kernels/stress_op.cpp
index 46fccdeed6..08eb2d0d0b 100644
--- a/source/source_pw/module_pwdft/kernels/stress_op.cpp
+++ b/source/source_pw/module_pwdft/kernels/stress_op.cpp
@@ -140,8 +140,28 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                                 FPTYPE ps = deeq[((spin * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
                                 const int inkb1 = sum + ia * nproj + ip1;
                                 const int inkb2 = sum + ia * nproj + ip2;
-                                // out<<"\n ps = "<<ps;
 
+                                #ifdef __SW
+                                const std::complex<FPTYPE>& dbecp_val = dbecp[ib * nkb + inkb1];
+                                uint64_t* parts = reinterpret_cast<uint64_t*>(const_cast<std::complex<FPTYPE>*>(&dbecp_val));
+                                uint64_t exp_real = (parts[0] >> 52) & 0x7FF;
+                                uint64_t exp_imag = (parts[1] >> 52) & 0x7FF;
+                                if ((exp_real <= 923) || (exp_imag <= 923)) {
+                                    continue;
+                                }
+                                const std::complex<FPTYPE>& becp_val = becp[ib * nkb + inkb2];
+                                uint64_t* becp_parts = reinterpret_cast<uint64_t*>(const_cast<std::complex<FPTYPE>*>(&becp_val));
+                                uint64_t becp_exp_real = (becp_parts[0] >> 52) & 0x7FF;
+                                uint64_t becp_exp_imag = (becp_parts[1] >> 52) & 0x7FF;
+                                if ((becp_exp_real <= 923) || (becp_exp_imag <= 923)) {
+                                    continue;
+                                }
+                                uint64_t* ls_parts = reinterpret_cast<uint64_t*>(const_cast<FPTYPE*>(&local_stress));
+                                uint64_t ls_exp = (ls_parts[0] >> 52) & 0x7FF;
+                                if (ls_exp <= 923) {
+                                    continue;
+                                }
+                                #endif
                                 const FPTYPE dbb = (conj(dbecp[ib * nkb + inkb1]) * becp[ib * nkb + inkb2]).real();
                                 local_stress -= ps * fac * dbb;
                             }
@@ -618,7 +638,8 @@ struct cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_CPU> {
             {
                 rhocg1 *= ModuleBase::FOUR_PI / omega / 2.0 / gx_arr[igl];
                 FPTYPE g2a = (gx_arr[igl]*gx_arr[igl]) / 4.0;
-                rhocg1 += ModuleBase::FOUR_PI / omega * gx_arr[ngg] * ModuleBase::libm::exp(-g2a) * (g2a + 1)
+                rhocg1 += ModuleBase::FOUR_PI / omega * gx_arr[ngg] * 
+                           ModuleBase::libm::truncated_exp(-g2a) * (g2a + 1)
                           / pow(gx_arr[igl] * gx_arr[igl], 2);
                 drhocg [igl] = rhocg1;
             }
@@ -644,6 +665,13 @@ struct cal_multi_dot_op<FPTYPE, base_device::DEVICE_CPU> {
 #endif
         for (int i = 0; i < npw; i++)
         {
+            #ifdef __SW
+            const std::complex<FPTYPE>& psi_i = psi[i];
+            uint64_t* parts = reinterpret_cast<uint64_t*>(const_cast<std::complex<FPTYPE>*>(&psi_i));
+            uint64_t exp_real = (parts[0] >> 52) & 0x7FF;
+            uint64_t exp_imag = (parts[1] >> 52) & 0x7FF;
+            if ((exp_real <= 923) || (exp_imag <= 923)) {continue;}
+            #endif
             sum += fac * gk1[i] * gk2[i] * d_kfac[i] * std::norm(psi[i]);
         }
         return sum;

From 65af1da4b704e607ab0c3143f29d2f07ffb60c2f Mon Sep 17 00:00:00 2001
From: ubuntu <3158793232@qq.com>
Date: Tue, 3 Feb 2026 13:36:29 +0800
Subject: [PATCH 6/8] fix compute bug

---
 source/source_pw/module_pwdft/kernels/stress_op.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/source/source_pw/module_pwdft/kernels/stress_op.cpp b/source/source_pw/module_pwdft/kernels/stress_op.cpp
index 08eb2d0d0b..a3b555a110 100644
--- a/source/source_pw/module_pwdft/kernels/stress_op.cpp
+++ b/source/source_pw/module_pwdft/kernels/stress_op.cpp
@@ -156,11 +156,6 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                                 if ((becp_exp_real <= 923) || (becp_exp_imag <= 923)) {
                                     continue;
                                 }
-                                uint64_t* ls_parts = reinterpret_cast<uint64_t*>(const_cast<FPTYPE*>(&local_stress));
-                                uint64_t ls_exp = (ls_parts[0] >> 52) & 0x7FF;
-                                if (ls_exp <= 923) {
-                                    continue;
-                                }
                                 #endif
                                 const FPTYPE dbb = (conj(dbecp[ib * nkb + inkb1]) * becp[ib * nkb + inkb2]).real();
                                 local_stress -= ps * fac * dbb;

From ae01d71d9f82648bc59874bef70c5cd2dd00c18f Mon Sep 17 00:00:00 2001
From: ubuntu <3158793232@qq.com>
Date: Fri, 6 Feb 2026 22:51:04 +0800
Subject: [PATCH 7/8] fix avoid compute in libm

---
 source/source_base/libm/libm.h                | 27 -------------------
 source/source_pw/module_pwdft/forces.cpp      |  7 ++++-
 .../module_pwdft/kernels/stress_op.cpp        |  6 ++++-
 3 files changed, 11 insertions(+), 29 deletions(-)

diff --git a/source/source_base/libm/libm.h b/source/source_base/libm/libm.h
index 65996ad43b..13bf65de5c 100644
--- a/source/source_base/libm/libm.h
+++ b/source/source_base/libm/libm.h
@@ -49,33 +49,6 @@ template<> inline double cos(double x) { return __cos(x); }
 template<> inline double sin(double x) { return __sin(x); }
 template<> inline void sincos(double x, double *s, double *c) { __sincos(x, s, c); }
 template<> inline std::complex<double> exp(const std::complex<double> &x) { return __cexp(x); }
-// A safe exponential function that truncates to zero for very small values
-// to avoid underflow and expensive calculations.
-// Threshold: exp(-230) approx 1e-100.
-inline std::complex<double> truncated_exp(const std::complex<double> &x)
-{
-    if (x.real() < -230) return std::complex<double>(0.0, 0.0);
-        return __cexp(x);
-}
-
-inline std::complex<float> truncated_exp(const std::complex<float> &x)
-{
-    if (x.real() < -230) return std::complex<float>(0.0, 0.0);
-        return __cexpf(x);
-}
-
-inline double truncated_exp(const double &x)
-{
-    if (x < -230) return 0.0;
-        return __exp(x);
-}
-
-inline float truncated_exp(const float &x)
-{
-    if (x < -230) return 0.0;
-        return __expf(x);
-}
-
 
 template<> inline float exp(float x) { return __expf(x); }
 template<> inline float cos(float x) { return __cosf(x); }
diff --git a/source/source_pw/module_pwdft/forces.cpp b/source/source_pw/module_pwdft/forces.cpp
index b9e6823d8b..8f8b5e7650 100644
--- a/source/source_pw/module_pwdft/forces.cpp
+++ b/source/source_pw/module_pwdft/forces.cpp
@@ -561,7 +561,12 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         {
             continue; // skip G=0
         }
-        aux[ig] *= ModuleBase::libm::truncated_exp(std::complex<double>(-1.0 * rho_basis->gg[ig] * ucell.tpiba2 / alpha / 4.0, 0.0))
+        const double damping_factor = -1.0 * rho_basis->gg[ig] * ucell.tpiba2 / alpha / 4.0;
+        if (damping_factor < -230.0)
+        {
+            continue;// if the damping factor is too small, the contribution is negligible
+        }
+        aux[ig] *= ModuleBase::libm::exp(std::complex<double>(damping_factor, 0.0))
                 / (rho_basis->gg[ig] * ucell.tpiba2);
     }
 
diff --git a/source/source_pw/module_pwdft/kernels/stress_op.cpp b/source/source_pw/module_pwdft/kernels/stress_op.cpp
index a3b555a110..7c3489e335 100644
--- a/source/source_pw/module_pwdft/kernels/stress_op.cpp
+++ b/source/source_pw/module_pwdft/kernels/stress_op.cpp
@@ -633,8 +633,12 @@ struct cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_CPU> {
             {
                 rhocg1 *= ModuleBase::FOUR_PI / omega / 2.0 / gx_arr[igl];
                 FPTYPE g2a = (gx_arr[igl]*gx_arr[igl]) / 4.0;
+                if (-g2a < -230.0)
+                {
+                    continue;
+                }
                 rhocg1 += ModuleBase::FOUR_PI / omega * gx_arr[ngg] * 
-                           ModuleBase::libm::truncated_exp(-g2a) * (g2a + 1)
+                           ModuleBase::libm::exp(-g2a) * (g2a + 1)
                           / pow(gx_arr[igl] * gx_arr[igl], 2);
                 drhocg [igl] = rhocg1;
             }

From 7a739ecf6ced322f773b0b7ede7f2ddf295c7871 Mon Sep 17 00:00:00 2001
From: ubuntu <3158793232@qq.com>
Date: Sun, 8 Feb 2026 22:36:12 +0800
Subject: [PATCH 8/8] add truncated func

---
 source/source_base/truncated_func.h           | 116 ++++++++++++++++++
 source/source_pw/module_pwdft/forces.cpp      |  25 +---
 .../module_pwdft/kernels/force_op.cpp         |  33 ++---
 .../module_pwdft/kernels/stress_op.cpp        |  38 ++----
 source/source_pw/module_pwdft/vl_pw.cpp       |  10 +-
 5 files changed, 139 insertions(+), 83 deletions(-)
 create mode 100644 source/source_base/truncated_func.h

diff --git a/source/source_base/truncated_func.h b/source/source_base/truncated_func.h
new file mode 100644
index 0000000000..55ce64953a
--- /dev/null
+++ b/source/source_base/truncated_func.h
@@ -0,0 +1,116 @@
+#ifndef MODULE_BASE_TRUNCATED_FUNC_H
+#define MODULE_BASE_TRUNCATED_FUNC_H
+
+#include "source_base/libm/libm.h"
+#include <cstdint>
+#include <cstring>
+#include <complex>
+
+namespace ModuleBase
+{
+
+/**
+ * @brief Truncated exponential function to avoid underflow.
+ *
+ * This function returns 0 if the real part of the input is less than -230.0,
+ * otherwise it calls ModuleBase::libm::exp(x).
+ *
+ * @tparam FPTYPE The floating point type (float, double, or complex).
+ * @param x The input value.
+ * @return FPTYPE The result of the exponential function.
+ */
+template <typename FPTYPE>
+inline FPTYPE truncated_exp(FPTYPE x)
+{
+    if (std::real(x) < -230.0)
+    {
+        return static_cast<FPTYPE>(0.0);
+    }
+    return ModuleBase::libm::exp(x);
+}
+
+/**
+ * @brief Truncated complementary error function to avoid underflow for large arguments.
+ *
+ * This function returns 0 if the real part of the input is greater than 20.0,
+ * otherwise it calls std::erfc(x).
+ *
+ * @tparam FPTYPE The floating point type (float, double, or complex).
+ * @param x The input value.
+ * @return FPTYPE The result of the erfc function.
+ */
+template <typename FPTYPE>
+inline FPTYPE truncated_erfc(FPTYPE x)
+{
+    if (std::real(x) > 20.0)
+    {
+        return static_cast<FPTYPE>(0.0);
+    }
+    return std::erfc(x);
+}
+
+/**
+ * @brief Truncated value function to avoid underflow.
+ *
+ * This function returns 0 if the absolute value of the input is less than 1.0e-30,
+ * otherwise it returns the input x.
+ *
+ * @tparam FPTYPE The floating point type (float, double, or complex).
+ * @param x The input value.
+ * @return FPTYPE The result of the truncation.
+ */
+/**
+ * @brief Truncated value function to avoid underflow.
+ *
+ * This function modifies the input to 0 if its absolute value is less than 1.0e-30.
+ *
+ * @tparam FPTYPE The floating point type (float, double, or complex).
+ * @param x The input value to be checked and possibly truncated.
+ */
+template <typename FPTYPE>
+inline void truncated_underflow(FPTYPE& x)
+{
+    if (std::abs(x) < 1.0e-30)
+    {
+        x = static_cast<FPTYPE>(0.0);
+    }
+}
+
+template <>
+inline void truncated_underflow(double& x)
+{
+    const uint64_t u = *reinterpret_cast<const uint64_t*>(&x);
+    // The exponent bits are 52-62 (11 bits). The bias is 1023.
+    // 1e-30 corresponds to -100 in base-2 exponent roughly.
+    // 923 = 1023 - 100.
+    if (((u >> 52) & 0x7FF) <= 923)
+    {
+        x = 0.0;
+    }
+}
+
+template <>
+inline void truncated_underflow(float& x)
+{
+    const uint32_t u = *reinterpret_cast<const uint32_t*>(&x);
+    // The exponent bits are 23-30 (8 bits). The bias is 127.
+    // 1e-30 corresponds to -100 in base-2 exponent roughly.
+    // 27 = 127 - 100.
+    if (((u >> 23) & 0xFF) <= 27)
+    {
+        x = 0.0f;
+    }
+}
+
+template <typename T>
+inline void truncated_underflow(std::complex<T>& x)
+{
+    T* ptr = reinterpret_cast<T*>(&x);
+    truncated_underflow(ptr[0]);
+    truncated_underflow(ptr[1]);
+}
+
+
+} // namespace ModuleBase
+
+#endif // MODULE_BASE_TRUNCATED_FUNC_H
\ No newline at end of file
diff --git a/source/source_pw/module_pwdft/forces.cpp b/source/source_pw/module_pwdft/forces.cpp
index 8f8b5e7650..a6894c49ca 100644
--- a/source/source_pw/module_pwdft/forces.cpp
+++ b/source/source_pw/module_pwdft/forces.cpp
@@ -5,6 +5,7 @@
 // new
 #include "source_base/complexmatrix.h"
 #include "source_base/libm/libm.h"
+#include "source_base/truncated_func.h"
 #include "source_base/math_integral.h"
 #include "source_base/mathzone.h"
 #include "source_base/timer.h"
@@ -537,21 +538,7 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         {
             ModuleBase::WARNING_QUIT("ewald", "Can't find optimal alpha.");
         }
-        const double damping_factor = ucell.tpiba2 * rho_basis->ggecut / 4.0 / alpha;
-        // damping_factor corresponds to the exponent (G^2 / 4alpha) 
-        // in the Ewald error estimate.
-        // It measures how strongly the terms are damped at the plane-wave cutoff.
-        // Asymptotic behavior: erfc(x) ~ exp(-x^2) for large x. 
-        // Here x^2 = damping_factor.
-        // If damping_factor > 230.0, exp(-230.0) approx 1e-100, 
-        // meaning the error is negligible.
-        if (damping_factor > 230.0)
-        {
-            upperbound = 0.0; 
-            break;
-        }
-        upperbound = 2.0 * charge * charge * sqrt(2.0 * alpha / ModuleBase::TWO_PI)
-                     * erfc(sqrt(damping_factor));
+        upperbound = 2.0 * charge * charge * sqrt(2.0 * alpha / ModuleBase::TWO_PI)* ModuleBase::truncated_erfc(sqrt( ucell.tpiba2 * rho_basis->ggecut / 4.0 / alpha));
     } while (upperbound > 1.0e-6);
     const int ig0 = rho_basis->ig_gge0;
 #pragma omp parallel for
@@ -561,12 +548,8 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         {
             continue; // skip G=0
         }
-        const double damping_factor = -1.0 * rho_basis->gg[ig] * ucell.tpiba2 / alpha / 4.0;
-        if (damping_factor < -230.0)
-        {
-            continue;// if the damping factor is too small, the contribution is negligible
-        }
-        aux[ig] *= ModuleBase::libm::exp(std::complex<double>(damping_factor, 0.0))
+        aux[ig] *= ModuleBase::truncated_exp
+                (-1.0 * rho_basis->gg[ig] * ucell.tpiba2 / alpha / 4.0)
                 / (rho_basis->gg[ig] * ucell.tpiba2);
     }
 
diff --git a/source/source_pw/module_pwdft/kernels/force_op.cpp b/source/source_pw/module_pwdft/kernels/force_op.cpp
index 85ac655148..0e0c34ccdd 100644
--- a/source/source_pw/module_pwdft/kernels/force_op.cpp
+++ b/source/source_pw/module_pwdft/kernels/force_op.cpp
@@ -1,5 +1,7 @@
 #include "source_pw/module_pwdft/kernels/force_op.h"
 
+#include "source_base/truncated_func.h"
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -109,31 +111,12 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
 
                             for (int ipol = 0; ipol < 3; ipol++)
                             {
-                                #ifdef __SW
-                                const std::complex<FPTYPE>& dbecp_val = dbecp[ipol * nbands * nkb + ib * nkb + inkb];
-                                uint64_t* parts = reinterpret_cast<uint64_t*>(const_cast<std::complex<FPTYPE>*>(&dbecp_val));
-                                uint64_t exp_real = (parts[0] >> 52) & 0x7FF;
-                                uint64_t exp_imag = (parts[1] >> 52) & 0x7FF;
-                                if ((exp_real <= 923) || (exp_imag <= 923)) {
-                                    continue;
-                                }
-                                const std::complex<FPTYPE>& becp_val = becp[ib * nkb + inkb];
-                                uint64_t* becp_parts = reinterpret_cast<uint64_t*>(const_cast<std::complex<FPTYPE>*>(&becp_val));
-                                uint64_t becp_exp_real = (becp_parts[0] >> 52) & 0x7FF;
-                                uint64_t becp_exp_imag = (becp_parts[1] >> 52) & 0x7FF;
-                                if ((becp_exp_real <= 923) || (becp_exp_imag <= 923)) {
-                                    continue;
-                                }
-                                uint64_t* lf_parts = reinterpret_cast<uint64_t*>(const_cast<FPTYPE*>(&local_force[ipol]));
-                                uint64_t lf_exp = (lf_parts[0] >> 52) & 0x7FF;
-                                if (lf_exp <= 923) {
-                                    continue;
-                                }
-                                #endif
-                                const FPTYPE dbb
-                                    = (conj(dbecp[ipol * nbands * nkb + ib * nkb + inkb]) 
-                                    * becp[ib * nkb + inkb])
-                                          .real();
+#ifdef __SW
+                                ModuleBase::truncated_underflow(dbecp[ipol * nbands * nkb + ib * nkb + inkb]);
+                                ModuleBase::truncated_underflow(becp[ib * nkb + inkb]);
+                                ModuleBase::truncated_underflow(local_force[ipol]);
+#endif
+                                const FPTYPE dbb = (conj(dbecp[ipol * nbands * nkb + ib * nkb + inkb]) * becp[ib * nkb + inkb]).real();
                                 local_force[ipol] -= ps * fac * dbb;
                                 // cf[iat*3+ipol] += ps * fac * dbb;
                             }
diff --git a/source/source_pw/module_pwdft/kernels/stress_op.cpp b/source/source_pw/module_pwdft/kernels/stress_op.cpp
index 7c3489e335..6034a52710 100644
--- a/source/source_pw/module_pwdft/kernels/stress_op.cpp
+++ b/source/source_pw/module_pwdft/kernels/stress_op.cpp
@@ -1,5 +1,6 @@
 #include "source_pw/module_pwdft/kernels/stress_op.h"
 
+#include "source_base/truncated_func.h"
 #include "source_base/constants.h"
 #include "source_base/libm/libm.h"
 #include "source_base/math_polyint.h"
@@ -140,23 +141,10 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                                 FPTYPE ps = deeq[((spin * deeq_2 + iat + ia) * deeq_3 + ip1) * deeq_4 + ip2] + ps_qq;
                                 const int inkb1 = sum + ia * nproj + ip1;
                                 const int inkb2 = sum + ia * nproj + ip2;
-
-                                #ifdef __SW
-                                const std::complex<FPTYPE>& dbecp_val = dbecp[ib * nkb + inkb1];
-                                uint64_t* parts = reinterpret_cast<uint64_t*>(const_cast<std::complex<FPTYPE>*>(&dbecp_val));
-                                uint64_t exp_real = (parts[0] >> 52) & 0x7FF;
-                                uint64_t exp_imag = (parts[1] >> 52) & 0x7FF;
-                                if ((exp_real <= 923) || (exp_imag <= 923)) {
-                                    continue;
-                                }
-                                const std::complex<FPTYPE>& becp_val = becp[ib * nkb + inkb2];
-                                uint64_t* becp_parts = reinterpret_cast<uint64_t*>(const_cast<std::complex<FPTYPE>*>(&becp_val));
-                                uint64_t becp_exp_real = (becp_parts[0] >> 52) & 0x7FF;
-                                uint64_t becp_exp_imag = (becp_parts[1] >> 52) & 0x7FF;
-                                if ((becp_exp_real <= 923) || (becp_exp_imag <= 923)) {
-                                    continue;
-                                }
-                                #endif
+#ifdef __SW
+                                ModuleBase::truncated_underflow(dbecp[ib * nkb + inkb1]);
+                                ModuleBase::truncated_underflow(becp[ib * nkb + inkb2]);
+#endif
                                 const FPTYPE dbb = (conj(dbecp[ib * nkb + inkb1]) * becp[ib * nkb + inkb2]).real();
                                 local_stress -= ps * fac * dbb;
                             }
@@ -633,12 +621,8 @@ struct cal_stress_drhoc_aux_op<FPTYPE, base_device::DEVICE_CPU> {
             {
                 rhocg1 *= ModuleBase::FOUR_PI / omega / 2.0 / gx_arr[igl];
                 FPTYPE g2a = (gx_arr[igl]*gx_arr[igl]) / 4.0;
-                if (-g2a < -230.0)
-                {
-                    continue;
-                }
                 rhocg1 += ModuleBase::FOUR_PI / omega * gx_arr[ngg] * 
-                           ModuleBase::libm::exp(-g2a) * (g2a + 1)
+                           ModuleBase::truncated_exp(-g2a) * (g2a + 1)
                           / pow(gx_arr[igl] * gx_arr[igl], 2);
                 drhocg [igl] = rhocg1;
             }
@@ -664,13 +648,9 @@ struct cal_multi_dot_op<FPTYPE, base_device::DEVICE_CPU> {
 #endif
         for (int i = 0; i < npw; i++)
         {
-            #ifdef __SW
-            const std::complex<FPTYPE>& psi_i = psi[i];
-            uint64_t* parts = reinterpret_cast<uint64_t*>(const_cast<std::complex<FPTYPE>*>(&psi_i));
-            uint64_t exp_real = (parts[0] >> 52) & 0x7FF;
-            uint64_t exp_imag = (parts[1] >> 52) & 0x7FF;
-            if ((exp_real <= 923) || (exp_imag <= 923)) {continue;}
-            #endif
+#ifdef __SW
+            ModuleBase::truncated_underflow(psi[i]);
+#endif
             sum += fac * gk1[i] * gk2[i] * d_kfac[i] * std::norm(psi[i]);
         }
         return sum;
diff --git a/source/source_pw/module_pwdft/vl_pw.cpp b/source/source_pw/module_pwdft/vl_pw.cpp
index 5ff21a9aaf..76ee526fbc 100644
--- a/source/source_pw/module_pwdft/vl_pw.cpp
+++ b/source/source_pw/module_pwdft/vl_pw.cpp
@@ -1,6 +1,7 @@
 #include "vl_pw.h"
 #include "source_io/module_parameter/parameter.h"
 #include "source_base/libm/libm.h"
+#include "source_base/truncated_func.h"
 #include "source_base/math_integral.h"
 #include "source_base/timer.h"
 
@@ -226,14 +227,7 @@ void pseudopot_cell_vl::vloc_of_g(const int& msh,
 			aux [ir] = aux1 [ir] * ModuleBase::libm::sin(gx * r [ir]) / gx;
 		}
 		ModuleBase::Integral::Simpson_Integral(msh, aux, rab, vloc_1d[ig] );
-		// here we add the analytic fourier transform of the erf function
-		// here exp(-gx2*0.25) < 1e-100, skip it to avoid 
-		// float underflow and abundant calculation
-		if (gx2 > 921)
-		{
-			continue;
-		}
-		vloc_1d[ig] -= fac *  ModuleBase::libm::exp(- gx2 * 0.25)/ gx2;
+		vloc_1d[ig] -= fac *  ModuleBase::truncated_exp(- gx2 * 0.25)/ gx2;
 	} // enddo
 
 	const double d_fpi_omega = ModuleBase::FOUR_PI/ucell.omega;//mohan add 2008-06-04