diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index 995919ce6b..158cae85a0 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -524,7 +524,6 @@ OBJS_LCAO=DM_gamma.o\
       LCAO_set_fs.o\
       LCAO_set_st.o\
       LCAO_nl_mu.o\
-      LCAO_nl_beta.o\
       LCAO_nnr.o\
       LCAO_set_zero.o\
       LCAO_allocate.o\
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index c5607db6ba..bbed3c6da3 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -212,7 +212,7 @@ void ESolver_KS_LCAO<TK, TR>::before_all_runners(Input& inp, UnitCell& ucell) {
 
     // 8) initialize DFT+U
     if (GlobalV::dft_plus_u) {
-        GlobalC::dftu.init(ucell, this->LM, this->kv.get_nks());
+        GlobalC::dftu.init(ucell, this->LM.ParaV, this->kv.get_nks());
     }
 
     // 9) initialize ppcell
@@ -1010,22 +1010,21 @@ void ESolver_KS_LCAO<TK, TR>::iter_finish(int iter) {
         && (!GlobalC::exx_info.info_global.separate_loop
             || iter == 1)) // to avoid saving the same value repeatedly
     {
-        std::vector<TK> Hexxk_save(this->orb_con.ParaV.get_local_size());
+        hamilt::HS_Matrix_K<TK> Hexxk_save(&this->orb_con.ParaV, 1);
         for (int ik = 0; ik < this->kv.get_nks(); ++ik) {
-            ModuleBase::GlobalFunc::ZEROS(Hexxk_save.data(), Hexxk_save.size());
+            Hexxk_save.set_zero_hk();
 
-            hamilt::OperatorEXX<hamilt::OperatorLCAO<TK, TR>> opexx_save(
-                &this->LM,
-                nullptr,
-                &Hexxk_save,
-                this->kv);
+            hamilt::OperatorEXX<hamilt::OperatorLCAO<TK, TR>> opexx_save(&Hexxk_save, 
+                                                                         &this->LM, 
+                                                                         nullptr, 
+                                                                         this->kv);
 
             opexx_save.contributeHk(ik);
 
             GlobalC::restart.save_disk("Hexx",
                                        ik,
                                        this->orb_con.ParaV.get_local_size(),
-                                       Hexxk_save.data());
+                                       Hexxk_save.get_hk());
         }
         if (GlobalV::MY_RANK == 0) {
             GlobalC::restart.save_disk("Eexx", 0, 1, &this->pelec->f_en.exx);
diff --git a/source/module_esolver/esolver_ks_lcao_elec.cpp b/source/module_esolver/esolver_ks_lcao_elec.cpp
index f248c1429a..2226b591d8 100644
--- a/source/module_esolver/esolver_ks_lcao_elec.cpp
+++ b/source/module_esolver/esolver_ks_lcao_elec.cpp
@@ -170,6 +170,7 @@ void ESolver_KS_LCAO<TK, TR>::beforesolver(const int istep) {
             GlobalV::GAMMA_ONLY_LOCAL ? &(this->GG) : nullptr,
             GlobalV::GAMMA_ONLY_LOCAL ? nullptr : &(this->GK),
             &(this->LM),
+            &this->orb_con.ParaV,
             this->pelec->pot,
             this->kv,
             two_center_bundle_,
@@ -520,11 +521,9 @@ void ESolver_KS_LCAO<std::complex<double>, double>::get_S(void) {
 
     this->RA.for_2d(this->orb_con.ParaV, GlobalV::GAMMA_ONLY_LOCAL);
 
-    this->LM.ParaV = &this->orb_con.ParaV;
-
     if (this->p_hamilt == nullptr) {
         this->p_hamilt = new hamilt::HamiltLCAO<std::complex<double>, double>(
-            &this->LM,
+            &this->orb_con.ParaV,
             this->kv,
             *(two_center_bundle_.overlap_orb));
         dynamic_cast<hamilt::OperatorLCAO<std::complex<double>, double>*>(
@@ -569,7 +568,7 @@ void ESolver_KS_LCAO<std::complex<double>, std::complex<double>>::get_S(void) {
     if (this->p_hamilt == nullptr) {
         this->p_hamilt = new hamilt::HamiltLCAO<std::complex<double>,
                                                 std::complex<double>>(
-            &this->LM,
+            &this->orb_con.ParaV,
             this->kv,
             *(two_center_bundle_.overlap_orb));
         dynamic_cast<
@@ -593,12 +592,12 @@ void ESolver_KS_LCAO<std::complex<double>, std::complex<double>>::get_S(void) {
 }
 
 template <typename TK, typename TR>
-void ESolver_KS_LCAO<TK, TR>::nscf(void) {
+void ESolver_KS_LCAO<TK, TR>::nscf() {
     ModuleBase::TITLE("ESolver_KS_LCAO", "nscf");
 
     std::cout << " NON-SELF CONSISTENT CALCULATIONS" << std::endl;
 
-    time_t time_start = std::time(NULL);
+    time_t time_start = std::time(nullptr);
 
 #ifdef __EXX
 #ifdef __MPI
@@ -631,7 +630,7 @@ void ESolver_KS_LCAO<TK, TR>::nscf(void) {
                                  "HSolver has not been initialed!");
     }
 
-    time_t time_finish = std::time(NULL);
+    time_t time_finish = std::time(nullptr);
     ModuleBase::GlobalFunc::OUT_TIME("cal_bands", time_start, time_finish);
 
     GlobalV::ofs_running << " end of band structure calculation " << std::endl;
diff --git a/source/module_esolver/esolver_ks_lcao_tmpfunc.cpp b/source/module_esolver/esolver_ks_lcao_tmpfunc.cpp
index 2a16b09b17..7a5f9fdbcd 100644
--- a/source/module_esolver/esolver_ks_lcao_tmpfunc.cpp
+++ b/source/module_esolver/esolver_ks_lcao_tmpfunc.cpp
@@ -19,7 +19,8 @@ namespace ModuleESolver
 		GlobalC::dftu.cal_occup_m_gamma(
 				iter, 
 				dm, 
-				this->p_chgmix->get_mixing_beta());
+				this->p_chgmix->get_mixing_beta(),
+                this->p_hamilt);
     }
 
     //! dftu occupation matrix for multiple k-points using dm(complex)
diff --git a/source/module_hamilt_general/matrixblock.h b/source/module_hamilt_general/matrixblock.h
index e75cf4cac5..481a95b8a5 100644
--- a/source/module_hamilt_general/matrixblock.h
+++ b/source/module_hamilt_general/matrixblock.h
@@ -12,7 +12,7 @@ template <typename T> struct MatrixBlock
     T* p;
     size_t row;
     size_t col;
-    int* desc;
+    const int* desc;
 };
 
 } // namespace hamilt
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/CMakeLists.txt b/source/module_hamilt_lcao/hamilt_lcaodft/CMakeLists.txt
index 68990c0808..0229e23779 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/CMakeLists.txt
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/CMakeLists.txt
@@ -36,8 +36,7 @@ if(ENABLE_LCAO)
         LCAO_nnr.cpp
 		LCAO_set_fs.cpp
 		LCAO_set_st.cpp
-		LCAO_nl_mu.cpp
-		LCAO_nl_beta.cpp
+		LCAO_nl_mu.cpp	
         LCAO_set_zero.cpp
         LCAO_allocate.cpp
 		record_adj.cpp
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
index ada447c66d..5aec438f66 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
@@ -240,7 +240,6 @@ void Force_Stress_LCAO<T>::getForceStress(const bool isforce,
         if (GlobalV::dft_plus_u == 2)
         {
             GlobalC::dftu.force_stress(pelec,
-                                       lm,
                                        pv,
                                        fsr, // mohan 2024-06-16
                                        force_dftu,
@@ -249,15 +248,13 @@ void Force_Stress_LCAO<T>::getForceStress(const bool isforce,
         }
         else
         {
-            hamilt::DFTU<hamilt::OperatorLCAO<T, double>> tmp_dftu(&lm,
+            hamilt::DFTU<hamilt::OperatorLCAO<T, double>> tmp_dftu(nullptr, // HK and SK are not used for force&stress
                                                                    kv.kvec_d,
-                                                                   nullptr,
-                                                                   nullptr,
+                                                                   nullptr, // HR are not used for force&stress
                                                                    GlobalC::ucell,
                                                                    &GlobalC::GridD,
                                                                    two_center_bundle.overlap_orb_onsite.get(),
-                                                                   &GlobalC::dftu,
-                                                                   *(lm.ParaV));
+                                                                   &GlobalC::dftu);
 
             tmp_dftu.cal_force_stress(isforce, isstress, force_dftu, stress_dftu);
         }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
index 512d90658c..e5e0681e70 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
@@ -77,7 +77,7 @@ void Force_LCAO<double>::allocate(const Parallel_Orbitals& pv,
                               pv,
                               two_center_bundle,
                               &GlobalC::GridD,
-                              lm.Sloc.data());
+                              nullptr);
 
     // calculate dT in LCAP
     // allocation dt
@@ -101,11 +101,11 @@ void Force_LCAO<double>::allocate(const Parallel_Orbitals& pv,
                               pv,
                               two_center_bundle,
                               &GlobalC::GridD,
-                              lm.Hloc_fixed.data());
+                              nullptr);
 
     LCAO_domain::build_Nonlocal_mu_new(lm,
                                        fsr,
-                                       lm.Hloc_fixed.data(),
+                                       nullptr,
                                        cal_deri,
                                        GlobalC::ucell,
                                        GlobalC::ORB,
@@ -116,8 +116,9 @@ void Force_LCAO<double>::allocate(const Parallel_Orbitals& pv,
     if (INPUT.cal_syns)
     {
         cal_deri = false;
-
-        lm.zeros_HSgamma('S');
+        ModuleBase::WARNING_QUIT("cal_syns", "this function has been broken and will be fixed later.");
+        /*
+        std::vector<double> Sloc(pv.nloc, 0.0);
 
         LCAO_domain::build_ST_new(lm,
                                   fsr,
@@ -128,7 +129,7 @@ void Force_LCAO<double>::allocate(const Parallel_Orbitals& pv,
                                   pv,
                                   two_center_bundle,
                                   &GlobalC::GridD,
-                                  lm.Sloc.data(),
+                                  Sloc.data(),
                                   INPUT.cal_syns,
                                   INPUT.dmax);
 
@@ -147,7 +148,7 @@ void Force_LCAO<double>::allocate(const Parallel_Orbitals& pv,
                            GlobalV::DRANK);
 
         ModuleIO::save_mat(0,
-                           lm.Sloc.data(),
+                           Sloc.data(),
                            GlobalV::NLOCAL,
                            bit,
                            GlobalV::out_ndigits,
@@ -156,7 +157,7 @@ void Force_LCAO<double>::allocate(const Parallel_Orbitals& pv,
                            "S",
                            "data-" + std::to_string(0),
                            pv,
-                           GlobalV::DRANK);
+                           GlobalV::DRANK);*/
     }
 
     ModuleBase::timer::tick("Force_LCAO", "allocate");
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
index e0044d709e..56953592fd 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
@@ -131,7 +131,7 @@ void Force_LCAO<std::complex<double>>::allocate(const Parallel_Orbitals& pv,
     // calculate dVnl=<phi|dVnl|dphi> in LCAO
     LCAO_domain::build_Nonlocal_mu_new(lm,
                                        fsr,
-                                       lm.Hloc_fixed.data(),
+                                       nullptr,
                                        cal_deri,
                                        GlobalC::ucell,
                                        GlobalC::ORB,
@@ -143,6 +143,9 @@ void Force_LCAO<std::complex<double>>::allocate(const Parallel_Orbitals& pv,
     {
         cal_deri = false;
 
+        ModuleBase::WARNING_QUIT("cal_syns",
+                                 "This function has been broken and will be fixed later.");
+
         LCAO_domain::build_ST_new(lm,
                                   fsr,
                                   'S',
@@ -158,10 +161,9 @@ void Force_LCAO<std::complex<double>>::allocate(const Parallel_Orbitals& pv,
 
         for (int ik = 0; ik < nks; ik++)
         {
-            lm.zeros_HSk('S');
-            lm.folding_fixedH(ik, kvec_d, 1);
+            
             bool bit = false; // LiuXh, 2017-03-21
-            ModuleIO::save_mat(0,
+            /*ModuleIO::save_mat(0,
                                lm.Hloc2.data(),
                                GlobalV::NLOCAL,
                                bit,
@@ -183,7 +185,7 @@ void Force_LCAO<std::complex<double>>::allocate(const Parallel_Orbitals& pv,
                                "S",
                                "data-" + std::to_string(ik),
                                pv,
-                               GlobalV::DRANK);
+                               GlobalV::DRANK);*/
         }
     }
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_allocate.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_allocate.cpp
index 68dcd2ef04..aa7a20c9e2 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_allocate.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_allocate.cpp
@@ -12,12 +12,6 @@ void divide_HS_in_frag(LCAO_Matrix& lm, const bool isGamma, Parallel_Orbitals& p
     //(1), (2): set up matrix division have been moved into ORB_control
     // just pass `ParaV` as pointer is enough
     lm.ParaV = &pv;
-    // (3) allocate for S, H_fixed, H, and S_diag
-    if (isGamma) {
-        LCAO_domain::allocate_HS_gamma(lm, lm.ParaV->nloc);
-    } else {
-        LCAO_domain::allocate_HS_k(lm, lm.ParaV->nloc);
-    }
 #ifdef __DEEPKS
     // wenfei 2021-12-19
     // preparation for DeePKS
@@ -48,51 +42,4 @@ void divide_HS_in_frag(LCAO_Matrix& lm, const bool isGamma, Parallel_Orbitals& p
     return;
 }
 
-void allocate_HS_gamma(LCAO_Matrix& lm, const long& nloc) {
-    ModuleBase::TITLE("LCAO_Matrix", "allocate_HS_gamma");
-
-    ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "nloc", nloc);
-
-    if (nloc == 0) {
-        return;
-    }
-
-    // because we initilize in the constructor function
-    // with dimension '1', so here we reconstruct these
-    // matrices
-
-    lm.Sloc.resize(nloc);
-    lm.Hloc_fixed.resize(nloc);
-    lm.Hloc.resize(nloc);
-
-    ModuleBase::GlobalFunc::ZEROS(lm.Sloc.data(), nloc);
-    ModuleBase::GlobalFunc::ZEROS(lm.Hloc_fixed.data(), nloc);
-    ModuleBase::GlobalFunc::ZEROS(lm.Hloc.data(), nloc);
-
-    return;
-}
-
-void allocate_HS_k(LCAO_Matrix& lm, const long& nloc) {
-    ModuleBase::TITLE("LCAO_Matrix", "allocate_HS_k");
-
-    ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "nloc", nloc);
-
-    if (nloc == 0) {
-        return; // mohan fix bug 2012-05-25
-    }
-
-    // because we initilize in the constructor function
-    // with dimension '1', so here we reconstruct these
-    // matrices
-    lm.Sloc2.resize(nloc);
-    lm.Hloc_fixed2.resize(nloc);
-    lm.Hloc2.resize(nloc);
-
-    ModuleBase::GlobalFunc::ZEROS(lm.Sloc2.data(), nloc);
-    ModuleBase::GlobalFunc::ZEROS(lm.Hloc_fixed2.data(), nloc);
-    ModuleBase::GlobalFunc::ZEROS(lm.Hloc2.data(), nloc);
-
-    return;
-}
-
 }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h
index 8ed6c57850..2f27bda1e2 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h
@@ -16,14 +16,6 @@
 namespace LCAO_domain
 {
 
-// can used in gamma algorithm.
-void build_Nonlocal_beta_new(LCAO_Matrix& lm,
-                             double* Hloc,
-                             const UnitCell& ucell,
-                             const LCAO_Orbitals& orb,
-                             const TwoCenterIntegrator& intor_orb_beta,
-                             Grid_Driver* GridD);
-
 void build_Nonlocal_mu_new(LCAO_Matrix& lm,
                            ForceStressArrays& fsr, // mohan 2024-06-16
                            double* HlocR,
@@ -173,10 +165,6 @@ void zeros_HSR(const char& mtype, LCAO_HS_Arrays& HS_arrays);
 
 void divide_HS_in_frag(LCAO_Matrix& lm, const bool isGamma, Parallel_Orbitals& pv, const int& nks);
 
-void allocate_HS_k(LCAO_Matrix& lm, const long& nloc);
-
-void allocate_HS_gamma(LCAO_Matrix& lm, const long& nloc);
-
 } // namespace LCAO_domain
 
 #endif
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.cpp
index bd89d3d82d..489b72b780 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.cpp
@@ -11,79 +11,8 @@ LCAO_Matrix::LCAO_Matrix() {}
 
 LCAO_Matrix::~LCAO_Matrix() {}
 
-void LCAO_Matrix::set_HSgamma(const int& iw1_all,
-                              const int& iw2_all,
-                              const double& v,
-                              double* HSloc) {
+void LCAO_Matrix::set_HSgamma(const int& iw1_all, const int& iw2_all, const double& v, double* HSloc)
+{
     LCAO_Matrix::set_mat2d<double>(iw1_all, iw2_all, v, *this->ParaV, HSloc);
     return;
-}
-
-void LCAO_Matrix::zeros_HSgamma(const char& mtype) {
-    auto zeros_HSgamma_ker = [&](int num_threads, int thread_id) {
-        long long beg, len;
-        if (mtype == 'S') {
-            ModuleBase::BLOCK_TASK_DIST_1D(num_threads,
-                                           thread_id,
-                                           (long long)this->Sloc.size(),
-                                           (long long)512,
-                                           beg,
-                                           len);
-
-            ModuleBase::GlobalFunc::ZEROS(this->Sloc.data() + beg, len);
-        } else if (mtype == 'T') {
-            ModuleBase::BLOCK_TASK_DIST_1D(num_threads,
-                                           thread_id,
-                                           (long long)this->Hloc_fixed.size(),
-                                           (long long)512,
-                                           beg,
-                                           len);
-
-            ModuleBase::GlobalFunc::ZEROS(this->Hloc_fixed.data() + beg, len);
-        } else if (mtype == 'H') {
-            ModuleBase::BLOCK_TASK_DIST_1D(num_threads,
-                                           thread_id,
-                                           (long long)this->Hloc.size(),
-                                           (long long)512,
-                                           beg,
-                                           len);
-
-            ModuleBase::GlobalFunc::ZEROS(this->Hloc.data() + beg, len);
-        }
-    };
-    ModuleBase::OMP_PARALLEL(zeros_HSgamma_ker);
-    return;
-}
-
-void LCAO_Matrix::zeros_HSk(const char& mtype) {
-    auto zeros_HSk_ker = [&](int num_threads, int thread_id) {
-        long long beg, len;
-        if (mtype == 'S') {
-            ModuleBase::BLOCK_TASK_DIST_1D(num_threads,
-                                           thread_id,
-                                           (long long)this->Sloc2.size(),
-                                           (long long)256,
-                                           beg,
-                                           len);
-            ModuleBase::GlobalFunc::ZEROS(this->Sloc2.data() + beg, len);
-        } else if (mtype == 'T') {
-            ModuleBase::BLOCK_TASK_DIST_1D(num_threads,
-                                           thread_id,
-                                           (long long)this->Hloc_fixed2.size(),
-                                           (long long)256,
-                                           beg,
-                                           len);
-            ModuleBase::GlobalFunc::ZEROS(this->Hloc_fixed2.data() + beg, len);
-        } else if (mtype == 'H') {
-            ModuleBase::BLOCK_TASK_DIST_1D(num_threads,
-                                           thread_id,
-                                           (long long)this->Hloc2.size(),
-                                           (long long)256,
-                                           beg,
-                                           len);
-            ModuleBase::GlobalFunc::ZEROS(this->Hloc2.data() + beg, len);
-        }
-    };
-    ModuleBase::OMP_PARALLEL(zeros_HSk_ker);
-    return;
 }
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h
index ed1b67617b..54fbc7b520 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h
@@ -19,12 +19,6 @@ class LCAO_Matrix {
     LCAO_Matrix();
     ~LCAO_Matrix();
 
-    // folding the fixed Hamiltonian (T+Vnl) if
-    // k-point algorithm is used.
-    void folding_fixedH(const int& ik,
-                        const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                        bool cal_syns = false);
-
     Parallel_Orbitals* ParaV;
 
 #ifdef __EXX
@@ -38,27 +32,6 @@ class LCAO_Matrix {
 #endif
 
   public:
-    //------------------------------
-    // H, S, Hfixed
-    // used in gamma only algorithm.
-    // thse matrix are used to
-    // diagonalize.
-    //------------------------------
-    std::vector<double> Hloc;
-    std::vector<double> Sloc;
-    std::vector<double> Hloc_fixed;
-
-    //------------------------------
-    // 1. Hamiltonian(vl),
-    // 2. overlap matrix Sloc2
-    // 3. fixed (vna+T+Vnl) matrix.
-    // used in kpoint algorithm.
-    // these matrix are used to
-    // diagonalize.
-    //------------------------------
-    std::vector<std::complex<double>> Hloc2;
-    std::vector<std::complex<double>> Sloc2;
-    std::vector<std::complex<double>> Hloc_fixed2;
 
     // Record all R direct coordinate information, even if HR or SR is a zero
     // matrix
@@ -79,10 +52,6 @@ class LCAO_Matrix {
                      const int& iw2_all,
                      const double& v,
                      double* HSloc);
-
-    void zeros_HSgamma(const char& mtype);
-
-    void zeros_HSk(const char& mtype);
 };
 
 #include "LCAO_matrix.hpp"
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_beta.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_beta.cpp
deleted file mode 100644
index e4f526a502..0000000000
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_beta.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-#include "module_base/timer.h"
-#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
-
-#ifdef __MKL
-#include <mkl_service.h> // mkl_get_max_threads
-#endif
-
-namespace LCAO_domain
-{
-
-void build_Nonlocal_beta_new(LCAO_Matrix& lm,
-                             double* HSloc,
-                             const UnitCell& ucell,
-                             const LCAO_Orbitals& orb,
-                             const TwoCenterIntegrator& intor_orb_beta,
-                             Grid_Driver* GridD) // update by liuyu 2021-04-07
-{
-    ModuleBase::TITLE("LCAO_domain", "b_NL_beta_new");
-    ModuleBase::timer::tick("LCAO_domain", "b_NL_beta_new");
-
-    const Parallel_Orbitals* pv = lm.ParaV;
-    const int npol = GlobalV::NPOL;
-
-#ifdef __MKL
-    const int mkl_threads = mkl_get_max_threads();
-    mkl_set_num_threads(1);
-#endif
-
-    const std::vector<AdjacentAtomInfo> adjs_all = GridD->get_adjs(ucell);
-
-#ifdef _OPENMP
-#pragma omp parallel
-    {
-        double* Nonlocal_thread;
-        Nonlocal_thread = new double[pv->nloc];
-        ModuleBase::GlobalFunc::ZEROS(Nonlocal_thread, pv->nloc);
-#pragma omp for schedule(dynamic)
-#endif
-        for (int iat = 0; iat < ucell.nat; iat++)
-        {
-            const int T0 = ucell.iat2it[iat];
-            const int I0 = ucell.iat2ia[iat];
-            Atom* atom0 = &ucell.atoms[T0];
-
-//=======================================================
-// Step1:
-// saves <beta|psi>, where beta runs over L0,M0 on atom I0
-// and psi runs over atomic basis sets on the current core
-//=======================================================
-#ifdef _OPENMP
-            std::vector<std::unordered_map<int, std::vector<double>>> nlm_tot_thread;
-            nlm_tot_thread.resize(adjs_all[iat].adj_num + 1);
-#else
-        std::vector<std::unordered_map<int, std::vector<double>>> nlm_tot;
-        nlm_tot.resize(adjs_all[iat].adj_num + 1);
-#endif
-
-            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            const double Rcut_Beta = ucell.infoNL.Beta[T0].get_rcut_max();
-
-            // outermost loop : all adjacent atoms
-            for (int ad_count = 0; ad_count < adjs_all[iat].adj_num + 1; ad_count++)
-            {
-                const int T1 = adjs_all[iat].ntype[ad_count];
-                const int I1 = adjs_all[iat].natom[ad_count];
-                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-                const double Rcut_AO1 = orb.Phi[T1].getRcut();
-                const ModuleBase::Vector3<double> tau1 = adjs_all[iat].adjacent_tau[ad_count];
-                const Atom* atom1 = &ucell.atoms[T1];
-                const int nw1_tot = atom1->nw * npol;
-
-#ifdef _OPENMP
-                nlm_tot_thread[ad_count].clear();
-#else
-            nlm_tot[ad_count].clear();
-#endif
-
-                // middle loop : atomic basis on current processor (either row or column)
-                const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
-                if (dist1 > Rcut_Beta + Rcut_AO1)
-                {
-                    continue;
-                }
-
-                for (int iw1 = 0; iw1 < nw1_tot; ++iw1)
-                {
-                    const int iw1_all = start1 + iw1;
-                    const int iw1_local = pv->global2local_row(iw1_all);
-                    const int iw2_local = pv->global2local_col(iw1_all);
-
-                    if (iw1_local < 0 && iw2_local < 0)
-                        continue;
-
-                    const int iw1_0 = iw1 / npol;
-                    std::vector<std::vector<double>> nlm;
-                    // 2D, but first dimension is only 1 here
-                    // for force, the right hand side is the gradient
-                    // and the first dimension is then 3
-                    // inner loop : all projectors (L0,M0)
-
-                    int L1 = atom1->iw2l[iw1_0];
-                    int N1 = atom1->iw2n[iw1_0];
-                    int m1 = atom1->iw2m[iw1_0];
-
-                    // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-                    int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
-
-                    ModuleBase::Vector3<double> dtau = ucell.atoms[T0].tau[I0] - tau1;
-                    intor_orb_beta.snap(T1, L1, N1, M1, T0, dtau * ucell.lat0, false, nlm);
-
-#ifdef _OPENMP
-                    nlm_tot_thread[ad_count].insert({iw1_all, nlm[0]});
-#else
-                nlm_tot[ad_count].insert({iw1_all, nlm[0]});
-#endif
-                } // end iw
-            }     // end ad
-
-            //=======================================================
-            // Step2:
-            // calculate sum_(L0,M0) beta<psi_i|beta><beta|psi_j>
-            // and accumulate the value to Hloc_fixed(i,j)
-            //=======================================================
-            for (int ad1_count = 0; ad1_count < adjs_all[iat].adj_num + 1; ad1_count++)
-            {
-                const int T1 = adjs_all[iat].ntype[ad1_count];
-                const int I1 = adjs_all[iat].natom[ad1_count];
-                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-                const ModuleBase::Vector3<double> tau1 = adjs_all[iat].adjacent_tau[ad1_count];
-                const Atom* atom1 = &ucell.atoms[T1];
-                const int nw1_tot = atom1->nw * npol;
-                const double Rcut_AO1 = orb.Phi[T1].getRcut();
-
-                for (int ad2_count = 0; ad2_count < adjs_all[iat].adj_num + 1; ad2_count++)
-                {
-                    const int T2 = adjs_all[iat].ntype[ad2_count];
-                    const int I2 = adjs_all[iat].natom[ad2_count];
-                    const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
-                    const ModuleBase::Vector3<double> tau2 = adjs_all[iat].adjacent_tau[ad2_count];
-                    const Atom* atom2 = &ucell.atoms[T2];
-                    const int nw2_tot = atom2->nw * npol;
-                    const double Rcut_AO2 = orb.Phi[T2].getRcut();
-                    const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
-                    const double dist2 = (tau2 - tau0).norm() * ucell.lat0;
-
-                    if (dist1 > Rcut_Beta + Rcut_AO1 || dist2 > Rcut_Beta + Rcut_AO2)
-                    {
-                        continue;
-                    }
-
-                    for (int iw1 = 0; iw1 < nw1_tot; ++iw1)
-                    {
-                        const int iw1_all = start1 + iw1;
-                        const int iw1_local = pv->global2local_row(iw1_all);
-                        if (iw1_local < 0)
-                            continue;
-                        const int iw1_0 = iw1 / npol;
-                        for (int iw2 = 0; iw2 < nw2_tot; ++iw2)
-                        {
-                            const int iw2_all = start2 + iw2;
-                            const int iw2_local = pv->global2local_col(iw2_all);
-                            if (iw2_local < 0)
-                                continue;
-                            const int iw2_0 = iw2 / npol;
-#ifdef _OPENMP
-                            std::vector<double> nlm1 = nlm_tot_thread[ad1_count][iw1_all];
-                            std::vector<double> nlm2 = nlm_tot_thread[ad2_count][iw2_all];
-#else
-                        std::vector<double> nlm1 = nlm_tot[ad1_count][iw1_all];
-                        std::vector<double> nlm2 = nlm_tot[ad2_count][iw2_all];
-#endif
-
-                            assert(nlm1.size() == nlm2.size());
-#ifdef _OPENMP
-                            double nlm_thread = 0.0;
-#else
-                        double nlm = 0.0;
-#endif
-                            const int nproj = ucell.infoNL.nproj[T0];
-                            int ib = 0;
-                            for (int nb = 0; nb < nproj; nb++)
-                            {
-                                const int L0 = ucell.infoNL.Beta[T0].Proj[nb].getL();
-                                for (int m = 0; m < 2 * L0 + 1; m++)
-                                {
-#ifdef _OPENMP
-                                    nlm_thread += nlm1[ib] * nlm2[ib] * ucell.atoms[T0].ncpp.dion(nb, nb);
-#else
-                                nlm += nlm1[ib] * nlm2[ib] * ucell.atoms[T0].ncpp.dion(nb, nb);
-#endif
-                                    ib += 1;
-                                }
-                            }
-                            assert(ib == nlm1.size());
-
-                            const int ir = pv->global2local_row(iw1_all);
-                            const int ic = pv->global2local_col(iw2_all);
-                            long index = 0;
-                            if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
-                            {
-                                index = ic * pv->nrow + ir;
-                            }
-                            else
-                            {
-                                index = ir * pv->ncol + ic;
-                            }
-#ifdef _OPENMP
-                            Nonlocal_thread[index] += nlm_thread;
-#else
-                        lm.set_HSgamma(iw1_all, iw2_all, nlm, HSloc);
-#endif
-                        } // iw2
-                    }     // iw1
-                }         // ad2
-            }             // ad1
-        }                 // end iat
-
-#ifdef _OPENMP
-#pragma omp critical(cal_nonlocal)
-        {
-            for (int i = 0; i < pv->nloc; i++)
-            {
-                lm.Hloc_fixed[i] += Nonlocal_thread[i];
-            }
-        }
-        delete[] Nonlocal_thread;
-#endif
-#ifdef _OPENMP
-    }
-#endif
-
-#ifdef __MKL
-    mkl_set_num_threads(mkl_threads);
-#endif
-
-    ModuleBase::timer::tick("LCAO_domain", "b_NL_beta_new");
-    return;
-}
-
-} // namespace LCAO_domain
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nnr.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nnr.cpp
index 25a0214cc7..13399b6722 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nnr.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nnr.cpp
@@ -289,221 +289,3 @@ int Grid_Technique::binary_search_find_R2_offset(int val, int iat) const
     }
     return -1;
 }
-
-// be called in LCAO_Hamilt::calculate_Hk.
-void LCAO_Matrix::folding_fixedH(
-						const int &ik, 
-						const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-						bool cal_syns)
-{
-	ModuleBase::TITLE("LCAO_nnr","folding_fixedH");
-    ModuleBase::timer::tick("LCAO_nnr", "folding_fixedH");
-    const Parallel_Orbitals* pv = this->ParaV;
-
-	int tot_index = 0;
-#ifdef _OPENMP
-#pragma omp parallel reduction(+:tot_index)
-{
-	const int num_threads = omp_get_num_threads();
-	const int thread_id = omp_get_thread_num();
-#else
-	const int num_threads = 1;
-	const int thread_id = 0;
-#endif
-	ModuleBase::Vector3<double> dtau;
-	ModuleBase::Vector3<double> tau1;
-	ModuleBase::Vector3<double> tau2;
-
-	ModuleBase::Vector3<double> dtau1;
-	ModuleBase::Vector3<double> dtau2;
-	ModuleBase::Vector3<double> tau0;
-
-#ifdef __DEEPKS
-	if (GlobalV::deepks_scf)
-    {
-		int beg, len;
-		ModuleBase::BLOCK_TASK_DIST_1D(num_threads, thread_id, (int)pv->nloc, 1024, beg, len);
-		ModuleBase::GlobalFunc::ZEROS(GlobalC::ld.H_V_delta_k[ik].data() + beg, len);
-	}
-#endif
-
-#ifdef _OPENMP
-#pragma omp for schedule(dynamic)
-#endif
-	for (int iat=0; iat<GlobalC::ucell.nat; ++iat)
-	{
-		const int T1 = GlobalC::ucell.iat2it[iat];
-		Atom* atom1 = &GlobalC::ucell.atoms[T1];
-		const int I1 = GlobalC::ucell.iat2ia[iat];
-		{
-			tau1 = atom1->tau[I1];
-			//GlobalC::GridD.Find_atom(tau1);
-			AdjacentAtomInfo adjs;
-			GlobalC::GridD.Find_atom(GlobalC::ucell, tau1, T1, I1, &adjs);
-			Atom* atom1 = &GlobalC::ucell.atoms[T1];
-			const int start = GlobalC::ucell.itiaiw2iwt(T1,I1,0);
-			int index = pv->nlocstart[iat];
-
-			if (cal_syns)
-            {
-                for (int k = 0; k < 3; k++)
-                {
-                    tau1[k] = tau1[k] - atom1->vel[I1][k] * INPUT.mdp.md_dt / GlobalC::ucell.lat0 ;
-                }
-            }
-
-			// (2) search among all adjacent atoms.
-			for (int ad = 0; ad < adjs.adj_num+1; ++ad)
-			{
-				const int T2 = adjs.ntype[ad];
-				const int I2 = adjs.natom[ad];
-				Atom* atom2 = &GlobalC::ucell.atoms[T2];
-
-				tau2 = adjs.adjacent_tau[ad];
-				dtau = tau2 - tau1;
-				double distance = dtau.norm() * GlobalC::ucell.lat0;
-				double rcut = GlobalC::ORB.Phi[T1].getRcut() + GlobalC::ORB.Phi[T2].getRcut();
-
-				bool adj = false;
-
-				if(distance < rcut) 
-				{
-					adj = true;
-				}
-				else if(distance >= rcut)
-				{
-					for (int ad0 = 0; ad0 < adjs.adj_num+1; ++ad0)
-					{
-						const int T0 = adjs.ntype[ad0]; 
-						const int I0 = adjs.natom[ad0]; 
-						//const int iat0 = GlobalC::ucell.itia2iat(T0, I0);
-						//const int start0 = GlobalC::ucell.itiaiw2iwt(T0, I0, 0);
-
-						tau0 = adjs.adjacent_tau[ad0];
-						dtau1 = tau0 - tau1;
-						dtau2 = tau0 - tau2;
-
-						double distance1 = dtau1.norm() * GlobalC::ucell.lat0;
-						double distance2 = dtau2.norm() * GlobalC::ucell.lat0;
-
-						double rcut1 = GlobalC::ORB.Phi[T1].getRcut() + GlobalC::ucell.infoNL.Beta[T0].get_rcut_max();
-						double rcut2 = GlobalC::ORB.Phi[T2].getRcut() + GlobalC::ucell.infoNL.Beta[T0].get_rcut_max();
-
-						if( distance1 < rcut1 && distance2 < rcut2 )
-						{
-							adj = true;
-							break;
-						}
-					}
-				}
-
-				if(adj) // mohan fix bug 2011-06-26, should not be '<='
-				{
-					// (3) calculate the nu of atom (T2, I2)
-					const int start2 = GlobalC::ucell.itiaiw2iwt(T2,I2,0);
-					//------------------------------------------------
-					// exp(k dot dR)
-					// dR is the index of box in Crystal coordinates
-					//------------------------------------------------
-					ModuleBase::Vector3<double> dR(adjs.box[ad].x, adjs.box[ad].y, adjs.box[ad].z); 
-					const double arg = ( kvec_d[ik] * dR ) * ModuleBase::TWO_PI;
-					//const double arg = ( kvec_d[ik] * GlobalC::GridD.getBox(ad) ) * ModuleBase::TWO_PI;
-					double sinp, cosp;
-					ModuleBase::libm::sincos(arg, &sinp, &cosp);
-					const std::complex<double> kphase = std::complex <double> ( cosp,  sinp );
-
-					//--------------------------------------------------
-					// calculate how many matrix elements are in 
-					// this processor.
-					//--------------------------------------------------
-					//########################### EXPLAIN ###############################
-					// 1. overlap matrix with k point
-					// this->SlocR = < phi_0i | phi_Rj >, where 0, R are the cell index
-					// while i,j are the orbital index.
-
-					// 2. H_fixed=T+Vnl matrix element with k point (if Vna is not used).
-					// H_fixed=T+Vnl+Vna matrix element with k point (if Vna is used).
-					// this->Hloc_fixed = < phi_0i | H_fixed | phi_Rj>
-
-					// 3. H(k) |psi(k)> = S(k) | psi(k)> 
-					// Sloc2 is used to diagonalize for a give k point.
-					// Hloc_fixed2 is used to diagonalize (eliminate index R).
-					//###################################################################
-#define NW_IJ_LOOP_BEGIN() \
-					for(int ii=0; ii<atom1->nw*GlobalV::NPOL; ii++)				\
-					{															\
-						/* the index of orbitals in this processor */			\
-						const int iw1_all = start + ii;							\
-						const int mu = pv->global2local_row(iw1_all);				\
-						if(mu<0) {continue;}									\
-						for(int jj=0; jj<atom2->nw*GlobalV::NPOL; jj++)			\
-						{														\
-							int iw2_all = start2 + jj;							\
-							const int nu = pv->global2local_col(iw2_all);			\
-							if(nu<0) {continue;}
-
-								// DO STH.
-
-#define NW_IJ_LOOP_END() \
-							++index;											\
-							++tot_index;										\
-						}/*end jj*/												\
-					}/*end ii*/
-
-					if(GlobalV::NSPIN!=4)
-					{
-						auto compute_kernel = [&](int iic, int index) {
-#ifdef __DEEPKS
-							if(GlobalV::deepks_scf)
-							{
-								GlobalC::ld.H_V_delta_k[ik][iic] += GlobalC::ld.H_V_deltaR[index] * kphase;
-							}
-#endif
-						};
-						if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
-						{
-							NW_IJ_LOOP_BEGIN()
-							compute_kernel(mu+nu*pv->nrow, index);
-							NW_IJ_LOOP_END()
-						} else {
-							NW_IJ_LOOP_BEGIN()
-							compute_kernel(mu*pv->ncol+nu, index);
-							NW_IJ_LOOP_END()
-						}
-					}
-					else
-					{
-						auto compute_kernel = [&](int iic, int index, int iw1_all, int iw2_all) {
-#ifdef __DEEPKS
-							if(GlobalV::deepks_scf)
-							{
-								if (iw1_all % 2 == iw2_all % 2)
-								{
-									GlobalC::ld.H_V_delta_k[ik][iic] += GlobalC::ld.H_V_deltaR[index] * kphase;
-								}
-							}
-#endif
-						};
-						if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
-						{
-							NW_IJ_LOOP_BEGIN()
-							compute_kernel(mu+nu*pv->nrow, index, iw1_all, iw2_all);
-							NW_IJ_LOOP_END()
-						} else {
-							NW_IJ_LOOP_BEGIN()
-							compute_kernel(mu*pv->ncol+nu, index, iw1_all, iw2_all);
-							NW_IJ_LOOP_END()
-						}
-					}
-				}
-			}// end ad
-		}// end I1
-	} // end T1
-#ifdef _OPENMP
-}
-#endif
-	assert(tot_index==this->ParaV->nnr);
-
-	ModuleBase::timer::tick("LCAO_nnr","folding_fixedH");
-	return;
-}
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
index 0a71ebff5e..1513a4f3a7 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
@@ -36,32 +36,31 @@ namespace hamilt
 {
 
 template <typename TK, typename TR>
-HamiltLCAO<TK, TR>::HamiltLCAO(LCAO_Matrix* LM_in, const K_Vectors& kv_in, const TwoCenterIntegrator& intor_overlap_orb)
+HamiltLCAO<TK, TR>::HamiltLCAO(const Parallel_Orbitals* paraV, const K_Vectors& kv_in, const TwoCenterIntegrator& intor_overlap_orb)
 {
     this->classname = "HamiltLCAO";
 
     this->kv = &kv_in;
 
     // Real space Hamiltonian is inited with template TR
-    this->hR = new HContainer<TR>(LM_in->ParaV);
-    this->sR = new HContainer<TR>(LM_in->ParaV);
+    this->hR = new HContainer<TR>(paraV);
+    this->sR = new HContainer<TR>(paraV);
+    this->hsk = new HS_Matrix_K<TK>(paraV);
 
-    this->getOperator() = new OverlapNew<OperatorLCAO<TK, TR>>(LM_in,
+    this->getOperator() = new OverlapNew<OperatorLCAO<TK, TR>>(this->hsk,
                                                                this->kv->kvec_d,
                                                                this->hR,
-                                                               &(this->getHk(LM_in)),
                                                                this->sR,
-                                                               &(this->getSk(LM_in)),
                                                                &GlobalC::ucell,
                                                                &GlobalC::GridD,
-                                                               &intor_overlap_orb,
-                                                               LM_in->ParaV);
+                                                               &intor_overlap_orb);
 }
 
 template <typename TK, typename TR>
 HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
                                Gint_k* GK_in,
                                LCAO_Matrix* LM_in,
+                               const Parallel_Orbitals* paraV,
                                elecstate::Potential* pot_in,
                                const K_Vectors& kv_in,
                                const TwoCenterBundle& two_center_bundle,
@@ -72,13 +71,9 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
     this->classname = "HamiltLCAO";
 
     // Real space Hamiltonian is inited with template TR
-    this->hR = new HContainer<TR>(LM_in->ParaV);
-    this->sR = new HContainer<TR>(LM_in->ParaV);
-
-    const std::size_t row_col_size
-        = static_cast<std::size_t>(LM_in->ParaV->get_row_size()) * LM_in->ParaV->get_col_size();
-    this->getSk(LM_in).resize(row_col_size);
-    this->getHk(LM_in).resize(row_col_size);
+    this->hR = new HContainer<TR>(paraV);
+    this->sR = new HContainer<TR>(paraV);
+    this->hsk = new HS_Matrix_K<TK>(paraV);
 
     // Effective potential term (\sum_r <psi(r)|Veff(r)|psi(r)>) is registered without template
     std::vector<std::string> pot_register_in;
@@ -120,51 +115,42 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
         this->hR->fix_gamma();
         // initial operator for Gamma_only case
         // overlap term (<psi|psi>) is indispensable
-        // in Gamma_only case, target SR is LCAO_Matrix::Sloc, which is same as SK
-        this->getOperator() = new OverlapNew<OperatorLCAO<TK, TR>>(LM_in,
+        // in Gamma_only case, target SK is this->hsk->get_sk(), the target SR is this->sR
+        this->getOperator() = new OverlapNew<OperatorLCAO<TK, TR>>(this->hsk,
                                                                    this->kv->kvec_d,
                                                                    this->hR,
-                                                                   &(this->getHk(LM_in)),
                                                                    this->sR,
-                                                                   &(this->getSk(LM_in)),
                                                                    &GlobalC::ucell,
                                                                    &GlobalC::GridD,
-                                                                   two_center_bundle.overlap_orb.get(),
-                                                                   LM_in->ParaV);
+                                                                   two_center_bundle.overlap_orb.get());
 
-        // kinetic term (<psi|T|psi>),
-        // in Gamma_only case, target HR is LCAO_Matrix::Hloc_fixed, while target HK is LCAO_Matrix::Hloc
-        // LCAO_Matrix::Hloc_fixed2 is used for storing
+        // kinetic term (<psi|T|psi>)
         if (GlobalV::T_IN_H)
         {
-            Operator<TK>* ekinetic = new EkineticNew<OperatorLCAO<TK, TR>>(LM_in,
+            Operator<TK>* ekinetic = new EkineticNew<OperatorLCAO<TK, TR>>(this->hsk,
                                                                            this->kv->kvec_d,
                                                                            this->hR,
-                                                                           &(this->getHk(LM_in)),
                                                                            &GlobalC::ucell,
                                                                            &GlobalC::GridD,
-                                                                           two_center_bundle.kinetic_orb.get(),
-                                                                           LM_in->ParaV);
+                                                                           two_center_bundle.kinetic_orb.get());
             this->getOperator()->add(ekinetic);
         }
 
         // nonlocal term (<psi|beta>D<beta|psi>)
-        // in general case, target HR is this->hR, while target HK is LCAO_Matrix::Hloc
+        // in general case, target HR is this->hR, while target HK is this->hsk->get_hk()
         if (GlobalV::VNL_IN_H)
         {
-            Operator<TK>* nonlocal = new NonlocalNew<OperatorLCAO<TK, TR>>(LM_in,
+            Operator<TK>* nonlocal = new NonlocalNew<OperatorLCAO<TK, TR>>(this->hsk,
                                                                            this->kv->kvec_d,
                                                                            this->hR,
-                                                                           &(this->getHk(LM_in)),
                                                                            &GlobalC::ucell,
                                                                            &GlobalC::GridD,
-                                                                           two_center_bundle.overlap_orb_beta.get(),
-                                                                           LM_in->ParaV);
+                                                                           two_center_bundle.overlap_orb_beta.get());
             this->getOperator()->add(nonlocal);
         }
 
         // Effective potential term (\sum_r <psi(r)|Veff(r)|psi(r)>)
-        // in general case, target HR is Gint::hRGint, while target HK is LCAO_Matrix::Hloc
+        // in general case, target HR is Gint::hRGint, while target HK is this->hsk->get_hk()
         if (GlobalV::VL_IN_H)
         {
             // only Potential is not empty, Veff and Meta are available
@@ -174,14 +160,12 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
                 pot_in->pot_register(pot_register_in);
                 // effective potential term
                 Operator<TK>* veff = new Veff<OperatorLCAO<TK, TR>>(GG_in,
-                                                                    LM_in,
+                                                                    this->hsk,
                                                                     this->kv->kvec_d,
                                                                     pot_in,
                                                                     this->hR, // no explicit call yet
-                                                                    &(this->getHk(LM_in)),
                                                                     &GlobalC::ucell,
-                                                                    &GlobalC::GridD,
-                                                                    LM_in->ParaV // no explicit call yet
+                                                                    &GlobalC::GridD
                 );
                 this->getOperator()->add(veff);
             }
@@ -190,10 +174,9 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
 #ifdef __DEEPKS
         if (GlobalV::deepks_scf)
         {
-            Operator<TK>* deepks = new DeePKS<OperatorLCAO<TK, TR>>(LM_in,
+            Operator<TK>* deepks = new DeePKS<OperatorLCAO<TK, TR>>(this->hsk,
                                                                     this->kv->kvec_d,
                                                                     this->hR, // no explicit call yet
-                                                                    &(this->getHk(LM_in)),
                                                                     &GlobalC::ucell,
                                                                     &GlobalC::GridD,
                                                                     two_center_bundle.overlap_orb_alpha.get(),
@@ -209,23 +192,20 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
             Operator<TK>* dftu = nullptr;
             if (GlobalV::dft_plus_u == 2)
             {
-                dftu = new OperatorDFTU<OperatorLCAO<TK, TR>>(LM_in,
+                dftu = new OperatorDFTU<OperatorLCAO<TK, TR>>(this->hsk,
                                                               kv->kvec_d,
                                                               this->hR, // no explicit call yet
-                                                              &(this->getHk(LM_in)),
                                                               this->kv->isk);
             }
             else
             {
-                dftu = new DFTU<OperatorLCAO<TK, TR>>(LM_in,
+                dftu = new DFTU<OperatorLCAO<TK, TR>>(this->hsk,
                                                       this->kv->kvec_d,
                                                       this->hR,
-                                                      &(this->getHk(LM_in)),
                                                       GlobalC::ucell,
                                                       &GlobalC::GridD,
                                                       two_center_bundle.overlap_orb_onsite.get(),
-                                                      &GlobalC::dftu,
-                                                      *(LM_in->ParaV));
+                                                      &GlobalC::dftu);
             }
             this->getOperator()->add(dftu);
         }
@@ -235,7 +215,7 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
     {
         // Effective potential term (\sum_r <psi(r)|Veff(r)|psi(r)>)
         // Meta potential term (\sum_r <psi(r)|tau(r)|psi(r)>)
-        // in general case, target HR is Gint::pvpR_reduced, while target HK is LCAO_Matrix::Hloc2
+        // in general case, target HR is Gint::pvpR_reduced, while target HK is this->hsk->get_hk()
         if (GlobalV::VL_IN_H)
         {
             // only Potential is not empty, Veff and Meta are available
@@ -245,14 +225,12 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
                 pot_in->pot_register(pot_register_in);
                 // Veff term
                 this->getOperator() = new Veff<OperatorLCAO<TK, TR>>(GK_in,
-                                                                     LM_in,
+                                                                     this->hsk,
                                                                      kv->kvec_d,
                                                                      pot_in,
                                                                      this->hR,
-                                                                     &(this->getHk(LM_in)),
                                                                      &GlobalC::ucell,
-                                                                     &GlobalC::GridD,
-                                                                     LM_in->ParaV);
+                                                                     &GlobalC::GridD);
                 // reset spin index and real space Hamiltonian matrix
                 int start_spin = -1;
                 GK_in->reset_spin(start_spin);
@@ -263,16 +241,13 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
 
         // initial operator for multi-k case
         // overlap term is indispensable
-        Operator<TK>* overlap = new OverlapNew<OperatorLCAO<TK, TR>>(LM_in,
+        Operator<TK>* overlap = new OverlapNew<OperatorLCAO<TK, TR>>(this->hsk,
                                                                      this->kv->kvec_d,
                                                                      this->hR,
-                                                                     &(this->getHk(LM_in)),
                                                                      this->sR,
-                                                                     &(this->getSk(LM_in)),
                                                                      &GlobalC::ucell,
                                                                      &GlobalC::GridD,
-                                                                     two_center_bundle.overlap_orb.get(),
-                                                                     LM_in->ParaV);
+                                                                     two_center_bundle.overlap_orb.get());
         if (this->getOperator() == nullptr)
         {
             this->getOperator() = overlap;
@@ -283,32 +258,28 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
         }
 
         // kinetic term (<psi|T|psi>),
-        // in general case, target HR is this->hR, while target HK is LCAO_Matrix::Hloc2
+        // in general case, target HR is this->hR, while target HK is this->hsk->get_hk()
         if (GlobalV::T_IN_H)
         {
-            Operator<TK>* ekinetic = new EkineticNew<OperatorLCAO<TK, TR>>(LM_in,
+            Operator<TK>* ekinetic = new EkineticNew<OperatorLCAO<TK, TR>>(this->hsk,
                                                                            this->kv->kvec_d,
                                                                            this->hR,
-                                                                           &(this->getHk(LM_in)),
                                                                            &GlobalC::ucell,
                                                                            &GlobalC::GridD,
-                                                                           two_center_bundle.kinetic_orb.get(),
-                                                                           LM_in->ParaV);
+                                                                           two_center_bundle.kinetic_orb.get());
             this->getOperator()->add(ekinetic);
         }
 
         // nonlocal term (<psi|beta>D<beta|psi>)
-        // in general case, target HR is this->hR, while target HK is LCAO_Matrix::Hloc2
+        // in general case, target HR is this->hR, while target HK is this->hsk->get_hk()
         if (GlobalV::VNL_IN_H)
         {
-            Operator<TK>* nonlocal = new NonlocalNew<OperatorLCAO<TK, TR>>(LM_in,
+            Operator<TK>* nonlocal = new NonlocalNew<OperatorLCAO<TK, TR>>(this->hsk,
                                                                            this->kv->kvec_d,
                                                                            this->hR,
-                                                                           &(this->getHk(LM_in)),
                                                                            &GlobalC::ucell,
                                                                            &GlobalC::GridD,
-                                                                           two_center_bundle.overlap_orb_beta.get(),
-                                                                           LM_in->ParaV);
+                                                                           two_center_bundle.overlap_orb_beta.get());
             // TDDFT velocity gague will calculate full non-local potential including the original one and the
             // correction on its own. So the original non-local potential term should be skipped
             if (GlobalV::ESOLVER_TYPE != "tddft" || elecstate::H_TDDFT_pw::stype != 1)
@@ -324,10 +295,9 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
 #ifdef __DEEPKS
         if (GlobalV::deepks_scf)
         {
-            Operator<TK>* deepks = new DeePKS<OperatorLCAO<TK, TR>>(LM_in,
+            Operator<TK>* deepks = new DeePKS<OperatorLCAO<TK, TR>>(this->hsk,
                                                                     this->kv->kvec_d,
                                                                     hR,
-                                                                    &(this->getHk(LM_in)),
                                                                     &GlobalC::ucell,
                                                                     &GlobalC::GridD,
                                                                     two_center_bundle.overlap_orb_alpha.get(),
@@ -340,24 +310,20 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
         if (TD_Velocity::tddft_velocity)
         {
             elecstate::H_TDDFT_pw::update_At();
-            Operator<TK>* td_ekinetic = new TDEkinetic<OperatorLCAO<TK, TR>>(LM_in,
+            Operator<TK>* td_ekinetic = new TDEkinetic<OperatorLCAO<TK, TR>>(this->hsk,
                                                                              this->hR,
-                                                                             &(this->getHk(LM_in)),
                                                                              this->sR,
                                                                              kv,
                                                                              &GlobalC::ucell,
                                                                              &GlobalC::GridD,
-                                                                             LM_in->ParaV,
                                                                              two_center_bundle.overlap_orb.get());
             this->getOperator()->add(td_ekinetic);
 
-            Operator<TK>* td_nonlocal = new TDNonlocal<OperatorLCAO<TK, TR>>(LM_in,
+            Operator<TK>* td_nonlocal = new TDNonlocal<OperatorLCAO<TK, TR>>(this->hsk,
                                                                              this->kv->kvec_d,
                                                                              this->hR,
-                                                                             &(this->getHk(LM_in)),
                                                                              &GlobalC::ucell,
-                                                                             &GlobalC::GridD,
-                                                                             LM_in->ParaV);
+                                                                             &GlobalC::GridD);
             this->getOperator()->add(td_nonlocal);
         }
         if (GlobalV::dft_plus_u)
@@ -365,32 +331,28 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
             Operator<TK>* dftu = nullptr;
             if (GlobalV::dft_plus_u == 2)
             {
-                dftu = new OperatorDFTU<OperatorLCAO<TK, TR>>(LM_in,
+                dftu = new OperatorDFTU<OperatorLCAO<TK, TR>>(this->hsk,
                                                               kv->kvec_d,
                                                               this->hR, // no explicit call yet
-                                                              &(this->getHk(LM_in)),
                                                               this->kv->isk);
             }
             else
             {
-                dftu = new DFTU<OperatorLCAO<TK, TR>>(LM_in,
+                dftu = new DFTU<OperatorLCAO<TK, TR>>(this->hsk,
                                                       this->kv->kvec_d,
                                                       this->hR,
-                                                      &(this->getHk(LM_in)),
                                                       GlobalC::ucell,
                                                       &GlobalC::GridD,
                                                       two_center_bundle.overlap_orb_onsite.get(),
-                                                      &GlobalC::dftu,
-                                                      *(LM_in->ParaV));
+                                                      &GlobalC::dftu);
             }
             this->getOperator()->add(dftu);
         }
         if (GlobalV::sc_mag_switch)
         {
-            Operator<TK>* sc_lambda = new OperatorScLambda<OperatorLCAO<TK, TR>>(LM_in,
+            Operator<TK>* sc_lambda = new OperatorScLambda<OperatorLCAO<TK, TR>>(this->hsk,
                                                                                  kv->kvec_d,
                                                                                  this->hR, // no explicit call yet
-                                                                                 &(this->getHk(LM_in)),
                                                                                  this->kv->isk);
             this->getOperator()->add(sc_lambda);
         }
@@ -399,9 +361,9 @@ HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
 #ifdef __EXX
     if (GlobalC::exx_info.info_global.cal_exx)
     {
-        Operator<TK>* exx = new OperatorEXX<OperatorLCAO<TK, TR>>(LM_in,
+        Operator<TK>* exx = new OperatorEXX<OperatorLCAO<TK, TR>>(this->hsk,
+                                                                  LM_in,
                                                                   this->hR,
-                                                                  &(this->getHk(LM_in)),
                                                                   *this->kv,
                                                                   LM_in->Hexxd,
                                                                   LM_in->Hexxc,
@@ -484,71 +446,22 @@ Operator<TK>*& HamiltLCAO<TK, TR>::getOperator()
 {
     return this->ops;
 }
-// getHk
-template <>
-std::vector<double>& HamiltLCAO<double, double>::getHk(LCAO_Matrix* LM)
-{
-    return LM->Hloc;
-}
-
-template <>
-std::vector<std::complex<double>>& HamiltLCAO<std::complex<double>, double>::getHk(LCAO_Matrix* LM)
-{
-    return LM->Hloc2;
-}
-template <>
-std::vector<std::complex<double>>& HamiltLCAO<std::complex<double>, std::complex<double>>::getHk(LCAO_Matrix* LM)
-{
-    return LM->Hloc2;
-}
-
-// getSk
-template <>
-std::vector<double>& HamiltLCAO<double, double>::getSk(LCAO_Matrix* LM)
-{
-    return LM->Sloc;
-}
-
-template <>
-std::vector<std::complex<double>>& HamiltLCAO<std::complex<double>, double>::getSk(LCAO_Matrix* LM)
-{
-    return LM->Sloc2;
-}
-template <>
-std::vector<std::complex<double>>& HamiltLCAO<std::complex<double>, std::complex<double>>::getSk(LCAO_Matrix* LM)
-{
-    return LM->Sloc2;
-}
-
-// getHR
-template <typename TK, typename TR>
-HContainer<TR>*& HamiltLCAO<TK, TR>::getHR()
-{
-    return this->hR;
-}
-
-// getSR
-template <typename TK, typename TR>
-HContainer<TR>*& HamiltLCAO<TK, TR>::getSR()
-{
-    return this->sR;
-}
 
 template <typename TK, typename TR>
-void HamiltLCAO<TK, TR>::updateSk(const int ik, LCAO_Matrix* LM_in, const int hk_type)
+void HamiltLCAO<TK, TR>::updateSk(const int ik, const int hk_type)
 {
     ModuleBase::TITLE("HamiltLCAO", "updateSk");
     ModuleBase::timer::tick("HamiltLCAO", "updateSk");
-    ModuleBase::GlobalFunc::ZEROS(this->getSk(LM_in).data(), this->getSk(LM_in).size());
+    ModuleBase::GlobalFunc::ZEROS(this->getSk(), this->get_size_hsk());
     if (hk_type == 1) // collumn-major matrix for SK
     {
-        const int nrow = LM_in->ParaV->get_row_size();
-        hamilt::folding_HR(*this->sR, this->getSk(LM_in).data(), this->kv->kvec_d[ik], nrow, 1);
+        const int nrow = this->hsk->get_pv()->get_row_size();
+        hamilt::folding_HR(*this->sR, this->getSk(), this->kv->kvec_d[ik], nrow, 1);
     }
     else if (hk_type == 0) // row-major matrix for SK
     {
-        const int ncol = LM_in->ParaV->get_col_size();
-        hamilt::folding_HR(*this->sR, this->getSk(LM_in).data(), this->kv->kvec_d[ik], ncol, 0);
+        const int ncol = this->hsk->get_pv()->get_col_size();
+        hamilt::folding_HR(*this->sR, this->getSk(), this->kv->kvec_d[ik], ncol, 0);
     }
     ModuleBase::timer::tick("HamiltLCAO", "updateSk");
 }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h
index c388b71fbc..6226db3d71 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h
@@ -10,7 +10,7 @@
 #include "module_hamilt_lcao/module_gint/gint_gamma.h"
 #include "module_hamilt_lcao/module_gint/gint_k.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
-
+#include "module_hamilt_lcao/hamilt_lcaodft/hs_matrix_k.hpp"
 namespace hamilt
 {
 
@@ -27,6 +27,7 @@ class HamiltLCAO : public Hamilt<TK>
     HamiltLCAO(Gint_Gamma* GG_in,
                Gint_k* GK_in,
                LCAO_Matrix* LM_in,
+               const Parallel_Orbitals* paraV,
                elecstate::Potential* pot_in,
                const K_Vectors& kv_in,
                const TwoCenterBundle& two_center_bundle,
@@ -35,7 +36,7 @@ class HamiltLCAO : public Hamilt<TK>
     /**
      * @brief Constructor of vacuum Operators, only HR and SR will be initialed as empty HContainer
      */
-    HamiltLCAO(LCAO_Matrix* LM_in, const K_Vectors& kv_in, const TwoCenterIntegrator& intor_overlap_orb);
+    HamiltLCAO(const Parallel_Orbitals* paraV, const K_Vectors& kv_in, const TwoCenterIntegrator& intor_overlap_orb);
 
     ~HamiltLCAO()
     {
@@ -45,18 +46,20 @@ class HamiltLCAO : public Hamilt<TK>
         }
         delete this->hR;
         delete this->sR;
-    };
+        delete this->hsk;
+    }
 
     /// get pointer of Operator<TK> ops
     Operator<TK>*& getOperator();
-    /// get hk-pointer of std::vector<TK>, the return will be LM->Hloc or LM->Hloc2
-    std::vector<TK>& getHk(LCAO_Matrix* LM);
-    /// get sk-pointer of std::vector<TK>, the return will be this->sk
-    std::vector<TK>& getSk(LCAO_Matrix* LM);
+    /// get hk-pointer
+    TK* getHk() const{return this->hsk->get_hk();}
+    /// get sk-pointer
+    TK* getSk() const{return this->hsk->get_sk();}
+    int get_size_hsk() const{return this->hsk->get_size();}
     /// get HR pointer of *this->hR, which is a HContainer<TR> and contains H(R)
-    HContainer<TR>*& getHR();
+    HContainer<TR>*& getHR(){return this->hR;}
     /// get SR pointer of *this->sR, which is a HContainer<TR> and contains S(R)
-    HContainer<TR>*& getSR();
+    HContainer<TR>*& getSR(){return this->sR;}
     /// refresh the status of HR
     void refresh() override;
 
@@ -70,7 +73,7 @@ class HamiltLCAO : public Hamilt<TK>
      * @param hk_type 0: SK is row-major, 1: SK is collumn-major
      * @return void
      */
-    void updateSk(const int ik, LCAO_Matrix* LM_in, const int hk_type = 0);
+    void updateSk(const int ik, const int hk_type = 0);
 
     // core function: return H(k) and S(k) matrixs for direct solving eigenvalues.
     // not used in PW base
@@ -84,6 +87,7 @@ class HamiltLCAO : public Hamilt<TK>
     HContainer<TR>* hR = nullptr;
     HContainer<TR>* sR = nullptr;
 
+    HS_Matrix_K<TK>* hsk = nullptr;
     // special case for NSPIN=2 , data of HR should be separated into two parts
     // save them in this->hRS2;
     std::vector<TR> hRS2;
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/hs_matrix_k.hpp b/source/module_hamilt_lcao/hamilt_lcaodft/hs_matrix_k.hpp
new file mode 100644
index 0000000000..495af32841
--- /dev/null
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/hs_matrix_k.hpp
@@ -0,0 +1,34 @@
+#ifndef ABACUS_HS_MATRIX_K_HPP
+#define ABACUS_HS_MATRIX_K_HPP
+
+#include "module_basis/module_ao/parallel_orbitals.h"
+
+#include <vector>
+namespace hamilt
+{
+    template <typename TK>
+    class HS_Matrix_K
+    {
+        public:
+            HS_Matrix_K(const Parallel_Orbitals* paraV, bool no_s=false){
+                hk.resize(paraV->nloc);
+                if(!no_s) 
+                {
+                    sk.resize(paraV->nloc);
+                }
+                this->pv = paraV;
+            }
+            TK* get_hk() {return hk.data();}
+            TK* get_sk() {return sk.data();}
+            int get_size() {return hk.size();}
+            void set_zero_hk() {hk.assign(hk.size(), 0);}
+            void set_zero_sk() {sk.assign(sk.size(), 0);}
+            const Parallel_Orbitals* get_pv() const {return this->pv;}
+        private:
+            std::vector<TK> hk;
+            std::vector<TK> sk;
+            const Parallel_Orbitals* pv = nullptr;
+    };
+}
+
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
index 9706494dda..28565ecdaf 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
@@ -15,21 +15,20 @@ namespace hamilt
 {
 
 template <typename TK, typename TR>
-DeePKS<OperatorLCAO<TK, TR>>::DeePKS(LCAO_Matrix* LM_in,
+DeePKS<OperatorLCAO<TK, TR>>::DeePKS(HS_Matrix_K<TK>* hsk_in,
                                      const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                      HContainer<TR>* hR_in,
-                                     std::vector<TK>* hK_in,
                                      const UnitCell* ucell_in,
                                      Grid_Driver* GridD_in,
                                      const TwoCenterIntegrator* intor_orb_alpha,
                                      const int& nks_in,
                                      elecstate::DensityMatrix<TK, double>* DM_in)
-    : nks(nks_in), ucell(ucell_in), OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), DM(DM_in),
+    : nks(nks_in), ucell(ucell_in), OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in), DM(DM_in),
       intor_orb_alpha_(intor_orb_alpha)
 {
     this->cal_type = calculation_type::lcao_deepks;
 #ifdef __DEEPKS
-    this->initialize_HR(GridD_in, LM_in->ParaV);
+    this->initialize_HR(GridD_in);
 #endif
 }
 
@@ -45,11 +44,14 @@ DeePKS<OperatorLCAO<TK, TR>>::~DeePKS()
 #ifdef __DEEPKS
 // initialize_HR()
 template <typename TK, typename TR>
-void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD, const Parallel_Orbitals* paraV)
+void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD)
 {
     ModuleBase::TITLE("DeePKS", "initialize_HR");
     ModuleBase::timer::tick("DeePKS", "initialize_HR");
 
+    auto* paraV = this->hR->get_paraV();// get parallel orbitals from HR
+    // TODO: if paraV is nullptr, AtomPair can not use paraV for constructor, I will repair it in the future.
+
     // this->H_V_delta = new HContainer<TR>(paraV);
     if (std::is_same<TK, double>::value)
     {
@@ -153,7 +155,7 @@ void DeePKS<OperatorLCAO<double, double>>::contributeHR()
     if (GlobalC::ld.get_hr_cal())
     {
         ModuleBase::timer::tick("DeePKS", "contributeHR");
-        const Parallel_Orbitals* pv = this->LM->ParaV;
+        const Parallel_Orbitals* pv = this->hsk->get_pv();
         GlobalC::ld.cal_projected_DM(this->DM, *this->ucell, GlobalC::ORB, GlobalC::GridD);
         GlobalC::ld.cal_descriptor(this->ucell->nat);
         GlobalC::ld.cal_gedm(this->ucell->nat);
@@ -245,7 +247,7 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::pre_calculate_nlm(
     const int iat0,
     std::vector<std::unordered_map<int, std::vector<double>>>& nlm_in)
 {
-    const Parallel_Orbitals* paraV = this->LM->ParaV;
+    const Parallel_Orbitals* paraV = this->hR->get_paraV();
     const int npol = this->ucell->get_npol();
     auto tau0 = ucell->get_tau(iat0);
     int T0, I0;
@@ -302,7 +304,7 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
     }
     ModuleBase::timer::tick("DeePKS", "calculate_HR");
 
-    const Parallel_Orbitals* paraV = this->H_V_delta->get_atom_pair(0).get_paraV();
+    const Parallel_Orbitals* paraV = this->H_V_delta->get_paraV();
     const int npol = this->ucell->get_npol();
 
     const LCAO_Orbitals& orb = LCAO_Orbitals::get_const_instance();
@@ -511,16 +513,16 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::contributeHk(int ik)
     TK* h_delta_k = nullptr;
     get_h_delta_k(ik, h_delta_k);
     // set SK to zero and then calculate SK for each k vector
-    ModuleBase::GlobalFunc::ZEROS(h_delta_k, this->hK->size());
+    ModuleBase::GlobalFunc::ZEROS(h_delta_k, this->hsk->get_size());
 
     if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
     {
-        const int nrow = this->LM->ParaV->get_row_size();
+        const int nrow = this->hsk->get_pv()->get_row_size();
         hamilt::folding_HR(*this->H_V_delta, h_delta_k, this->kvec_d[ik], nrow, 1);
     }
     else
     {
-        const int ncol = this->LM->ParaV->get_col_size();
+        const int ncol = this->hsk->get_pv()->get_col_size();
         hamilt::folding_HR(*this->H_V_delta, h_delta_k, this->kvec_d[ik], ncol, 0);
     }
     ModuleBase::timer::tick("DeePKS", "contributeHk");
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.h
index bfb82a1483..57bccacf85 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.h
@@ -30,10 +30,9 @@ template <typename TK, typename TR>
 class DeePKS<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 {
   public:
-    DeePKS<OperatorLCAO<TK, TR>>(LCAO_Matrix* LM_in,
+    DeePKS<OperatorLCAO<TK, TR>>(HS_Matrix_K<TK>* hsk_in,
                                  const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                  HContainer<TR>* hR_in,
-                                 std::vector<TK>* hK_in,
                                  const UnitCell* ucell_in,
                                  Grid_Driver* GridD_in,
                                  const TwoCenterIntegrator* intor_orb_alpha,
@@ -72,7 +71,7 @@ class DeePKS<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
      * HContainer is used to store the DeePKS real space Hamiltonian correction with specific <I,J,R> atom-pairs
      * the size of HR will be fixed after initialization
      */
-    void initialize_HR(Grid_Driver* GridD, const Parallel_Orbitals* paraV);
+    void initialize_HR(Grid_Driver* GridD);
 
     /**
      * @brief calculate the DeePKS correction matrix with specific <I,J,R> atom-pairs
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_force_stress.hpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_force_stress.hpp
index e7fbf8c0ac..4d1c1bec2b 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_force_stress.hpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_force_stress.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include "dftu_lcao.h"
 #include "module_base/parallel_reduce.h"
+#include "module_base/timer.h"
 
 namespace hamilt
 {
@@ -19,8 +20,9 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
     // try to get the density matrix, if the density matrix is empty, skip the calculation and return
     const hamilt::HContainer<double>* dmR_tmp[this->nspin];
     dmR_tmp[0] = this->dftu->get_dmr(0);
-    if (this->nspin == 2)
+    if (this->nspin == 2) {
         dmR_tmp[1] = this->dftu->get_dmr(1);
+    }
     if (dmR_tmp[0]->size_atom_pairs() == 0)
     {
         return;
@@ -28,7 +30,7 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
     // begin the calculation of force and stress
     ModuleBase::timer::tick("DFTU", "cal_force_stress");
 
-    const Parallel_Orbitals* paraV = dmR_tmp[0]->get_atom_pair(0).get_paraV();
+    const Parallel_Orbitals* paraV = dmR_tmp[0]->get_paraV();
     const int npol = this->ucell->get_npol();
     std::vector<double> stress_tmp(6, 0);
     if (cal_force)
@@ -45,8 +47,9 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
         int T0, I0;
         ucell->iat2iait(iat0, &I0, &T0);
         const int target_L = this->dftu->orbital_corr[T0];
-        if (target_L == -1)
+        if (target_L == -1) {
             continue;
+        }
         const int tlp1 = 2 * target_L + 1;
         AdjacentAtomInfo& adjs = this->adjs_all[atom_index++];
 
@@ -161,7 +164,7 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
                 if (tmp[0] != nullptr)
                 {
                     // calculate force
-                    if (cal_force)
+                    if (cal_force) {
                         this->cal_force_IJR(iat1,
                                             iat2,
                                             paraV,
@@ -172,9 +175,10 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
                                             this->nspin,
                                             force_tmp1,
                                             force_tmp2);
+                    }
 
                     // calculate stress
-                    if (cal_stress)
+                    if (cal_stress) {
                         this->cal_stress_IJR(iat1,
                                              iat2,
                                              paraV,
@@ -186,6 +190,7 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
                                              dis1,
                                              dis2,
                                              stress_tmp.data());
+                    }
                 }
             }
         }
@@ -251,8 +256,9 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_IJR(const int& iat1,
     const int m_size2 = m_size * m_size;
     // step_trace = 0 for NSPIN=1,2; ={0, 1, local_col, local_col+1} for NSPIN=4
     std::vector<int> step_trace(npol, 0);
-    if (npol == 2)
+    if (npol == 2) {
         step_trace[1] = col_indexes.size() + 1;
+    }
     double tmp[3];
     // calculate the local matrix
     for (int is = 0; is < nspin; is++)
@@ -319,8 +325,9 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_stress_IJR(const int& iat1,
     const int m_size2 = m_size * m_size;
     // step_trace = 0 for NSPIN=1,2; ={0, 1, local_col, local_col+1} for NSPIN=4
     std::vector<int> step_trace(npol, 0);
-    if (npol == 2)
+    if (npol == 2) {
         step_trace[1] = col_indexes.size() + 1;
+    }
     // calculate the local matrix
     for (int is = 0; is < nspin; is++)
     {
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp
index f239091bcb..6d40d8ffef 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp
@@ -11,16 +11,14 @@
 #include "module_base/parallel_reduce.h"
 
 template <typename TK, typename TR>
-hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::DFTU(LCAO_Matrix* LM_in,
+hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::DFTU(HS_Matrix_K<TK>* hsk_in,
                                                  const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                                  hamilt::HContainer<TR>* hR_in,
-                                                 std::vector<TK>* hK_in,
                                                  const UnitCell& ucell_in,
                                                  Grid_Driver* GridD_in,
                                                  const TwoCenterIntegrator* intor,
-                                                 ModuleDFTU::DFTU* dftu_in,
-                                                 const Parallel_Orbitals& paraV)
-    : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), intor_(intor)
+                                                 ModuleDFTU::DFTU* dftu_in)
+    : hamilt::OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in), intor_(intor)
 {
     this->cal_type = calculation_type::lcao_dftu;
     this->ucell = &ucell_in;
@@ -29,7 +27,7 @@ hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::DFTU(LCAO_Matrix* LM_in,
     assert(this->ucell != nullptr);
 #endif
     // initialize HR to allocate sparse Nonlocal matrix memory
-    this->initialize_HR(GridD_in, &paraV);
+    this->initialize_HR(GridD_in);
     // set nspin
     this->nspin = GlobalV::NSPIN;
 }
@@ -42,11 +40,14 @@ hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::~DFTU()
 
 // initialize_HR()
 template <typename TK, typename TR>
-void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD, const Parallel_Orbitals* paraV)
+void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD)
 {
     ModuleBase::TITLE("DFTU", "initialize_HR");
     ModuleBase::timer::tick("DFTU", "initialize_HR");
 
+    auto* paraV = this->hR->get_paraV();// get parallel orbitals from HR
+    // TODO: if paraV is nullptr, AtomPair can not use paraV for constructor, I will repair it in the future.
+
     this->adjs_all.clear();
     this->adjs_all.reserve(this->ucell->nat);
     for (int iat0 = 0; iat0 < ucell->nat; iat0++)
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.h
index 64f64a6d5b..f17471ac91 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.h
@@ -26,15 +26,13 @@ template <typename TK, typename TR>
 class DFTU<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 {
   public:
-    DFTU<OperatorLCAO<TK, TR>>(LCAO_Matrix* lm_in,
+    DFTU<OperatorLCAO<TK, TR>>(HS_Matrix_K<TK>* hsk_in,
                                const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                hamilt::HContainer<TR>* hR_in,
-                               std::vector<TK>* hK_in,
                                const UnitCell& ucell_in,
                                Grid_Driver* gridD_in,
                                const TwoCenterIntegrator* intor,
-                               ModuleDFTU::DFTU* dftu_in,
-                               const Parallel_Orbitals& paraV);
+                               ModuleDFTU::DFTU* dftu_in);
     ~DFTU<OperatorLCAO<TK, TR>>();
 
     /**
@@ -56,8 +54,6 @@ class DFTU<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 
     hamilt::HContainer<TR>* HR = nullptr;
 
-    TK* HK_pointer = nullptr;
-
     const TwoCenterIntegrator* intor_ = nullptr;
 
     /// @brief the number of spin components, 1 for no-spin, 2 for collinear spin case and 4 for non-collinear spin case
@@ -70,7 +66,7 @@ class DFTU<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
      * the size of HR will not change in DFTU,
      * because I don't want to expand HR larger than Nonlocal operator caused by DFTU
      */
-    void initialize_HR(Grid_Driver* gridD_in, const Parallel_Orbitals* paraV);
+    void initialize_HR(Grid_Driver* gridD_in);
 
     /**
      * @brief calculate the <phi|alpha^I> overlap values and save them in this->nlm_tot
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp
index 6ece09f001..0702a14d77 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp
@@ -9,23 +9,22 @@
 // Constructor
 template <typename TK, typename TR>
 hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::EkineticNew(
-    LCAO_Matrix* LM_in,
+    HS_Matrix_K<TK>* hsk_in,
     const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
     hamilt::HContainer<TR>* hR_in,
-    std::vector<TK>* hK_in,
     const UnitCell* ucell_in,
     Grid_Driver* GridD_in,
-    const TwoCenterIntegrator* intor,
-    const Parallel_Orbitals* paraV)
-    : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), intor_(intor)
+    const TwoCenterIntegrator* intor)
+    : hamilt::OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in), intor_(intor)
 {
     this->cal_type = calculation_type::lcao_fixed;
     this->ucell = ucell_in;
 #ifdef __DEBUG
     assert(this->ucell != nullptr);
+    assert(this->hsk != nullptr);
 #endif
     // initialize HR to allocate sparse Ekinetic matrix memory
-    this->initialize_HR(GridD_in, paraV);
+    this->initialize_HR(GridD_in);
 }
 
 // destructor
@@ -40,12 +39,14 @@ hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::~EkineticNew()
 
 // initialize_HR()
 template <typename TK, typename TR>
-void hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD,
-                                                                      const Parallel_Orbitals* paraV)
+void hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD)
 {
     ModuleBase::TITLE("EkineticNew", "initialize_HR");
     ModuleBase::timer::tick("EkineticNew", "initialize_HR");
 
+    auto* paraV = this->hR->get_paraV();// get parallel orbitals from HR
+    // TODO: if paraV is nullptr, AtomPair can not use paraV for constructor, I will repair it in the future.
+
     for (int iat1 = 0; iat1 < ucell->nat; iat1++)
     {
         auto tau1 = ucell->get_tau(iat1);
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.h
index 35e827fc3d..2cb05b53eb 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.h
@@ -39,14 +39,12 @@ class EkineticNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
     /**
      * @brief Construct a new EkineticNew object
      */
-    EkineticNew<OperatorLCAO<TK, TR>>(LCAO_Matrix* LM_in,
+    EkineticNew<OperatorLCAO<TK, TR>>(HS_Matrix_K<TK>* hsk_in,
                                       const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                       HContainer<TR>* hR_in,
-                                      std::vector<TK>* hK_in,
                                       const UnitCell* ucell_in,
                                       Grid_Driver* GridD_in,
-                                      const TwoCenterIntegrator* intor,
-                                      const Parallel_Orbitals* paraV);
+                                      const TwoCenterIntegrator* intor);
 
     /**
      * @brief Destroy the EkineticNew object
@@ -77,7 +75,7 @@ class EkineticNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
      * HContainer is used to store the electronic kinetic matrix with specific <I,J,R> atom-pairs
      * the size of HR will be fixed after initialization
      */
-    void initialize_HR(Grid_Driver* GridD_in, const Parallel_Orbitals* paraV);
+    void initialize_HR(Grid_Driver* GridD_in);
 
     /**
      * @brief calculate the electronic kinetic matrix with specific <I,J,R> atom-pairs
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/meta_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/meta_lcao.cpp
index fb2e5e4e6e..de325a998f 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/meta_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/meta_lcao.cpp
@@ -12,59 +12,4 @@ template class Meta<OperatorLCAO<std::complex<double>, double>>;
 
 template class Meta<OperatorLCAO<std::complex<double>, std::complex<double>>>;
 
-template<>
-Meta<OperatorLCAO<double, double>>::~Meta()
-{
-}
-
-template<>
-Meta<OperatorLCAO<std::complex<double>, double>>::~Meta()
-{
-}
-
-template<>
-Meta<OperatorLCAO<std::complex<double>, std::complex<double>>>::~Meta()
-{
-}
-
-//nothing to do in LCAO base for meta operator
-template<>
-void Meta<OperatorLCAO<double, double>>::contributeHR()
-{
-    return;
-}
-
-//nothing to do in LCAO base for meta operator
-template<>
-void Meta<OperatorLCAO<std::complex<double>, double>>::contributeHR()
-{
-    return;
-}
-//nothing to do in LCAO base for meta operator
-template<>
-void Meta<OperatorLCAO<std::complex<double>, std::complex<double>>>::contributeHR()
-{
-    return;
-}
-
-//nothing to do in LCAO base for meta operator
-template<>
-void Meta<OperatorLCAO<double, double>>::contributeHk(int ik)
-{
-    return;
-}
-
-//nothing to do in LCAO base for meta operator
-template<>
-void Meta<OperatorLCAO<std::complex<double>, double>>::contributeHk(int ik)
-{
-    return;
-}
-
-template<>
-void Meta<OperatorLCAO<std::complex<double>, std::complex<double>>>::contributeHk(int ik)
-{
-    return;
-}
-
 }
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/meta_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/meta_lcao.h
index c33429e79b..587ea036b8 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/meta_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/meta_lcao.h
@@ -23,46 +23,22 @@ template <typename TK, typename TR>
 class Meta<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 {
   public:
-    Meta<OperatorLCAO<TK, TR>>(Gint_k* GK_in,
-                          Local_Orbital_Charge* loc_in,
-                          LCAO_Matrix* LM_in,
+    Meta<OperatorLCAO<TK, TR>>(
+                          HS_Matrix_K<TK>* hsk_in,
                           const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
-                          HContainer<TR>* hR_in,
-                          std::vector<TK>* hK_in)
-        : GK(GK_in),
-          loc(loc_in),
-          OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in)
-    {
-        this->cal_type = calculation_type::lcao_gint;
-    }
-    Meta<OperatorLCAO<TK, TR>>(Gint_Gamma* GG_in,
-                          Local_Orbital_Charge* loc_in,
-                          LCAO_Matrix* LM_in,
-                          const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
-                          HContainer<TR>* hR_in,
-                          std::vector<TK>* hK_in)
-        : GG(GG_in),
-          loc(loc_in),
-          OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in)
+                          HContainer<TR>* hR_in)
+        : OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in)
     {
         this->cal_type = calculation_type::lcao_gint;
     }
 
-    ~Meta<OperatorLCAO<TK, TR>>();
+    ~Meta<OperatorLCAO<TK, TR>>(){};
 
-    virtual void contributeHR() override;
+    virtual void contributeHR() override{}//do nothing now
 
-    virtual void contributeHk(int ik) override;
+    virtual void contributeHk(int ik) override{};//do nothing now
 
   private:
-    // used for k-dependent grid integration.
-    Gint_k* GK = nullptr;
-
-    // used for gamma only algorithms.
-    Gint_Gamma* GG = nullptr;
-
-    // Charge calculating method in LCAO base and contained grid base calculation: DM_R, DM, pvpR_reduced
-    Local_Orbital_Charge* loc = nullptr;
 };
 
 } // namespace hamilt
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
index bf8bd4683f..7714cea230 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
@@ -11,15 +11,13 @@
 
 template <typename TK, typename TR>
 hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::NonlocalNew(
-    LCAO_Matrix* LM_in,
+    HS_Matrix_K<TK>* hsk_in,
     const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
     hamilt::HContainer<TR>* hR_in,
-    std::vector<TK>* hK_in,
     const UnitCell* ucell_in,
     Grid_Driver* GridD_in,
-    const TwoCenterIntegrator* intor,
-    const Parallel_Orbitals* paraV)
-    : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), intor_(intor)
+    const TwoCenterIntegrator* intor)
+    : hamilt::OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in), intor_(intor)
 {
     this->cal_type = calculation_type::lcao_fixed;
     this->ucell = ucell_in;
@@ -27,7 +25,7 @@ hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::NonlocalNew(
     assert(this->ucell != nullptr);
 #endif
     // initialize HR to allocate sparse Nonlocal matrix memory
-    this->initialize_HR(GridD_in, paraV);
+    this->initialize_HR(GridD_in);
 }
 
 // destructor
@@ -42,12 +40,14 @@ hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::~NonlocalNew()
 
 // initialize_HR()
 template <typename TK, typename TR>
-void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD,
-                                                                      const Parallel_Orbitals* paraV)
+void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD)
 {
     ModuleBase::TITLE("NonlocalNew", "initialize_HR");
     ModuleBase::timer::tick("NonlocalNew", "initialize_HR");
 
+    auto* paraV = this->hR->get_paraV();// get parallel orbitals from HR
+    // TODO: if paraV is nullptr, AtomPair can not use paraV for constructor, I will repair it in the future.
+
     this->adjs_all.clear();
     this->adjs_all.reserve(this->ucell->nat);
     for (int iat0 = 0; iat0 < ucell->nat; iat0++)
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h
index 6dbcdd9ca3..92fdc19bde 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h
@@ -38,14 +38,12 @@ template <typename TK, typename TR>
 class NonlocalNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 {
   public:
-    NonlocalNew<OperatorLCAO<TK, TR>>(LCAO_Matrix* LM_in,
+    NonlocalNew<OperatorLCAO<TK, TR>>(HS_Matrix_K<TK>* hsk_in,
                                       const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                       hamilt::HContainer<TR>* hR_in,
-                                      std::vector<TK>* hK_in,
                                       const UnitCell* ucell_in,
                                       Grid_Driver* GridD_in,
-                                      const TwoCenterIntegrator* intor,
-                                      const Parallel_Orbitals* paraV);
+                                      const TwoCenterIntegrator* intor);
     ~NonlocalNew<OperatorLCAO<TK, TR>>();
 
     /**
@@ -59,8 +57,6 @@ class NonlocalNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
   private:
     const UnitCell* ucell = nullptr;
 
-    hamilt::HContainer<TR>* HR = nullptr;
-
     hamilt::HContainer<TR>* HR_fixed = nullptr;
 
     // the following variable is introduced temporarily during LCAO refactoring
@@ -68,8 +64,6 @@ class NonlocalNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 
     bool allocated = false;
 
-    TK* HK_pointer = nullptr;
-
     bool HR_fixed_done = false;
 
     /**
@@ -77,7 +71,7 @@ class NonlocalNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
      * HContainer is used to store the non-local pseudopotential matrix with specific <I,J,R> atom-pairs
      * the size of HR will be fixed after initialization
      */
-    void initialize_HR(Grid_Driver* GridD_in, const Parallel_Orbitals* paraV);
+    void initialize_HR(Grid_Driver* GridD_in);
 
     /**
      * @brief calculate the non-local pseudopotential matrix with specific <I,J,R> atom-pairs
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_dftu_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_dftu_lcao.cpp
index a1a210711e..924dcf1764 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_dftu_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_dftu_lcao.cpp
@@ -26,12 +26,13 @@ void OperatorDFTU<OperatorLCAO<double, double>>::contributeHk(int ik)
     ModuleBase::TITLE("OperatorDFTU", "contributeHk");
     ModuleBase::timer::tick("OperatorDFTU", "contributeHk");
     // Effective potential of DFT+U is added to total Hamiltonian here; Quxin adds on 20201029
-    std::vector<double> eff_pot(this->LM->ParaV->nloc);
-    GlobalC::dftu.cal_eff_pot_mat_real(ik, &eff_pot[0], isk);
+    std::vector<double> eff_pot(this->hsk->get_pv()->nloc);
+    GlobalC::dftu.cal_eff_pot_mat_real(ik, &eff_pot[0], isk, this->hsk->get_sk());
+    double* hk = this->hsk->get_hk();
 
-    for (int irc = 0; irc < this->LM->ParaV->nloc; irc++)
+    for (int irc = 0; irc < this->hsk->get_pv()->nloc; irc++)
     {
-        this->LM->Hloc[irc] += eff_pot[irc];
+        hk[irc] += eff_pot[irc];
     }
 
     ModuleBase::timer::tick("OperatorDFTU", "contributeHk");
@@ -43,12 +44,13 @@ void OperatorDFTU<OperatorLCAO<std::complex<double>, double>>::contributeHk(int
     ModuleBase::TITLE("OperatorDFTU", "contributeHk");
     ModuleBase::timer::tick("OperatorDFTU", "contributeHk");
     // Effective potential of DFT+U is added to total Hamiltonian here; Quxin adds on 20201029
-    std::vector<std::complex<double>> eff_pot(this->LM->ParaV->nloc);
-    GlobalC::dftu.cal_eff_pot_mat_complex(ik, &eff_pot[0], isk);
+    std::vector<std::complex<double>> eff_pot(this->hsk->get_pv()->nloc);
+    GlobalC::dftu.cal_eff_pot_mat_complex(ik, &eff_pot[0], isk, this->hsk->get_sk());
+    std::complex<double>* hk = this->hsk->get_hk();
 
-    for (int irc = 0; irc < this->LM->ParaV->nloc; irc++)
+    for (int irc = 0; irc < this->hsk->get_pv()->nloc; irc++)
     {
-        this->LM->Hloc2[irc] += eff_pot[irc];
+        hk[irc] += eff_pot[irc];
     }
 
     ModuleBase::timer::tick("OperatorDFTU", "contributeHk");
@@ -60,12 +62,13 @@ void OperatorDFTU<OperatorLCAO<std::complex<double>, std::complex<double>>>::con
     ModuleBase::TITLE("OperatorDFTU", "contributeHk");
     ModuleBase::timer::tick("OperatorDFTU", "contributeHk");
     // Effective potential of DFT+U is added to total Hamiltonian here; Quxin adds on 20201029
-    std::vector<std::complex<double>> eff_pot(this->LM->ParaV->nloc);
-    GlobalC::dftu.cal_eff_pot_mat_complex(ik, &eff_pot[0], isk);
+    std::vector<std::complex<double>> eff_pot(this->hsk->get_pv()->nloc);
+    GlobalC::dftu.cal_eff_pot_mat_complex(ik, &eff_pot[0], isk, this->hsk->get_sk());
 
-    for (int irc = 0; irc < this->LM->ParaV->nloc; irc++)
+    std::complex<double>* hk = this->hsk->get_hk();
+    for (int irc = 0; irc < this->hsk->get_pv()->nloc; irc++)
     {
-        this->LM->Hloc2[irc] += eff_pot[irc];
+        hk[irc] += eff_pot[irc];
     }
 
     ModuleBase::timer::tick("OperatorDFTU", "contributeHk");
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_dftu_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_dftu_lcao.h
index e80c8285c0..de40bce4b3 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_dftu_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_dftu_lcao.h
@@ -20,12 +20,11 @@ template <typename TK, typename TR>
 class OperatorDFTU<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 {
   public:
-    OperatorDFTU<OperatorLCAO<TK, TR>>(LCAO_Matrix* LM_in,
+    OperatorDFTU<OperatorLCAO<TK, TR>>(HS_Matrix_K<TK>* hsk_in,
                                   const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                   hamilt::HContainer<TR>* hR_in,
-                                  std::vector<TK>* hK_in,
                                   const std::vector<int>& isk_in)
-        : isk(isk_in), OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in)
+        : isk(isk_in), OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in)
     {
         this->cal_type = calculation_type::lcao_dftu;
     }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.cpp
index ff1b6f9ca1..f4418840b5 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.cpp
@@ -8,18 +8,18 @@ namespace hamilt
 template<>
 void OperatorEXX<OperatorLCAO<double, double>>::add_loaded_Hexx(const int ik)
 {
-    BlasConnector::axpy(this->LM->ParaV->get_local_size(), 1.0, this->LM->Hexxd_k_load[ik].data(), 1, this->LM->Hloc.data(), 1);
+    BlasConnector::axpy(this->LM->ParaV->get_local_size(), 1.0, this->LM->Hexxd_k_load[ik].data(), 1, this->hsk->get_hk(), 1);
 }
 template<>
 void OperatorEXX<OperatorLCAO<std::complex<double>, double>>::add_loaded_Hexx(const int ik)
 {
 
-    BlasConnector::axpy(this->LM->ParaV->get_local_size(), 1.0, this->LM->Hexxc_k_load[ik].data(), 1, this->LM->Hloc2.data(), 1);
+    BlasConnector::axpy(this->LM->ParaV->get_local_size(), 1.0, this->LM->Hexxc_k_load[ik].data(), 1, this->hsk->get_hk(), 1);
 }
 template<>
 void OperatorEXX<OperatorLCAO<std::complex<double>, std::complex<double>>>::add_loaded_Hexx(const int ik)
 {
-    BlasConnector::axpy(this->LM->ParaV->get_local_size(), 1.0, this->LM->Hexxc_k_load[ik].data(), 1, this->LM->Hloc2.data(), 1);
+    BlasConnector::axpy(this->LM->ParaV->get_local_size(), 1.0, this->LM->Hexxc_k_load[ik].data(), 1, this->hsk->get_hk(), 1);
 }
 
 } // namespace hamilt
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.h
index 4609cb44ee..6cb1e8fde2 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.h
@@ -6,6 +6,7 @@
 #include <RI/global/Tensor.h>
 #include "operator_lcao.h"
 #include "module_cell/klist.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h"
 
 namespace hamilt
 {
@@ -25,9 +26,9 @@ class OperatorEXX<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 {
     using TAC = std::pair<int, std::array<int, 3>>;
 public:
-    OperatorEXX<OperatorLCAO<TK, TR>>(LCAO_Matrix* LM_in,
+    OperatorEXX<OperatorLCAO<TK, TR>>(HS_Matrix_K<TK>* hsk_in,
+        LCAO_Matrix* LM_in,
         hamilt::HContainer<TR>* hR_in,
-        std::vector<TK>* hK_in,
         const K_Vectors& kv_in,
         std::vector<std::map<int, std::map<TAC, RI::Tensor<double>>>>* Hexxd_in = nullptr,
         std::vector<std::map<int, std::map<TAC, RI::Tensor<std::complex<double>>>>>* Hexxc_in = nullptr,
@@ -53,6 +54,8 @@ class OperatorEXX<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 
       void add_loaded_Hexx(const int ik);
       const K_Vectors& kv;
+
+      LCAO_Matrix* LM = nullptr;
 };
 
 } // namespace hamilt
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.hpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.hpp
index 5f180bd145..4158abcaf5 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.hpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.hpp
@@ -13,56 +13,64 @@ namespace hamilt
 
 template <typename TK, typename TR>
 OperatorEXX<OperatorLCAO<TK, TR>>::OperatorEXX(
+	HS_Matrix_K<TK>* hsk_in,
 	LCAO_Matrix* LM_in,
 	hamilt::HContainer<TR>* hR_in,
-	std::vector<TK>* hK_in,
 	const K_Vectors& kv_in,
 	std::vector<std::map<int, std::map<TAC, RI::Tensor<double>>>>* Hexxd_in,
 	std::vector<std::map<int, std::map<TAC, RI::Tensor<std::complex<double>>>>>* Hexxc_in,
 	int* two_level_step_in,
 	const bool restart_in)
-	: OperatorLCAO<TK, TR>(LM_in, kv_in.kvec_d, hR_in, hK_in),
+	: OperatorLCAO<TK, TR>(hsk_in, kv_in.kvec_d, hR_in),
 	  kv(kv_in),
 	  Hexxd(Hexxd_in),
 	  Hexxc(Hexxc_in), 
 	  two_level_step(two_level_step_in),
-	  restart(restart_in)
+	  restart(restart_in),
+	  LM(LM_in)
 {
-	this->cal_type = calculation_type::lcao_exx;
-	if (this->restart)
-	{///  Now only Hexx depends on DM, so we can directly read Hexx to reduce the computational cost.
-	/// If other operators depends on DM, we can also read DM and then calculate the operators to save the memory to store operator terms.
-		assert(this->two_level_step != nullptr);
-		/// read in Hexx
-		if (std::is_same<TK, double>::value)
-		{
-			this->LM->Hexxd_k_load.resize(this->kv.get_nks());
-			for (int ik = 0; ik < this->kv.get_nks(); ik++)
-			{
-				this->LM->Hexxd_k_load[ik].resize(this->LM->ParaV->get_local_size(), 0.0);
-				this->restart = GlobalC::restart.load_disk(
-					"Hexx", ik,
-					this->LM->ParaV->get_local_size(), this->LM->Hexxd_k_load[ik].data(), false);
-				if (!this->restart) break;
-			}
-		}
-		else
-		{
-			this->LM->Hexxc_k_load.resize(this->kv.get_nks());
-			for (int ik = 0; ik < this->kv.get_nks(); ik++)
-			{
-				this->LM->Hexxc_k_load[ik].resize(this->LM->ParaV->get_local_size(), 0.0);
-				this->restart = GlobalC::restart.load_disk(
-					"Hexx", ik,
-					this->LM->ParaV->get_local_size(), this->LM->Hexxc_k_load[ik].data(), false);
-				if (!this->restart) break;
-			}
-		}
-		if (!this->restart)
-			std::cout << "WARNING: Hexx not found, restart from the non-exx loop." << std::endl
-			          << "If the loaded charge density is EXX-solved, this may lead to poor convergence." << std::endl;
-		GlobalC::restart.info_load.load_H_finish = this->restart;
-	}
+    this->cal_type = calculation_type::lcao_exx;
+    if (this->restart)
+    { ///  Now only Hexx depends on DM, so we can directly read Hexx to reduce the computational cost.
+        /// If other operators depends on DM, we can also read DM and then calculate the operators to save the memory to
+        /// store operator terms.
+        assert(this->two_level_step != nullptr);
+        /// read in Hexx
+        if (std::is_same<TK, double>::value)
+        {
+            this->LM->Hexxd_k_load.resize(this->kv.get_nks());
+            for (int ik = 0; ik < this->kv.get_nks(); ik++)
+            {
+                this->LM->Hexxd_k_load[ik].resize(this->LM->ParaV->get_local_size(), 0.0);
+                this->restart = GlobalC::restart.load_disk("Hexx",
+                                                           ik,
+                                                           this->LM->ParaV->get_local_size(),
+                                                           this->LM->Hexxd_k_load[ik].data(),
+                                                           false);
+                if (!this->restart)
+                    break;
+            }
+        }
+        else
+        {
+            this->LM->Hexxc_k_load.resize(this->kv.get_nks());
+            for (int ik = 0; ik < this->kv.get_nks(); ik++)
+            {
+                this->LM->Hexxc_k_load[ik].resize(this->LM->ParaV->get_local_size(), 0.0);
+                this->restart = GlobalC::restart.load_disk("Hexx",
+                                                           ik,
+                                                           this->LM->ParaV->get_local_size(),
+                                                           this->LM->Hexxc_k_load[ik].data(),
+                                                           false);
+                if (!this->restart)
+                    break;
+            }
+        }
+        if (!this->restart)
+            std::cout << "WARNING: Hexx not found, restart from the non-exx loop." << std::endl
+                      << "If the loaded charge density is EXX-solved, this may lead to poor convergence." << std::endl;
+        GlobalC::restart.info_load.load_H_finish = this->restart;
+    }
 }
 
 template<typename TK, typename TR>
@@ -102,7 +110,7 @@ void OperatorEXX<OperatorLCAO<TK, TR>>::contributeHk(int ik)
 				GlobalC::exx_info.info_global.hybrid_alpha,
 				this->Hexxd == nullptr ? *this->LM->Hexxd : *this->Hexxd,
 				*this->LM->ParaV,
-				*this->hK);
+				this->hsk->get_hk());
 		else
 			RI_2D_Comm::add_Hexx(
 				this->kv,
@@ -110,7 +118,7 @@ void OperatorEXX<OperatorLCAO<TK, TR>>::contributeHk(int ik)
 				GlobalC::exx_info.info_global.hybrid_alpha,
 				this->Hexxc == nullptr ? *this->LM->Hexxc : *this->Hexxc,
 				*this->LM->ParaV,
-				*this->hK);
+				this->hsk->get_hk());
 	}
 }
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp
index d47eecf5b4..50937ff6af 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.cpp
@@ -15,16 +15,16 @@ template<>
 void OperatorLCAO<double, double>::get_hs_pointers()
 {
     ModuleBase::timer::tick("OperatorLCAO", "get_hs_pointers");
-    this->hmatrix_k = this->LM->Hloc.data();
+    this->hmatrix_k = this->hsk->get_hk();
     if ((this->new_e_iteration && ik == 0) || hsolver::HSolverLCAO<double>::out_mat_hs[0])
     {
         if (this->smatrix_k == nullptr)
         {
-            this->smatrix_k = new double[this->LM->Sloc.size()];
+            this->smatrix_k = new double[this->hsk->get_size()];
             this->allocated_smatrix = true;
         }
         const int inc = 1;
-        BlasConnector::copy(this->LM->Sloc.size(), this->LM->Sloc.data(), inc, this->smatrix_k, inc);
+        BlasConnector::copy(this->hsk->get_size(), this->hsk->get_sk(), inc, this->smatrix_k, inc);
 #ifdef __ELPA
         hsolver::DiagoElpa<double>::DecomposedState = 0;
 #endif
@@ -36,36 +36,22 @@ void OperatorLCAO<double, double>::get_hs_pointers()
 template<>
 void OperatorLCAO<std::complex<double>, double>::get_hs_pointers()
 {
-    this->hmatrix_k = this->LM->Hloc2.data();
-    this->smatrix_k = this->LM->Sloc2.data();
+    this->hmatrix_k = this->hsk->get_hk();
+    this->smatrix_k = this->hsk->get_sk();
 }
 
 template<>
 void OperatorLCAO<std::complex<double>, std::complex<double>>::get_hs_pointers()
 {
-    this->hmatrix_k = this->LM->Hloc2.data();
-    this->smatrix_k = this->LM->Sloc2.data();
+    this->hmatrix_k = this->hsk->get_hk();
+    this->smatrix_k = this->hsk->get_sk();
 }
 
-template<>
-void OperatorLCAO<double, double>::refresh_h()
-{
-    // Set the matrix 'H' to zero.
-    this->LM->zeros_HSgamma('H');
-}
-
-template<>
-void OperatorLCAO<std::complex<double>, double>::refresh_h()
-{
-    // Set the matrix 'H' to zero.
-    this->LM->zeros_HSk('H');
-}
-
-template<>
-void OperatorLCAO<std::complex<double>, std::complex<double>>::refresh_h()
+template<typename TK, typename TR>
+void OperatorLCAO<TK, TR>::refresh_h()
 {
     // Set the matrix 'H' to zero.
-    this->LM->zeros_HSk('H');
+    this->hsk->set_zero_hk();
 }
 
 template<typename TK, typename TR>
@@ -254,13 +240,13 @@ void OperatorLCAO<TK, TR>::contributeHk(int ik)
     ModuleBase::timer::tick("OperatorLCAO", "contributeHk");
     if(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
     {
-        const int nrow = this->LM->ParaV->get_row_size();
-        hamilt::folding_HR(*this->hR, this->hK->data(), this->kvec_d[ik], nrow, 1);
+        const int nrow = this->hsk->get_pv()->get_row_size();
+        hamilt::folding_HR(*this->hR, this->hsk->get_hk(), this->kvec_d[ik], nrow, 1);
     }
     else
     {
-        const int ncol = this->LM->ParaV->get_col_size();
-        hamilt::folding_HR(*this->hR, this->hK->data(), this->kvec_d[ik], ncol, 0);
+        const int ncol = this->hsk->get_pv()->get_col_size();
+        hamilt::folding_HR(*this->hR, this->hsk->get_hk(), this->kvec_d[ik], ncol, 0);
     }
     ModuleBase::timer::tick("OperatorLCAO", "contributeHk");
 }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h
index b5a7e536b9..5aece587ab 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h
@@ -3,8 +3,8 @@
 #include "module_base/vector3.h"
 #include "module_hamilt_general/matrixblock.h"
 #include "module_hamilt_general/operator.h"
-#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/hs_matrix_k.hpp"
 
 namespace hamilt
 {
@@ -14,16 +14,15 @@ class OperatorLCAO : public Operator<TK>
 {
   public:
     OperatorLCAO(
-        LCAO_Matrix* LM_in, 
+        HS_Matrix_K<TK>* hsk_in, 
         const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
-        HContainer<TR>* hR_in,
-        std::vector<TK>* hK_in)
-        : LM(LM_in), kvec_d(kvec_d_in), hR(hR_in), hK(hK_in){};
+        HContainer<TR>* hR_in)
+        : hsk(hsk_in), kvec_d(kvec_d_in), hR(hR_in){}
     virtual ~OperatorLCAO()
     {
         if (this->allocated_smatrix)
             delete[] this->smatrix_k;
-    };
+    }
 
     /* Function init(k) is used for update HR and HK ,
     data pointers of HR and HK are not passed by this function, but passed by constructors of every derived classes.
@@ -55,16 +54,16 @@ class OperatorLCAO : public Operator<TK>
         this->get_hs_pointers();
 #ifdef __MPI
         hk_in = MatrixBlock<TK>{hmatrix_k,
-                               (size_t)this->LM->ParaV->nrow,
-                               (size_t)this->LM->ParaV->ncol,
-                               this->LM->ParaV->desc};
+                               (size_t)this->hsk->get_pv()->nrow,
+                               (size_t)this->hsk->get_pv()->ncol,
+                               this->hsk->get_pv()->desc};
         sk_in = MatrixBlock<TK>{smatrix_k,
-                               (size_t)this->LM->ParaV->nrow,
-                               (size_t)this->LM->ParaV->ncol,
-                               this->LM->ParaV->desc};
+                               (size_t)this->hsk->get_pv()->nrow,
+                               (size_t)this->hsk->get_pv()->ncol,
+                               this->hsk->get_pv()->desc};
 #else
-        hk_in = MatrixBlock<TK>{hmatrix_k, (size_t)this->LM->ParaV->nrow, (size_t)this->LM->ParaV->ncol, nullptr};
-        sk_in = MatrixBlock<TK>{smatrix_k, (size_t)this->LM->ParaV->nrow, (size_t)this->LM->ParaV->ncol, nullptr};
+        hk_in = MatrixBlock<TK>{hmatrix_k, (size_t)this->hsk->get_pv()->nrow, (size_t)this->hsk->get_pv()->ncol, nullptr};
+        sk_in = MatrixBlock<TK>{smatrix_k, (size_t)this->hsk->get_pv()->nrow, (size_t)this->hsk->get_pv()->ncol, nullptr};
 #endif
     }
 
@@ -88,7 +87,7 @@ class OperatorLCAO : public Operator<TK>
 
     // protected:
     //  Hamiltonian matrix which are stored in LCAO_Matrix and calculated in OperatorLCAO
-    LCAO_Matrix* LM = nullptr;
+    HS_Matrix_K<TK>* hsk = nullptr;
     const std::vector<ModuleBase::Vector3<double>>& kvec_d;
 
   protected:
@@ -97,9 +96,6 @@ class OperatorLCAO : public Operator<TK>
     // Real space Hamiltonian pointer
     hamilt::HContainer<TR>* hR = nullptr;
 
-    // vector of HK matrix for current k point in reciprocal space
-    std::vector<TK>* hK = nullptr;
-
   private:
     void get_hs_pointers();
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
index 3d0e6b5397..02737a7c2e 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
@@ -7,37 +7,34 @@
 #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
 
 template <typename TK, typename TR>
-hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::OverlapNew(LCAO_Matrix* LM_in,
+hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::OverlapNew(HS_Matrix_K<TK>* hsk_in,
                                                              const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                                              hamilt::HContainer<TR>* hR_in,
-                                                             std::vector<TK>* hK_in,
                                                              hamilt::HContainer<TR>* SR_in,
-                                                             std::vector<TK>* SK_pointer_in,
                                                              const UnitCell* ucell_in,
                                                              Grid_Driver* GridD_in,
-                                                             const TwoCenterIntegrator* intor,
-                                                             const Parallel_Orbitals* paraV)
-    : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), intor_(intor)
+                                                             const TwoCenterIntegrator* intor)
+    : hamilt::OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in), intor_(intor)
 {
     this->cal_type = calculation_type::lcao_overlap;
     this->ucell = ucell_in;
     this->SR = SR_in;
-    this->SK_pointer = SK_pointer_in;
 #ifdef __DEBUG
     assert(this->ucell != nullptr);
     assert(this->SR != nullptr);
-    assert(this->SK_pointer != nullptr);
 #endif
     // initialize SR to allocate sparse overlap matrix memory
-    this->initialize_SR(GridD_in, paraV);
+    this->initialize_SR(GridD_in);
 }
 
 // initialize_SR()
 template <typename TK, typename TR>
-void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::initialize_SR(Grid_Driver* GridD, const Parallel_Orbitals* paraV)
+void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::initialize_SR(Grid_Driver* GridD)
 {
     ModuleBase::TITLE("OverlapNew", "initialize_SR");
     ModuleBase::timer::tick("OverlapNew", "initialize_SR");
+    auto* paraV = this->SR->get_paraV(); // get parallel orbitals from HR
+    // TODO: if paraV is nullptr, AtomPair can not use paraV for constructor, I will repair it in the future.
     for (int iat1 = 0; iat1 < ucell->nat; iat1++)
     {
         auto tau1 = ucell->get_tau(iat1);
@@ -197,16 +194,16 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::contributeHk(int ik)
     ModuleBase::TITLE("OverlapNew", "contributeHk");
     ModuleBase::timer::tick("OverlapNew", "contributeHk");
     // set SK to zero and then calculate SK for each k vector
-    ModuleBase::GlobalFunc::ZEROS(this->SK_pointer->data(), this->SK_pointer->size());
+    this->hsk->set_zero_sk();
     if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
     {
         const int nrow = this->SR->get_atom_pair(0).get_paraV()->get_row_size();
-        hamilt::folding_HR(*this->SR, this->SK_pointer->data(), this->kvec_d[ik], nrow, 1);
+        hamilt::folding_HR(*this->SR, this->hsk->get_sk(), this->kvec_d[ik], nrow, 1);
     }
     else
     {
         const int ncol = this->SR->get_atom_pair(0).get_paraV()->get_col_size();
-        hamilt::folding_HR(*this->SR, this->SK_pointer->data(), this->kvec_d[ik], ncol, 0);
+        hamilt::folding_HR(*this->SR, this->hsk->get_sk(), this->kvec_d[ik], ncol, 0);
     }
     // update kvec_d_old
     this->kvec_d_old = this->kvec_d[ik];
@@ -217,9 +214,9 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::contributeHk(int ik)
 template <typename TK, typename TR>
 TK* hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::getSk()
 {
-    if (this->SK_pointer != nullptr)
+    if (this->hsk != nullptr)
     {
-        return this->SK_pointer->data();
+        return this->hsk->get_sk();
     }
     return nullptr;
 }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.h
index 84bfb7519a..8f6016cd21 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.h
@@ -36,16 +36,13 @@ template <typename TK, typename TR>
 class OverlapNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 {
   public:
-    OverlapNew<OperatorLCAO<TK, TR>>(LCAO_Matrix* LM_in,
+    OverlapNew<OperatorLCAO<TK, TR>>(HS_Matrix_K<TK>* hsk_in,
                                      const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                      hamilt::HContainer<TR>* hR_in,
-                                     std::vector<TK>* hK_in,
                                      hamilt::HContainer<TR>* SR_in,
-                                     std::vector<TK>* SK_pointer_in,
                                      const UnitCell* ucell_in,
                                      Grid_Driver* GridD_in,
-                                     const TwoCenterIntegrator* intor,
-                                     const Parallel_Orbitals* paraV);
+                                     const TwoCenterIntegrator* intor);
 
     virtual void contributeHR() override;
 
@@ -58,8 +55,6 @@ class OverlapNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 
     hamilt::HContainer<TR>* SR = nullptr;
 
-    std::vector<TK>* SK_pointer = nullptr;
-
     const TwoCenterIntegrator* intor_ = nullptr;
 
     bool SR_fixed_done = false;
@@ -69,7 +64,7 @@ class OverlapNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
      * HContainer is used to store the overlap matrix with specific <I,J,R> atom-pairs
      * the size of SR will be fixed after initialization
      */
-    void initialize_SR(Grid_Driver* GridD_in, const Parallel_Orbitals* paraV);
+    void initialize_SR(Grid_Driver* GridD_in);
 
     /**
      * @brief calculate the overlap matrix with specific <I,J,R> atom-pairs
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/sc_lambda_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/sc_lambda_lcao.cpp
index cea1313e89..2aa96175dd 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/sc_lambda_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/sc_lambda_lcao.cpp
@@ -20,12 +20,13 @@ void OperatorScLambda<OperatorLCAO<std::complex<double>, std::complex<double>>>:
     ModuleBase::timer::tick("OperatorScLambda", "contributeHk");
     SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>& sc
         = SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>::getScInstance();
-    std::vector<std::complex<double>> h_lambda(this->LM->ParaV->nloc);
+    std::vector<std::complex<double>> h_lambda(this->hsk->get_pv()->nloc);
     std::fill(h_lambda.begin(), h_lambda.end(), std::complex<double>(0, 0));
-    sc.cal_h_lambda(&h_lambda[0], this->LM->Sloc2, ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(), this->isk[ik]);
-    for (int irc = 0; irc < this->LM->ParaV->nloc; irc++)
+    sc.cal_h_lambda(&h_lambda[0], this->hsk->get_sk(), ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(), this->isk[ik]);
+    std::complex<double>* hk = this->hsk->get_hk();
+    for (int irc = 0; irc < this->hsk->get_pv()->nloc; irc++)
     {
-        this->LM->Hloc2[irc] += h_lambda[irc];
+        hk[irc] += h_lambda[irc];
     }
     //std::cout << "OperatorScLambda contributeHk" << std::endl;
     ModuleBase::timer::tick("OperatorScLambda", "contributeHk");
@@ -39,12 +40,13 @@ void OperatorScLambda<OperatorLCAO<std::complex<double>, double>>::contributeHk(
     ModuleBase::timer::tick("OperatorScLambda", "contributeHk");
     SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>& sc
         = SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>::getScInstance();
-    std::vector<std::complex<double>> h_lambda(this->LM->ParaV->nloc);
+    std::vector<std::complex<double>> h_lambda(this->hsk->get_pv()->nloc);
     std::fill(h_lambda.begin(), h_lambda.end(), std::complex<double>(0, 0));
-    sc.cal_h_lambda(&h_lambda[0], this->LM->Sloc2, ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(), this->isk[ik]);
-    for (int irc = 0; irc < this->LM->ParaV->nloc; irc++)
+    sc.cal_h_lambda(&h_lambda[0], this->hsk->get_sk(), ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER(), this->isk[ik]);
+    std::complex<double>* hk = this->hsk->get_hk();
+    for (int irc = 0; irc < this->hsk->get_pv()->nloc; irc++)
     {
-        this->LM->Hloc2[irc] += h_lambda[irc];
+        hk[irc] += h_lambda[irc];
     }
     //std::cout << "OperatorScLambda contributeHk" << std::endl;
     ModuleBase::timer::tick("OperatorScLambda", "contributeHk");
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/sc_lambda_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/sc_lambda_lcao.h
index 9d196db9f0..646c0415fb 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/sc_lambda_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/sc_lambda_lcao.h
@@ -22,12 +22,11 @@ template<typename TK, typename TR>
 class OperatorScLambda<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 {
   public:
-    OperatorScLambda<OperatorLCAO<TK, TR>>(LCAO_Matrix* LM_in,
+    OperatorScLambda<OperatorLCAO<TK, TR>>(HS_Matrix_K<TK>* hsk_in,
                                     const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                     hamilt::HContainer<TR>* hR_in,
-                                    std::vector<TK>* hK_in,
                                     const std::vector<int>& isk_in)
-        : isk(isk_in), OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in)
+        : isk(isk_in), OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in)
     {
         this->cal_type = calculation_type::lcao_sc_lambda;
     }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
index fd740b3dcb..3c4a111907 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
@@ -14,24 +14,21 @@
 namespace hamilt
 {
 template <typename TK, typename TR>
-TDEkinetic<OperatorLCAO<TK, TR>>::TDEkinetic(LCAO_Matrix* LM_in,
+TDEkinetic<OperatorLCAO<TK, TR>>::TDEkinetic(HS_Matrix_K<TK>* hsk_in,
                                              hamilt::HContainer<TR>* hR_in,
-                                             std::vector<TK>* hK_in,
                                              hamilt::HContainer<TR>* SR_in,
                                              const K_Vectors* kv_in,
                                              const UnitCell* ucell_in,
                                              Grid_Driver* GridD_in,
-                                             const Parallel_Orbitals* paraV,
                                              const TwoCenterIntegrator* intor)
-    : SR(SR_in), kv(kv_in), OperatorLCAO<TK, TR>(LM_in, kv_in->kvec_d, hR_in, hK_in), intor_(intor)
+    : SR(SR_in), kv(kv_in), OperatorLCAO<TK, TR>(hsk_in, kv_in->kvec_d, hR_in), intor_(intor)
 {
-    this->LM = LM_in;
     this->ucell = ucell_in;
     this->cal_type = calculation_type::lcao_tddft_velocity;
     this->Grid = GridD_in;
     this->init_td();
     // initialize HR to get adjs info.
-    this->initialize_HR(Grid, this->LM->ParaV);
+    this->initialize_HR(Grid);
 }
 template <typename TK, typename TR>
 TDEkinetic<OperatorLCAO<TK, TR>>::~TDEkinetic()
@@ -248,7 +245,7 @@ void hamilt::TDEkinetic<hamilt::OperatorLCAO<TK, TR>>::set_HR_fixed(void* hR_tmp
     this->allocated = false;
 }
 template <typename TK, typename TR>
-void TDEkinetic<OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD, const Parallel_Orbitals* paraV)
+void TDEkinetic<OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD)
 {
     if (elecstate::H_TDDFT_pw::stype != 1)
     {
@@ -257,6 +254,9 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD, const P
     ModuleBase::TITLE("TDEkinetic", "initialize_HR");
     ModuleBase::timer::tick("TDEkinetic", "initialize_HR");
 
+    auto* paraV = this->hR->get_paraV();// get parallel orbitals from HR
+    // TODO: if paraV is nullptr, AtomPair can not use paraV for constructor, I will repair it in the future.
+
     this->adjs_all.clear();
     this->adjs_all.reserve(this->ucell->nat);
     for (int iat1 = 0; iat1 < ucell->nat; iat1++)
@@ -294,7 +294,7 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD, const P
     ModuleBase::timer::tick("TDEkinetic", "initialize_HR");
 }
 template <typename TK, typename TR>
-void TDEkinetic<OperatorLCAO<TK, TR>>::initialize_HR_tmp(const Parallel_Orbitals* paraV)
+void TDEkinetic<OperatorLCAO<TK, TR>>::initialize_HR_tmp()
 {
     if (elecstate::H_TDDFT_pw::stype != 1)
     {
@@ -303,6 +303,8 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::initialize_HR_tmp(const Parallel_Orbitals
     ModuleBase::TITLE("TDEkinetic", "initialize_HR_tmp");
     ModuleBase::timer::tick("TDEkinetic", "initialize_HR_tmp");
 
+    auto* paraV = this->hR->get_paraV();// get parallel orbitals from HR
+    // TODO: if paraV is nullptr, AtomPair can not use paraV for constructor, I will repair it in the future.
     for (int i = 0; i < this->hR->size_atom_pairs(); ++i)
     {
         hamilt::AtomPair<TR>& tmp = this->hR->get_atom_pair(i);
@@ -338,9 +340,9 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::contributeHR()
         // if this Operator is the first node of the sub_chain, then hR_tmp is nullptr
         if (this->hR_tmp == nullptr)
         {
-            this->hR_tmp = new hamilt::HContainer<std::complex<double>>(paraV);
+            this->hR_tmp = new hamilt::HContainer<std::complex<double>>(this->hR->get_paraV());
             // allocate memory for hR_tmp use the same memory as hR
-            this->initialize_HR_tmp(paraV);
+            this->initialize_HR_tmp();
             this->allocated = true;
         }
         if (this->next_sub_op != nullptr)
@@ -399,12 +401,12 @@ void TDEkinetic<OperatorLCAO<std::complex<double>, double>>::contributeHk(int ik
         if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
         {
             const int nrow = paraV->get_row_size();
-            hamilt::folding_HR(*this->hR_tmp, this->hK->data(), this->kvec_d[ik], nrow, 1);
+            hamilt::folding_HR(*this->hR_tmp, this->hsk->get_hk(), this->kvec_d[ik], nrow, 1);
         }
         else
         {
             const int ncol = paraV->get_col_size();
-            hamilt::folding_HR(*this->hR_tmp, this->hK->data(), this->kvec_d[ik], ncol, 0);
+            hamilt::folding_HR(*this->hR_tmp, this->hsk->get_hk(), this->kvec_d[ik], ncol, 0);
         }
 
         ModuleBase::timer::tick("TDEkinetic", "contributeHk");
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.h
index 00365e2e56..8723e598ff 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.h
@@ -8,6 +8,7 @@
 #include "module_hamilt_lcao/module_tddft/td_velocity.h"
 #include "operator_lcao.h"
 
+
 namespace hamilt
 {
 
@@ -30,18 +31,16 @@ class TDEkinetic : public T
 /// - TR: data type of real space Hamiltonian
 
 template <typename TK, typename TR>
-class TDEkinetic<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
+class TDEkinetic<OperatorLCAO<TK,TR>> : public OperatorLCAO<TK, TR>
 {
   public:
-    TDEkinetic<OperatorLCAO<TK, TR>>(LCAO_Matrix* LM_in,
-                                     hamilt::HContainer<TR>* hR_in,
-                                     std::vector<TK>* hK_in,
-                                     hamilt::HContainer<TR>* SR_in,
-                                     const K_Vectors* kv_in,
-                                     const UnitCell* ucell_in,
-                                     Grid_Driver* GridD_in,
-                                     const Parallel_Orbitals* paraV,
-                                     const TwoCenterIntegrator* intor);
+    TDEkinetic<OperatorLCAO<TK, TR>>(HS_Matrix_K<TK>* hsk_in,
+                                 hamilt::HContainer<TR>* hR_in,
+                                 hamilt::HContainer<TR>* SR_in,
+                                 const K_Vectors* kv_in,
+                                 const UnitCell* ucell_in,
+                                 Grid_Driver* GridD_in,
+                                 const TwoCenterIntegrator* intor);
     ~TDEkinetic();
 
     virtual void contributeHR() override;
@@ -54,13 +53,13 @@ class TDEkinetic<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
      * HContainer is used to store the non-local pseudopotential matrix with specific <I,J,R> atom-pairs
      * the size of HR will be fixed after initialization
      */
-    void initialize_HR(Grid_Driver* GridD, const Parallel_Orbitals* paraV);
+    void initialize_HR(Grid_Driver* GridD);
 
     /**
      * @brief initialize HR_tmp
      * Allocate the memory for HR_tmp with the same size as HR
-     */
-    void initialize_HR_tmp(const Parallel_Orbitals* paraV);
+    */
+    void initialize_HR_tmp();
 
     /**
      * @brief calculate the HR local matrix of <I,J,R> atom pair
@@ -85,10 +84,9 @@ class TDEkinetic<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 
   private:
     const UnitCell* ucell = nullptr;
-
+    
     HContainer<TR>* SR = nullptr;
-    /// @brief Store real space hamiltonian. TD term should include imaginary part, thus it has to be complex type. Only
-    /// shared between TD operators.
+    /// @brief Store real space hamiltonian. TD term should include imaginary part, thus it has to be complex type. Only shared between TD operators.
     HContainer<std::complex<double>>* hR_tmp = nullptr;
     Grid_Driver* Grid = nullptr;
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.cpp
index e62b1e73d7..1d21bc3074 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.cpp
@@ -12,25 +12,22 @@
 #endif
 
 template <typename TK, typename TR>
-hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::TDNonlocal(LCAO_Matrix* LM_in,
+hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::TDNonlocal(HS_Matrix_K<TK>* hsk_in,
                                                              const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                                              hamilt::HContainer<TR>* hR_in,
-                                                             std::vector<TK>* hK_in,
                                                              const UnitCell* ucell_in,
-                                                             Grid_Driver* GridD_in,
-                                                             const Parallel_Orbitals* paraV)
-    : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in)
+                                                             Grid_Driver* GridD_in)
+    : hamilt::OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in)
 {
     this->cal_type = calculation_type::lcao_tddft_velocity;
     this->ucell = ucell_in;
-    this->LM = LM_in;
     this->Grid = GridD_in;
 #ifdef __DEBUG
     assert(this->ucell != nullptr);
 #endif
     // initialize HR to get adjs info.
     this->init_td();
-    this->initialize_HR(Grid, this->LM->ParaV);
+    this->initialize_HR(Grid);
 }
 
 // destructor
@@ -50,7 +47,7 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::init_td()
 }
 // initialize_HR()
 template <typename TK, typename TR>
-void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD, const Parallel_Orbitals* paraV)
+void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD)
 {
     if (elecstate::H_TDDFT_pw::stype != 1)
     {
@@ -401,9 +398,9 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
     {
         if (this->hR_tmp == nullptr)
         {
-            this->hR_tmp = new hamilt::HContainer<std::complex<double>>(this->LM->ParaV);
+            this->hR_tmp = new hamilt::HContainer<std::complex<double>>(this->hsk->get_pv());
             // allocate memory for hR_tmp use the same memory as hR
-            this->initialize_HR_tmp(this->LM->ParaV);
+            this->initialize_HR_tmp(this->hsk->get_pv());
             this->allocated = true;
         }
         if (this->next_sub_op != nullptr)
@@ -437,13 +434,13 @@ void hamilt::TDNonlocal<hamilt::OperatorLCAO<std::complex<double>, double>>::con
         // folding inside HR to HK
         if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
         {
-            const int nrow = this->LM->ParaV->get_row_size();
-            folding_HR(*this->hR_tmp, this->hK->data(), this->kvec_d[ik], nrow, 1);
+            const int nrow = this->hsk->get_pv()->get_row_size();
+            folding_HR(*this->hR_tmp, this->hsk->get_hk(), this->kvec_d[ik], nrow, 1);
         }
         else
         {
-            const int ncol = this->LM->ParaV->get_col_size();
-            folding_HR(*this->hR_tmp, this->hK->data(), this->kvec_d[ik], ncol, 0);
+            const int ncol = this->hsk->get_pv()->get_col_size();
+            folding_HR(*this->hR_tmp, this->hsk->get_hk(), this->kvec_d[ik], ncol, 0);
         }
 
         ModuleBase::timer::tick("TDNonlocal", "contributeHk");
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.h
index 1275969cb6..1fbcd48029 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_nonlocal_lcao.h
@@ -34,13 +34,11 @@ template <typename TK, typename TR>
 class TDNonlocal<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 {
   public:
-    TDNonlocal<OperatorLCAO<TK, TR>>(LCAO_Matrix* LM_in,
+    TDNonlocal<OperatorLCAO<TK, TR>>(HS_Matrix_K<TK>* hsk_in,
                                      const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                      hamilt::HContainer<TR>* hR_in,
-                                     std::vector<TK>* hK_in,
                                      const UnitCell* ucell_in,
-                                     Grid_Driver* GridD_in,
-                                     const Parallel_Orbitals* paraV);
+                                     Grid_Driver* GridD_in);
     ~TDNonlocal<OperatorLCAO<TK, TR>>();
 
     /**
@@ -63,8 +61,6 @@ class TDNonlocal<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 
     bool allocated = false;
 
-    TK* HK_pointer = nullptr;
-
     bool hR_tmp_done = false;
 
     /**
@@ -72,7 +68,7 @@ class TDNonlocal<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
      * HContainer is used to store the non-local pseudopotential matrix with specific <I,J,R> atom-pairs
      * the size of HR will be fixed after initialization
      */
-    void initialize_HR(Grid_Driver* GridD_in, const Parallel_Orbitals* paraV);
+    void initialize_HR(Grid_Driver* GridD_in);
 
     /**
      * @brief initialize HR_tmp
@@ -80,7 +76,7 @@ class TDNonlocal<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
      */
     void initialize_HR_tmp(const Parallel_Orbitals* paraV);
     /// @brief init vector potential for td_nonlocal term
-    void init_td(void);
+    void init_td();
     /**
      * @brief calculate the non-local pseudopotential matrix with specific <I,J,R> atom-pairs
      * nearest neighbor atoms don't need to be calculated again
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp
index bdb85cf947..45d095a7b2 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp
@@ -136,27 +136,24 @@ TEST_F(TNLTest, testTVNLcd2cd)
     int npol = ucell.get_npol();
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(2, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     kvec_d_in[1] = ModuleBase::Vector3<double>(0.1, 0.2, 0.3);
-    std::vector<std::complex<double>> hk(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
+    hamilt::HS_Matrix_K<std::complex<double>> hsk(paraV, 1);
+    hsk.set_zero_hk();
     Grid_Driver gd(0, 0, 0);
     std::chrono::high_resolution_clock::time_point start_time = std::chrono::high_resolution_clock::now();
     hamilt::Operator<std::complex<double>>* op
-        = new hamilt::EkineticNew<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>(nullptr,
+        = new hamilt::EkineticNew<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>(&hsk,
                                                                                                     kvec_d_in,
                                                                                                     HR,
-                                                                                                    &hk,
                                                                                                     &ucell,
                                                                                                     &gd,
-                                                                                                    &intor_,
-                                                                                                    paraV);
+                                                                                                    &intor_);
     hamilt::Operator<std::complex<double>>* op1
-        = new hamilt::NonlocalNew<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>(nullptr,
+        = new hamilt::NonlocalNew<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>(&hsk,
                                                                                                     kvec_d_in,
                                                                                                     HR,
-                                                                                                    &hk,
                                                                                                     &ucell,
                                                                                                     &gd,
-                                                                                                    &intor_,
-                                                                                                    paraV);
+                                                                                                    &intor_);
     // merge two Operators to a chain
     op->add(op1);
     std::chrono::high_resolution_clock::time_point end_time = std::chrono::high_resolution_clock::now();
@@ -202,6 +199,7 @@ TEST_F(TNLTest, testTVNLcd2cd)
         }
     }
     // check the value of HK of gamma point
+    auto* hk = hsk.get_hk();
     result_ref += 1.0;
     int i = 0;
     for (int irow = 0; irow < paraV->get_row_size(); ++irow)
@@ -223,7 +221,7 @@ TEST_F(TNLTest, testTVNLcd2cd)
     }
     // calculate HK for k point
     start_time = std::chrono::high_resolution_clock::now();
-    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
+    hsk.set_zero_hk();
     op->init(1);
     end_time = std::chrono::high_resolution_clock::now();
     std::chrono::duration<double> elapsed_time2
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_dftu.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_dftu.cpp
index 8f61b6919a..28d63c8074 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_dftu.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_dftu.cpp
@@ -147,7 +147,8 @@ TEST_F(DFTUTest, constructHRd2d)
     // test for nspin=1
     GlobalV::NSPIN = 1;
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(1, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
-    std::vector<double> hk(paraV->get_row_size() * paraV->get_col_size(), 0.0);
+    hamilt::HS_Matrix_K<double> hsk(paraV, 1);
+    hsk.set_zero_hk();
     Grid_Driver gd(0, 0, 0);
     // check some input values
     EXPECT_EQ(LCAO_Orbitals::get_const_instance().Phi[0].getRcut(), 1.0);
@@ -160,7 +161,7 @@ TEST_F(DFTUTest, constructHRd2d)
     }
     std::chrono::high_resolution_clock::time_point start_time = std::chrono::high_resolution_clock::now();
     hamilt::DFTU<hamilt::OperatorLCAO<double, double>>
-        op(nullptr, kvec_d_in, HR, &hk, ucell, &gd, &intor_, &GlobalC::dftu, *paraV);
+        op(&hsk, kvec_d_in, HR, ucell, &gd, &intor_, &GlobalC::dftu);
     std::chrono::high_resolution_clock::time_point end_time = std::chrono::high_resolution_clock::now();
     std::chrono::duration<double> elapsed_time
         = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
@@ -197,7 +198,8 @@ TEST_F(DFTUTest, constructHRd2d)
     end_time = std::chrono::high_resolution_clock::now();
     std::chrono::duration<double> elapsed_time2
         = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
-    // check the value of SK
+    // check the value of HK
+    double* hk = hsk.get_hk();
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
         EXPECT_NEAR(hk[i], -10.0 * test_size, 1e-10);
@@ -213,7 +215,8 @@ TEST_F(DFTUTest, constructHRd2cd)
     // test for nspin=2
     GlobalV::NSPIN = 2;
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(2, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
-    std::vector<std::complex<double>> hk(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
+    hamilt::HS_Matrix_K<std::complex<double>> hsk(paraV, 1);
+    hsk.set_zero_hk();
     Grid_Driver gd(0, 0, 0);
     EXPECT_EQ(LCAO_Orbitals::get_const_instance().Phi[0].getRcut(), 1.0);
     // reset HR and DMR
@@ -224,7 +227,7 @@ TEST_F(DFTUTest, constructHRd2cd)
         HR->get_wrapper()[i] = 0.0;
     }
     hamilt::DFTU<hamilt::OperatorLCAO<std::complex<double>, double>>
-        op(nullptr, kvec_d_in, HR, &hk, ucell, &gd, &intor_, &GlobalC::dftu, *paraV);
+        op(&hsk, kvec_d_in, HR, ucell, &gd, &intor_, &GlobalC::dftu);
     op.contributeHR();
     // check the occupations of dftu for spin-up
     for (int iat = 0; iat < test_size; iat++)
@@ -251,6 +254,7 @@ TEST_F(DFTUTest, constructHRd2cd)
     // calculate HK for gamma point
     op.contributeHk(0);
     // check the value of HK of gamma point
+    std::complex<double>* hk = hsk.get_hk();
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
         EXPECT_NEAR(hk[i].real(), -10.0 * test_size, 1e-10);
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp
index 2985317391..0befa32c0d 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp
@@ -104,10 +104,11 @@ class EkineticNewTest : public ::testing::Test
 TEST_F(EkineticNewTest, constructHRd2d)
 {
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(1, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
-    std::vector<double> hk(paraV->get_row_size() * paraV->get_col_size(), 0.0);
+    hamilt::HS_Matrix_K<double> hsk(paraV, 1);
+    hsk.set_zero_hk();
     Grid_Driver gd(0, 0, 0);
     hamilt::EkineticNew<hamilt::OperatorLCAO<double, double>>
-        op(nullptr, kvec_d_in, HR, &hk, &ucell, &gd, &intor_, paraV);
+        op(&hsk, kvec_d_in, HR, &ucell, &gd, &intor_);
     op.contributeHR();
     // check the value of HR
     for (int iap = 0; iap < HR->size_atom_pairs(); ++iap)
@@ -123,9 +124,10 @@ TEST_F(EkineticNewTest, constructHRd2d)
             EXPECT_EQ(tmp.get_pointer(0)[i], 1.0);
         }
     }
-    // calculate SK
+    // calculate HK
     op.contributeHk(0);
-    // check the value of SK
+    // check the value of HK
+    double* hk = hsk.get_hk();
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
         EXPECT_EQ(hk[i], 1.0);
@@ -152,10 +154,11 @@ TEST_F(EkineticNewTest, constructHRd2cd)
 {
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(2, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     kvec_d_in[1] = ModuleBase::Vector3<double>(0.1, 0.2, 0.3);
-    std::vector<std::complex<double>> hk(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
+    hamilt::HS_Matrix_K<std::complex<double>> hsk(paraV, 1);
+    hsk.set_zero_hk();
     Grid_Driver gd(0, 0, 0);
     hamilt::EkineticNew<hamilt::OperatorLCAO<std::complex<double>, double>>
-        op(nullptr, kvec_d_in, HR, &hk, &ucell, &gd, &intor_, paraV);
+        op(&hsk, kvec_d_in, HR, &ucell, &gd, &intor_);
     op.contributeHR();
     // check the value of HR
     for (int iap = 0; iap < HR->size_atom_pairs(); ++iap)
@@ -171,16 +174,17 @@ TEST_F(EkineticNewTest, constructHRd2cd)
             EXPECT_EQ(tmp.get_pointer(0)[i], 1.0);
         }
     }
-    // calculate SK for gamma point
+    // calculate HK for gamma point
     op.contributeHk(0);
-    // check the value of SK of gamma point
+    auto* hk = hsk.get_hk();
+    // check the value of HK of gamma point
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
         EXPECT_EQ(hk[i].real(), 1.0);
         EXPECT_EQ(hk[i].imag(), 0.0);
     }
     // calculate HK for k point
-    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
+    hsk.set_zero_hk();
     op.contributeHk(1);
     // check the value of HK
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_nonlocalnew.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_nonlocalnew.cpp
index 6aafc318c5..beb5925915 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_nonlocalnew.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_nonlocalnew.cpp
@@ -133,14 +133,15 @@ class NonlocalNewTest : public ::testing::Test
 TEST_F(NonlocalNewTest, constructHRd2d)
 {
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(1, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
-    std::vector<double> hk(paraV->get_row_size() * paraV->get_col_size(), 0.0);
+    hamilt::HS_Matrix_K<double> hsk(paraV, true);
+    hsk.set_zero_hk();
     Grid_Driver gd(0, 0, 0);
     // check some input values
     EXPECT_EQ(ucell.infoNL.Beta[0].get_rcut_max(), 1.0);
     EXPECT_EQ(LCAO_Orbitals::get_const_instance().Phi[0].getRcut(), 1.0);
     std::chrono::high_resolution_clock::time_point start_time = std::chrono::high_resolution_clock::now();
     hamilt::NonlocalNew<hamilt::OperatorLCAO<double, double>>
-        op(nullptr, kvec_d_in, HR, &hk, &ucell, &gd, &intor_, paraV);
+        op(&hsk, kvec_d_in, HR, &ucell, &gd, &intor_);
     std::chrono::high_resolution_clock::time_point end_time = std::chrono::high_resolution_clock::now();
     std::chrono::duration<double> elapsed_time
         = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
@@ -169,7 +170,8 @@ TEST_F(NonlocalNewTest, constructHRd2d)
     end_time = std::chrono::high_resolution_clock::now();
     std::chrono::duration<double> elapsed_time2
         = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
-    // check the value of SK
+    // check the value of HK
+    double* hk = hsk.get_hk();
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
         EXPECT_EQ(hk[i], 5.0 * test_size);
@@ -205,10 +207,11 @@ TEST_F(NonlocalNewTest, constructHRd2cd)
 {
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(2, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     kvec_d_in[1] = ModuleBase::Vector3<double>(0.1, 0.2, 0.3);
-    std::vector<std::complex<double>> hk(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
+    hamilt::HS_Matrix_K<std::complex<double>> hsk(paraV);
+    hsk.set_zero_hk();
     Grid_Driver gd(0, 0, 0);
     hamilt::NonlocalNew<hamilt::OperatorLCAO<std::complex<double>, double>>
-        op(nullptr, kvec_d_in, HR, &hk, &ucell, &gd, &intor_, paraV);
+        op(&hsk, kvec_d_in, HR, &ucell, &gd, &intor_);
     op.contributeHR();
     // check the value of HR
     for (int iap = 0; iap < HR->size_atom_pairs(); ++iap)
@@ -224,16 +227,17 @@ TEST_F(NonlocalNewTest, constructHRd2cd)
             EXPECT_EQ(tmp.get_pointer(0)[i], 5.0 * test_size);
         }
     }
-    // calculate SK for gamma point
+    // calculate HK for gamma point
     op.contributeHk(0);
-    // check the value of SK of gamma point
+    // check the value of HK of gamma point
+    auto* hk = hsk.get_hk();
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
         EXPECT_EQ(hk[i].real(), 5.0 * test_size);
         EXPECT_EQ(hk[i].imag(), 0.0);
     }
     // calculate HK for k point
-    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
+    hsk.set_zero_hk();
     op.contributeHk(1);
     // check the value of HK
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew.cpp
index a7d637371d..298439b074 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew.cpp
@@ -104,10 +104,11 @@ class OverlapNewTest : public ::testing::Test
 TEST_F(OverlapNewTest, constructHRd2d)
 {
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(1, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
-    std::vector<double> hk(paraV->get_row_size() * paraV->get_col_size(), 0.0);
+    hamilt::HS_Matrix_K<double> hsk(paraV);
+    hsk.set_zero_sk();
     Grid_Driver gd(0, 0, 0);
     hamilt::OverlapNew<hamilt::OperatorLCAO<double, double>>
-        op(nullptr, kvec_d_in, nullptr, nullptr, SR, &hk, &ucell, &gd, &intor_, paraV);
+        op(&hsk, kvec_d_in, nullptr, SR, &ucell, &gd, &intor_);
     op.contributeHR();
     // check the value of SR
     for (int iap = 0; iap < SR->size_atom_pairs(); ++iap)
@@ -126,9 +127,10 @@ TEST_F(OverlapNewTest, constructHRd2d)
     // calculate SK
     op.contributeHk(0);
     // check the value of SK
-    for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
+    double* sk = hsk.get_sk();
+    for (int i = 0; i < hsk.get_size(); ++i)
     {
-        EXPECT_EQ(hk[i], 1.0);
+        EXPECT_EQ(sk[i], 1.0);
     }
 }
 
@@ -136,10 +138,11 @@ TEST_F(OverlapNewTest, constructHRd2cd)
 {
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(2, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     kvec_d_in[1] = ModuleBase::Vector3<double>(0.1, 0.2, 0.3);
-    std::vector<std::complex<double>> hk(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
+    hamilt::HS_Matrix_K<std::complex<double>> hsk(paraV);
+    hsk.set_zero_sk();
     Grid_Driver gd(0, 0, 0);
     hamilt::OverlapNew<hamilt::OperatorLCAO<std::complex<double>, double>>
-        op(nullptr, kvec_d_in, nullptr, nullptr, SR, &hk, &ucell, &gd, &intor_, paraV);
+        op(&hsk, kvec_d_in, nullptr, SR, &ucell, &gd, &intor_);
     op.contributeHR();
     // check the value of SR
     for (int iap = 0; iap < SR->size_atom_pairs(); ++iap)
@@ -158,19 +161,20 @@ TEST_F(OverlapNewTest, constructHRd2cd)
     // calculate SK for gamma point
     op.contributeHk(0);
     // check the value of SK of gamma point
+    auto* sk = hsk.get_sk();
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
-        EXPECT_EQ(hk[i].real(), 1.0);
-        EXPECT_EQ(hk[i].imag(), 0.0);
+        EXPECT_EQ(sk[i].real(), 1.0);
+        EXPECT_EQ(sk[i].imag(), 0.0);
     }
     // calculate SK for k point
-    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
+    hsk.set_zero_sk();
     op.contributeHk(1);
     // check the value of SK
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
-        EXPECT_NEAR(hk[i].real(), -0.80901699437494723, 1e-10);
-        EXPECT_NEAR(hk[i].imag(), -0.58778525229247336, 1e-10);
+        EXPECT_NEAR(sk[i].real(), -0.80901699437494723, 1e-10);
+        EXPECT_NEAR(sk[i].imag(), -0.58778525229247336, 1e-10);
     }
 }
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew_cd.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew_cd.cpp
index 171f34c2cd..26f51500a6 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew_cd.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew_cd.cpp
@@ -104,10 +104,11 @@ TEST_F(OverlapNewTest, constructHRcd2cd)
     int npol = ucell.get_npol();
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(2, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     kvec_d_in[1] = ModuleBase::Vector3<double>(0.1, 0.2, 0.3);
-    std::vector<std::complex<double>> hk(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
+    hamilt::HS_Matrix_K<std::complex<double>> hsk(paraV);
+    hsk.set_zero_sk();
     Grid_Driver gd(0, 0, 0);
     hamilt::OverlapNew<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>
-        op(nullptr, kvec_d_in, nullptr, nullptr, SR, &hk, &ucell, &gd, &intor_, paraV);
+        op(&hsk, kvec_d_in, nullptr, SR, &ucell, &gd, &intor_);
     op.contributeHR();
     // check the value of SR
     for (int iap = 0; iap < SR->size_atom_pairs(); ++iap)
@@ -139,6 +140,7 @@ TEST_F(OverlapNewTest, constructHRcd2cd)
     // calculate SK for gamma point
     op.contributeHk(0);
     // check the value of SK of gamma point
+    auto* sk = hsk.get_sk();
     int i = 0;
     for (int irow = 0; irow < paraV->get_row_size(); ++irow)
     {
@@ -146,19 +148,19 @@ TEST_F(OverlapNewTest, constructHRcd2cd)
         {
             if (irow % npol == icol % npol)
             {
-                EXPECT_NEAR(hk[i].real(), 1.0, 1e-10);
-                EXPECT_NEAR(hk[i].imag(), 0.0, 1e-10);
+                EXPECT_NEAR(sk[i].real(), 1.0, 1e-10);
+                EXPECT_NEAR(sk[i].imag(), 0.0, 1e-10);
             }
             else
             {
-                EXPECT_NEAR(hk[i].real(), 0.0, 1e-10);
-                EXPECT_NEAR(hk[i].imag(), 0.0, 1e-10);
+                EXPECT_NEAR(sk[i].real(), 0.0, 1e-10);
+                EXPECT_NEAR(sk[i].imag(), 0.0, 1e-10);
             }
             ++i;
         }
     }
     // calculate SK for k point
-    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
+    hsk.set_zero_sk();
     op.contributeHk(1);
     // check the value of SK
     i = 0;
@@ -168,13 +170,13 @@ TEST_F(OverlapNewTest, constructHRcd2cd)
         {
             if (irow % npol == icol % npol)
             {
-                EXPECT_NEAR(hk[i].real(), -0.80901699437494723, 1e-10);
-                EXPECT_NEAR(hk[i].imag(), -0.58778525229247336, 1e-10);
+                EXPECT_NEAR(sk[i].real(), -0.80901699437494723, 1e-10);
+                EXPECT_NEAR(sk[i].imag(), -0.58778525229247336, 1e-10);
             }
             else
             {
-                EXPECT_NEAR(hk[i].real(), 0.0, 1e-10);
-                EXPECT_NEAR(hk[i].imag(), 0.0, 1e-10);
+                EXPECT_NEAR(sk[i].real(), 0.0, 1e-10);
+                EXPECT_NEAR(sk[i].imag(), 0.0, 1e-10);
             }
             ++i;
         }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_sc_lambda_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_sc_lambda_lcao.cpp
index 15efa6c7b8..d37e94efbb 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_sc_lambda_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_sc_lambda_lcao.cpp
@@ -86,17 +86,11 @@ TEST_F(ScLambdaLCAOTest, ContributeHk)
     int ncol = 2;
     paraV.set_serial(nrow, ncol);
     EXPECT_EQ(paraV.nloc, 4);
-    // set LM
-    LCAO_Matrix LM;
-    LM.ParaV = &paraV;
-    EXPECT_EQ(LM.ParaV->nloc, 4);
-    LM.Sloc2 = {
-        std::complex<double>{1.0, 0.0},
-        std::complex<double>{0.0, 0.0},
-        std::complex<double>{0.0, 0.0},
-        std::complex<double>{1.0, 0.0}
-    };
-    LM.Hloc2.resize(LM.ParaV->nloc, 0.0);
+    hamilt::HS_Matrix_K<std::complex<double>> hsk(&paraV);
+    hsk.set_zero_hk();
+    hsk.set_zero_sk();
+    hsk.get_sk()[0] = std::complex<double>{1.0, 0.0};
+    hsk.get_sk()[3] = std::complex<double>{1.0, 0.0};
     // set sc
     SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>& sc
         = SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>::getScInstance();
@@ -122,10 +116,9 @@ TEST_F(ScLambdaLCAOTest, ContributeHk)
     EXPECT_TRUE(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER());
     // set sc_lambda_op
     auto sc_lambda_op
-        = hamilt::OperatorScLambda<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>(&LM,
+        = hamilt::OperatorScLambda<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>(&hsk,
                                                                                                      this->kvec_d,
                                                                                                      nullptr,
-                                                                                                     nullptr,
                                                                                                      isk);
     sc_lambda_op.contributeHk(0);
     std::vector<std::complex<double>> columnMajor_h_lambda = {
@@ -134,14 +127,14 @@ TEST_F(ScLambdaLCAOTest, ContributeHk)
         std::complex<double>{-1.0, -1.0},
         std::complex<double>{1.0,  0.0}
     };
-    EXPECT_DOUBLE_EQ(LM.Hloc2[0].real(), columnMajor_h_lambda[0].real());
-    EXPECT_DOUBLE_EQ(LM.Hloc2[0].imag(), columnMajor_h_lambda[0].imag());
-    EXPECT_DOUBLE_EQ(LM.Hloc2[1].real(), columnMajor_h_lambda[1].real());
-    EXPECT_DOUBLE_EQ(LM.Hloc2[1].imag(), columnMajor_h_lambda[1].imag());
-    EXPECT_DOUBLE_EQ(LM.Hloc2[2].real(), columnMajor_h_lambda[2].real());
-    EXPECT_DOUBLE_EQ(LM.Hloc2[2].imag(), columnMajor_h_lambda[2].imag());
-    EXPECT_DOUBLE_EQ(LM.Hloc2[3].real(), columnMajor_h_lambda[3].real());
-    EXPECT_DOUBLE_EQ(LM.Hloc2[3].imag(), columnMajor_h_lambda[3].imag());
+    EXPECT_DOUBLE_EQ(hsk.get_hk()[0].real(), columnMajor_h_lambda[0].real());
+    EXPECT_DOUBLE_EQ(hsk.get_hk()[0].imag(), columnMajor_h_lambda[0].imag());
+    EXPECT_DOUBLE_EQ(hsk.get_hk()[1].real(), columnMajor_h_lambda[1].real());
+    EXPECT_DOUBLE_EQ(hsk.get_hk()[1].imag(), columnMajor_h_lambda[1].imag());
+    EXPECT_DOUBLE_EQ(hsk.get_hk()[2].real(), columnMajor_h_lambda[2].real());
+    EXPECT_DOUBLE_EQ(hsk.get_hk()[2].imag(), columnMajor_h_lambda[2].imag());
+    EXPECT_DOUBLE_EQ(hsk.get_hk()[3].real(), columnMajor_h_lambda[3].real());
+    EXPECT_DOUBLE_EQ(hsk.get_hk()[3].imag(), columnMajor_h_lambda[3].imag());
 }
 
 TEST_F(ScLambdaLCAOTest, ContributeHkS2)
@@ -154,13 +147,9 @@ TEST_F(ScLambdaLCAOTest, ContributeHkS2)
     paraV.set_serial(nrow, ncol);
     EXPECT_EQ(paraV.nloc, 1);
     // set LM
-    LCAO_Matrix LM;
-    LM.ParaV = &paraV;
-    EXPECT_EQ(LM.ParaV->nloc, 1);
-    LM.Sloc2 = {
-        std::complex<double>{1.0, 0.0}
-    };
-    LM.Hloc2.resize(LM.ParaV->nloc, 0.0);
+    hamilt::HS_Matrix_K<std::complex<double>> hsk(&paraV);
+    hsk.set_zero_hk();
+    hsk.get_sk()[0] = std::complex<double>{1.0, 0.0};
     // set sc
     SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>& sc
         = SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>::getScInstance();
@@ -186,17 +175,16 @@ TEST_F(ScLambdaLCAOTest, ContributeHkS2)
     EXPECT_TRUE(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER());
     // set sc_lambda_op
     auto sc_lambda_op
-        = hamilt::OperatorScLambda<hamilt::OperatorLCAO<std::complex<double>, double>>(&LM,
-                                                                                                     this->kvec_d,
-                                                                                                     nullptr,
-                                                                                                     nullptr,
-                                                                                                     isk);
+        = hamilt::OperatorScLambda<hamilt::OperatorLCAO<std::complex<double>, double>>(&hsk,
+                                                                                    this->kvec_d,
+                                                                                    nullptr,
+                                                                                    isk);
     sc_lambda_op.contributeHk(0);
     std::vector<std::complex<double>> columnMajor_h_lambda = {
         std::complex<double>{-1.0,  0.0}
     };
-    EXPECT_DOUBLE_EQ(LM.Hloc2[0].real(), columnMajor_h_lambda[0].real());
-    EXPECT_DOUBLE_EQ(LM.Hloc2[0].imag(), columnMajor_h_lambda[0].imag());
+    EXPECT_DOUBLE_EQ(hsk.get_hk()[0].real(), columnMajor_h_lambda[0].real());
+    EXPECT_DOUBLE_EQ(hsk.get_hk()[0].imag(), columnMajor_h_lambda[0].imag());
 }
 
 TEST_F(ScLambdaLCAOTest, TemplateHelpers)
@@ -205,15 +193,13 @@ TEST_F(ScLambdaLCAOTest, TemplateHelpers)
         = hamilt::OperatorScLambda<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>(nullptr,
                                                                                                      this->kvec_d,
                                                                                                      nullptr,
-                                                                                                     nullptr,
                                                                                                      isk);
     auto sc_lambda_op1 = hamilt::OperatorScLambda<hamilt::OperatorLCAO<std::complex<double>, double>>(nullptr,
                                                                                                       this->kvec_d,
                                                                                                       nullptr,
-                                                                                                      nullptr,
                                                                                                       isk);
     auto sc_lambda_op2
-        = hamilt::OperatorScLambda<hamilt::OperatorLCAO<double, double>>(nullptr, this->kvec_d, nullptr, nullptr, isk);
+        = hamilt::OperatorScLambda<hamilt::OperatorLCAO<double, double>>(nullptr, this->kvec_d, nullptr, isk);
     EXPECT_NO_THROW(sc_lambda_op.contributeHR());
     EXPECT_NO_THROW(sc_lambda_op1.contributeHR());
     EXPECT_NO_THROW(sc_lambda_op2.contributeHR());
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
index db6e0a02bb..1664bc0ff2 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
@@ -127,7 +127,7 @@ void hamilt::OperatorLCAO<TK, TR>::contributeHk(int ik)
     else
     {
         const int ncol = this->hR->get_atom_pair(0).get_paraV()->get_col_size();
-        hamilt::folding_HR(*this->hR, this->hK->data(), this->kvec_d[ik], ncol, 0);
+        hamilt::folding_HR(*this->hR, this->hsk->get_hk(), this->kvec_d[ik], ncol, 0);
     }
 }
 template <typename TK, typename TR>
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp
index c0f3f58590..39e91219a0 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp
@@ -10,13 +10,14 @@ namespace hamilt
 // initialize_HR()
 template <typename TK, typename TR>
 void Veff<OperatorLCAO<TK, TR>>::initialize_HR(const UnitCell* ucell_in,
-                                        Grid_Driver* GridD,
-                                        const Parallel_Orbitals* paraV)
+                                        Grid_Driver* GridD)
 {
     ModuleBase::TITLE("Veff", "initialize_HR");
     ModuleBase::timer::tick("Veff", "initialize_HR");
 
     this->nspin = GlobalV::NSPIN;
+    auto* paraV = this->hR->get_paraV();// get parallel orbitals from HR
+    // TODO: if paraV is nullptr, AtomPair can not use paraV for constructor, I will repair it in the future.
 
     for (int iat1 = 0; iat1 < ucell_in->nat; iat1++)
     {
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h
index f8c3aaa49d..9c97e1020f 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h
@@ -1,12 +1,12 @@
 #ifndef VEFFLCAO_H
 #define VEFFLCAO_H
 #include "module_base/timer.h"
-#include "module_cell/module_neighbor/sltk_grid_driver.h"
-#include "module_cell/unitcell.h"
 #include "module_elecstate/potentials/potential_new.h"
 #include "module_hamilt_lcao/module_gint/gint_gamma.h"
 #include "module_hamilt_lcao/module_gint/gint_k.h"
 #include "operator_lcao.h"
+#include "module_cell/module_neighbor/sltk_grid_driver.h"
+#include "module_cell/unitcell.h"
 
 namespace hamilt
 {
@@ -24,8 +24,8 @@ class Veff : public T
 /// @brief Effective potential class, used for calculating Hamiltonian with grid integration tools
 /// If user want to separate the contribution of V_{eff} into V_{H} and V_{XC} and V_{local pseudopotential} and so on,
 /// the user can separate the Potential class into different parts, and construct different Veff class for each part.
-/// @tparam TK
-/// @tparam TR
+/// @tparam TK 
+/// @tparam TR 
 template <typename TK, typename TR>
 class Veff<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 {
@@ -33,40 +33,37 @@ class Veff<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
     /**
      * @brief Construct a new Veff object for multi-kpoint calculation
      * @param GK_in: the pointer of Gint_k object, used for grid integration
-     */
+    */
     Veff<OperatorLCAO<TK, TR>>(Gint_k* GK_in,
-                               LCAO_Matrix* LM_in,
+                               HS_Matrix_K<TK>* hsk_in,
                                const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                elecstate::Potential* pot_in,
                                hamilt::HContainer<TR>* hR_in,
-                               std::vector<TK>* hK_in,
                                const UnitCell* ucell_in,
-                               Grid_Driver* GridD_in,
-                               const Parallel_Orbitals* paraV)
-        : GK(GK_in), pot(pot_in), ucell(ucell_in), gd(GridD_in), OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in)
+                               Grid_Driver* GridD_in)
+        : GK(GK_in), pot(pot_in), ucell(ucell_in),
+          gd(GridD_in), OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in)
     {
         this->cal_type = calculation_type::lcao_gint;
 
-        this->initialize_HR(ucell_in, GridD_in, paraV);
+        this->initialize_HR(ucell_in, GridD_in);
         GK_in->initialize_pvpR(*ucell_in, GridD_in);
     }
     /**
      * @brief Construct a new Veff object for Gamma-only calculation
      * @param GG_in: the pointer of Gint_Gamma object, used for grid integration
-     */
+    */
     Veff<OperatorLCAO<TK, TR>>(Gint_Gamma* GG_in,
-                               LCAO_Matrix* LM_in,
+                               HS_Matrix_K<TK>* hsk_in,
                                const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
                                elecstate::Potential* pot_in,
                                hamilt::HContainer<TR>* hR_in,
-                               std::vector<TK>* hK_in,
                                const UnitCell* ucell_in,
-                               Grid_Driver* GridD_in,
-                               const Parallel_Orbitals* paraV)
-        : GG(GG_in), pot(pot_in), OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in)
+                               Grid_Driver* GridD_in)
+        : GG(GG_in), pot(pot_in), OperatorLCAO<TK, TR>(hsk_in, kvec_d_in, hR_in)
     {
         this->cal_type = calculation_type::lcao_gint;
-        this->initialize_HR(ucell_in, GridD_in, paraV);
+        this->initialize_HR(ucell_in, GridD_in);
 
         GG_in->initialize_pvpR(*ucell_in, GridD_in);
     }
@@ -76,14 +73,13 @@ class Veff<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
     /**
      * @brief contributeHR() is used to calculate the HR matrix
      * <phi_{\mu, 0}|V_{eff}|phi_{\nu, R}>
-     * the contribution of V_{eff} is calculated by the contribution of V_{H} and V_{XC} and V_{local pseudopotential}
-     * and so on. grid integration is used to calculate the contribution Hamiltonian of effective potential
+     * the contribution of V_{eff} is calculated by the contribution of V_{H} and V_{XC} and V_{local pseudopotential} and so on.
+     * grid integration is used to calculate the contribution Hamiltonian of effective potential
      */
     virtual void contributeHR() override;
-
-    const UnitCell* ucell;
-    Grid_Driver* gd;
-
+  
+  const UnitCell* ucell;
+  Grid_Driver* gd;
   private:
     // used for k-dependent grid integration.
     Gint_k* GK = nullptr;
@@ -103,7 +99,8 @@ class Veff<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
      * HContainer is used to store the electronic kinetic matrix with specific <I,J,R> atom-pairs
      * the size of HR will be fixed after initialization
      */
-    void initialize_HR(const UnitCell* ucell_in, Grid_Driver* GridD_in, const Parallel_Orbitals* paraV);
+    void initialize_HR(const UnitCell* ucell_in, Grid_Driver* GridD_in);
+
 };
 
 } // namespace hamilt
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp
index 2387d9bb46..ec5ad82983 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp
@@ -60,7 +60,7 @@ void sparse_format::cal_dH(LCAO_Matrix& lm,
 
     LCAO_domain::build_Nonlocal_mu_new(lm,
                                        fsr_dh,
-                                       lm.Hloc_fixed.data(),
+                                       nullptr,
                                        true,
                                        GlobalC::ucell,
                                        GlobalC::ORB,
diff --git a/source/module_hamilt_lcao/module_deltaspin/cal_h_lambda.cpp b/source/module_hamilt_lcao/module_deltaspin/cal_h_lambda.cpp
index 6bf1265386..be60643477 100644
--- a/source/module_hamilt_lcao/module_deltaspin/cal_h_lambda.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/cal_h_lambda.cpp
@@ -7,7 +7,7 @@
 template <>
 void SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>::cal_h_lambda(
     std::complex<double>* h_lambda,
-    const std::vector<std::complex<double>>& Sloc2,
+    const std::complex<double>* Sloc2,
     bool column_major,
     int isk)
 {
diff --git a/source/module_hamilt_lcao/module_deltaspin/cal_mw.cpp b/source/module_hamilt_lcao/module_deltaspin/cal_mw.cpp
index 8701fe5cd8..fb88b8eb57 100644
--- a/source/module_hamilt_lcao/module_deltaspin/cal_mw.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/cal_mw.cpp
@@ -19,13 +19,16 @@ ModuleBase::matrix SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>:
     ModuleBase::matrix MecMulP(this->nspin_, nlocal, true), orbMulP(this->nspin_, nlocal, true);
     for(size_t ik = 0; ik != this->kv_.get_nks(); ++ik)
     {
+        std::complex<double> *sk = nullptr;
         if (this->nspin_ == 4)
         {
-            dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(this->p_hamilt)->updateSk(ik, LM, 1);
+            dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(this->p_hamilt)->updateSk(ik, 1);
+            sk = dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(this->p_hamilt)->getSk();
         }
         else
         {
-            dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt)->updateSk(ik, LM, 1);
+            dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt)->updateSk(ik, 1);
+            sk = dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt)->getSk();
         }
         ModuleBase::ComplexMatrix mud(this->ParaV->ncol, this->ParaV->nrow, true);
 #ifdef __MPI
@@ -43,7 +46,7 @@ ModuleBase::matrix SpinConstrain<std::complex<double>, base_device::DEVICE_CPU>:
                 &one_int,
                 &one_int,
                 this->ParaV->desc,
-                LM->Sloc2.data(),
+                sk,
                 &one_int,
                 &one_int,
                 this->ParaV->desc,
diff --git a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h
index 382ed88686..24151fb833 100644
--- a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h
+++ b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h
@@ -44,7 +44,7 @@ class SpinConstrain
                elecstate::ElecState* pelec_in);
 
   /// calculate h_lambda operator for spin-constrained DFT
-  void cal_h_lambda(std::complex<double>* h_lambda, const std::vector<std::complex<double>>& Sloc2, bool column_major, int isk);
+  void cal_h_lambda(std::complex<double>* h_lambda, const std::complex<double>* Sloc2, bool column_major, int isk);
 
   void cal_MW(const int& step, LCAO_Matrix* LM, bool print = false);
 
diff --git a/source/module_hamilt_lcao/module_deltaspin/template_helpers.cpp b/source/module_hamilt_lcao/module_deltaspin/template_helpers.cpp
index 5afd7e98a9..98721bd87b 100644
--- a/source/module_hamilt_lcao/module_deltaspin/template_helpers.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/template_helpers.cpp
@@ -2,7 +2,7 @@
 
 template <>
 void SpinConstrain<double, base_device::DEVICE_CPU>::cal_h_lambda(std::complex<double>* h_lambda,
-                                                                  const std::vector<std::complex<double>>& Sloc2,
+                                                                  const std::complex<double>* Sloc2,
                                                                   bool column_major,
                                                                   int isk)
 {
diff --git a/source/module_hamilt_lcao/module_deltaspin/test/cal_h_lambda_test.cpp b/source/module_hamilt_lcao/module_deltaspin/test/cal_h_lambda_test.cpp
index 19984eb01b..f3312716b7 100644
--- a/source/module_hamilt_lcao/module_deltaspin/test/cal_h_lambda_test.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/test/cal_h_lambda_test.cpp
@@ -56,7 +56,7 @@ TEST_F(SpinConstrainTest, CalHLambda)
     sc_lambda[0][2] = 1.0;
     sc.set_sc_lambda(sc_lambda, 1);
     // column_major = true
-    sc.cal_h_lambda(&h_lambda[0], Sloc2, true, 0);
+    sc.cal_h_lambda(&h_lambda[0], Sloc2.data(), true, 0);
     // h_lambda = - [lambda_x * sigma_x + lambda_y * sigma_y + lambda_z * sigma_z] * Sloc2
     std::vector<std::complex<double>> columnMajor_h_lambda = {
         std::complex<double>{-1.0, 0.0 },
@@ -74,7 +74,7 @@ TEST_F(SpinConstrainTest, CalHLambda)
     EXPECT_DOUBLE_EQ(h_lambda[3].imag(), columnMajor_h_lambda[3].imag());
     // column_major = false
     delete[] sc_lambda;
-    sc.cal_h_lambda(&h_lambda[0], Sloc2, false, 0);
+    sc.cal_h_lambda(&h_lambda[0], Sloc2.data(), false, 0);
     std::vector<std::complex<double>> rowMajor_h_lambda = {
         std::complex<double>{-1.0, 0.0 },
         std::complex<double>{-1.0, -1.0},
@@ -122,14 +122,14 @@ TEST_F(SpinConstrainTest, CalHLambdaS2)
     sc_lambda[0][2] = 2.0;
     sc.set_sc_lambda(sc_lambda, 1);
     // column_major = true
-    sc.cal_h_lambda(&h_lambda[0], Sloc2, true, 0);
+    sc.cal_h_lambda(&h_lambda[0], Sloc2.data(), true, 0);
     // h_lambda = -Sloc2[icc] * this->lambda_[iat2][2]
     std::vector<std::complex<double>> columnMajor_h_lambda = {
         std::complex<double>{-2.0, 0.0 },
     };
     EXPECT_DOUBLE_EQ(h_lambda[0].real(), columnMajor_h_lambda[0].real());
     EXPECT_DOUBLE_EQ(h_lambda[0].imag(), columnMajor_h_lambda[0].imag());
-    sc.cal_h_lambda(&h_lambda[0], Sloc2, true, 1);
+    sc.cal_h_lambda(&h_lambda[0], Sloc2.data(), true, 1);
     // h_lambda = -Sloc2[icc] * (-this->lambda_[iat2][2])
     std::vector<std::complex<double>> columnMajor_h_lambda1 = {
         std::complex<double>{2.0, 0.0 },
@@ -139,14 +139,14 @@ TEST_F(SpinConstrainTest, CalHLambdaS2)
     // column_major = false
     delete[] sc_lambda;
     // h_lambda = -Sloc2[icc] * this->lambda_[iat2][2]
-    sc.cal_h_lambda(&h_lambda[0], Sloc2, false, 0);
+    sc.cal_h_lambda(&h_lambda[0], Sloc2.data(), false, 0);
     std::vector<std::complex<double>> rowMajor_h_lambda = {
         std::complex<double>{-2.0, 0.0 },
     };
     EXPECT_DOUBLE_EQ(h_lambda[0].real(), rowMajor_h_lambda[0].real());
     EXPECT_DOUBLE_EQ(h_lambda[0].imag(), rowMajor_h_lambda[0].imag());
     // h_lambda = -Sloc2[icc] * (-this->lambda_[iat2][2])
-    sc.cal_h_lambda(&h_lambda[0], Sloc2, false, 1);
+    sc.cal_h_lambda(&h_lambda[0], Sloc2.data(), false, 1);
     std::vector<std::complex<double>> rowMajor_h_lambda1 = {
         std::complex<double>{2.0, 0.0 },
     };
diff --git a/source/module_hamilt_lcao/module_deltaspin/test/template_helpers_test.cpp b/source/module_hamilt_lcao/module_deltaspin/test/template_helpers_test.cpp
index d6b0f627ed..993a65184a 100644
--- a/source/module_hamilt_lcao/module_deltaspin/test/template_helpers_test.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/test/template_helpers_test.cpp
@@ -34,7 +34,7 @@ TEST_F(SpinConstrainTest, TemplatHelpers)
 {
     // this is a trivial test as the double version is not used
     std::vector<std::complex<double>> Sloc2;
-    EXPECT_NO_THROW(sc.cal_h_lambda(nullptr, Sloc2, true, 0));
+    EXPECT_NO_THROW(sc.cal_h_lambda(nullptr, Sloc2.data(), true, 0));
     EXPECT_NO_THROW(sc.cal_mw_from_lambda(0));
     EXPECT_NO_THROW(sc.cal_MW_k(nullptr, std::vector<std::vector<std::complex<double>>>(0)));
     EXPECT_NO_THROW(sc.cal_MW(0, nullptr, false));
diff --git a/source/module_hamilt_lcao/module_dftu/dftu.cpp b/source/module_hamilt_lcao/module_dftu/dftu.cpp
index 14be04406e..282b9fba32 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu.cpp
+++ b/source/module_hamilt_lcao/module_dftu/dftu.cpp
@@ -36,7 +36,7 @@ DFTU::~DFTU()
 }
 
 void DFTU::init(UnitCell& cell, // unitcell class
-                LCAO_Matrix& lm,
+                const Parallel_Orbitals* pv,
                 const int& nks)
 {
     ModuleBase::TITLE("DFTU", "init");
@@ -46,7 +46,7 @@ void DFTU::init(UnitCell& cell, // unitcell class
     exit(0);
 #endif
 
-    this->LM = &lm;
+    this->paraV = pv;
 
     // needs reconstructions in future
     // global parameters, need to be removed in future
diff --git a/source/module_hamilt_lcao/module_dftu/dftu.h b/source/module_hamilt_lcao/module_dftu/dftu.h
index ee8d1ff7cc..c673b36eb9 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu.h
+++ b/source/module_hamilt_lcao/module_dftu/dftu.h
@@ -8,7 +8,6 @@
 #include "module_cell/klist.h"
 #include "module_cell/unitcell.h"
 #include "module_basis/module_ao/parallel_orbitals.h"
-#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h"
 #include "module_elecstate/module_charge/charge_mixing.h"
 #include "module_hamilt_general/hamilt.h"
 #include "module_elecstate/elecstate.h"
@@ -39,7 +38,7 @@ class DFTU
   public:
     // allocate relevant data strcutures
     void init(UnitCell& cell, // unitcell class
-              LCAO_Matrix& lm,
+              const Parallel_Orbitals* pv,
               const int& nks);
 
     // calculate the energy correction
@@ -57,7 +56,7 @@ class DFTU
 
     double EU; //+U energy
   private:
-    LCAO_Matrix* LM;
+    const Parallel_Orbitals* paraV = nullptr;
     int cal_type = 3; // 1:dftu_tpye=1, dc=1; 2:dftu_type=1, dc=2; 3:dftu_tpye=2, dc=1; 4:dftu_tpye=2, dc=2;
     
     // transform between iwt index and it, ia, L, N and m index
@@ -69,8 +68,8 @@ class DFTU
     // For calculating contribution to Hamiltonian matrices
     //=============================================================
   public:
-    void cal_eff_pot_mat_complex(const int ik, std::complex<double>* eff_pot, const std::vector<int>& isk);
-    void cal_eff_pot_mat_real(const int ik, double* eff_pot, const std::vector<int>& isk);
+    void cal_eff_pot_mat_complex(const int ik, std::complex<double>* eff_pot, const std::vector<int>& isk, const std::complex<double>* sk);
+    void cal_eff_pot_mat_real(const int ik, double* eff_pot, const std::vector<int>& isk, const double* sk);
     void cal_eff_pot_mat_R_double(const int ispin, double* SR, double* HR);
     void cal_eff_pot_mat_R_complex_double(const int ispin, std::complex<double>* SR, std::complex<double>* HR);
 
@@ -82,7 +81,7 @@ class DFTU
   public:
     // calculate the local occupation number matrix
     void cal_occup_m_k(const int iter, const std::vector<std::vector<std::complex<double>>>& dm_k, const K_Vectors& kv, const double& mixing_beta, hamilt::Hamilt<std::complex<double>>* p_ham);
-    void cal_occup_m_gamma(const int iter, const std::vector<std::vector<double>>& dm_gamma, const double& mixing_beta);
+    void cal_occup_m_gamma(const int iter, const std::vector<std::vector<double>>& dm_gamma, const double& mixing_beta, hamilt::Hamilt<double>* p_ham);
 
     // dftu can be calculated only after locale has been initialed
     bool initialed_locale = false;
@@ -162,7 +161,6 @@ class DFTU
   public:
 
    void force_stress(const elecstate::ElecState* pelec,
-		   LCAO_Matrix& lm,
 		   const Parallel_Orbitals& pv,
 		   ForceStressArrays& fsr,
 		   ModuleBase::matrix& force_dftu,
diff --git a/source/module_hamilt_lcao/module_dftu/dftu_folding.cpp b/source/module_hamilt_lcao/module_dftu/dftu_folding.cpp
index 8b58dd8344..0abaac6ef7 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu_folding.cpp
+++ b/source/module_hamilt_lcao/module_dftu/dftu_folding.cpp
@@ -282,17 +282,17 @@ void DFTU::folding_matrix_k_new(const int ik,
     // get SR and fold to mat_k
     if(GlobalV::GAMMA_ONLY_LOCAL)
     {
-        dynamic_cast<hamilt::HamiltLCAO<double, double>*>(p_ham)->updateSk(ik, this->LM, hk_type);
+        dynamic_cast<hamilt::HamiltLCAO<double, double>*>(p_ham)->updateSk(ik, hk_type);
     }
     else
     {
         if(GlobalV::NSPIN != 4)
         {
-            dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(p_ham)->updateSk(ik, this->LM, hk_type);
+            dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(p_ham)->updateSk(ik, hk_type);
         }
         else
         {
-            dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(p_ham)->updateSk(ik, this->LM, hk_type);
+            dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(p_ham)->updateSk(ik, hk_type);
         }
     }
 }
diff --git a/source/module_hamilt_lcao/module_dftu/dftu_force.cpp b/source/module_hamilt_lcao/module_dftu/dftu_force.cpp
index bc73bf878d..4354f3b9ac 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu_force.cpp
+++ b/source/module_hamilt_lcao/module_dftu/dftu_force.cpp
@@ -72,7 +72,6 @@ namespace ModuleDFTU
 {
 
 void DFTU::force_stress(const elecstate::ElecState* pelec,
-                        LCAO_Matrix& lm,
                         const Parallel_Orbitals& pv,
                         ForceStressArrays& fsr, // mohan add 2024-06-16
                         ModuleBase::matrix& force_dftu,
@@ -84,8 +83,6 @@ void DFTU::force_stress(const elecstate::ElecState* pelec,
 
     const int nlocal = GlobalV::NLOCAL;
 
-    this->LM = &lm;
-
     if (GlobalV::CAL_FORCE)
     {
         force_dftu.zero_out();
@@ -643,14 +640,14 @@ void DFTU::cal_stress_gamma(const UnitCell& ucell,
                     pv.desc);
 #endif
 
-            for (int ir = 0; ir < this->LM->ParaV->nrow; ir++)
+            for (int ir = 0; ir < this->paraV->nrow; ir++)
             {
-                const int iwt1 = this->LM->ParaV->local2global_row(ir);
+                const int iwt1 = this->paraV->local2global_row(ir);
 
-                for (int ic = 0; ic < this->LM->ParaV->ncol; ic++)
+                for (int ic = 0; ic < this->paraV->ncol; ic++)
                 {
-                    const int iwt2 = this->LM->ParaV->local2global_col(ic);
-                    const int irc = ic * this->LM->ParaV->nrow + ir;
+                    const int iwt2 = this->paraV->local2global_col(ic);
+                    const int irc = ic * this->paraV->nrow + ir;
 
                     if (iwt1 == iwt2)
                         stress_dftu(dim1, dim2) += 2.0 * dm_VU_sover[irc];
diff --git a/source/module_hamilt_lcao/module_dftu/dftu_hamilt.cpp b/source/module_hamilt_lcao/module_dftu/dftu_hamilt.cpp
index 15b7f844c2..717dcb853c 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu_hamilt.cpp
+++ b/source/module_hamilt_lcao/module_dftu/dftu_hamilt.cpp
@@ -6,7 +6,7 @@
 namespace ModuleDFTU
 {
 
-void DFTU::cal_eff_pot_mat_complex(const int ik, std::complex<double>* eff_pot, const std::vector<int>& isk)
+void DFTU::cal_eff_pot_mat_complex(const int ik, std::complex<double>* eff_pot, const std::vector<int>& isk, const std::complex<double>* sk)
 {
     ModuleBase::TITLE("DFTU", "cal_eff_pot_mat");
     ModuleBase::timer::tick("DFTU", "cal_eff_pot_mat");
@@ -18,7 +18,7 @@ void DFTU::cal_eff_pot_mat_complex(const int ik, std::complex<double>* eff_pot,
 
     int spin = isk[ik];
 
-    ModuleBase::GlobalFunc::ZEROS(eff_pot, this->LM->ParaV->nloc);
+    ModuleBase::GlobalFunc::ZEROS(eff_pot, this->paraV->nloc);
 
     //=============================================================
     //   PART2: call pblas to calculate effective potential matrix
@@ -29,35 +29,35 @@ void DFTU::cal_eff_pot_mat_complex(const int ik, std::complex<double>* eff_pot,
     const std::complex<double> half = 0.5;
     const std::complex<double> zero = 0.0;
 
-    std::vector<std::complex<double>> VU(this->LM->ParaV->nloc);
+    std::vector<std::complex<double>> VU(this->paraV->nloc);
     this->cal_VU_pot_mat_complex(spin, true, &VU[0]);
 
 #ifdef __MPI
 	pzgemm_(&transN, &transN,
             &GlobalV::NLOCAL, &GlobalV::NLOCAL, &GlobalV::NLOCAL,
             &half, 
-            ModuleBase::GlobalFunc::VECTOR_TO_PTR(VU), &one_int, &one_int, this->LM->ParaV->desc,
-            this->LM->Sloc2.data(), &one_int, &one_int, this->LM->ParaV->desc,
+            ModuleBase::GlobalFunc::VECTOR_TO_PTR(VU), &one_int, &one_int, this->paraV->desc,
+            sk, &one_int, &one_int, this->paraV->desc,
             &zero,
-            eff_pot, &one_int, &one_int, this->LM->ParaV->desc);
+            eff_pot, &one_int, &one_int, this->paraV->desc);
 #endif
 
-    for (int irc = 0; irc < this->LM->ParaV->nloc; irc++)
+    for (int irc = 0; irc < this->paraV->nloc; irc++)
         VU[irc] = eff_pot[irc];
 
 #ifdef __MPI
   	pztranc_(&GlobalV::NLOCAL, &GlobalV::NLOCAL, 
             &one, 
-            &VU[0], &one_int, &one_int, this->LM->ParaV->desc, 
+            &VU[0], &one_int, &one_int, this->paraV->desc, 
             &one, 
-            eff_pot, &one_int, &one_int, this->LM->ParaV->desc);
+            eff_pot, &one_int, &one_int, this->paraV->desc);
 #endif
 
     ModuleBase::timer::tick("DFTU", "cal_eff_pot_mat");
     return;
 }
 
-void DFTU::cal_eff_pot_mat_real(const int ik, double* eff_pot, const std::vector<int>& isk)
+void DFTU::cal_eff_pot_mat_real(const int ik, double* eff_pot, const std::vector<int>& isk, const double* sk)
 {
     ModuleBase::TITLE("DFTU", "cal_eff_pot_mat");
     ModuleBase::timer::tick("DFTU", "cal_eff_pot_mat");
@@ -69,7 +69,7 @@ void DFTU::cal_eff_pot_mat_real(const int ik, double* eff_pot, const std::vector
 
     int spin = isk[ik];
 
-    ModuleBase::GlobalFunc::ZEROS(eff_pot, this->LM->ParaV->nloc);
+    ModuleBase::GlobalFunc::ZEROS(eff_pot, this->paraV->nloc);
 
     //=============================================================
     //   PART2: call pblas to calculate effective potential matrix
@@ -78,28 +78,28 @@ void DFTU::cal_eff_pot_mat_real(const int ik, double* eff_pot, const std::vector
     int one_int = 1;
     double alpha = 1.0, beta = 0.0, half = 0.5, one = 1.0;
 
-    std::vector<double> VU(this->LM->ParaV->nloc);
+    std::vector<double> VU(this->paraV->nloc);
     this->cal_VU_pot_mat_real(spin, 1, &VU[0]);
 
 #ifdef __MPI
 	pdgemm_(&transN, &transN,
             &GlobalV::NLOCAL, &GlobalV::NLOCAL, &GlobalV::NLOCAL,
             &half, 
-            ModuleBase::GlobalFunc::VECTOR_TO_PTR(VU), &one_int, &one_int, this->LM->ParaV->desc, 
-            this->LM->Sloc.data(), &one_int, &one_int, this->LM->ParaV->desc,
+            ModuleBase::GlobalFunc::VECTOR_TO_PTR(VU), &one_int, &one_int, this->paraV->desc, 
+            sk, &one_int, &one_int, this->paraV->desc,
             &beta,
-            eff_pot, &one_int, &one_int, this->LM->ParaV->desc);
+            eff_pot, &one_int, &one_int, this->paraV->desc);
 #endif
 
-    for (int irc = 0; irc < this->LM->ParaV->nloc; irc++)
+    for (int irc = 0; irc < this->paraV->nloc; irc++)
         VU[irc] = eff_pot[irc];
 
 #ifdef __MPI
 	pdtran_(&GlobalV::NLOCAL, &GlobalV::NLOCAL, 
             &one, 
-            &VU[0], &one_int, &one_int, this->LM->ParaV->desc, 
+            &VU[0], &one_int, &one_int, const_cast<int*>(this->paraV->desc), 
             &one, 
-            eff_pot, &one_int, &one_int, this->LM->ParaV->desc);
+            eff_pot, &one_int, &one_int, const_cast<int*>(this->paraV->desc));
 #endif
 
     ModuleBase::timer::tick("DFTU", "cal_eff_pot_mat");
@@ -112,25 +112,25 @@ void DFTU::cal_eff_pot_mat_R_double(const int ispin, double* SR, double* HR)
     const int one_int = 1;
     const double alpha = 1.0, beta = 0.0, one = 1.0, half = 0.5;
 
-    std::vector<double> VU(this->LM->ParaV->nloc);
+    std::vector<double> VU(this->paraV->nloc);
     this->cal_VU_pot_mat_real(ispin, 1, &VU[0]);
 
 #ifdef __MPI
     pdgemm_(&transN, &transN,
             &GlobalV::NLOCAL, &GlobalV::NLOCAL, &GlobalV::NLOCAL,
             &half, 
-            ModuleBase::GlobalFunc::VECTOR_TO_PTR(VU), &one_int, &one_int, this->LM->ParaV->desc, 
-            SR, &one_int, &one_int, this->LM->ParaV->desc,
+            ModuleBase::GlobalFunc::VECTOR_TO_PTR(VU), &one_int, &one_int, this->paraV->desc, 
+            SR, &one_int, &one_int, this->paraV->desc,
             &beta,
-            HR, &one_int, &one_int, this->LM->ParaV->desc);
+            HR, &one_int, &one_int, this->paraV->desc);
 
     pdgemm_(&transN, &transN,
             &GlobalV::NLOCAL, &GlobalV::NLOCAL, &GlobalV::NLOCAL,
             &half, 
-            SR, &one_int, &one_int, this->LM->ParaV->desc, 
-            ModuleBase::GlobalFunc::VECTOR_TO_PTR(VU), &one_int, &one_int, this->LM->ParaV->desc,
+            SR, &one_int, &one_int, this->paraV->desc, 
+            ModuleBase::GlobalFunc::VECTOR_TO_PTR(VU), &one_int, &one_int, this->paraV->desc,
             &one,
-            HR, &one_int, &one_int, this->LM->ParaV->desc);
+            HR, &one_int, &one_int, this->paraV->desc);
 #endif
 
     return;
@@ -142,25 +142,25 @@ void DFTU::cal_eff_pot_mat_R_complex_double(const int ispin, std::complex<double
     const int one_int = 1;
     const std::complex<double> zero = 0.0, one = 1.0, half = 0.5;
 
-    std::vector<std::complex<double>> VU(this->LM->ParaV->nloc);
+    std::vector<std::complex<double>> VU(this->paraV->nloc);
     this->cal_VU_pot_mat_complex(ispin, 1, &VU[0]);
 
 #ifdef __MPI
     pzgemm_(&transN, &transN,
             &GlobalV::NLOCAL, &GlobalV::NLOCAL, &GlobalV::NLOCAL,
             &half, 
-            ModuleBase::GlobalFunc::VECTOR_TO_PTR(VU), &one_int, &one_int, this->LM->ParaV->desc,
-            SR, &one_int, &one_int, this->LM->ParaV->desc,
+            ModuleBase::GlobalFunc::VECTOR_TO_PTR(VU), &one_int, &one_int, this->paraV->desc,
+            SR, &one_int, &one_int, this->paraV->desc,
             &zero,
-            HR, &one_int, &one_int, this->LM->ParaV->desc);
+            HR, &one_int, &one_int, this->paraV->desc);
 
     pzgemm_(&transN, &transN,
             &GlobalV::NLOCAL, &GlobalV::NLOCAL, &GlobalV::NLOCAL,
             &half, 
-            SR, &one_int, &one_int, this->LM->ParaV->desc, 
-            ModuleBase::GlobalFunc::VECTOR_TO_PTR(VU), &one_int, &one_int, this->LM->ParaV->desc,
+            SR, &one_int, &one_int, this->paraV->desc, 
+            ModuleBase::GlobalFunc::VECTOR_TO_PTR(VU), &one_int, &one_int, this->paraV->desc,
             &one,
-            HR, &one_int, &one_int, this->LM->ParaV->desc);
+            HR, &one_int, &one_int, this->paraV->desc);
 #endif
 
     return;
diff --git a/source/module_hamilt_lcao/module_dftu/dftu_occup.cpp b/source/module_hamilt_lcao/module_dftu/dftu_occup.cpp
index 5242dd96f3..ae710d93dd 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu_occup.cpp
+++ b/source/module_hamilt_lcao/module_dftu/dftu_occup.cpp
@@ -1,6 +1,7 @@
 #include "dftu.h"
 #include "module_base/timer.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h"
 
 extern "C"
 {
@@ -154,13 +155,21 @@ void DFTU::cal_occup_m_k(const int iter,
     const int one_int = 1;
     const std::complex<double> beta(0.0,0.0), alpha(1.0,0.0);
 
-    std::vector<std::complex<double>> srho(this->LM->ParaV->nloc);
+    std::vector<std::complex<double>> srho(this->paraV->nloc);
 
     for (int ik = 0; ik < kv.get_nks(); ik++)
     {
         // srho(mu,nu) = \sum_{iw} S(mu,iw)*dm_k(iw,nu)
         this->folding_matrix_k_new(ik, p_ham);
-        std::complex<double>* s_k_pointer = this->LM->Sloc2.data();
+        std::complex<double>* s_k_pointer = nullptr;
+        if(GlobalV::NSPIN != 4)
+        {
+            s_k_pointer = dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(p_ham)->getSk();
+        }
+        else
+        {
+            s_k_pointer = dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(p_ham)->getSk();
+        }
 
 #ifdef __MPI
         pzgemm_(&transN,
@@ -172,17 +181,17 @@ void DFTU::cal_occup_m_k(const int iter,
                 s_k_pointer,
                 &one_int,
                 &one_int,
-                this->LM->ParaV->desc,
+                this->paraV->desc,
                 dm_k[ik].data(),
                 //dm_k[ik].c,
                 &one_int,
                 &one_int,
-                this->LM->ParaV->desc,
+                this->paraV->desc,
                 &beta,
                 &srho[0],
                 &one_int,
                 &one_int,
-                this->LM->ParaV->desc);
+                this->paraV->desc);
 #endif
 
         const int spin = kv.isk[ik];
@@ -217,19 +226,19 @@ void DFTU::cal_occup_m_k(const int iter,
                             for (int ipol0 = 0; ipol0 < GlobalV::NPOL; ipol0++)
                             {
                                 const int iwt0 = this->iatlnmipol2iwt[iat][l][n][m0][ipol0];
-                                const int mu = this->LM->ParaV->global2local_row(iwt0);
-                                const int mu_prime = this->LM->ParaV->global2local_col(iwt0);
+                                const int mu = this->paraV->global2local_row(iwt0);
+                                const int mu_prime = this->paraV->global2local_col(iwt0);
 
                                 for (int m1 = 0; m1 < 2 * l + 1; m1++)
                                 {
                                     for (int ipol1 = 0; ipol1 < GlobalV::NPOL; ipol1++)
                                     {
                                         const int iwt1 = this->iatlnmipol2iwt[iat][l][n][m1][ipol1];
-                                        const int nu = this->LM->ParaV->global2local_col(iwt1);
-                                        const int nu_prime = this->LM->ParaV->global2local_row(iwt1);
+                                        const int nu = this->paraV->global2local_col(iwt1);
+                                        const int nu_prime = this->paraV->global2local_row(iwt1);
 
-                                        const int irc = nu * this->LM->ParaV->nrow + mu;
-                                        const int irc_prime = mu_prime * this->LM->ParaV->nrow + nu_prime;
+                                        const int irc = nu * this->paraV->nrow + mu;
+                                        const int irc_prime = mu_prime * this->paraV->nrow + nu_prime;
 
                                         const int m0_all = m0 + ipol0 * (2 * l + 1);
                                         const int m1_all = m1 + ipol1 * (2 * l + 1);
@@ -344,7 +353,7 @@ void DFTU::cal_occup_m_k(const int iter,
     return;
 }
 
-void DFTU::cal_occup_m_gamma(const int iter, const std::vector<std::vector<double>> &dm_gamma, const double& mixing_beta)
+void DFTU::cal_occup_m_gamma(const int iter, const std::vector<std::vector<double>> &dm_gamma, const double& mixing_beta, hamilt::Hamilt<double>* p_ham)
 {
     ModuleBase::TITLE("DFTU", "cal_occup_m_gamma");
     ModuleBase::timer::tick("DFTU", "cal_occup_m_gamma");
@@ -357,11 +366,11 @@ void DFTU::cal_occup_m_gamma(const int iter, const std::vector<std::vector<doubl
     const int one_int = 1;
     const double alpha = 1.0, beta = 0.0;
 
-    std::vector<double> srho(this->LM->ParaV->nloc);
+    std::vector<double> srho(this->paraV->nloc);
     for (int is = 0; is < GlobalV::NSPIN; is++)
     {
         // srho(mu,nu) = \sum_{iw} S(mu,iw)*dm_gamma(iw,nu)
-        double* s_gamma_pointer = this->LM->Sloc.data();
+        double* s_gamma_pointer = dynamic_cast<hamilt::HamiltLCAO<double, double>*>(p_ham)->getSk();
 
 #ifdef __MPI
         pdgemm_(&transN,
@@ -373,17 +382,17 @@ void DFTU::cal_occup_m_gamma(const int iter, const std::vector<std::vector<doubl
                 s_gamma_pointer,
                 &one_int,
                 &one_int,
-                this->LM->ParaV->desc,
+                this->paraV->desc,
                 dm_gamma[is].data(),
                 //dm_gamma[is].c,
                 &one_int,
                 &one_int,
-                this->LM->ParaV->desc,
+                this->paraV->desc,
                 &beta,
                 &srho[0],
                 &one_int,
                 &one_int,
-                this->LM->ParaV->desc);
+                this->paraV->desc);
 #endif
 
         for (int it = 0; it < GlobalC::ucell.ntype; it++)
@@ -413,19 +422,19 @@ void DFTU::cal_occup_m_gamma(const int iter, const std::vector<std::vector<doubl
                             for (int ipol0 = 0; ipol0 < GlobalV::NPOL; ipol0++)
                             {
                                 const int iwt0 = this->iatlnmipol2iwt[iat][l][n][m0][ipol0];
-                                const int mu = this->LM->ParaV->global2local_row(iwt0);
-                                const int mu_prime = this->LM->ParaV->global2local_col(iwt0);
+                                const int mu = this->paraV->global2local_row(iwt0);
+                                const int mu_prime = this->paraV->global2local_col(iwt0);
 
                                 for (int m1 = 0; m1 < 2 * l + 1; m1++)
                                 {
                                     for (int ipol1 = 0; ipol1 < GlobalV::NPOL; ipol1++)
                                     {
                                         const int iwt1 = this->iatlnmipol2iwt[iat][l][n][m1][ipol1];
-                                        const int nu = this->LM->ParaV->global2local_col(iwt1);
-                                        const int nu_prime = this->LM->ParaV->global2local_row(iwt1);
+                                        const int nu = this->paraV->global2local_col(iwt1);
+                                        const int nu_prime = this->paraV->global2local_row(iwt1);
 
-                                        const int irc = nu * this->LM->ParaV->nrow + mu;
-                                        const int irc_prime = mu_prime * this->LM->ParaV->nrow + nu_prime;
+                                        const int irc = nu * this->paraV->nrow + mu;
+                                        const int irc_prime = mu_prime * this->paraV->nrow + nu_prime;
 
                                         if ((nu >= 0) && (mu >= 0))
                                         {
diff --git a/source/module_hamilt_lcao/module_dftu/dftu_tools.cpp b/source/module_hamilt_lcao/module_dftu/dftu_tools.cpp
index c48cfd925b..9efbdb224f 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu_tools.cpp
+++ b/source/module_hamilt_lcao/module_dftu/dftu_tools.cpp
@@ -8,7 +8,7 @@ namespace ModuleDFTU
 void DFTU::cal_VU_pot_mat_complex(const int spin, const bool newlocale, std::complex<double>* VU)
 {
     ModuleBase::TITLE("DFTU", "cal_VU_pot_mat_complex");
-    ModuleBase::GlobalFunc::ZEROS(VU, this->LM->ParaV->nloc);
+    ModuleBase::GlobalFunc::ZEROS(VU, this->paraV->nloc);
 
     for (int it = 0; it < GlobalC::ucell.ntype; ++it)
     {
@@ -31,7 +31,7 @@ void DFTU::cal_VU_pot_mat_complex(const int spin, const bool newlocale, std::com
                     {
                         for (int ipol1 = 0; ipol1 < GlobalV::NPOL; ipol1++)
                         {
-                            const int mu = this->LM->ParaV->global2local_row(this->iatlnmipol2iwt[iat][L][n][m1][ipol1]);
+                            const int mu = this->paraV->global2local_row(this->iatlnmipol2iwt[iat][L][n][m1][ipol1]);
                             if (mu < 0)
                                 continue;
 
@@ -40,7 +40,7 @@ void DFTU::cal_VU_pot_mat_complex(const int spin, const bool newlocale, std::com
                                 for (int ipol2 = 0; ipol2 < GlobalV::NPOL; ipol2++)
                                 {
                                     const int nu
-                                        = this->LM->ParaV->global2local_col(this->iatlnmipol2iwt[iat][L][n][m2][ipol2]);
+                                        = this->paraV->global2local_col(this->iatlnmipol2iwt[iat][L][n][m2][ipol2]);
                                     if (nu < 0)
                                         continue;
 
@@ -48,7 +48,7 @@ void DFTU::cal_VU_pot_mat_complex(const int spin, const bool newlocale, std::com
                                     int m2_all = m2 + (2 * L + 1) * ipol2;
 
                                     double val = get_onebody_eff_pot(it, iat, L, n, spin, m1_all, m2_all, newlocale);
-                                    VU[nu * this->LM->ParaV->nrow + mu] = std::complex<double>(val, 0.0);
+                                    VU[nu * this->paraV->nrow + mu] = std::complex<double>(val, 0.0);
                                 } // ipol2
                             } // m2
                         } // ipol1
@@ -64,7 +64,7 @@ void DFTU::cal_VU_pot_mat_complex(const int spin, const bool newlocale, std::com
 void DFTU::cal_VU_pot_mat_real(const int spin, const bool newlocale, double* VU)
 {
     ModuleBase::TITLE("DFTU", "cal_VU_pot_mat_real");
-    ModuleBase::GlobalFunc::ZEROS(VU, this->LM->ParaV->nloc);
+    ModuleBase::GlobalFunc::ZEROS(VU, this->paraV->nloc);
 
     for (int it = 0; it < GlobalC::ucell.ntype; ++it)
     {
@@ -87,7 +87,7 @@ void DFTU::cal_VU_pot_mat_real(const int spin, const bool newlocale, double* VU)
                     {
                         for (int ipol1 = 0; ipol1 < GlobalV::NPOL; ipol1++)
                         {
-                            const int mu = this->LM->ParaV->global2local_row(this->iatlnmipol2iwt[iat][L][n][m1][ipol1]);
+                            const int mu = this->paraV->global2local_row(this->iatlnmipol2iwt[iat][L][n][m1][ipol1]);
                             if (mu < 0)
                                 continue;
                             for (int m2 = 0; m2 < 2 * L + 1; m2++)
@@ -95,14 +95,14 @@ void DFTU::cal_VU_pot_mat_real(const int spin, const bool newlocale, double* VU)
                                 for (int ipol2 = 0; ipol2 < GlobalV::NPOL; ipol2++)
                                 {
                                     const int nu
-                                        = this->LM->ParaV->global2local_col(this->iatlnmipol2iwt[iat][L][n][m2][ipol2]);
+                                        = this->paraV->global2local_col(this->iatlnmipol2iwt[iat][L][n][m2][ipol2]);
                                     if (nu < 0)
                                         continue;
 
                                     int m1_all = m1 + (2 * L + 1) * ipol1;
                                     int m2_all = m2 + (2 * L + 1) * ipol2;
 
-                                    VU[nu * this->LM->ParaV->nrow + mu]
+                                    VU[nu * this->paraV->nrow + mu]
                                         = this->get_onebody_eff_pot(it, iat, L, n, spin, m1_all, m2_all, newlocale);
 
                                 } // ipol2
diff --git a/source/module_hamilt_lcao/module_hcontainer/hcontainer.h b/source/module_hamilt_lcao/module_hcontainer/hcontainer.h
index 269c4ff93f..79105ce340 100644
--- a/source/module_hamilt_lcao/module_hcontainer/hcontainer.h
+++ b/source/module_hamilt_lcao/module_hcontainer/hcontainer.h
@@ -419,6 +419,12 @@ class HContainer
         return paraV->get_global_row_size();
     }
 
+    /**
+     * @brief get parallel orbital pointer to check parallel information
+     * @return const Parallel_Orbitals* , if return is nullptr, it means HContainer is not in parallel mode
+     */
+    const Parallel_Orbitals* get_paraV() const{return paraV;}
+
   private:
     // i-j atom pairs, sorted by matrix of (atom_i, atom_j)
     std::vector<AtomPair<T>> atom_pairs;
diff --git a/source/module_hsolver/diago_cusolver.cpp b/source/module_hsolver/diago_cusolver.cpp
index 517fad4d0c..b8abf642d2 100644
--- a/source/module_hsolver/diago_cusolver.cpp
+++ b/source/module_hsolver/diago_cusolver.cpp
@@ -1,207 +1,204 @@
 #include "diago_cusolver.h"
-#include "module_base/global_variable.h"
-#include "module_base/timer.h"
+
 #include "module_base/blacs_connector.h"
+#include "module_base/global_variable.h"
 #include "module_base/scalapack_connector.h"
+#include "module_base/timer.h"
 
+#include <memory>
 #include <type_traits>
 #include <vector>
-#include <memory>
 
 using complex = std::complex<double>;
 
 // Namespace for the diagonalization solver
 namespace hsolver
 {
-    // this struct is used for collecting matrices from all processes to root process
-    template <typename T> struct Matrix_g
-    {
-        std::shared_ptr<T> p;
-        size_t row;
-        size_t col;
-        std::shared_ptr<int> desc;
-    };
-
-    // Initialize the DecomposedState variable for real and complex numbers
-    template <typename T>
-    int DiagoCusolver<T>::DecomposedState = 0;
-
-    template <typename T>
-    DiagoCusolver<T>::DiagoCusolver(const Parallel_Orbitals* ParaV)
-    {
-        this->ParaV = ParaV;
-    }
+// this struct is used for collecting matrices from all processes to root process
+template <typename T>
+struct Matrix_g
+{
+    std::shared_ptr<T> p;
+    size_t row;
+    size_t col;
+    std::shared_ptr<int> desc;
+};
+
+// Initialize the DecomposedState variable for real and complex numbers
+template <typename T>
+int DiagoCusolver<T>::DecomposedState = 0;
+
+template <typename T>
+DiagoCusolver<T>::DiagoCusolver(const Parallel_Orbitals* ParaV)
+{
+    this->ParaV = ParaV;
+}
+
+template <typename T>
+DiagoCusolver<T>::~DiagoCusolver()
+{
+}
+
+// Wrapper for pdgemr2d and pzgemr2d
+// static inline void Cpxgemr2d(
+//    const int M, const int N,
+//    double *a, const int ia, const int ja, const int *desca,
+//    double *b, const int ib, const int jb, const int *descb,
+//    const int blacs_ctxt)
+//{
+//    pdgemr2d_(&M, &N,
+//              a, &ia, &ja, desca,
+//              b, &ib, &jb, descb,
+//              &blacs_ctxt);
+//}
+//
+// static inline void Cpxgemr2d(
+//    const int M, const int N,
+//    complex *a, const int ia, const int ja, const int *desca,
+//    complex *b, const int ib, const int jb, const int *descb,
+//    const int blacs_ctxt)
+//{
+//    pzgemr2d_(&M, &N,
+//              a, &ia, &ja, desca,
+//              b, &ib, &jb, descb,
+//              &blacs_ctxt);
+//}
+
+// Use Cpxgemr2d to collect matrices from all processes to root process
+template <typename mat, typename matg>
+static void gatherMatrix(const int myid, const int root_proc, const mat& mat_l, matg& mat_g)
+{
+    auto a = mat_l.p;
+    const int* desca = mat_l.desc;
+    int ctxt = desca[1];
+    int nrows = desca[2];
+    int ncols = desca[3];
 
-    template <typename T>
-    DiagoCusolver<T>::~DiagoCusolver()
+    if (myid == root_proc)
     {
+        mat_g.p.reset(new typename std::remove_reference<decltype(*a)>::type[nrows * ncols]);
     }
-    
-    // Wrapper for pdgemr2d and pzgemr2d
-    //static inline void Cpxgemr2d(
-    //    const int M, const int N,
-    //    double *a, const int ia, const int ja, const int *desca,
-    //    double *b, const int ib, const int jb, const int *descb,
-    //    const int blacs_ctxt) 
-    //{
-    //    pdgemr2d_(&M, &N,
-    //              a, &ia, &ja, desca,
-    //              b, &ib, &jb, descb,
-    //              &blacs_ctxt);
-    //}
-//
-    //static inline void Cpxgemr2d(
-    //    const int M, const int N,
-    //    complex *a, const int ia, const int ja, const int *desca,
-    //    complex *b, const int ib, const int jb, const int *descb,
-    //    const int blacs_ctxt) 
-    //{
-    //    pzgemr2d_(&M, &N,
-    //              a, &ia, &ja, desca,
-    //              b, &ib, &jb, descb,
-    //              &blacs_ctxt);
-    //}
-
-    // Use Cpxgemr2d to collect matrices from all processes to root process
-    template <typename mat, typename matg>
-    static void gatherMatrix(const int myid,
-                             const int root_proc,
-                             const mat& mat_l,
-                             matg& mat_g)
+    else
     {
-        auto a = mat_l.p;
-        int* desca = mat_l.desc;
-        int ctxt = desca[1];
-        int nrows = desca[2];
-        int ncols = desca[3];
-
-        if (myid == root_proc)
-            {
-                mat_g.p.reset(new typename std::remove_reference<decltype(*a)>::type[nrows * ncols]);
-            }
-        else
-            {
-                mat_g.p.reset(new typename std::remove_reference<decltype(*a)>::type[1]);
-            }
+        mat_g.p.reset(new typename std::remove_reference<decltype(*a)>::type[1]);
+    }
 
-        // Set descb, which has all elements in the only block in the root process
-        mat_g.desc.reset(new int[9]{1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows});
+    // Set descb, which has all elements in the only block in the root process
+    mat_g.desc.reset(new int[9]{1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows});
 
-        mat_g.row = nrows;
-        mat_g.col = ncols;
+    mat_g.row = nrows;
+    mat_g.col = ncols;
 
-        Cpxgemr2d(nrows, ncols, a, 1, 1, desca, mat_g.p.get(), 1, 1, mat_g.desc.get(), ctxt);
-    }
+    Cpxgemr2d(nrows, ncols, a, 1, 1, const_cast<int*>(desca), mat_g.p.get(), 1, 1, mat_g.desc.get(), ctxt);
+}
 
-    // Convert the Psi to a 2D block storage format
-    template <typename T>
-    static void distributePsi(const int* desc_psi, T* psi, T* psi_g)
-    {
-        int ctxt = desc_psi[1];
-        int nrows = desc_psi[2];
-        int ncols = desc_psi[3];
-        int rsrc = desc_psi[6];
-        int csrc = desc_psi[7];
+// Convert the Psi to a 2D block storage format
+template <typename T>
+static void distributePsi(const int* desc_psi, T* psi, T* psi_g)
+{
+    int ctxt = desc_psi[1];
+    int nrows = desc_psi[2];
+    int ncols = desc_psi[3];
+    int rsrc = desc_psi[6];
+    int csrc = desc_psi[7];
 
-        int descg[9] = {1, ctxt, nrows, ncols, nrows, ncols, rsrc, csrc, nrows};
-        int descl[9];
+    int descg[9] = {1, ctxt, nrows, ncols, nrows, ncols, rsrc, csrc, nrows};
+    int descl[9];
 
-        std::copy(desc_psi, desc_psi + 9, descl);
+    std::copy(desc_psi, desc_psi + 9, descl);
 
-        Cpxgemr2d(nrows, ncols, psi_g, 1, 1, descg, psi, 1, 1, descl, ctxt);
-    }
+    Cpxgemr2d(nrows, ncols, psi_g, 1, 1, descg, psi, 1, 1, descl, ctxt);
+}
 
-    // Diagonalization function
-    template <typename T>
-    void DiagoCusolver<T>::diag(hamilt::Hamilt<T>* phm_in,
-                                psi::Psi<T>& psi,
-                                Real* eigenvalue_in)
-    {
-        // Output the title for the current operation
-        ModuleBase::TITLE("DiagoCusolver", "diag");
+// Diagonalization function
+template <typename T>
+void DiagoCusolver<T>::diag(hamilt::Hamilt<T>* phm_in, psi::Psi<T>& psi, Real* eigenvalue_in)
+{
+    // Output the title for the current operation
+    ModuleBase::TITLE("DiagoCusolver", "diag");
 
-        // Create matrices for the Hamiltonian and overlap
-        hamilt::MatrixBlock<T> h_mat;
-        hamilt::MatrixBlock<T> s_mat;
-        phm_in->matrix(h_mat, s_mat);
+    // Create matrices for the Hamiltonian and overlap
+    hamilt::MatrixBlock<T> h_mat;
+    hamilt::MatrixBlock<T> s_mat;
+    phm_in->matrix(h_mat, s_mat);
 
 #ifdef __MPI
-        // global matrix
-        Matrix_g<T> h_mat_g;
-        Matrix_g<T> s_mat_g;
-
-        // get the context and process information
-        int ctxt = ParaV->blacs_ctxt;
-        int nprows = 0;
-        int npcols = 0;
-        int myprow = 0;
-        int mypcol = 0;
-        Cblacs_gridinfo(ctxt, &nprows, &npcols, &myprow, &mypcol);
-        const int num_procs = nprows * npcols;
-        const int myid = Cblacs_pnum(ctxt, myprow, mypcol);
-        const int root_proc = Cblacs_pnum(ctxt, ParaV->desc[6], ParaV->desc[7]);
+    // global matrix
+    Matrix_g<T> h_mat_g;
+    Matrix_g<T> s_mat_g;
+
+    // get the context and process information
+    int ctxt = ParaV->blacs_ctxt;
+    int nprows = 0;
+    int npcols = 0;
+    int myprow = 0;
+    int mypcol = 0;
+    Cblacs_gridinfo(ctxt, &nprows, &npcols, &myprow, &mypcol);
+    const int num_procs = nprows * npcols;
+    const int myid = Cblacs_pnum(ctxt, myprow, mypcol);
+    const int root_proc = Cblacs_pnum(ctxt, ParaV->desc[6], ParaV->desc[7]);
 #endif
 
-        // Allocate memory for eigenvalues
-        std::vector<double> eigen(GlobalV::NLOCAL, 0.0);
+    // Allocate memory for eigenvalues
+    std::vector<double> eigen(GlobalV::NLOCAL, 0.0);
 
-        // Start the timer for the cusolver operation
-        ModuleBase::timer::tick("DiagoCusolver", "cusolver");
+    // Start the timer for the cusolver operation
+    ModuleBase::timer::tick("DiagoCusolver", "cusolver");
 
 #ifdef __MPI
-        if(num_procs > 1)
-        {
-            // gather matrices from processes to root process
-            gatherMatrix(myid, root_proc, h_mat, h_mat_g);
-            gatherMatrix(myid, root_proc, s_mat, s_mat_g);
-        }
+    if (num_procs > 1)
+    {
+        // gather matrices from processes to root process
+        gatherMatrix(myid, root_proc, h_mat, h_mat_g);
+        gatherMatrix(myid, root_proc, s_mat, s_mat_g);
+    }
 #endif
 
-        // Call the dense diagonalization routine
+    // Call the dense diagonalization routine
 #ifdef __MPI
-        if(num_procs > 1)
+    if (num_procs > 1)
+    {
+        MPI_Barrier(MPI_COMM_WORLD);
+        // global psi for distribute
+        int psi_len = myid == root_proc ? h_mat_g.row * h_mat_g.col : 1;
+        std::vector<T> psi_g(psi_len);
+        if (myid == root_proc)
         {
-            MPI_Barrier(MPI_COMM_WORLD);
-            // global psi for distribute
-            int psi_len = myid == root_proc ? h_mat_g.row * h_mat_g.col : 1;
-            std::vector<T> psi_g(psi_len);
-            if (myid == root_proc)
-            {
-                this->dc.Dngvd(h_mat_g.col, h_mat_g.row, h_mat_g.p.get(), s_mat_g.p.get(), eigen.data(), psi_g.data());
-            }
-
-            MPI_Barrier(MPI_COMM_WORLD);
-            
-            // broadcast eigenvalues to all processes
-            MPI_Bcast(eigen.data(), GlobalV::NBANDS, MPI_DOUBLE, root_proc, MPI_COMM_WORLD);
-
-            // distribute psi to all processes
-            distributePsi(this->ParaV->desc_wfc, psi.get_pointer(), psi_g.data());
-        }
-        else
-        {   
-            // Be careful that h_mat.row * h_mat.col != psi.get_nbands() * psi.get_nbasis() under multi-k situation
-            std::vector<T> eigenvectors(h_mat.row * h_mat.col);
-            this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data());
-            const int size = psi.get_nbands() * psi.get_nbasis();
-            BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1);
+            this->dc.Dngvd(h_mat_g.col, h_mat_g.row, h_mat_g.p.get(), s_mat_g.p.get(), eigen.data(), psi_g.data());
         }
+
+        MPI_Barrier(MPI_COMM_WORLD);
+
+        // broadcast eigenvalues to all processes
+        MPI_Bcast(eigen.data(), GlobalV::NBANDS, MPI_DOUBLE, root_proc, MPI_COMM_WORLD);
+
+        // distribute psi to all processes
+        distributePsi(this->ParaV->desc_wfc, psi.get_pointer(), psi_g.data());
+    }
+    else
+    {
+        // Be careful that h_mat.row * h_mat.col != psi.get_nbands() * psi.get_nbasis() under multi-k situation
+        std::vector<T> eigenvectors(h_mat.row * h_mat.col);
+        this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data());
+        const int size = psi.get_nbands() * psi.get_nbasis();
+        BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1);
+    }
 #else
-            std::vector<T> eigenvectors(h_mat.row * h_mat.col);
-            this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data());
-            const int size = psi.get_nbands() * psi.get_nbasis();
-            BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1);
+    std::vector<T> eigenvectors(h_mat.row * h_mat.col);
+    this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors.data());
+    const int size = psi.get_nbands() * psi.get_nbasis();
+    BlasConnector::copy(size, eigenvectors.data(), 1, psi.get_pointer(), 1);
 #endif
-        // Stop the timer for the cusolver operation
-        ModuleBase::timer::tick("DiagoCusolver", "cusolver");
+    // Stop the timer for the cusolver operation
+    ModuleBase::timer::tick("DiagoCusolver", "cusolver");
 
-        // Copy the eigenvalues to the output arrays
-        const int inc = 1;
-        BlasConnector::copy(GlobalV::NBANDS, eigen.data(), inc, eigenvalue_in, inc);
-    }
+    // Copy the eigenvalues to the output arrays
+    const int inc = 1;
+    BlasConnector::copy(GlobalV::NBANDS, eigen.data(), inc, eigenvalue_in, inc);
+}
 
-    // Explicit instantiation of the DiagoCusolver class for real and complex numbers
-    template class DiagoCusolver<double>;
-    template class DiagoCusolver<complex>;
+// Explicit instantiation of the DiagoCusolver class for real and complex numbers
+template class DiagoCusolver<double>;
+template class DiagoCusolver<complex>;
 
 } // namespace hsolver
diff --git a/source/module_io/output_sk.cpp b/source/module_io/output_sk.cpp
index 24776e4f67..96c0663579 100644
--- a/source/module_io/output_sk.cpp
+++ b/source/module_io/output_sk.cpp
@@ -18,7 +18,7 @@ double* Output_Sk<double>::get_Sk(int ik)
     {
         ModuleBase::WARNING_QUIT("Output_Sk::get_sk", "ik out of range");
     }
-    return this->LM_->Sloc.data();
+    return dynamic_cast<hamilt::HamiltLCAO<double, double>*>(this->p_hamilt_)->getSk();
 }
 
 template <>
@@ -31,13 +31,15 @@ std::complex<double>* Output_Sk<std::complex<double>>::get_Sk(int ik)
     if (this->nspin_ == 4)
     {
         dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(this->p_hamilt_)
-            ->updateSk(ik, this->LM_, 1);
+            ->updateSk(ik, 1);
+        return dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(this->p_hamilt_)->getSk();
     }
     else
     {
-        dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt_)->updateSk(ik, this->LM_, 1);
+        dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt_)->updateSk(ik, 1);
+        return dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt_)->getSk();
     }
-    return this->LM_->Sloc2.data();
+    return nullptr;
 }
 
 template class Output_Sk<double>;
diff --git a/source/module_io/write_Vxc.hpp b/source/module_io/write_Vxc.hpp
index 0992d3fca9..1a470cabea 100644
--- a/source/module_io/write_Vxc.hpp
+++ b/source/module_io/write_Vxc.hpp
@@ -306,7 +306,7 @@ void write_Vxc(int nspin,
     for (int is = 0; is < nspin0; ++is)
         vxcs_R_ao[is].set_zero();
     // k (size for each k-point)
-    std::vector<TK> vxc_k_ao(pv->nloc);
+    hamilt::HS_Matrix_K<TK> vxc_k_ao(pv, 1); // only hk is needed, sk is skipped
 
     // 3. allocate operators and contribute HR
     // op (corresponding to hR)
@@ -318,26 +318,24 @@ void write_Vxc(int nspin,
     for (int is = 0; is < nspin0; ++is)
     {
         vxcs_op_ao[is] = new hamilt::Veff<hamilt::OperatorLCAO<TK, TR>>(gint,
-                                                                        &lm,
+                                                                        &vxc_k_ao,
                                                                         kv.kvec_d,
                                                                         potxc,
                                                                         &vxcs_R_ao[is],
-                                                                        &vxc_k_ao,
                                                                         &ucell,
-                                                                        &gd,
-                                                                        pv);
+                                                                        &gd);
 
         vxcs_op_ao[is]->contributeHR();
     }
     std::vector<std::vector<double>> e_orb_locxc; // orbital energy (local XC)
     std::vector<std::vector<double>> e_orb_tot;   // orbital energy (total)
 #ifdef __EXX
-    hamilt::OperatorEXX<hamilt::OperatorLCAO<TK, TR>> vexx_op_ao(&lm, nullptr, &vxc_k_ao, kv);
-    std::vector<TK> vexxonly_k_ao(pv->nloc);
-    hamilt::OperatorEXX<hamilt::OperatorLCAO<TK, TR>> vexxonly_op_ao(&lm, nullptr, &vexxonly_k_ao, kv);
+    hamilt::OperatorEXX<hamilt::OperatorLCAO<TK, TR>> vexx_op_ao(&vxc_k_ao, &lm, nullptr, kv);
+    hamilt::HS_Matrix_K<TK> vexxonly_k_ao(pv, 1); // only hk is needed, sk is skipped
+    hamilt::OperatorEXX<hamilt::OperatorLCAO<TK, TR>> vexxonly_op_ao(&vexxonly_k_ao, &lm, nullptr, kv);
     std::vector<std::vector<double>> e_orb_exx; // orbital energy (EXX)
 #endif
-    hamilt::OperatorDFTU<hamilt::OperatorLCAO<TK, TR>> vdftu_op_ao(&lm, kv.kvec_d, nullptr, &vxc_k_ao, kv.isk);
+    hamilt::OperatorDFTU<hamilt::OperatorLCAO<TK, TR>> vdftu_op_ao(&vxc_k_ao, kv.kvec_d, nullptr, kv.isk);
 
     // 4. calculate and write the MO-matrix Exc
     Parallel_2D p2d;
@@ -349,19 +347,19 @@ void write_Vxc(int nspin,
     // ======test=======
     for (int ik = 0; ik < kv.get_nks(); ++ik)
     {
-        ModuleBase::GlobalFunc::ZEROS(vxc_k_ao.data(), pv->nloc);
+        vxc_k_ao.set_zero_hk();
         int is = kv.isk[ik];
         dynamic_cast<hamilt::OperatorLCAO<TK, TR>*>(vxcs_op_ao[is])->contributeHk(ik);
-        const std::vector<TK>& vlocxc_k_mo = cVc(vxc_k_ao.data(), &psi(ik, 0, 0), nbasis, nbands, *pv, p2d);
+        const std::vector<TK>& vlocxc_k_mo = cVc(vxc_k_ao.get_hk(), &psi(ik, 0, 0), nbasis, nbands, *pv, p2d);
 
 #ifdef __EXX
         if (GlobalC::exx_info.info_global.cal_exx)
         {
             e_orb_locxc.emplace_back(orbital_energy(ik, nbands, vlocxc_k_mo, p2d));
-            ModuleBase::GlobalFunc::ZEROS(vexxonly_k_ao.data(), pv->nloc);
+            ModuleBase::GlobalFunc::ZEROS(vexxonly_k_ao.get_hk(), pv->nloc);
             vexx_op_ao.contributeHk(ik);
             vexxonly_op_ao.contributeHk(ik);
-            std::vector<TK> vexx_k_mo = cVc(vexxonly_k_ao.data(), &psi(ik, 0, 0), nbasis, nbands, *pv, p2d);
+            std::vector<TK> vexx_k_mo = cVc(vexxonly_k_ao.get_hk(), &psi(ik, 0, 0), nbasis, nbands, *pv, p2d);
             e_orb_exx.emplace_back(orbital_energy(ik, nbands, vexx_k_mo, p2d));
         }
         // ======test=======
@@ -372,7 +370,7 @@ void write_Vxc(int nspin,
         {
             vdftu_op_ao.contributeHk(ik);
         }
-        const std::vector<TK>& vxc_tot_k_mo = cVc(vxc_k_ao.data(), &psi(ik, 0, 0), nbasis, nbands, *pv, p2d);
+        const std::vector<TK>& vxc_tot_k_mo = cVc(vxc_k_ao.get_hk(), &psi(ik, 0, 0), nbasis, nbands, *pv, p2d);
         e_orb_tot.emplace_back(orbital_energy(ik, nbands, vxc_tot_k_mo, p2d));
         // write
         ModuleIO::save_mat(-1,
diff --git a/source/module_io/write_dos_lcao.cpp b/source/module_io/write_dos_lcao.cpp
index 9203e9f969..81c30a9462 100644
--- a/source/module_io/write_dos_lcao.cpp
+++ b/source/module_io/write_dos_lcao.cpp
@@ -47,6 +47,9 @@ void ModuleIO::write_dos_lcao(const psi::Psi<double>* psi,
         nspin0 = 2;
     }
 
+    // get the date pointer of SK
+    const double* sk = dynamic_cast<const hamilt::HamiltLCAO<double, double>*>(p_ham)->getSk();
+
     // find the maximal and minimal band energy.
     double emax = ekb(0, 0);
     double emin = ekb(0, 0);
@@ -151,7 +154,7 @@ void ModuleIO::write_dos_lcao(const psi::Psi<double>* psi,
                     &GlobalV::NLOCAL,
                     &GlobalV::NLOCAL,
                     &one_float,
-                    lm.Sloc.data(),
+                    sk,
                     &one_int,
                     &one_int,
                     pv.desc,
@@ -442,16 +445,16 @@ void ModuleIO::write_dos_lcao(const psi::Psi<std::complex<double>>* psi,
             if (is == kv.isk[ik])
             {
                 // calculate SK for current k point
-                // the target matrix is LM->Sloc2 with collumn-major
+                const std::complex<double>* sk = nullptr;
                 if (GlobalV::NSPIN == 4)
                 {
-                    dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(p_ham)->updateSk(ik,
-                                                                                                                   &lm,
-                                                                                                                   1);
+                    dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(p_ham)->updateSk(ik, 1);
+                    sk = dynamic_cast<const hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(p_ham)->getSk();
                 }
                 else
                 {
-                    dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(p_ham)->updateSk(ik, &lm, 1);
+                    dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(p_ham)->updateSk(ik, 1);
+                    sk = dynamic_cast<const hamilt::HamiltLCAO<std::complex<double>, double>*>(p_ham)->getSk();
                 }
 
                 psi->fix_k(ik);
@@ -489,7 +492,7 @@ void ModuleIO::write_dos_lcao(const psi::Psi<std::complex<double>>* psi,
                             &GlobalV::NLOCAL,
                             &GlobalV::NLOCAL,
                             &one_float[0],
-                            lm.Sloc2.data(),
+                            sk,
                             &one_int,
                             &one_int,
                             pv.desc,
diff --git a/source/module_io/write_proj_band_lcao.cpp b/source/module_io/write_proj_band_lcao.cpp
index 0ac97a0062..32425dc130 100644
--- a/source/module_io/write_proj_band_lcao.cpp
+++ b/source/module_io/write_proj_band_lcao.cpp
@@ -21,6 +21,8 @@ void ModuleIO::write_proj_band_lcao(
     ModuleBase::timer::tick("ModuleIO", "write_proj_band_lcao");
 
     const Parallel_Orbitals* pv = lm.ParaV;
+    // get the date pointer of SK
+    const double* sk = dynamic_cast<const hamilt::HamiltLCAO<double, double>*>(p_ham)->getSk();
 
     int nspin0 = 1;
     if (GlobalV::NSPIN == 2)
@@ -61,7 +63,7 @@ void ModuleIO::write_proj_band_lcao(
                 &GlobalV::NLOCAL,
                 &GlobalV::NLOCAL,
                 &one_float,
-                lm.Sloc.data(),
+                sk,
                 &one_int,
                 &one_int,
                 pv->desc,
@@ -207,16 +209,16 @@ void ModuleIO::write_proj_band_lcao(
                 if (is == kv.isk[ik])
                 {
                     // calculate SK for current k point
-                    // the target matrix is LM->Sloc2 with collumn-major
+                    const std::complex<double>* sk = nullptr;
                     if (GlobalV::NSPIN == 4)
                     {
-                        dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(p_ham)->updateSk(
-                          ik, &lm, 1);
+                        dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(p_ham)->updateSk(ik, 1);
+                        sk = dynamic_cast<const hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>*>(p_ham)->getSk();
                     }
                     else
                     {
-                        dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(p_ham)->updateSk(
-                          ik, &lm, 1);
+                        dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(p_ham)->updateSk(ik, 1);
+                        sk = dynamic_cast<const hamilt::HamiltLCAO<std::complex<double>, double>*>(p_ham)->getSk();
                     }
 
                     // calculate Mulk
@@ -241,7 +243,7 @@ void ModuleIO::write_proj_band_lcao(
                             &GlobalV::NLOCAL,
                             &GlobalV::NLOCAL,
                             &one_float[0],
-                            lm.Sloc2.data(),
+                            sk,
                             &one_int,
                             &one_int,
                             pv->desc,
diff --git a/source/module_ri/RI_2D_Comm.h b/source/module_ri/RI_2D_Comm.h
index 1c52b41458..d4be823ddf 100644
--- a/source/module_ri/RI_2D_Comm.h
+++ b/source/module_ri/RI_2D_Comm.h
@@ -43,7 +43,7 @@ namespace RI_2D_Comm
         const double alpha,
         const std::vector<std::map<TA, std::map<TAC, RI::Tensor<Tdata>>>>& Hs,
         const Parallel_Orbitals& pv,
-        std::vector<TK>& Hloc);
+        TK* hk);
 
 	template<typename Tdata>
 	extern std::vector<std::vector<Tdata>> Hexxs_to_Hk(
diff --git a/source/module_ri/RI_2D_Comm.hpp b/source/module_ri/RI_2D_Comm.hpp
index 182c47d032..af77cb9508 100644
--- a/source/module_ri/RI_2D_Comm.hpp
+++ b/source/module_ri/RI_2D_Comm.hpp
@@ -22,7 +22,9 @@ inline RI::Tensor<double> tensor_conj(const RI::Tensor<double>& t) { return t; }
 inline RI::Tensor<std::complex<double>> tensor_conj(const RI::Tensor<std::complex<double>>& t)
 {
     RI::Tensor<std::complex<double>> r(t.shape);
-    for (int i = 0;i < t.data->size();++i)(*r.data)[i] = std::conj((*t.data)[i]);
+    for (int i = 0; i < t.data->size(); ++i) {
+        (*r.data)[i] = std::conj((*t.data)[i]);
+    }
     return r;
 }
 template<typename Tdata, typename Tmatrix>
@@ -51,15 +53,21 @@ auto RI_2D_Comm::split_m2D_ktoR(const K_Vectors &kv, const std::vector<const Tma
 					* RI::Global_Func::convert<Tdata_m>( std::exp(
                         -ModuleBase::TWO_PI * ModuleBase::IMAG_UNIT * (kv.kvec_c[ik] * (RI_Util::array3_to_Vector3(cell) * GlobalC::ucell.latvec))));
                 auto set_mR_2D = [&mR_2D](auto&& mk_frac) {
-                    if (mR_2D.empty())
+                    if (mR_2D.empty()) {
                         mR_2D = RI::Global_Func::convert<Tdata>(mk_frac);
-                    else
-                        mR_2D = mR_2D + RI::Global_Func::convert<Tdata>(mk_frac);
-                    };
-                if (static_cast<int>(std::round(SPIN_multiple * kv.wk[ik] * kv.get_nkstot_full())) == 2)
+                    } else {
+                        mR_2D
+                            = mR_2D + RI::Global_Func::convert<Tdata>(mk_frac);
+                    }
+                };
+                if (static_cast<int>(std::round(SPIN_multiple * kv.wk[ik]
+                                                * kv.get_nkstot_full()))
+                    == 2) {
                     set_mR_2D(mk_2D * (frac * 0.5) + tensor_conj(mk_2D * (frac * 0.5)));
-                else set_mR_2D(mk_2D * frac);
-			}
+                } else {
+                    set_mR_2D(mk_2D * frac);
+                }
+            }
 
 			for(int iwt0_2D=0; iwt0_2D!=mR_2D.shape[0]; ++iwt0_2D)
 			{
@@ -82,9 +90,13 @@ auto RI_2D_Comm::split_m2D_ktoR(const K_Vectors &kv, const std::vector<const Tma
 
 					const int is_b = RI_2D_Comm::get_is_block(is_k, is0_b, is1_b);
 					RI::Tensor<Tdata> &mR_a2D = mRs_a2D[is_b][iat0][{iat1,cell}];
-					if(mR_a2D.empty())
-						mR_a2D = RI::Tensor<Tdata>({static_cast<size_t>(GlobalC::ucell.atoms[it0].nw), static_cast<size_t>(GlobalC::ucell.atoms[it1].nw)});
-					mR_a2D(iw0_b,iw1_b) = mR_2D(iwt0_2D, iwt1_2D);
+                    if (mR_a2D.empty()) {
+                        mR_a2D = RI::Tensor<Tdata>(
+                            {static_cast<size_t>(GlobalC::ucell.atoms[it0].nw),
+                             static_cast<size_t>(
+                                 GlobalC::ucell.atoms[it1].nw)});
+                    }
+                    mR_a2D(iw0_b,iw1_b) = mR_2D(iwt0_2D, iwt1_2D);
 				}
 			}
 		}
@@ -101,7 +113,7 @@ void RI_2D_Comm::add_Hexx(
 	const double alpha,
 	const std::vector<std::map<TA,std::map<TAC,RI::Tensor<Tdata>>>> &Hs,
     const Parallel_Orbitals& pv,
-    std::vector<TK>& Hloc)
+    TK* hk)
 {
 	ModuleBase::TITLE("RI_2D_Comm","add_Hexx");
 	ModuleBase::timer::tick("RI_2D_Comm", "add_Hexx");
@@ -124,12 +136,16 @@ void RI_2D_Comm::add_Hexx(
 				for(size_t iw0_b=0; iw0_b<H.shape[0]; ++iw0_b)
 				{
 					const int iwt0 = RI_2D_Comm::get_iwt(iat0, iw0_b, is0_b);
-                    if (pv.global2local_row(iwt0) < 0)	continue;
-					for(size_t iw1_b=0; iw1_b<H.shape[1]; ++iw1_b)
+                    if (pv.global2local_row(iwt0) < 0) {
+                        continue;
+                    }
+                    for(size_t iw1_b=0; iw1_b<H.shape[1]; ++iw1_b)
 					{
 						const int iwt1 = RI_2D_Comm::get_iwt(iat1, iw1_b, is1_b);
-                        if (pv.global2local_col(iwt1) < 0)	continue;
-                        LCAO_Matrix::set_mat2d(iwt0, iwt1, RI::Global_Func::convert<TK>(H(iw0_b, iw1_b)) * RI::Global_Func::convert<TK>(frac), pv, Hloc.data());
+                        if (pv.global2local_col(iwt1) < 0) {
+                            continue;
+                        }
+                        LCAO_Matrix::set_mat2d(iwt0, iwt1, RI::Global_Func::convert<TK>(H(iw0_b, iw1_b)) * RI::Global_Func::convert<TK>(frac), pv, hk);
 					}
 				}
 			}