From 14ee9e251341687ccb554b014ac01bd5ab0f418d Mon Sep 17 00:00:00 2001 From: CPviolator Date: Thu, 2 Mar 2017 00:07:15 +0100 Subject: [PATCH] Separate out loop calculation functions, update smethod default param, updated README --- README | 5 +- lib/qudaQKXTM_Deflation_Kepler.cpp | 228 ----- lib/qudaQKXTM_Kepler_utils.cpp | 939 +++------------------ lib/qudaQKXTM_Loops_Kepler.cpp | 1265 ++++++++++++++++++++++++++++ qkxtm/QKXTM_util.cpp | 2 +- 5 files changed, 1386 insertions(+), 1053 deletions(-) create mode 100644 lib/qudaQKXTM_Loops_Kepler.cpp diff --git a/README b/README index a23bb08fe2..120951658e 100644 --- a/README +++ b/README @@ -17,6 +17,7 @@ Dependencies: In order to compile this code, one must also have the following installed: QUDA specific + REQUIRED: magma 1.7.0 ETMC-QUDA specific @@ -47,7 +48,7 @@ to the interface_quda.cpp file of mainline QUDA. This is so we can use the internal gauge field structures directly. The class definitions and header files of auxiliary functions are contained in files prepended like so, - QKXTM_.suffix + qudaQKXTM_.suffix There is a single .cu file for kernels, a 'code_pieces' directory in lib/, a single qudaQKXTM_Kepler_utils.cpp and .h file for containers and general @@ -75,7 +76,7 @@ of QUDA. Authors: -The majority of this code base was authored by, +The majority of the QKXTM code base was authored by, Kyriakos Hadjiyiannakou Christos Kallidonis diff --git a/lib/qudaQKXTM_Deflation_Kepler.cpp b/lib/qudaQKXTM_Deflation_Kepler.cpp index 0d48d3a94f..675bd71808 100644 --- a/lib/qudaQKXTM_Deflation_Kepler.cpp +++ b/lib/qudaQKXTM_Deflation_Kepler.cpp @@ -1907,231 +1907,3 @@ projectVector(QKXTM_Vector_Kepler &vec_defl, printfQuda("projectVector: Deflation of the source vector completed succesfully\n"); } -//------------------------------------------------------------------// -//- C.K. Functions to perform and write the exact part of the loop -// -//------------------------------------------------------------------// - -template -void QKXTM_Deflation_Kepler:: -Loop_w_One_Der_FullOp_Exact(int n, QudaInvertParam *param, - void *gen_uloc,void *std_uloc, - void **gen_oneD, - void **std_oneD, - void **gen_csvC, - void **std_csvC){ - - if(!isFullOp) errorQuda("oneEndTrick_w_One_Der_FullOp_Exact: This function only works with the full operator\n"); - - void *h_ctrn, *ctrnS, *ctrnC; - - double t1,t2; - - if((cudaMallocHost(&h_ctrn, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) - errorQuda("oneEndTrick_w_One_Der_FullOp_Exact: Error allocating memory for contraction results in CPU.\n"); - cudaMemset(h_ctrn, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); - - if((cudaMalloc(&ctrnS, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) - errorQuda("oneEndTrick_w_One_Der_FullOp_Exact: Error allocating memory for contraction results in GPU.\n"); - cudaMemset(ctrnS, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); - - if((cudaMalloc(&ctrnC, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) - errorQuda("oneEndTrick_w_One_Der_FullOp_Exact: Error allocating memory for contraction results in GPU.\n"); - cudaMemset(ctrnC, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); - - checkCudaError(); - - //- Set the eigenvector into cudaColorSpinorField format and save to x - bool pc_solve = false; - cudaColorSpinorField *x1 = NULL; - - double *eigVec = (double*) malloc(bytes_total_length_per_NeV); - memcpy(eigVec,&(h_elem[n*total_length_per_NeV]),bytes_total_length_per_NeV); - - QKXTM_Vector_Kepler *Kvec = - new QKXTM_Vector_Kepler(BOTH,VECTOR); - - ColorSpinorParam cpuParam((void*)eigVec,*param,GK_localL,pc_solve); - ColorSpinorParam cudaParam(cpuParam, *param); - cudaParam.create = QUDA_ZERO_FIELD_CREATE; - x1 = new cudaColorSpinorField(cudaParam); - - Kvec->packVector(eigVec); - Kvec->loadVector(); - Kvec->uploadToCuda(x1,pc_solve); - - Float eVal = eigenValues[2*n+0]; - - cudaColorSpinorField *tmp1 = NULL; - cudaColorSpinorField *tmp2 = NULL; - cudaParam.create = QUDA_ZERO_FIELD_CREATE; - tmp1 = new cudaColorSpinorField(cudaParam); - tmp2 = new cudaColorSpinorField(cudaParam); - blas::zero(*tmp1); - blas::zero(*tmp2); - - cudaColorSpinorField &tmp3 = *tmp1; - cudaColorSpinorField &tmp4 = *tmp2; - cudaColorSpinorField &x = *x1; - //------------------------------------------------------------------------ - - DiracParam dWParam; - dWParam.matpcType = QUDA_MATPC_EVEN_EVEN; - dWParam.dagger = QUDA_DAG_NO; - dWParam.gauge = gaugePrecise; - dWParam.kappa = param->kappa; - dWParam.mass = 1./(2.*param->kappa) - 4.; - dWParam.m5 = 0.; - dWParam.mu = 0.; - for(int i=0; i<4; i++) - dWParam.commDim[i] = 1; - - if(param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH){ - dWParam.type = QUDA_CLOVER_DIRAC; - dWParam.clover = cloverPrecise; - DiracClover *dW = new DiracClover(dWParam); - dW->M(tmp4,x); - delete dW; - } - else if (param->dslash_type == QUDA_TWISTED_MASS_DSLASH){ - dWParam.type = QUDA_WILSON_DIRAC; - DiracWilson *dW = new DiracWilson(dWParam); - dW->M(tmp4,x); - delete dW; - } - else{ - errorQuda("oneEndTrick_w_One_Der_FullOp_Exact: One end trick works only for twisted mass fermions\n"); - } - checkCudaError(); - - gamma5Cuda(static_cast(&tmp3.Even()), - static_cast(&tmp4.Even())); - gamma5Cuda(static_cast(&tmp3.Odd()), - static_cast(&tmp4.Odd())); - - long int sizeBuffer; - sizeBuffer = - sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; - CovD *cov = new CovD(gaugePrecise, profileCovDev); - - int NN = 16*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; - int incx = 1; - int incy = 1; - Float pceval[2] = {1.0/eVal,0.0}; - Float mceval[2] = {-1.0/eVal,0.0}; - - // ULTRA-LOCAL Generalized one-end trick - contract(x, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5); - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - if( typeid(Float) == typeid(float) ) - cblas_caxpy(NN,(void*)pceval,(void*)h_ctrn,incx,(void*)gen_uloc,incy); - else if( typeid(Float) == typeid(double) ) - cblas_zaxpy(NN,(void*)pceval,(void*)h_ctrn,incx,(void*)gen_uloc,incy); - //------------------------------------------------ - - // ULTRA-LOCAL Standard one-end trick - contract(x, x, ctrnS, QUDA_CONTRACT_GAMMA5); - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - if( typeid(Float) == typeid(float) ) - cblas_caxpy(NN,(void*)mceval,(void*)h_ctrn,incx,(void*)std_uloc,incy); - else if( typeid(Float) == typeid(double) ) - cblas_zaxpy(NN,(void*)mceval,(void*)h_ctrn,incx,(void*)std_uloc,incy); - //------------------------------------------------ - - cudaDeviceSynchronize(); - - // ONE-DERIVATIVE Generalized one-end trick - for(int mu=0; mu<4; mu++){ - cov->M(tmp4,tmp3,mu); - // Term 0 - contract(x, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5); - - cov->M (tmp4, x, mu+4); - // Term 0 + Term 3 - contract(tmp4, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_PLUS); - cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); - - cov->M (tmp4, x, mu); - // Term 0 + Term 3 + Term 2 (C Sum) - contract(tmp4, tmp3, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); - // Term 0 + Term 3 - Term 2 (D Dif) - contract(tmp4, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); - - cov->M (tmp4, tmp3, mu+4); - // Term 0 + Term 3 + Term 2 + Term 1 (C Sum) - contract(x, tmp4, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); - // Term 0 + Term 3 - Term 2 - Term 1 (D Dif) - contract(x, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - if( typeid(Float) == typeid(float) ) - cblas_caxpy(NN, (void*) pceval, (void*) h_ctrn, incx, - (void*) gen_oneD[mu], incy); - else if( typeid(Float) == typeid(double) ) - cblas_zaxpy(NN, (void*) pceval, (void*) h_ctrn, incx, - (void*) gen_oneD[mu], incy); - - cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); - - if( typeid(Float) == typeid(float) ) - cblas_caxpy(NN, (void*) pceval, (void*) h_ctrn, incx, - (void*) gen_csvC[mu], incy); - else if( typeid(Float) == typeid(double) ) - cblas_zaxpy(NN, (void*) pceval, (void*) h_ctrn, incx, - (void*) gen_csvC[mu], incy); - } - - //------------------------------------------------ - - // ONE-DERIVATIVE Standard one-end trick - for(int mu=0; mu<4; mu++){ - cov->M (tmp4, x, mu); - cov->M (tmp3, x, mu+4); - // Term 0 - contract(x, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5); - // Term 0 + Term 3 - contract(tmp3, x, ctrnS, QUDA_CONTRACT_GAMMA5_PLUS); - cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); - - // Term 0 + Term 3 + Term 2 (C Sum) - contract(tmp4, x, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); - // Term 0 + Term 3 - Term 2 (D Dif) - contract(tmp4, x, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); - // Term 0 + Term 3 + Term 2 + Term 1 (C Sum) - contract(x, tmp3, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); - // Term 0 + Term 3 - Term 2 - Term 1 (D Dif) - contract(x, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - if( typeid(Float) == typeid(float) ) - cblas_caxpy(NN, (void*) mceval, (void*) h_ctrn, incx, - (void*) std_oneD[mu], incy); - else if( typeid(Float) == typeid(double) ) - cblas_zaxpy(NN, (void*) mceval, (void*) h_ctrn, incx, - (void*) std_oneD[mu], incy); - - cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); - - if( typeid(Float) == typeid(float) ) - cblas_caxpy(NN, (void*) mceval, (void*) h_ctrn, incx, - (void*) std_csvC[mu], incy); - else if( typeid(Float) == typeid(double) ) - cblas_zaxpy(NN, (void*) mceval, (void*) h_ctrn, incx, - (void*) std_csvC[mu], incy); - } - - //------------------------------------------------ - - delete Kvec; - delete x1; - delete tmp1; - delete tmp2; - free(eigVec); - - delete cov; - cudaFreeHost(h_ctrn); - cudaFree(ctrnS); - cudaFree(ctrnC); - checkCudaError(); -} diff --git a/lib/qudaQKXTM_Kepler_utils.cpp b/lib/qudaQKXTM_Kepler_utils.cpp index 760393b18f..1fbf2f2d0b 100644 --- a/lib/qudaQKXTM_Kepler_utils.cpp +++ b/lib/qudaQKXTM_Kepler_utils.cpp @@ -62,19 +62,19 @@ extern int GK_timeSize; //Though only some template forward declarations //are needed presently, future development may require the //others, and are therefore placed here for convenience -template class QKXTM_Field_Kepler; -template class QKXTM_Gauge_Kepler; -template class QKXTM_Vector_Kepler; -template class QKXTM_Propagator_Kepler; -template class QKXTM_Propagator3D_Kepler; -template class QKXTM_Vector3D_Kepler; - -template class QKXTM_Field_Kepler; -template class QKXTM_Gauge_Kepler; -template class QKXTM_Vector_Kepler; -template class QKXTM_Propagator_Kepler; -template class QKXTM_Propagator3D_Kepler; -template class QKXTM_Vector3D_Kepler; +// template class QKXTM_Field_Kepler; +// template class QKXTM_Gauge_Kepler; +// template class QKXTM_Vector_Kepler; +// template class QKXTM_Propagator_Kepler; +// template class QKXTM_Propagator3D_Kepler; +// template class QKXTM_Vector3D_Kepler; + +// template class QKXTM_Field_Kepler; +// template class QKXTM_Gauge_Kepler; +// template class QKXTM_Vector_Kepler; +// template class QKXTM_Propagator_Kepler; +// template class QKXTM_Propagator3D_Kepler; +// template class QKXTM_Vector3D_Kepler; static bool exists_file (const char* name) { return ( access( name, F_OK ) != -1 ); @@ -136,7 +136,7 @@ void testGaussSmearing(void **gauge){ } //SOURCE_T RANDOM: Constructs a Z_4 random source using -// usng gsl as RNG. +// gsl as RNG. //SOURCE_T UNITY: Constructs a momentum source with p=0. template void getStochasticRandomSource(void *spinorIn, gsl_rng *rNum, @@ -170,500 +170,45 @@ void getStochasticRandomSource(void *spinorIn, gsl_rng *rNum, errorQuda("Source type not set correctly!! Aborting.\n"); } } - } -/* Quarantined code -template -void getStochasticRandomSource(void *spinorIn, gsl_rng *rNum){ - memset(spinorIn,0,GK_localVolume*12*2*sizeof(Float)); - - for(int i = 0; i -void oneEndTrick_w_One_Der(ColorSpinorField &x, ColorSpinorField &tmp3, - ColorSpinorField &tmp4, QudaInvertParam *param, - void *cnRes_gv,void *cnRes_vv, void **cnD_gv, - void **cnD_vv, void **cnC_gv, void **cnC_vv){ - - void *h_ctrn, *ctrnS, *ctrnC; - - if((cudaMallocHost(&h_ctrn, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) - errorQuda("Error allocating memory for contraction results in CPU.\n"); - cudaMemset(h_ctrn, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); - - if((cudaMalloc(&ctrnS, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) - errorQuda("Error allocating memory for contraction results in GPU.\n"); - cudaMemset(ctrnS, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); - - if((cudaMalloc(&ctrnC, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) - errorQuda("Error allocating memory for contraction results in GPU.\n"); - cudaMemset(ctrnC, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); - - checkCudaError(); - - DiracParam dWParam; - dWParam.matpcType = QUDA_MATPC_EVEN_EVEN; - dWParam.dagger = QUDA_DAG_NO; - dWParam.gauge = gaugePrecise; - dWParam.kappa = param->kappa; - dWParam.mass = 1./(2.*param->kappa) - 4.; - dWParam.m5 = 0.; - dWParam.mu = 0.; - for (int i=0; i<4; i++) - dWParam.commDim[i] = 1; - - if(param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - dWParam.type = QUDA_CLOVER_DIRAC; - dWParam.clover = cloverPrecise; - DiracClover *dW = new DiracClover(dWParam); - dW->M(tmp4,x); - delete dW; - } - else if (param->dslash_type == QUDA_TWISTED_MASS_DSLASH) { - dWParam.type = QUDA_WILSON_DIRAC; - DiracWilson *dW = new DiracWilson(dWParam); - dW->M(tmp4,x); - delete dW; - } - else{ - errorQuda("Error one end trick works only for twisted mass fermions\n"); - } - checkCudaError(); - - gamma5Cuda(static_cast(&tmp3.Even()), - static_cast(&tmp4.Even())); - gamma5Cuda(static_cast(&tmp3.Odd()), - static_cast(&tmp4.Odd())); - - long int sizeBuffer; - sizeBuffer = - sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; - - int NN = 16*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; - int incx = 1; - int incy = 1; - Float pceval[2] = {1.0,0.0}; - Float mceval[2] = {-1.0,0.0}; - - ///////////////// LOCAL /////////////////////////// - contract(x, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5); - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - if( typeid(Float) == typeid(float) ) - cblas_caxpy(NN,(void*)pceval,(void*)h_ctrn,incx,(void*)cnRes_gv,incy); - else if( typeid(Float) == typeid(double) ) - cblas_zaxpy(NN,(void*)pceval,(void*)h_ctrn,incx,(void*)cnRes_gv,incy); - - // for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - // ((Float*) cnRes_gv)[ix] += ((Float*)h_ctrn)[ix]; // generalized one end trick - - contract(x, x, ctrnS, QUDA_CONTRACT_GAMMA5); - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - // for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - // ((Float*) cnRes_vv)[ix] -= ((Float*)h_ctrn)[ix]; // standard one end trick - - if( typeid(Float) == typeid(float) ) { - cblas_caxpy(NN, (void*) mceval, (void*) h_ctrn, incx, - (void*) cnRes_vv, incy); +static int** allocateMomMatrix(int Q_sq){ + int **mom = (int **) malloc(sizeof(int*)*GK_localL[0]*GK_localL[1]*GK_localL[2]); + if(mom == NULL) errorQuda("Error allocate memory for momenta\n"); + for(int ip=0; ipM(tmp4,tmp3,mu); - // Term 0 - contract(x, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5); - - cov->M (tmp4, x, mu+4); - // Term 0 + Term 3 - contract(tmp4, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_PLUS); - cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); - - // Term 0 + Term 3 + Term 2 (C Sum) - cov->M (tmp4, x, mu); - contract(tmp4, tmp3, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); - // Term 0 + Term 3 - Term 2 (D Dif) - contract(tmp4, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); - - cov->M (tmp4, tmp3, mu+4); - // Term 0 + Term 3 + Term 2 + Term 1 (C Sum) - contract(x, tmp4, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); - // Term 0 + Term 3 - Term 2 - Term 1 (D Dif) - contract(x, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - if( typeid(Float) == typeid(float) ) { - cblas_caxpy(NN, (void*) pceval, (void*) h_ctrn, incx, - (void*) cnD_gv[mu], incy); - } - else if( typeid(Float) == typeid(double) ) { - cblas_zaxpy(NN, (void*) pceval, (void*) h_ctrn, incx, - (void*) cnD_gv[mu], incy); - } - // for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - // ((Float *) cnD_gv[mu])[ix] += ((Float*)h_ctrn)[ix]; - - cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); - - if( typeid(Float) == typeid(float) ) { - cblas_caxpy(NN, (void*) pceval, (void*) h_ctrn, incx, - (void*) cnC_gv[mu], incy); - } - else if( typeid(Float) == typeid(double) ) { - cblas_zaxpy(NN, (void*) pceval, (void*) h_ctrn, incx, - (void*) cnC_gv[mu], incy); - } - // for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - // ((Float *) cnC_gv[mu])[ix] += ((Float*)h_ctrn)[ix]; - } - - for(int mu=0; mu<4; mu++) // for standard one-end trick - { - cov->M (tmp4, x, mu); - cov->M (tmp3, x, mu+4); - // Term 0 - contract(x, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5); - // Term 0 + Term 3 - contract(tmp3, x, ctrnS, QUDA_CONTRACT_GAMMA5_PLUS); - cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); - - // Term 0 + Term 3 + Term 2 (C Sum) - contract(tmp4, x, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); - // Term 0 + Term 3 - Term 2 (D Dif) - contract(tmp4, x, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); - // Term 0 + Term 3 + Term 2 + Term 1 (C Sum) - contract(x, tmp3, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); - // Term 0 + Term 3 - Term 2 - Term 1 (D Dif) - contract(x, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - if( typeid(Float) == typeid(float) ) { - cblas_caxpy(NN, (void*) mceval, (void*) h_ctrn, incx, - (void*) cnD_vv[mu], incy); - } - else if( typeid(Float) == typeid(double) ) { - cblas_zaxpy(NN, (void*) mceval, (void*) h_ctrn, incx, - (void*) cnD_vv[mu], incy); - } - // for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - // ((Float *) cnD_vv[mu])[ix] -= ((Float*)h_ctrn)[ix]; - - cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); - - if( typeid(Float) == typeid(float) ) { - cblas_caxpy(NN, (void*) mceval, (void*) h_ctrn, incx, - (void*) cnC_vv[mu], incy); - } - else if( typeid(Float) == typeid(double) ) { - cblas_zaxpy(NN, (void*) mceval, (void*) h_ctrn, incx, - (void*) cnC_vv[mu], incy); - } - - // for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - // ((Float *) cnC_vv[mu])[ix] -= ((Float*)h_ctrn)[ix]; - } - - delete cov; - cudaFreeHost(h_ctrn); - cudaFree(ctrnS); - cudaFree(ctrnC); - checkCudaError(); -} - - -/* Quarantined Code -template -void oneEndTrick_w_One_Der_2(cudaColorSpinorField &s, - cudaColorSpinorField &x, - cudaColorSpinorField &tmp3, - cudaColorSpinorField &tmp4, - QudaInvertParam *param, - void *cnRes_gv, void *cnRes_vv, - void **cnD_gv, void **cnD_vv, - void **cnC_gv, void **cnC_vv){ - void *h_ctrn, *ctrnS, *ctrnC; - - double t1,t2; - - if((cudaMallocHost(&h_ctrn, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) - errorQuda("Error allocating memory for contraction results in CPU.\n"); - cudaMemset(h_ctrn, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); - - if((cudaMalloc(&ctrnS, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) - errorQuda("Error allocating memory for contraction results in GPU.\n"); - cudaMemset(ctrnS, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); - - if((cudaMalloc(&ctrnC, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) - errorQuda("Error allocating memory for contraction results in GPU.\n"); - cudaMemset(ctrnC, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); - - checkCudaError(); + int momIdx = 0; + int totMom = 0; - DiracParam dWParam; - dWParam.matpcType = QUDA_MATPC_EVEN_EVEN; - dWParam.dagger = QUDA_DAG_NO; - dWParam.gauge = gaugePrecise; - dWParam.kappa = param->kappa; - dWParam.mass = 1./(2.*param->kappa) - 4.; - dWParam.m5 = 0.; - dWParam.mu = 0.; - for (int i=0; i<4; i++) - dWParam.commDim[i] = 1; - - if(param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - dWParam.type = QUDA_CLOVER_DIRAC; - dWParam.clover = cloverPrecise; - DiracClover *dW = new DiracClover(dWParam); - dW->M(tmp4,x); - delete dW; - } - else if (param->dslash_type == QUDA_TWISTED_MASS_DSLASH) { - dWParam.type = QUDA_WILSON_DIRAC; - DiracWilson *dW = new DiracWilson(dWParam); - dW->M(tmp4,x); - delete dW; - } - else{ - errorQuda("Error one end trick works only for twisted mass fermions\n"); - } - checkCudaError(); - - gamma5Cuda(static_cast(&tmp3.Even()), static_cast(&tmp4.Even())); - gamma5Cuda(static_cast(&tmp3.Odd()), static_cast(&tmp4.Odd())); - - long int sizeBuffer; - sizeBuffer = sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; - CovD *cov = new CovD(gaugePrecise, profileCovDev); - - ///////////////// LOCAL /////////////////////////// - contract(s, x, ctrnS, QUDA_CONTRACT_GAMMA5); - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - ((Float*) cnRes_vv)[ix] -= ((Float*)h_ctrn)[ix]; // standard one end trick - - - contract(s, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5); - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - ((Float*) cnRes_gv)[ix] += ((Float*)h_ctrn)[ix]; // generalized one end trick - - cudaDeviceSynchronize(); - - ////////////////// DERIVATIVES ////////////////////////////// - for(int mu=0; mu<4; mu++) // for generalized one-end trick - { - cov->M(tmp4,tmp3,mu); - contract(s, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5); // Term 0 - - cov->M (tmp4, s, mu+4); - contract(tmp4, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_PLUS); // Term 0 + Term 3 - cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); - - cov->M (tmp4, s, mu); - contract(tmp4, tmp3, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); // Term 0 + Term 3 + Term 2 (C Sum) - contract(tmp4, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); // Term 0 + Term 3 - Term 2 (D Dif) - - cov->M (tmp4, tmp3, mu+4); - contract(s, tmp4, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); // Term 0 + Term 3 + Term 2 + Term 1 (C Sum) - contract(s, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); // Term 0 + Term 3 - Term 2 - Term 1 (D Dif) - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - ((Float *) cnD_gv[mu])[ix] += ((Float*)h_ctrn)[ix]; - - cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); - - for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - ((Float *) cnC_gv[mu])[ix] += ((Float*)h_ctrn)[ix]; - } - - for(int mu=0; mu<4; mu++) // for standard one-end trick - { - cov->M (tmp4, x, mu); - cov->M (tmp3, s, mu+4); - - contract(s, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5); // Term 0 - contract(tmp3, x, ctrnS, QUDA_CONTRACT_GAMMA5_PLUS); // Term 0 + Term 3 - cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); - - cov->M (tmp4, s, mu); - contract(tmp4, x, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); // Term 0 + Term 3 + Term 2 (C Sum) - contract(tmp4, x, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); // Term 0 + Term 3 - Term 2 (D Dif) - - cov->M (tmp3, x, mu+4); - contract(s, tmp3, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); // Term 0 + Term 3 + Term 2 + Term 1 (C Sum) - contract(s, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); // Term 0 + Term 3 - Term 2 - Term 1 (D Dif) - - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - ((Float *) cnD_vv[mu])[ix] -= ((Float*)h_ctrn)[ix]; - - cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); - - for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - ((Float *) cnC_vv[mu])[ix] -= ((Float*)h_ctrn)[ix]; - - } - - /////////////// - - delete cov; - cudaFreeHost(h_ctrn); - cudaFree(ctrnS); - cudaFree(ctrnC); - checkCudaError(); -} -*/ + for(int pz = 0; pz < GK_localL[2]; pz++) + for(int py = 0; py < GK_localL[1]; py++) + for(int px = 0; px < GK_localL[0]; px++){ + if (px < GK_localL[0]/2) + mom[momIdx][0] = px; + else + mom[momIdx][0] = px - GK_localL[0]; + if (py < GK_localL[1]/2) + mom[momIdx][1] = py; + else + mom[momIdx][1] = py - GK_localL[1]; -/* Quarantined Code -template -void volumeSource_w_One_Der(ColorSpinorField &x, - ColorSpinorField &xi, - ColorSpinorField &tmp, - QudaInvertParam *param, - void *cn_local, - void **cnD,void **cnC){ - void *h_ctrn, *ctrnS, *ctrnC; - - if((cudaMallocHost(&h_ctrn, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) - errorQuda("Error allocating memory for contraction results in CPU.\n"); - cudaMemset(h_ctrn, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); - - if((cudaMalloc(&ctrnS, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) - errorQuda("Error allocating memory for contraction results in GPU.\n"); - cudaMemset(ctrnS, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); - - if((cudaMalloc(&ctrnC, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) - errorQuda("Error allocating memory for contraction results in GPU.\n"); - cudaMemset(ctrnC, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); - - - long int sizeBuffer; - sizeBuffer = sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; - CovD *cov = new CovD(gaugePrecise, profileCovDev); - - ///////////////// LOCAL /////////////////////////// - contract(xi, x, ctrnS, QUDA_CONTRACT); - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - ((Float*) cn_local)[ix] += ((Float*)h_ctrn)[ix]; // generalized one end trick - - ////////////////// DERIVATIVES ////////////////////////////// - for(int mu=0; mu<4; mu++) // for standard one-end trick - { - cov->M (tmp, x, mu); // Term 0 - contract(xi, tmp, ctrnS, QUDA_CONTRACT); - cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); - - cov->M (tmp, x, mu+4); // Term 1 - contract(xi, tmp, ctrnS, QUDA_CONTRACT_MINUS); // Term 0 - Term 1 - contract(xi, tmp, ctrnC, QUDA_CONTRACT_PLUS); // Term 0 + Term 1 - - cov->M(tmp, xi, mu); // Term 2 - contract(tmp, x, ctrnS, QUDA_CONTRACT_MINUS); // Term 0 - Term 1 - Term 2 - contract(tmp, x, ctrnC, QUDA_CONTRACT_PLUS); // Term 0 + Term 1 + Term 2 - - cov->M(tmp, xi, mu+4); // Term 3 - contract(tmp, x, ctrnS, QUDA_CONTRACT_PLUS); // Term 0 - Term 1 - Term 2 + Term 3 - contract(tmp, x, ctrnC, QUDA_CONTRACT_PLUS); // Term 0 + Term 1 + Term 2 + Term 3 - - cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); - - for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - ((Float *) cnD[mu])[ix] += ((Float*)h_ctrn)[ix]; - - cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); - - for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) - ((Float *) cnC[mu])[ix] += ((Float*)h_ctrn)[ix]; - } - /////////////// + if (pz < GK_localL[2]/2) + mom[momIdx][2] = pz; + else + mom[momIdx][2] = pz - GK_localL[2]; - delete cov; - cudaFreeHost(h_ctrn); - cudaFree(ctrnS); - cudaFree(ctrnC); - checkCudaError(); + if((mom[momIdx][0]*mom[momIdx][0]+ + mom[momIdx][1]*mom[momIdx][1]+ + mom[momIdx][2]*mom[momIdx][2]) <= Q_sq) totMom++; + momIdx++; + } + return mom; } -*/ - -/* Quarantined Code -template -void doCudaFFT(void *cnRes_gv, void *cnRes_vv, void *cnResTmp_gv,void *cnResTmp_vv){ - static cufftHandle fftPlan; - static int init = 0; - int nRank[3] = {GK_localL[0], GK_localL[1], GK_localL[2]}; - const int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; - static cudaStream_t streamCuFFT; - cudaStreamCreate(&streamCuFFT); - - if(cufftPlanMany(&fftPlan, 3, nRank, nRank, 1, Vol, nRank, 1, Vol, CUFFT_Z2Z, 16*GK_localL[3]) != CUFFT_SUCCESS) errorQuda("Error in the FFT!!!\n"); - cufftSetCompatibilityMode (fftPlan, CUFFT_COMPATIBILITY_FFTW_PADDING); - cufftSetStream (fftPlan, streamCuFFT); - checkCudaError(); - void* ctrnS; - if((cudaMalloc(&ctrnS, sizeof(Float)*32*Vol*GK_localL[3])) == cudaErrorMemoryAllocation) errorQuda("Error with memory allocation\n"); - cudaMemcpy(ctrnS, cnRes_vv, sizeof(Float)*32*Vol*GK_localL[3], cudaMemcpyHostToDevice); - if(typeid(Float) == typeid(double))if(cufftExecZ2Z(fftPlan, (double2 *) ctrnS, (double2 *) ctrnS, CUFFT_FORWARD) != CUFFT_SUCCESS) errorQuda("Error run cudafft\n"); - if(typeid(Float) == typeid(float))if(cufftExecC2C(fftPlan, (float2 *) ctrnS, (float2 *) ctrnS, CUFFT_FORWARD) != CUFFT_SUCCESS) errorQuda("Error run cudafft\n"); - cudaMemcpy(cnResTmp_vv, ctrnS, sizeof(Float)*32*Vol*GK_localL[3], cudaMemcpyDeviceToHost); - cudaMemcpy(ctrnS, cnRes_gv, sizeof(Float)*32*Vol*GK_localL[3], cudaMemcpyHostToDevice); - if(typeid(Float) == typeid(double))if(cufftExecZ2Z(fftPlan, (double2 *) ctrnS, (double2 *) ctrnS, CUFFT_FORWARD) != CUFFT_SUCCESS) errorQuda("Error run cudafft\n"); - if(typeid(Float) == typeid(float))if(cufftExecC2C(fftPlan, (float2 *) ctrnS, (float2 *) ctrnS, CUFFT_FORWARD) != CUFFT_SUCCESS) errorQuda("Error run cudafft\n"); - cudaMemcpy(cnResTmp_gv, ctrnS, sizeof(Float)*32*Vol*GK_localL[3], cudaMemcpyDeviceToHost); - - - cudaFree(ctrnS); - cufftDestroy (fftPlan); - cudaStreamDestroy (streamCuFFT); - checkCudaError(); -} -*/ template void doCudaFFT_v2(void *cnIn, void *cnOut){ @@ -697,41 +242,6 @@ void doCudaFFT_v2(void *cnIn, void *cnOut){ checkCudaError(); } -static int** allocateMomMatrix(int Q_sq){ - int **mom = (int **) malloc(sizeof(int*)*GK_localL[0]*GK_localL[1]*GK_localL[2]); - if(mom == NULL) errorQuda("Error allocate memory for momenta\n"); - for(int ip=0; ip -void copyLoopToWriteBuf(Float *writeBuf, void *tmpBuf, int iPrint, int Q_sq, int Nmoms, int **mom){ +void copyLoopToWriteBuf(Float *writeBuf, void *tmpBuf, int iPrint, + int Q_sq, int Nmoms, int **mom){ if(GK_nProc[2]==1){ long int SplV = GK_localL[0]*GK_localL[1]*GK_localL[2]; @@ -862,9 +373,49 @@ void copyLoopToWriteBuf(Float *writeBuf, void *tmpBuf, int iPrint, int Q_sq, int } +/* Quarantined code +template +void getStochasticRandomSource(void *spinorIn, gsl_rng *rNum){ + memset(spinorIn,0,GK_localVolume*12*2*sizeof(Float)); + + for(int i = 0; i -void writeLoops_ASCII(Float *writeBuf, const char *Pref, qudaQKXTM_loopInfo loopInfo, int **momQsq, int type, int mu, bool exact_loop, bool useTSM, bool LowPrec){ +void writeLoops_ASCII(Float *writeBuf, const char *Pref, + qudaQKXTM_loopInfo loopInfo, + int **momQsq, int type, + int mu, bool exact_loop, + bool useTSM, bool LowPrec){ if(exact_loop && useTSM) errorQuda("writeLoops_ASCII: Got conflicting options - exact_loop AND useTSM.\n"); @@ -934,8 +485,9 @@ void writeLoops_ASCII(Float *writeBuf, const char *Pref, qudaQKXTM_loopInfo loop }//-if GK_timeRank } +*/ - +/* Moved to qudaQKXTM_Loops_Kepler.cpp //-C.K: Copy the HDF5 dataset chunk into writeBuf template void getLoopWriteBuf(Float *writeBuf, Float *loopBuf, int iPrint, int Nmoms, int imom, bool oneD){ @@ -960,8 +512,9 @@ void getLoopWriteBuf(Float *writeBuf, Float *loopBuf, int iPrint, int Nmoms, int }//-if GK_timeRank } +*/ - +/* Moved to qudaQKXTM_Loops_Kepler.cpp //-C.K: Funtion to write the loops in HDF5 format template void writeLoops_HDF5(Float *buf_std_uloc, Float *buf_gen_uloc, @@ -1115,299 +668,41 @@ void writeLoops_HDF5(Float *buf_std_uloc, Float *buf_gen_uloc, free(writeBuf); } } - -/* Quarantined code -template -void dumpLoop(void *cnRes_gv, void *cnRes_vv, const char *Pref,int accumLevel, int Q_sq){ - int **mom = allocateMomMatrix(Q_sq); - FILE *ptr_gv; - FILE *ptr_vv; - char file_gv[257]; - char file_vv[257]; - sprintf(file_gv, "%s_dOp.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); - sprintf(file_vv, "%s_Scalar.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); - ptr_gv = fopen(file_gv,"w"); - ptr_vv = fopen(file_vv,"w"); - if(ptr_gv == NULL || ptr_vv == NULL) errorQuda("Error open files to write loops\n"); - long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; - for(int ip=0; ip < Vol; ip++) - for(int lt=0; lt < GK_localL[3]; lt++){ - if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ - int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; - for(int gm=0; gm<16; gm++){ - fprintf (ptr_gv, "%02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, mom[ip][0], mom[ip][1], mom[ip][2], - ((Float*)cnRes_gv)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm], ((Float*)cnRes_gv)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]); - fprintf (ptr_vv, "%02d %02d %+d %+d %+d %+16.15le %+16.15e\n",t, gm, mom[ip][0], mom[ip][1], mom[ip][2], - ((Float*)cnRes_vv)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm], ((Float*)cnRes_vv)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]); - } - } - } - printfQuda("data dumped for accumLevel %d\n",accumLevel); - fclose(ptr_gv); - fclose(ptr_vv); - for(int ip=0; ip -void dumpLoop_ultraLocal(void *cn, const char *Pref,int accumLevel, int Q_sq, int flag){ - int **mom = allocateMomMatrix(Q_sq); - FILE *ptr; - char file_name[257]; - - switch(flag){ - case 0: - sprintf(file_name, "%s_Scalar.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); - break; - case 1: - sprintf(file_name, "%s_dOp.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); - break; - } - ptr = fopen(file_name,"w"); - - if(ptr == NULL) errorQuda("Error open files to write loops\n"); - long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; - for(int ip=0; ip < Vol; ip++) - for(int lt=0; lt < GK_localL[3]; lt++){ - if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ - int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; - for(int gm=0; gm<16; gm++){ - fprintf(ptr, "%02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, mom[ip][0], mom[ip][1], mom[ip][2], - ((Float*)cn)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm], ((Float*)cn)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]); - } - } - } - printfQuda("data dumped for accumLevel %d\n",accumLevel); - fclose(ptr); - for(int ip=0; ip -void dumpLoop_oneD(void *cn, const char *Pref,int accumLevel, int Q_sq, int muDir, int flag){ - int **mom = allocateMomMatrix(Q_sq); - FILE *ptr; - char file_name[257]; - - switch(flag){ - case 0: - sprintf(file_name, "%s_Loops.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); - break; - case 1: - sprintf(file_name, "%s_LpsDw.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); - break; - case 2: - sprintf(file_name, "%s_LoopsCv.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); - break; - case 3: - sprintf(file_name, "%s_LpsDwCv.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); - break; - } - if(muDir == 0) - ptr = fopen(file_name,"w"); - else - ptr = fopen(file_name,"a"); - - if(ptr == NULL) errorQuda("Error open files to write loops\n"); - long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; - for(int ip=0; ip < Vol; ip++) - for(int lt=0; lt < GK_localL[3]; lt++){ - if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ - int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; - for(int gm=0; gm<16; gm++){ - fprintf(ptr, "%02d %02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, muDir ,mom[ip][0], mom[ip][1], mom[ip][2], - 0.25*(((Float*)cn)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]), 0.25*(((Float*)cn)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm])); - } - } - } - - printfQuda("data dumped for accumLevel %d\n",accumLevel); - fclose(ptr); - for(int ip=0; ip -void dumpLoop_ultraLocal_v2(void *cn, const char *Pref,int accumLevel, int Q_sq, char *string){ - int **mom = allocateMomMatrix(Q_sq); - FILE *ptr; - char file_name[257]; - - sprintf(file_name, "%s_%s.loop.%04d.%d_%d",Pref,string,accumLevel,comm_size(), comm_rank()); - ptr = fopen(file_name,"w"); - - if(ptr == NULL) errorQuda("Error open files to write loops\n"); - long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; - for(int ip=0; ip < Vol; ip++) - for(int lt=0; lt < GK_localL[3]; lt++){ - if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ - int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; - for(int gm=0; gm<16; gm++){ - fprintf(ptr, "%02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, mom[ip][0], mom[ip][1], mom[ip][2], - ((Float*)cn)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm], ((Float*)cn)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]); - } - } - } - printfQuda("data dumped for accumLevel %d\n",accumLevel); - fclose(ptr); - for(int ip=0; ip +void doCudaFFT(void *cnRes_gv, void *cnRes_vv, void *cnResTmp_gv,void *cnResTmp_vv){ + static cufftHandle fftPlan; + static int init = 0; + int nRank[3] = {GK_localL[0], GK_localL[1], GK_localL[2]}; + const int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; + static cudaStream_t streamCuFFT; + cudaStreamCreate(&streamCuFFT); -/* Quarantined code -template -void dumpLoop_oneD_v2(void *cn, const char *Pref,int accumLevel, int Q_sq, int muDir, char *string){ - int **mom = allocateMomMatrix(Q_sq); - FILE *ptr; - char file_name[257]; - - sprintf(file_name, "%s_%s.loop.%04d.%d_%d",Pref,string,accumLevel,comm_size(), comm_rank()); - - if(muDir == 0) - ptr = fopen(file_name,"w"); - else - ptr = fopen(file_name,"a"); - - if(ptr == NULL) errorQuda("Error open files to write loops\n"); - long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; - for(int ip=0; ip < Vol; ip++) - for(int lt=0; lt < GK_localL[3]; lt++){ - if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ - int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; - for(int gm=0; gm<16; gm++){ - fprintf(ptr, "%02d %02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, muDir ,mom[ip][0], mom[ip][1], mom[ip][2], - 0.25*(((Float*)cn)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]), 0.25*(((Float*)cn)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm])); - } - } - } + if(cufftPlanMany(&fftPlan, 3, nRank, nRank, 1, Vol, nRank, 1, Vol, CUFFT_Z2Z, 16*GK_localL[3]) != CUFFT_SUCCESS) errorQuda("Error in the FFT!!!\n"); + cufftSetCompatibilityMode (fftPlan, CUFFT_COMPATIBILITY_FFTW_PADDING); + cufftSetStream (fftPlan, streamCuFFT); + checkCudaError(); + void* ctrnS; + if((cudaMalloc(&ctrnS, sizeof(Float)*32*Vol*GK_localL[3])) == cudaErrorMemoryAllocation) errorQuda("Error with memory allocation\n"); - printfQuda("data dumped for accumLevel %d\n",accumLevel); - fclose(ptr); - for(int ip=0; ip -void dumpVector(Float *vec, int is, char file_base[]){ - FILE *ptr; - char file_name[257]; - - sprintf(file_name,"%s.%04d.%d_%d",file_base,is+1,comm_size(), comm_rank()); - ptr = fopen(file_name,"w"); - if(ptr == NULL) errorQuda("Cannot open file %s for deflated source\n",file_name); - - - for(int t=0; t -void dumpLoop_ultraLocal_Exact(void *cn, const char *Pref, int Q_sq, int flag){ - int **mom = allocateMomMatrix(Q_sq); - FILE *ptr; - char file_name[257]; - - switch(flag){ - case 0: - sprintf(file_name, "%s_Scalar.loop.%d_%d",Pref,comm_size(), comm_rank()); - break; - case 1: - sprintf(file_name, "%s_dOp.loop.%d_%d",Pref,comm_size(), comm_rank()); - break; - } - ptr = fopen(file_name,"w"); - - if(ptr == NULL) errorQuda("Error open files to write loops\n"); - long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; - for(int ip=0; ip < Vol; ip++) - for(int lt=0; lt < GK_localL[3]; lt++){ - if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ - int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; - for(int gm=0; gm<16; gm++){ - fprintf(ptr, "%02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, mom[ip][0], mom[ip][1], mom[ip][2], - ((Float*)cn)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm], ((Float*)cn)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]); - } - } - } - fclose(ptr); - for(int ip=0; ip -void dumpLoop_oneD_Exact(void *cn, const char *Pref, int Q_sq, int muDir, int flag){ - int **mom = allocateMomMatrix(Q_sq); - FILE *ptr; - char file_name[257]; - - switch(flag){ - case 0: - sprintf(file_name, "%s_Loops.loop.%d_%d",Pref,comm_size(), comm_rank()); - break; - case 1: - sprintf(file_name, "%s_LpsDw.loop.%d_%d",Pref,comm_size(), comm_rank()); - break; - case 2: - sprintf(file_name, "%s_LoopsCv.loop.%d_%d",Pref,comm_size(), comm_rank()); - break; - case 3: - sprintf(file_name, "%s_LpsDwCv.loop.%d_%d",Pref,comm_size(), comm_rank()); - break; - } - if(muDir == 0) - ptr = fopen(file_name,"w"); - else - ptr = fopen(file_name,"a"); - - if(ptr == NULL) errorQuda("Error open files to write loops\n"); - long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; - for(int ip=0; ip < Vol; ip++) - for(int lt=0; lt < GK_localL[3]; lt++){ - if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ - int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; - for(int gm=0; gm<16; gm++){ - fprintf(ptr, "%02d %02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, muDir ,mom[ip][0], mom[ip][1], mom[ip][2], - 0.25*(((Float*)cn)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]), 0.25*(((Float*)cn)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm])); - } - } - } - - fclose(ptr); - for(int ip=0; ip +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include //QXKTM: FIXME +#include +#include +#include +#include + +#define PI 3.141592653589793 + +//using namespace quda; +extern Topology *default_topo; + +/* Block for global variables */ +extern float GK_deviceMemory; +extern int GK_nColor; +extern int GK_nSpin; +extern int GK_nDim; +extern int GK_strideFull; +extern double GK_alphaAPE; +extern double GK_alphaGauss; +extern int GK_localVolume; +extern int GK_totalVolume; +extern int GK_nsmearAPE; +extern int GK_nsmearGauss; +extern bool GK_dimBreak[QUDAQKXTM_DIM]; +extern int GK_localL[QUDAQKXTM_DIM]; +extern int GK_totalL[QUDAQKXTM_DIM]; +extern int GK_nProc[QUDAQKXTM_DIM]; +extern int GK_plusGhost[QUDAQKXTM_DIM]; +extern int GK_minusGhost[QUDAQKXTM_DIM]; +extern int GK_surface3D[QUDAQKXTM_DIM]; +extern bool GK_init_qudaQKXTM_Kepler_flag; +extern int GK_Nsources; +extern int GK_sourcePosition[MAX_NSOURCES][QUDAQKXTM_DIM]; +extern int GK_Nmoms; +extern short int GK_moms[MAX_NMOMENTA][3]; +// for mpi use global variables +extern MPI_Group GK_fullGroup , GK_spaceGroup , GK_timeGroup; +extern MPI_Comm GK_spaceComm , GK_timeComm; +extern int GK_localRank; +extern int GK_localSize; +extern int GK_timeRank; +extern int GK_timeSize; + + +#include +#include +#define TIMING_REPORT + +//------------------------------------------------------------------// +//- C.K. Functions to perform and write the exact part of the loop -// +//------------------------------------------------------------------// + +template +void QKXTM_Deflation_Kepler:: +Loop_w_One_Der_FullOp_Exact(int n, QudaInvertParam *param, + void *gen_uloc,void *std_uloc, + void **gen_oneD, + void **std_oneD, + void **gen_csvC, + void **std_csvC){ + + if(!isFullOp) errorQuda("oneEndTrick_w_One_Der_FullOp_Exact: This function only works with the full operator\n"); + + void *h_ctrn, *ctrnS, *ctrnC; + + double t1,t2; + + if((cudaMallocHost(&h_ctrn, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) + errorQuda("oneEndTrick_w_One_Der_FullOp_Exact: Error allocating memory for contraction results in CPU.\n"); + cudaMemset(h_ctrn, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); + + if((cudaMalloc(&ctrnS, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) + errorQuda("oneEndTrick_w_One_Der_FullOp_Exact: Error allocating memory for contraction results in GPU.\n"); + cudaMemset(ctrnS, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); + + if((cudaMalloc(&ctrnC, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) + errorQuda("oneEndTrick_w_One_Der_FullOp_Exact: Error allocating memory for contraction results in GPU.\n"); + cudaMemset(ctrnC, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); + + checkCudaError(); + + //- Set the eigenvector into cudaColorSpinorField format and save to x + bool pc_solve = false; + cudaColorSpinorField *x1 = NULL; + + double *eigVec = (double*) malloc(bytes_total_length_per_NeV); + memcpy(eigVec,&(h_elem[n*total_length_per_NeV]),bytes_total_length_per_NeV); + + QKXTM_Vector_Kepler *Kvec = + new QKXTM_Vector_Kepler(BOTH,VECTOR); + + ColorSpinorParam cpuParam((void*)eigVec,*param,GK_localL,pc_solve); + ColorSpinorParam cudaParam(cpuParam, *param); + cudaParam.create = QUDA_ZERO_FIELD_CREATE; + x1 = new cudaColorSpinorField(cudaParam); + + Kvec->packVector(eigVec); + Kvec->loadVector(); + Kvec->uploadToCuda(x1,pc_solve); + + Float eVal = eigenValues[2*n+0]; + + cudaColorSpinorField *tmp1 = NULL; + cudaColorSpinorField *tmp2 = NULL; + cudaParam.create = QUDA_ZERO_FIELD_CREATE; + tmp1 = new cudaColorSpinorField(cudaParam); + tmp2 = new cudaColorSpinorField(cudaParam); + blas::zero(*tmp1); + blas::zero(*tmp2); + + cudaColorSpinorField &tmp3 = *tmp1; + cudaColorSpinorField &tmp4 = *tmp2; + cudaColorSpinorField &x = *x1; + //------------------------------------------------------------------------ + + DiracParam dWParam; + dWParam.matpcType = QUDA_MATPC_EVEN_EVEN; + dWParam.dagger = QUDA_DAG_NO; + dWParam.gauge = gaugePrecise; + dWParam.kappa = param->kappa; + dWParam.mass = 1./(2.*param->kappa) - 4.; + dWParam.m5 = 0.; + dWParam.mu = 0.; + for(int i=0; i<4; i++) + dWParam.commDim[i] = 1; + + if(param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH){ + dWParam.type = QUDA_CLOVER_DIRAC; + dWParam.clover = cloverPrecise; + DiracClover *dW = new DiracClover(dWParam); + dW->M(tmp4,x); + delete dW; + } + else if (param->dslash_type == QUDA_TWISTED_MASS_DSLASH){ + dWParam.type = QUDA_WILSON_DIRAC; + DiracWilson *dW = new DiracWilson(dWParam); + dW->M(tmp4,x); + delete dW; + } + else{ + errorQuda("oneEndTrick_w_One_Der_FullOp_Exact: One end trick works only for twisted mass fermions\n"); + } + checkCudaError(); + + gamma5Cuda(static_cast(&tmp3.Even()), + static_cast(&tmp4.Even())); + gamma5Cuda(static_cast(&tmp3.Odd()), + static_cast(&tmp4.Odd())); + + long int sizeBuffer; + sizeBuffer = + sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; + CovD *cov = new CovD(gaugePrecise, profileCovDev); + + int NN = 16*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; + int incx = 1; + int incy = 1; + Float pceval[2] = {1.0/eVal,0.0}; + Float mceval[2] = {-1.0/eVal,0.0}; + + // ULTRA-LOCAL Generalized one-end trick + contract(x, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5); + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + if( typeid(Float) == typeid(float) ) + cblas_caxpy(NN,(void*)pceval,(void*)h_ctrn,incx,(void*)gen_uloc,incy); + else if( typeid(Float) == typeid(double) ) + cblas_zaxpy(NN,(void*)pceval,(void*)h_ctrn,incx,(void*)gen_uloc,incy); + //------------------------------------------------ + + // ULTRA-LOCAL Standard one-end trick + contract(x, x, ctrnS, QUDA_CONTRACT_GAMMA5); + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + if( typeid(Float) == typeid(float) ) + cblas_caxpy(NN,(void*)mceval,(void*)h_ctrn,incx,(void*)std_uloc,incy); + else if( typeid(Float) == typeid(double) ) + cblas_zaxpy(NN,(void*)mceval,(void*)h_ctrn,incx,(void*)std_uloc,incy); + //------------------------------------------------ + + cudaDeviceSynchronize(); + + // ONE-DERIVATIVE Generalized one-end trick + for(int mu=0; mu<4; mu++){ + cov->M(tmp4,tmp3,mu); + // Term 0 + contract(x, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5); + + cov->M (tmp4, x, mu+4); + // Term 0 + Term 3 + contract(tmp4, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_PLUS); + cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); + + cov->M (tmp4, x, mu); + // Term 0 + Term 3 + Term 2 (C Sum) + contract(tmp4, tmp3, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); + // Term 0 + Term 3 - Term 2 (D Dif) + contract(tmp4, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); + + cov->M (tmp4, tmp3, mu+4); + // Term 0 + Term 3 + Term 2 + Term 1 (C Sum) + contract(x, tmp4, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); + // Term 0 + Term 3 - Term 2 - Term 1 (D Dif) + contract(x, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + if( typeid(Float) == typeid(float) ) + cblas_caxpy(NN, (void*) pceval, (void*) h_ctrn, incx, + (void*) gen_oneD[mu], incy); + else if( typeid(Float) == typeid(double) ) + cblas_zaxpy(NN, (void*) pceval, (void*) h_ctrn, incx, + (void*) gen_oneD[mu], incy); + + cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); + + if( typeid(Float) == typeid(float) ) + cblas_caxpy(NN, (void*) pceval, (void*) h_ctrn, incx, + (void*) gen_csvC[mu], incy); + else if( typeid(Float) == typeid(double) ) + cblas_zaxpy(NN, (void*) pceval, (void*) h_ctrn, incx, + (void*) gen_csvC[mu], incy); + } + + //------------------------------------------------ + + // ONE-DERIVATIVE Standard one-end trick + for(int mu=0; mu<4; mu++){ + cov->M (tmp4, x, mu); + cov->M (tmp3, x, mu+4); + // Term 0 + contract(x, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5); + // Term 0 + Term 3 + contract(tmp3, x, ctrnS, QUDA_CONTRACT_GAMMA5_PLUS); + cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); + + // Term 0 + Term 3 + Term 2 (C Sum) + contract(tmp4, x, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); + // Term 0 + Term 3 - Term 2 (D Dif) + contract(tmp4, x, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); + // Term 0 + Term 3 + Term 2 + Term 1 (C Sum) + contract(x, tmp3, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); + // Term 0 + Term 3 - Term 2 - Term 1 (D Dif) + contract(x, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + if( typeid(Float) == typeid(float) ) + cblas_caxpy(NN, (void*) mceval, (void*) h_ctrn, incx, + (void*) std_oneD[mu], incy); + else if( typeid(Float) == typeid(double) ) + cblas_zaxpy(NN, (void*) mceval, (void*) h_ctrn, incx, + (void*) std_oneD[mu], incy); + + cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); + + if( typeid(Float) == typeid(float) ) + cblas_caxpy(NN, (void*) mceval, (void*) h_ctrn, incx, + (void*) std_csvC[mu], incy); + else if( typeid(Float) == typeid(double) ) + cblas_zaxpy(NN, (void*) mceval, (void*) h_ctrn, incx, + (void*) std_csvC[mu], incy); + } + + //------------------------------------------------ + + delete Kvec; + delete x1; + delete tmp1; + delete tmp2; + free(eigVec); + + delete cov; + cudaFreeHost(h_ctrn); + cudaFree(ctrnS); + cudaFree(ctrnC); + checkCudaError(); +} + + +template +void oneEndTrick_w_One_Der(ColorSpinorField &x, ColorSpinorField &tmp3, + ColorSpinorField &tmp4, QudaInvertParam *param, + void *cnRes_gv,void *cnRes_vv, void **cnD_gv, + void **cnD_vv, void **cnC_gv, void **cnC_vv){ + + void *h_ctrn, *ctrnS, *ctrnC; + + if((cudaMallocHost(&h_ctrn, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) + errorQuda("Error allocating memory for contraction results in CPU.\n"); + cudaMemset(h_ctrn, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); + + if((cudaMalloc(&ctrnS, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) + errorQuda("Error allocating memory for contraction results in GPU.\n"); + cudaMemset(ctrnS, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); + + if((cudaMalloc(&ctrnC, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) + errorQuda("Error allocating memory for contraction results in GPU.\n"); + cudaMemset(ctrnC, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); + + checkCudaError(); + + DiracParam dWParam; + dWParam.matpcType = QUDA_MATPC_EVEN_EVEN; + dWParam.dagger = QUDA_DAG_NO; + dWParam.gauge = gaugePrecise; + dWParam.kappa = param->kappa; + dWParam.mass = 1./(2.*param->kappa) - 4.; + dWParam.m5 = 0.; + dWParam.mu = 0.; + for (int i=0; i<4; i++) + dWParam.commDim[i] = 1; + + if(param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + dWParam.type = QUDA_CLOVER_DIRAC; + dWParam.clover = cloverPrecise; + DiracClover *dW = new DiracClover(dWParam); + dW->M(tmp4,x); + delete dW; + } + else if (param->dslash_type == QUDA_TWISTED_MASS_DSLASH) { + dWParam.type = QUDA_WILSON_DIRAC; + DiracWilson *dW = new DiracWilson(dWParam); + dW->M(tmp4,x); + delete dW; + } + else{ + errorQuda("Error one end trick works only for twisted mass fermions\n"); + } + checkCudaError(); + + gamma5Cuda(static_cast(&tmp3.Even()), + static_cast(&tmp4.Even())); + gamma5Cuda(static_cast(&tmp3.Odd()), + static_cast(&tmp4.Odd())); + + long int sizeBuffer; + sizeBuffer = + sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; + + int NN = 16*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; + int incx = 1; + int incy = 1; + Float pceval[2] = {1.0,0.0}; + Float mceval[2] = {-1.0,0.0}; + + ///////////////// LOCAL /////////////////////////// + contract(x, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5); + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + if( typeid(Float) == typeid(float) ) + cblas_caxpy(NN,(void*)pceval,(void*)h_ctrn,incx,(void*)cnRes_gv,incy); + else if( typeid(Float) == typeid(double) ) + cblas_zaxpy(NN,(void*)pceval,(void*)h_ctrn,incx,(void*)cnRes_gv,incy); + + // for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + // ((Float*) cnRes_gv)[ix] += ((Float*)h_ctrn)[ix]; // generalized one end trick + + contract(x, x, ctrnS, QUDA_CONTRACT_GAMMA5); + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + // for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + // ((Float*) cnRes_vv)[ix] -= ((Float*)h_ctrn)[ix]; // standard one end trick + + if( typeid(Float) == typeid(float) ) { + cblas_caxpy(NN, (void*) mceval, (void*) h_ctrn, incx, + (void*) cnRes_vv, incy); + } + else if( typeid(Float) == typeid(double) ) { + cblas_zaxpy(NN, (void*) mceval, (void*) h_ctrn, incx, + (void*) cnRes_vv, incy); + } + cudaDeviceSynchronize(); + + ////////////////// DERIVATIVES ////////////////////////////// + CovD *cov = new CovD(gaugePrecise, profileCovDev); + + // for generalized one-end trick + for(int mu=0; mu<4; mu++) + { + cov->M(tmp4,tmp3,mu); + // Term 0 + contract(x, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5); + + cov->M (tmp4, x, mu+4); + // Term 0 + Term 3 + contract(tmp4, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_PLUS); + cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); + + // Term 0 + Term 3 + Term 2 (C Sum) + cov->M (tmp4, x, mu); + contract(tmp4, tmp3, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); + // Term 0 + Term 3 - Term 2 (D Dif) + contract(tmp4, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); + + cov->M (tmp4, tmp3, mu+4); + // Term 0 + Term 3 + Term 2 + Term 1 (C Sum) + contract(x, tmp4, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); + // Term 0 + Term 3 - Term 2 - Term 1 (D Dif) + contract(x, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + if( typeid(Float) == typeid(float) ) { + cblas_caxpy(NN, (void*) pceval, (void*) h_ctrn, incx, + (void*) cnD_gv[mu], incy); + } + else if( typeid(Float) == typeid(double) ) { + cblas_zaxpy(NN, (void*) pceval, (void*) h_ctrn, incx, + (void*) cnD_gv[mu], incy); + } + // for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + // ((Float *) cnD_gv[mu])[ix] += ((Float*)h_ctrn)[ix]; + + cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); + + if( typeid(Float) == typeid(float) ) { + cblas_caxpy(NN, (void*) pceval, (void*) h_ctrn, incx, + (void*) cnC_gv[mu], incy); + } + else if( typeid(Float) == typeid(double) ) { + cblas_zaxpy(NN, (void*) pceval, (void*) h_ctrn, incx, + (void*) cnC_gv[mu], incy); + } + // for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + // ((Float *) cnC_gv[mu])[ix] += ((Float*)h_ctrn)[ix]; + } + + for(int mu=0; mu<4; mu++) // for standard one-end trick + { + cov->M (tmp4, x, mu); + cov->M (tmp3, x, mu+4); + // Term 0 + contract(x, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5); + // Term 0 + Term 3 + contract(tmp3, x, ctrnS, QUDA_CONTRACT_GAMMA5_PLUS); + cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); + + // Term 0 + Term 3 + Term 2 (C Sum) + contract(tmp4, x, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); + // Term 0 + Term 3 - Term 2 (D Dif) + contract(tmp4, x, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); + // Term 0 + Term 3 + Term 2 + Term 1 (C Sum) + contract(x, tmp3, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); + // Term 0 + Term 3 - Term 2 - Term 1 (D Dif) + contract(x, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + if( typeid(Float) == typeid(float) ) { + cblas_caxpy(NN, (void*) mceval, (void*) h_ctrn, incx, + (void*) cnD_vv[mu], incy); + } + else if( typeid(Float) == typeid(double) ) { + cblas_zaxpy(NN, (void*) mceval, (void*) h_ctrn, incx, + (void*) cnD_vv[mu], incy); + } + // for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + // ((Float *) cnD_vv[mu])[ix] -= ((Float*)h_ctrn)[ix]; + + cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); + + if( typeid(Float) == typeid(float) ) { + cblas_caxpy(NN, (void*) mceval, (void*) h_ctrn, incx, + (void*) cnC_vv[mu], incy); + } + else if( typeid(Float) == typeid(double) ) { + cblas_zaxpy(NN, (void*) mceval, (void*) h_ctrn, incx, + (void*) cnC_vv[mu], incy); + } + + // for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + // ((Float *) cnC_vv[mu])[ix] -= ((Float*)h_ctrn)[ix]; + } + + delete cov; + cudaFreeHost(h_ctrn); + cudaFree(ctrnS); + cudaFree(ctrnC); + checkCudaError(); +} + + +//-C.K. This is a new function to print all the loops in ASCII format +template +void writeLoops_ASCII(Float *writeBuf, const char *Pref, + qudaQKXTM_loopInfo loopInfo, + int **momQsq, int type, + int mu, bool exact_loop, + bool useTSM, bool LowPrec){ + + if(exact_loop && useTSM) errorQuda("writeLoops_ASCII: Got conflicting options - exact_loop AND useTSM.\n"); + + if(GK_timeRank >= 0 && GK_timeRank < GK_nProc[3] ){ + FILE *ptr; + char file_name[512]; + char *lpart,*ptrVal; + int Nprint,Ndump; + int Nmoms = loopInfo.Nmoms; + + if(exact_loop) Nprint = 1; + else{ + if(useTSM){ + if(LowPrec){ + Nprint = loopInfo.TSM_NprintLP; + Ndump = loopInfo.TSM_NdumpLP; + } + else{ + Nprint = loopInfo.TSM_NprintHP; + Ndump = loopInfo.TSM_NdumpHP; + } + } + else{ + Nprint = loopInfo.Nprint; + Ndump = loopInfo.Ndump; + } + } + + for(int iPrint=0;iPrint +void getLoopWriteBuf(Float *writeBuf, Float *loopBuf, int iPrint, int Nmoms, int imom, bool oneD){ + + if(GK_timeRank >= 0 && GK_timeRank < GK_nProc[3] ){ + if(oneD){ + for(int lt=0;lt +void writeLoops_HDF5(Float *buf_std_uloc, Float *buf_gen_uloc, + Float **buf_std_oneD, Float **buf_std_csvC, + Float **buf_gen_oneD, Float **buf_gen_csvC, + char *file_pref, + qudaQKXTM_loopInfo loopInfo, int **momQsq, + bool exact_loop, bool useTSM, bool LowPrec){ + + if(exact_loop && useTSM) errorQuda("writeLoops_HDF5: Got conflicting options - exact_loop AND useTSM.\n"); + + if(GK_timeRank >= 0 && GK_timeRank < GK_nProc[3] ){ + char fname[512]; + int Nprint,Ndump; + + if(exact_loop){ + Nprint = 1; + sprintf(fname,"%s_Qsq%d.h5",file_pref,loopInfo.Qsq); + } + else{ + if(useTSM){ + if(LowPrec){ + Nprint = loopInfo.TSM_NprintLP; + Ndump = loopInfo.TSM_NdumpLP; + sprintf(fname,"%s_NLP%04d_step%04d_Qsq%d.h5",file_pref,loopInfo.TSM_NLP,Ndump,loopInfo.Qsq); + } + else{ + Nprint = loopInfo.TSM_NprintHP; + Ndump = loopInfo.TSM_NdumpHP; + sprintf(fname,"%s_NHP%04d_step%04d_Qsq%d.h5",file_pref,loopInfo.TSM_NHP,Ndump,loopInfo.Qsq); + } + } + else{ + Nprint = loopInfo.Nprint; + Ndump = loopInfo.Ndump; + sprintf(fname,"%s_Ns%04d_step%04d_Qsq%d.h5",file_pref,loopInfo.Nstoch,Ndump,loopInfo.Qsq); + } + } + + double *loopBuf = NULL; + double *writeBuf = (double*) malloc(GK_localL[3]*16*2*sizeof(double)); + + hsize_t start[3] = {GK_timeRank*GK_localL[3], 0, 0}; + + // Dimensions of the dataspace + hsize_t dims[3] = {GK_totalL[3], 16, 2}; // Global + hsize_t ldims[3] = {GK_localL[3], 16, 2}; // Local + + hid_t fapl_id = H5Pcreate(H5P_FILE_ACCESS); + H5Pset_fapl_mpio(fapl_id, GK_timeComm, MPI_INFO_NULL); + hid_t file_id = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + if(file_id<0) errorQuda("Cannot open %s. Check that directory exists!\n",fname); + + H5Pclose(fapl_id); + + char *group1_tag; + asprintf(&group1_tag,"conf_%04d",loopInfo.traj); + hid_t group1_id = H5Gcreate(file_id, group1_tag, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + + hid_t group2_id; + hid_t group3_id; + hid_t group4_id; + hid_t group5_id; + + for(int iPrint=0;iPrint +void dumpLoop(void *cnRes_gv, void *cnRes_vv, const char *Pref,int accumLevel, int Q_sq){ + int **mom = allocateMomMatrix(Q_sq); + FILE *ptr_gv; + FILE *ptr_vv; + char file_gv[257]; + char file_vv[257]; + sprintf(file_gv, "%s_dOp.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); + sprintf(file_vv, "%s_Scalar.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); + ptr_gv = fopen(file_gv,"w"); + ptr_vv = fopen(file_vv,"w"); + if(ptr_gv == NULL || ptr_vv == NULL) errorQuda("Error open files to write loops\n"); + long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; + for(int ip=0; ip < Vol; ip++) + for(int lt=0; lt < GK_localL[3]; lt++){ + if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ + int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; + for(int gm=0; gm<16; gm++){ + fprintf (ptr_gv, "%02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, mom[ip][0], mom[ip][1], mom[ip][2], + ((Float*)cnRes_gv)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm], ((Float*)cnRes_gv)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]); + fprintf (ptr_vv, "%02d %02d %+d %+d %+d %+16.15le %+16.15e\n",t, gm, mom[ip][0], mom[ip][1], mom[ip][2], + ((Float*)cnRes_vv)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm], ((Float*)cnRes_vv)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]); + } + } + } + printfQuda("data dumped for accumLevel %d\n",accumLevel); + fclose(ptr_gv); + fclose(ptr_vv); + for(int ip=0; ip +void dumpLoop_ultraLocal(void *cn, const char *Pref,int accumLevel, int Q_sq, int flag){ + int **mom = allocateMomMatrix(Q_sq); + FILE *ptr; + char file_name[257]; + + switch(flag){ + case 0: + sprintf(file_name, "%s_Scalar.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); + break; + case 1: + sprintf(file_name, "%s_dOp.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); + break; + } + ptr = fopen(file_name,"w"); + + if(ptr == NULL) errorQuda("Error open files to write loops\n"); + long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; + for(int ip=0; ip < Vol; ip++) + for(int lt=0; lt < GK_localL[3]; lt++){ + if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ + int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; + for(int gm=0; gm<16; gm++){ + fprintf(ptr, "%02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, mom[ip][0], mom[ip][1], mom[ip][2], + ((Float*)cn)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm], ((Float*)cn)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]); + } + } + } + printfQuda("data dumped for accumLevel %d\n",accumLevel); + fclose(ptr); + for(int ip=0; ip +void dumpLoop_oneD(void *cn, const char *Pref,int accumLevel, int Q_sq, int muDir, int flag){ + int **mom = allocateMomMatrix(Q_sq); + FILE *ptr; + char file_name[257]; + + switch(flag){ + case 0: + sprintf(file_name, "%s_Loops.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); + break; + case 1: + sprintf(file_name, "%s_LpsDw.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); + break; + case 2: + sprintf(file_name, "%s_LoopsCv.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); + break; + case 3: + sprintf(file_name, "%s_LpsDwCv.loop.%04d.%d_%d",Pref,accumLevel,comm_size(), comm_rank()); + break; + } + if(muDir == 0) + ptr = fopen(file_name,"w"); + else + ptr = fopen(file_name,"a"); + + if(ptr == NULL) errorQuda("Error open files to write loops\n"); + long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; + for(int ip=0; ip < Vol; ip++) + for(int lt=0; lt < GK_localL[3]; lt++){ + if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ + int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; + for(int gm=0; gm<16; gm++){ + fprintf(ptr, "%02d %02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, muDir ,mom[ip][0], mom[ip][1], mom[ip][2], + 0.25*(((Float*)cn)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]), 0.25*(((Float*)cn)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm])); + } + } + } + + printfQuda("data dumped for accumLevel %d\n",accumLevel); + fclose(ptr); + for(int ip=0; ip +void dumpLoop_ultraLocal_v2(void *cn, const char *Pref,int accumLevel, int Q_sq, char *string){ + int **mom = allocateMomMatrix(Q_sq); + FILE *ptr; + char file_name[257]; + + sprintf(file_name, "%s_%s.loop.%04d.%d_%d",Pref,string,accumLevel,comm_size(), comm_rank()); + ptr = fopen(file_name,"w"); + + if(ptr == NULL) errorQuda("Error open files to write loops\n"); + long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; + for(int ip=0; ip < Vol; ip++) + for(int lt=0; lt < GK_localL[3]; lt++){ + if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ + int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; + for(int gm=0; gm<16; gm++){ + fprintf(ptr, "%02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, mom[ip][0], mom[ip][1], mom[ip][2], + ((Float*)cn)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm], ((Float*)cn)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]); + } + } + } + printfQuda("data dumped for accumLevel %d\n",accumLevel); + fclose(ptr); + for(int ip=0; ip +void dumpLoop_oneD_v2(void *cn, const char *Pref,int accumLevel, int Q_sq, int muDir, char *string){ + int **mom = allocateMomMatrix(Q_sq); + FILE *ptr; + char file_name[257]; + + sprintf(file_name, "%s_%s.loop.%04d.%d_%d",Pref,string,accumLevel,comm_size(), comm_rank()); + + if(muDir == 0) + ptr = fopen(file_name,"w"); + else + ptr = fopen(file_name,"a"); + + if(ptr == NULL) errorQuda("Error open files to write loops\n"); + long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; + for(int ip=0; ip < Vol; ip++) + for(int lt=0; lt < GK_localL[3]; lt++){ + if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ + int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; + for(int gm=0; gm<16; gm++){ + fprintf(ptr, "%02d %02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, muDir ,mom[ip][0], mom[ip][1], mom[ip][2], + 0.25*(((Float*)cn)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]), 0.25*(((Float*)cn)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm])); + } + } + } + + printfQuda("data dumped for accumLevel %d\n",accumLevel); + fclose(ptr); + for(int ip=0; ip +void dumpVector(Float *vec, int is, char file_base[]){ + FILE *ptr; + char file_name[257]; + + sprintf(file_name,"%s.%04d.%d_%d",file_base,is+1,comm_size(), comm_rank()); + ptr = fopen(file_name,"w"); + if(ptr == NULL) errorQuda("Cannot open file %s for deflated source\n",file_name); + + + for(int t=0; t +void dumpLoop_ultraLocal_Exact(void *cn, const char *Pref, int Q_sq, int flag){ + int **mom = allocateMomMatrix(Q_sq); + FILE *ptr; + char file_name[257]; + + switch(flag){ + case 0: + sprintf(file_name, "%s_Scalar.loop.%d_%d",Pref,comm_size(), comm_rank()); + break; + case 1: + sprintf(file_name, "%s_dOp.loop.%d_%d",Pref,comm_size(), comm_rank()); + break; + } + ptr = fopen(file_name,"w"); + + if(ptr == NULL) errorQuda("Error open files to write loops\n"); + long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; + for(int ip=0; ip < Vol; ip++) + for(int lt=0; lt < GK_localL[3]; lt++){ + if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ + int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; + for(int gm=0; gm<16; gm++){ + fprintf(ptr, "%02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, mom[ip][0], mom[ip][1], mom[ip][2], + ((Float*)cn)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm], ((Float*)cn)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]); + } + } + } + + fclose(ptr); + for(int ip=0; ip +void dumpLoop_oneD_Exact(void *cn, const char *Pref, int Q_sq, int muDir, int flag){ + int **mom = allocateMomMatrix(Q_sq); + FILE *ptr; + char file_name[257]; + + switch(flag){ + case 0: + sprintf(file_name, "%s_Loops.loop.%d_%d",Pref,comm_size(), comm_rank()); + break; + case 1: + sprintf(file_name, "%s_LpsDw.loop.%d_%d",Pref,comm_size(), comm_rank()); + break; + case 2: + sprintf(file_name, "%s_LoopsCv.loop.%d_%d",Pref,comm_size(), comm_rank()); + break; + case 3: + sprintf(file_name, "%s_LpsDwCv.loop.%d_%d",Pref,comm_size(), comm_rank()); + break; + } + if(muDir == 0) + ptr = fopen(file_name,"w"); + else + ptr = fopen(file_name,"a"); + + if(ptr == NULL) errorQuda("Error open files to write loops\n"); + long int Vol = GK_localL[0]*GK_localL[1]*GK_localL[2]; + for(int ip=0; ip < Vol; ip++) + for(int lt=0; lt < GK_localL[3]; lt++){ + if ((mom[ip][0]*mom[ip][0] + mom[ip][1]*mom[ip][1] + mom[ip][2]*mom[ip][2]) <= Q_sq){ + int t = lt+comm_coords(default_topo)[3]*GK_localL[3]; + for(int gm=0; gm<16; gm++){ + fprintf(ptr, "%02d %02d %02d %+d %+d %+d %+16.15e %+16.15e\n",t, gm, muDir ,mom[ip][0], mom[ip][1], mom[ip][2], + 0.25*(((Float*)cn)[0+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm]), 0.25*(((Float*)cn)[1+2*ip+2*Vol*lt+2*Vol*GK_localL[3]*gm])); + } + } + } + + fclose(ptr); + for(int ip=0; ip +void oneEndTrick_w_One_Der_2(cudaColorSpinorField &s, + cudaColorSpinorField &x, + cudaColorSpinorField &tmp3, + cudaColorSpinorField &tmp4, + QudaInvertParam *param, + void *cnRes_gv, void *cnRes_vv, + void **cnD_gv, void **cnD_vv, + void **cnC_gv, void **cnC_vv){ + void *h_ctrn, *ctrnS, *ctrnC; + + double t1,t2; + + if((cudaMallocHost(&h_ctrn, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) + errorQuda("Error allocating memory for contraction results in CPU.\n"); + cudaMemset(h_ctrn, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); + + if((cudaMalloc(&ctrnS, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) + errorQuda("Error allocating memory for contraction results in GPU.\n"); + cudaMemset(ctrnS, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); + + if((cudaMalloc(&ctrnC, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) + errorQuda("Error allocating memory for contraction results in GPU.\n"); + cudaMemset(ctrnC, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); + + checkCudaError(); + + DiracParam dWParam; + dWParam.matpcType = QUDA_MATPC_EVEN_EVEN; + dWParam.dagger = QUDA_DAG_NO; + dWParam.gauge = gaugePrecise; + dWParam.kappa = param->kappa; + dWParam.mass = 1./(2.*param->kappa) - 4.; + dWParam.m5 = 0.; + dWParam.mu = 0.; + for (int i=0; i<4; i++) + dWParam.commDim[i] = 1; + + if(param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + dWParam.type = QUDA_CLOVER_DIRAC; + dWParam.clover = cloverPrecise; + DiracClover *dW = new DiracClover(dWParam); + dW->M(tmp4,x); + delete dW; + } + else if (param->dslash_type == QUDA_TWISTED_MASS_DSLASH) { + dWParam.type = QUDA_WILSON_DIRAC; + DiracWilson *dW = new DiracWilson(dWParam); + dW->M(tmp4,x); + delete dW; + } + else{ + errorQuda("Error one end trick works only for twisted mass fermions\n"); + } + checkCudaError(); + + gamma5Cuda(static_cast(&tmp3.Even()), static_cast(&tmp4.Even())); + gamma5Cuda(static_cast(&tmp3.Odd()), static_cast(&tmp4.Odd())); + + long int sizeBuffer; + sizeBuffer = sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; + CovD *cov = new CovD(gaugePrecise, profileCovDev); + + ///////////////// LOCAL /////////////////////////// + contract(s, x, ctrnS, QUDA_CONTRACT_GAMMA5); + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + ((Float*) cnRes_vv)[ix] -= ((Float*)h_ctrn)[ix]; // standard one end trick + + + contract(s, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5); + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + ((Float*) cnRes_gv)[ix] += ((Float*)h_ctrn)[ix]; // generalized one end trick + + cudaDeviceSynchronize(); + + ////////////////// DERIVATIVES ////////////////////////////// + for(int mu=0; mu<4; mu++) // for generalized one-end trick + { + cov->M(tmp4,tmp3,mu); + contract(s, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5); // Term 0 + + cov->M (tmp4, s, mu+4); + contract(tmp4, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_PLUS); // Term 0 + Term 3 + cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); + + cov->M (tmp4, s, mu); + contract(tmp4, tmp3, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); // Term 0 + Term 3 + Term 2 (C Sum) + contract(tmp4, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); // Term 0 + Term 3 - Term 2 (D Dif) + + cov->M (tmp4, tmp3, mu+4); + contract(s, tmp4, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); // Term 0 + Term 3 + Term 2 + Term 1 (C Sum) + contract(s, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); // Term 0 + Term 3 - Term 2 - Term 1 (D Dif) + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + ((Float *) cnD_gv[mu])[ix] += ((Float*)h_ctrn)[ix]; + + cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); + + for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + ((Float *) cnC_gv[mu])[ix] += ((Float*)h_ctrn)[ix]; + } + + for(int mu=0; mu<4; mu++) // for standard one-end trick + { + cov->M (tmp4, x, mu); + cov->M (tmp3, s, mu+4); + + contract(s, tmp4, ctrnS, QUDA_CONTRACT_GAMMA5); // Term 0 + contract(tmp3, x, ctrnS, QUDA_CONTRACT_GAMMA5_PLUS); // Term 0 + Term 3 + cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); + + cov->M (tmp4, s, mu); + contract(tmp4, x, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); // Term 0 + Term 3 + Term 2 (C Sum) + contract(tmp4, x, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); // Term 0 + Term 3 - Term 2 (D Dif) + + cov->M (tmp3, x, mu+4); + contract(s, tmp3, ctrnC, QUDA_CONTRACT_GAMMA5_PLUS); // Term 0 + Term 3 + Term 2 + Term 1 (C Sum) + contract(s, tmp3, ctrnS, QUDA_CONTRACT_GAMMA5_MINUS); // Term 0 + Term 3 - Term 2 - Term 1 (D Dif) + + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + ((Float *) cnD_vv[mu])[ix] -= ((Float*)h_ctrn)[ix]; + + cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); + + for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + ((Float *) cnC_vv[mu])[ix] -= ((Float*)h_ctrn)[ix]; + + } + + /////////////// + + delete cov; + cudaFreeHost(h_ctrn); + cudaFree(ctrnS); + cudaFree(ctrnC); + checkCudaError(); +} +*/ + + +/* Quarantined Code +template +void volumeSource_w_One_Der(ColorSpinorField &x, + ColorSpinorField &xi, + ColorSpinorField &tmp, + QudaInvertParam *param, + void *cn_local, + void **cnD,void **cnC){ + void *h_ctrn, *ctrnS, *ctrnC; + + if((cudaMallocHost(&h_ctrn, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) + errorQuda("Error allocating memory for contraction results in CPU.\n"); + cudaMemset(h_ctrn, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); + + if((cudaMalloc(&ctrnS, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) + errorQuda("Error allocating memory for contraction results in GPU.\n"); + cudaMemset(ctrnS, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); + + if((cudaMalloc(&ctrnC, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3])) == cudaErrorMemoryAllocation) + errorQuda("Error allocating memory for contraction results in GPU.\n"); + cudaMemset(ctrnC, 0, sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]); + + + long int sizeBuffer; + sizeBuffer = sizeof(Float)*32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; + CovD *cov = new CovD(gaugePrecise, profileCovDev); + + ///////////////// LOCAL /////////////////////////// + contract(xi, x, ctrnS, QUDA_CONTRACT); + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + ((Float*) cn_local)[ix] += ((Float*)h_ctrn)[ix]; // generalized one end trick + + ////////////////// DERIVATIVES ////////////////////////////// + for(int mu=0; mu<4; mu++) // for standard one-end trick + { + cov->M (tmp, x, mu); // Term 0 + contract(xi, tmp, ctrnS, QUDA_CONTRACT); + cudaMemcpy(ctrnC, ctrnS, sizeBuffer, cudaMemcpyDeviceToDevice); + + cov->M (tmp, x, mu+4); // Term 1 + contract(xi, tmp, ctrnS, QUDA_CONTRACT_MINUS); // Term 0 - Term 1 + contract(xi, tmp, ctrnC, QUDA_CONTRACT_PLUS); // Term 0 + Term 1 + + cov->M(tmp, xi, mu); // Term 2 + contract(tmp, x, ctrnS, QUDA_CONTRACT_MINUS); // Term 0 - Term 1 - Term 2 + contract(tmp, x, ctrnC, QUDA_CONTRACT_PLUS); // Term 0 + Term 1 + Term 2 + + cov->M(tmp, xi, mu+4); // Term 3 + contract(tmp, x, ctrnS, QUDA_CONTRACT_PLUS); // Term 0 - Term 1 - Term 2 + Term 3 + contract(tmp, x, ctrnC, QUDA_CONTRACT_PLUS); // Term 0 + Term 1 + Term 2 + Term 3 + + cudaMemcpy(h_ctrn, ctrnS, sizeBuffer, cudaMemcpyDeviceToHost); + + for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + ((Float *) cnD[mu])[ix] += ((Float*)h_ctrn)[ix]; + + cudaMemcpy(h_ctrn, ctrnC, sizeBuffer, cudaMemcpyDeviceToHost); + + for(int ix=0; ix < 32*GK_localL[0]*GK_localL[1]*GK_localL[2]*GK_localL[3]; ix++) + ((Float *) cnC[mu])[ix] += ((Float*)h_ctrn)[ix]; + } + /////////////// + + delete cov; + cudaFreeHost(h_ctrn); + cudaFree(ctrnS); + cudaFree(ctrnC); + checkCudaError(); +} +*/ diff --git a/qkxtm/QKXTM_util.cpp b/qkxtm/QKXTM_util.cpp index 6c134d4c61..0b041a27de 100644 --- a/qkxtm/QKXTM_util.cpp +++ b/qkxtm/QKXTM_util.cpp @@ -1616,7 +1616,7 @@ int TSM_NdumpHP = 0; int TSM_NdumpLP = 0; long int TSM_maxiter = 0; double TSM_tol = 0; -int smethod = 1; +int smethod = 0; bool fullOp_stochEO = false; #ifdef HAVE_ARPACK //- Loop params with ARPACK enabled