From 507ff0dd2bfa45b97c65539315ae0bf376b70022 Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Tue, 6 Dec 2022 01:34:02 +0100 Subject: [PATCH 01/19] updated vectorization --- src/lib/Radio/predict_withbeam.c | 61 ++++++++++++++++++++++++-------- src/lib/Radio/stationbeam.c | 37 ++++++++++++++++--- 2 files changed, 79 insertions(+), 19 deletions(-) diff --git a/src/lib/Radio/predict_withbeam.c b/src/lib/Radio/predict_withbeam.c index 586bebc..6d088c2 100644 --- a/src/lib/Radio/predict_withbeam.c +++ b/src/lib/Radio/predict_withbeam.c @@ -1028,28 +1028,61 @@ visibilities_threadfn_multifreq(void *data) { } if (t->dobeam==DOBEAM_ELEMENT || t->dobeam==DOBEAM_FULL) { - /* add up terms together with Ejones multiplication */ + complex double *CO=0; + if (posix_memalign((void*)&CO,sizeof(complex double),((size_t)t->carr[cm].N*4*sizeof(complex double)))!=0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + #pragma omp simd for (cn=0; cncarr[cm].N; cn++) { - complex double *E1=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta1*8]; /* 8 values */ - complex double *E2=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta2*8]; /* 8 values */ complex double Ph,IIl,QQl,UUl,VVl; Ph=(PHr[cn]+_Complex_I*PHi[cn]); IIl=Ph*II[cn]; QQl=Ph*QQ[cn]; UUl=Ph*UU[cn]; VVl=Ph*VV[cn]; - complex double C0[4]; - C0[0]=IIl+QQl; - C0[1]=UUl+_Complex_I*VVl; - C0[2]=UUl-_Complex_I*VVl; - C0[3]=IIl-QQl; - amb(E1,C0,T1); - ambt(T1,E2,C0); - C[0]+=C0[0]; - C[1]+=C0[1]; - C[2]+=C0[2]; - C[3]+=C0[3]; + CO[0+4*cn]=IIl+QQl; + CO[1+4*cn]=UUl+_Complex_I*VVl; + CO[2+4*cn]=UUl-_Complex_I*VVl; + CO[3+4*cn]=IIl-QQl; + } + #pragma omp simd + for (cn=0; cncarr[cm].N; cn++) { + complex double *E1=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta1*8]; /* 8 values */ + complex double *E2=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta2*8]; /* 8 values */ + amb(E1,&CO[4*cn],T1); + ambt(T1,E2,&CO[4*cn]); } + #pragma omp simd + for (cn=0; cncarr[cm].N; cn++) { + C[0]+=CO[0+4*cn]; + C[1]+=CO[1+4*cn]; + C[2]+=CO[2+4*cn]; + C[3]+=CO[3+4*cn]; + } + /* add up terms together with Ejones multiplication */ + // for (cn=0; cncarr[cm].N; cn++) { + // complex double *E1=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta1*8]; /* 8 values */ + // complex double *E2=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta2*8]; /* 8 values */ + // complex double Ph,IIl,QQl,UUl,VVl; + // Ph=(PHr[cn]+_Complex_I*PHi[cn]); + // IIl=Ph*II[cn]; + // QQl=Ph*QQ[cn]; + // UUl=Ph*UU[cn]; + // VVl=Ph*VV[cn]; + // complex double C0[4]; + // C0[0]=IIl+QQl; + // C0[1]=UUl+_Complex_I*VVl; + // C0[2]=UUl-_Complex_I*VVl; + // C0[3]=IIl-QQl; + // amb(E1,C0,T1); + // ambt(T1,E2,C0); + // C[0]+=C0[0]; + // C[1]+=C0[1]; + // C[2]+=C0[2]; + // C[3]+=C0[3]; + // } + free(CO); } else { /* add up terms together */ for (cn=0; cncarr[cm].N; cn++) { diff --git a/src/lib/Radio/stationbeam.c b/src/lib/Radio/stationbeam.c index 2fbfa33..135a025 100644 --- a/src/lib/Radio/stationbeam.c +++ b/src/lib/Radio/stationbeam.c @@ -126,10 +126,10 @@ array_element_beam(double ra, double dec, double ra0, double dec0, double f, dou double *px,*py,*pz; double r1,r2,r3; double sint,cost,sint0,cost0,sinph,cosph,sinph0,cosph0; - double csum,ssum,tmpc,tmps; + double csum,ssum,*tmpc=0,*tmps=0; /* 2*PI/C */ const double tpc=2.0*M_PI/CONST_C; - + double *tmpprod=0; /* iterate over stations */ for (ci=0; ci Date: Tue, 6 Dec 2022 12:28:31 +0100 Subject: [PATCH 02/19] working libmvec --- src/lib/Radio/stationbeam.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/lib/Radio/stationbeam.c b/src/lib/Radio/stationbeam.c index 135a025..b26876c 100644 --- a/src/lib/Radio/stationbeam.c +++ b/src/lib/Radio/stationbeam.c @@ -178,17 +178,18 @@ array_element_beam(double ra, double dec, double ra0, double dec0, double f, dou fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); exit(1); } -//#pragma GCC ivdep #pragma omp simd for (cj=0; cj Date: Wed, 7 Dec 2022 14:03:44 +0100 Subject: [PATCH 03/19] vectorized elementcoeff eval --- src/lib/Radio/elementbeam.c | 85 ++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/src/lib/Radio/elementbeam.c b/src/lib/Radio/elementbeam.c index 5c3b2f9..bddbc2a 100644 --- a/src/lib/Radio/elementbeam.c +++ b/src/lib/Radio/elementbeam.c @@ -195,7 +195,7 @@ L_g1(int p, int q, double x) { elementval -eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { +eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { /* evaluate r^2/beta^2 */ double rb=pow(r/ecoeff->beta,2); /* evaluate e^(-r^2/2beta^2) */ @@ -234,6 +234,89 @@ eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { } +elementval +eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { + /* evaluate r^2/beta^2 */ + double rb=pow(r/ecoeff->beta,2); + /* evaluate e^(-r^2/2beta^2) */ + double ex=exp(-0.5*rb); + + elementval eval; + eval.phi=0.0+_Complex_I*0.0; + eval.theta=0.0+_Complex_I*0.0; + + /* storage for temp data */ + int N=ecoeff->M*(ecoeff->M+1)/2; + double *Lg=0; + if (posix_memalign((void*)&Lg,sizeof(double),((size_t)N*sizeof(double)))!=0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + + int idx=0; + for (int n=0; nM; n++) { +#pragma omp simd + for (int m=-n; m<=n; m+=2) { + int absm=m>=0?m:-m; /* |m| */ + Lg[idx]=L_g1((n-absm)/2,absm,rb)*pow(M_PI_4+r,(double)absm); + + idx++; + } + } + idx=0; + for (int n=0; nM; n++) { + double *inm=0,*ins=0,*inc=0; + if (posix_memalign((void*)&inm,sizeof(double),((size_t)(n+1)*sizeof(double)))!=0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + if (posix_memalign((void*)&ins,sizeof(double),((size_t)(n+1)*sizeof(double)))!=0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + if (posix_memalign((void*)&inc,sizeof(double),((size_t)(n+1)*sizeof(double)))!=0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + int m=-n; +#pragma omp simd + for (int ci=0; cipreamble[idx]; + double re,im; + /* basis function re+j*im */ + re=pr*inc[ci]; + im=pr*ins[ci]; + + eval.phi+=ecoeff->pattern_phi[idx]*(re+_Complex_I*im); + eval.theta+=ecoeff->pattern_theta[idx]*(re+_Complex_I*im); + idx++; + } + + free(inm); + free(ins); + free(inc); + } + + free(Lg); + + return eval; +} /* Legendre function P(l,m,x) */ static double From 98dd73f865e9127eb30e20a52e68fe9825461c3b Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Wed, 7 Dec 2022 14:30:29 +0100 Subject: [PATCH 04/19] revert to original --- src/lib/Radio/elementbeam.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib/Radio/elementbeam.c b/src/lib/Radio/elementbeam.c index bddbc2a..3a85ca7 100644 --- a/src/lib/Radio/elementbeam.c +++ b/src/lib/Radio/elementbeam.c @@ -195,7 +195,7 @@ L_g1(int p, int q, double x) { elementval -eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { +eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { /* evaluate r^2/beta^2 */ double rb=pow(r/ecoeff->beta,2); /* evaluate e^(-r^2/2beta^2) */ @@ -235,7 +235,7 @@ eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { elementval -eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { +eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { /* evaluate r^2/beta^2 */ double rb=pow(r/ecoeff->beta,2); /* evaluate e^(-r^2/2beta^2) */ From da530ec5861687af721b93540d9a8656babf5070 Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Wed, 7 Dec 2022 17:47:04 +0100 Subject: [PATCH 05/19] extract time from first row --- src/MS/data.cpp | 40 +++++++++++++------- src/lib/Radio/elementbeam.c | 73 ++++++++++++++++++++++++++----------- 2 files changed, 78 insertions(+), 35 deletions(-) diff --git a/src/MS/data.cpp b/src/MS/data.cpp index 32b7626..5d7ec4b 100644 --- a/src/MS/data.cpp +++ b/src/MS/data.cpp @@ -245,18 +245,23 @@ Data::readAuxData(const char *fname, Data::IOData *data, Data::LBeam *binfo) { ROArrayColumn chan_width(_freq, "CHAN_WIDTH"); data->deltaf=(double)data->Nchan*(chan_width(0).data()[0]); - /* UTC time */ - binfo->time_utc=new double[data->tilesz]; - /* no of elements in each station */ - binfo->Nelem=new int[data->N]; - /* positions of stations */ - binfo->sx=new double[data->N]; - binfo->sy=new double[data->N]; - binfo->sz=new double[data->N]; - /* coordinates of elements */ - binfo->xx=new double*[data->N]; - binfo->yy=new double*[data->N]; - binfo->zz=new double*[data->N]; + try { + /* UTC time */ + binfo->time_utc=new double[data->tilesz]; + /* no of elements in each station */ + binfo->Nelem=new int[data->N]; + /* positions of stations */ + binfo->sx=new double[data->N]; + binfo->sy=new double[data->N]; + binfo->sz=new double[data->N]; + /* coordinates of elements */ + binfo->xx=new double*[data->N]; + binfo->yy=new double*[data->N]; + binfo->zz=new double*[data->N]; + } catch (const std::bad_alloc& e) { + cout<<"Allocating memory for data failed. Quitting."<< e.what() << endl; + exit(1); + } Table antfield; bool isDipole=false; @@ -706,10 +711,13 @@ Data::loadData(Table ti, Data::IOData iodata, LBeam binfo, double *fratio) { } /* counters for finding flagged data ratio */ int countgood=0; int countbad=0; + /* get antenna pair of first row for recording time */ + uInt ant_i=a1(0); + uInt ant_j=a2(0); for(int row = 0; row < nrow && row0beta,2); /* evaluate e^(-r^2/2beta^2) */ @@ -235,7 +255,7 @@ eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { elementval -eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { +eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { /* evaluate r^2/beta^2 */ double rb=pow(r/ecoeff->beta,2); /* evaluate e^(-r^2/2beta^2) */ @@ -258,46 +278,57 @@ eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { #pragma omp simd for (int m=-n; m<=n; m+=2) { int absm=m>=0?m:-m; /* |m| */ - Lg[idx]=L_g1((n-absm)/2,absm,rb)*pow(M_PI_4+r,(double)absm); + double dabsm=(double)absm; + Lg[idx]=ex*L_g2((n-absm)/2,dabsm,rb)*pow(M_PI_4+r,dabsm); idx++; } } - idx=0; - for (int n=0; nM; n++) { - double *inm=0,*ins=0,*inc=0; - if (posix_memalign((void*)&inm,sizeof(double),((size_t)(n+1)*sizeof(double)))!=0) { + int *inm=0; + double *inmp=0,*ins=0,*inc=0; + /* Note: we allocate for the largest possible, n=M-1, size n+1=M */ + if ((inm=(int*)malloc((size_t)(ecoeff->M)*sizeof(int)))==0) { fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); exit(1); - } - if (posix_memalign((void*)&ins,sizeof(double),((size_t)(n+1)*sizeof(double)))!=0) { + } + if (posix_memalign((void*)&inmp,sizeof(double),((size_t)(ecoeff->M)*sizeof(double)))!=0) { fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); exit(1); - } - if (posix_memalign((void*)&inc,sizeof(double),((size_t)(n+1)*sizeof(double)))!=0) { + } + if (posix_memalign((void*)&ins,sizeof(double),((size_t)(ecoeff->M)*sizeof(double)))!=0) { fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); exit(1); - } + } + if (posix_memalign((void*)&inc,sizeof(double),((size_t)(ecoeff->M)*sizeof(double)))!=0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + idx=0; + for (int n=0; nM; n++) { int m=-n; #pragma omp simd for (int ci=0; cipreamble[idx]; + double pr=Lg[idx]*ecoeff->preamble[idx]; double re,im; /* basis function re+j*im */ re=pr*inc[ci]; @@ -308,11 +339,11 @@ eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { idx++; } - free(inm); - free(ins); - free(inc); } - + free(inm); + free(inmp); + free(ins); + free(inc); free(Lg); return eval; From 2926e0fd0928ba6f80eb3a0b89d713dd312b1f19 Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Fri, 9 Dec 2022 01:21:55 +0100 Subject: [PATCH 06/19] bump version --- INSTALL.md | 107 +++-------------------------------------------- src/MPI/main.cpp | 2 +- src/MS/main.cpp | 2 +- 3 files changed, 7 insertions(+), 104 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index a02af48..2217a07 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -12,7 +12,7 @@ Run cmake (with GPU support) for example like mkdir build && cd build cmake .. -DHAVE_CUDA=ON -DCMAKE_CXX_FLAGS='-DMAX_GPU_ID=0' -DCMAKE_CXX_COMPILER=g++-8 -DCMAKE_C_FLAGS='-DMAX_GPU_ID=0' -DCMAKE_C_COMPILER=gcc-8 -DCUDA_NVCC_FLAGS='-gencode arch=compute_75,code=sm_75' -DBLA_VENDOR=OpenBLAS ``` -where *MAX_GPU_ID=0* is when there is only one GPU (ordinal 0). If you have more GPUs, increase this number to 1,2, and so on. This will produce *sagecal_gpu* and *sagecal-mpi_gpu* binary files (after running *make* of course). +where *MAX_GPU_ID=0* is when there is only one GPU (ordinal 0). If you have more GPUs, increase this number to 1,2, and so on. This will produce *sagecal_gpu* and *sagecal-mpi_gpu* binary files (after running *make* of course). You can also use *-DNUM_CPU* to specify the number of GPUs to use, for example *-DNUM_GPU=4*. CPU only version can be build as ``` @@ -31,20 +31,14 @@ to make a symbolic link to libgfortran.so.5 or whatever version that is installe To only build *libdirac* (shared) library, use *-DLIB_ONLY=1* option (also *-DBLA_VENDOR* to select the BLAS flavour). This library can be used with pkg-config using *lib/pkgconfig/libdirac.pc*. -### Requirements for older installations -#### das5 +### Vectorized math operations (New) +SAGECal can use ***libmvec*** vectorized math operations, both in GPU and CPU versions. In order to enable this, use compiler options *-fopenmp -ffast-math -lmvec -lm* for both gcc and g++. Also *-mavx*, *-mavx2* etc. can be added. Here is an example for CPU version -Load the modules below before compiling SageCal. ``` -module load cmake/3.8.2 -module load mpich/ge/gcc/64/3.2 -module load gcc/4.9.3 -module load casacore/2.3.0-gcc-4.9.3 -module load wcslib/5.13-gcc-4.9.3 -module load wcslib/5.16-gcc-4.9.3 -module load cfitsio/3.410-gcc-4.9.3 +cmake .. -DCMAKE_CXX_FLAGS='-g -O3 -Wall -fopenmp -ffast-math -lmvec -lm -mavx2' -DCMAKE_C_FLAGS='-g -O3 -Wall -fopenmp -ffast-math -lmvec -lm -mavx2' ``` +### Requirements for older installations checkout the source code and compile it with the instructions below(in source folder): ``` git clone https://github.com/nlesc-dirac/sagecal.git @@ -111,94 +105,3 @@ MPI support is automatically detected, otherwise, it can be forced with: ``` cmake -DENABLE_MPI=ON ``` - -## GPU Support - -### Loading modules on DAS5 -See scripts folder for the modules. -``` -source ./scripts/load_das5_modules_gcc6.sh -``` - -### Compiling with GPU support -``` -mkdir -p build && cd build -cmake -DCUDA_DEBUG=ON -DDEBUG=ON -DHAVE_CUDA=ON .. -make VERBOSE=1 -``` - - - -## Installation via Anaconda (WIP) -``` - conda install -c sagecal=0.6.0 -``` - - - -## Manual installation -For expert users, and for custom architectures (GPU), the manual install is recommended. -### 1 Prerequisites: - - CASACORE http://casacore.googlecode.com/ - - glib http://developer.gnome.org/glib - - BLAS/LAPACK - Highly recommended is OpenBLAS http://www.openblas.net/ - Also, to avoid any linking issues (and to get best performance), build OpenBLAS from source and link SAGECal with the static library (libopenblas***.a) and NOT libopenblas***.so - - Compilers gcc/g++ or Intel icc/icpc - - If you have NVIDIA GPUs, - -- CUDA/CUBLAS/CUSOLVER and nvcc - -- NVML Nvidia management library - - If you are using Intel Xeon Phi MICs. - -- Intel MKL and other libraries - - Get the source for SAGECal -``` - git clone -b master https://git@github.com/nlesc-dirac/sagecal.git -``` - -### 2 The basic way to build is - 1.a) go to ./src/lib/Dirac and ./src/lib/Radio and run make (which will create libdirac.a and libradio.a) - 1.b) go to ./src/MS and run make (which will create the executable) - - -### 3 Build settings -In ./src/lib/Dirac and ./src/lib/Radio and ./src/MS you MUST edit the Makefiles to suit your system. Some common items to edit are: - - LAPACK: directory where LAPACK/OpenBLAS is installed - - GLIBI/GLIBL: include/lib files for glib - - CASA_LIBDIR/CASA_INCDIR/CASA_LIBS : casacore include/library location and files: - Note with new CASACORE might need two include paths, e.g. - -I/opt/casacore/include/ -I/opt/casacore/include/casacore - - CUDAINC/CUDALIB : where CUDA/CUBLAS/CUSOLVER is installed - - NVML_INC/NVML_LIB : NVML include/lib path - - NVCFLAGS : flags to pass to nvcc, especially -arch option to match your GPU - - MKLROOT : for Intel MKL - - Example makefiles: - Makefile : plain build - Makefile.gpu: with GPU support - Note: Edit ./lib/Radio/Radio.h MAX_GPU_ID to match the number of available GPUs, e.g., for 2 GPUs, MAX_GPU_ID=1 - - - -## SAGECAL-MPI Manual Installation -This is for manually installing the distributed version of sagecal (sagecal-mpi), the cmake build will will work for most cases. -## 1 Prerequsites: - - Same as for SAGECal. - - MPI (e.g. OpenMPI) - -## 2 Build ./src/lib/Dirac ./src/lib/Radio as above (using mpicc -DMPI_BUILD) - -## 3 Build ./src/MPI using mpicc++ - - - -## BUILDSKY Installation - - - See INSTALL in ./src/buildsky - - -## RESTORE Installation - - - See INSTALL in ./src/restore - - - diff --git a/src/MPI/main.cpp b/src/MPI/main.cpp index 7cf07d6..3516654 100644 --- a/src/MPI/main.cpp +++ b/src/MPI/main.cpp @@ -34,7 +34,7 @@ using namespace Data; void print_copyright(void) { - cout<<"SAGECal-MPI 0.7.8 (C) 2011-2022 Sarod Yatawatta"< Date: Fri, 9 Dec 2022 01:24:24 +0100 Subject: [PATCH 07/19] update install doc --- INSTALL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL.md b/INSTALL.md index 2217a07..1250fef 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,4 +1,4 @@ -do 24 feb 2022 1:25:28 CET +vr 9 dec 2022 1:23:12 CET # SAGECal Installation ## Cmake Build From 1bf847148f4ae568528976bff327424c5817a8ff Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Tue, 6 Dec 2022 01:34:02 +0100 Subject: [PATCH 08/19] updated vectorization --- src/lib/Radio/predict_withbeam.c | 61 ++++++++++++++++++++++++-------- src/lib/Radio/stationbeam.c | 37 ++++++++++++++++--- 2 files changed, 79 insertions(+), 19 deletions(-) diff --git a/src/lib/Radio/predict_withbeam.c b/src/lib/Radio/predict_withbeam.c index 586bebc..6d088c2 100644 --- a/src/lib/Radio/predict_withbeam.c +++ b/src/lib/Radio/predict_withbeam.c @@ -1028,28 +1028,61 @@ visibilities_threadfn_multifreq(void *data) { } if (t->dobeam==DOBEAM_ELEMENT || t->dobeam==DOBEAM_FULL) { - /* add up terms together with Ejones multiplication */ + complex double *CO=0; + if (posix_memalign((void*)&CO,sizeof(complex double),((size_t)t->carr[cm].N*4*sizeof(complex double)))!=0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + #pragma omp simd for (cn=0; cncarr[cm].N; cn++) { - complex double *E1=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta1*8]; /* 8 values */ - complex double *E2=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta2*8]; /* 8 values */ complex double Ph,IIl,QQl,UUl,VVl; Ph=(PHr[cn]+_Complex_I*PHi[cn]); IIl=Ph*II[cn]; QQl=Ph*QQ[cn]; UUl=Ph*UU[cn]; VVl=Ph*VV[cn]; - complex double C0[4]; - C0[0]=IIl+QQl; - C0[1]=UUl+_Complex_I*VVl; - C0[2]=UUl-_Complex_I*VVl; - C0[3]=IIl-QQl; - amb(E1,C0,T1); - ambt(T1,E2,C0); - C[0]+=C0[0]; - C[1]+=C0[1]; - C[2]+=C0[2]; - C[3]+=C0[3]; + CO[0+4*cn]=IIl+QQl; + CO[1+4*cn]=UUl+_Complex_I*VVl; + CO[2+4*cn]=UUl-_Complex_I*VVl; + CO[3+4*cn]=IIl-QQl; + } + #pragma omp simd + for (cn=0; cncarr[cm].N; cn++) { + complex double *E1=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta1*8]; /* 8 values */ + complex double *E2=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta2*8]; /* 8 values */ + amb(E1,&CO[4*cn],T1); + ambt(T1,E2,&CO[4*cn]); } + #pragma omp simd + for (cn=0; cncarr[cm].N; cn++) { + C[0]+=CO[0+4*cn]; + C[1]+=CO[1+4*cn]; + C[2]+=CO[2+4*cn]; + C[3]+=CO[3+4*cn]; + } + /* add up terms together with Ejones multiplication */ + // for (cn=0; cncarr[cm].N; cn++) { + // complex double *E1=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta1*8]; /* 8 values */ + // complex double *E2=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta2*8]; /* 8 values */ + // complex double Ph,IIl,QQl,UUl,VVl; + // Ph=(PHr[cn]+_Complex_I*PHi[cn]); + // IIl=Ph*II[cn]; + // QQl=Ph*QQ[cn]; + // UUl=Ph*UU[cn]; + // VVl=Ph*VV[cn]; + // complex double C0[4]; + // C0[0]=IIl+QQl; + // C0[1]=UUl+_Complex_I*VVl; + // C0[2]=UUl-_Complex_I*VVl; + // C0[3]=IIl-QQl; + // amb(E1,C0,T1); + // ambt(T1,E2,C0); + // C[0]+=C0[0]; + // C[1]+=C0[1]; + // C[2]+=C0[2]; + // C[3]+=C0[3]; + // } + free(CO); } else { /* add up terms together */ for (cn=0; cncarr[cm].N; cn++) { diff --git a/src/lib/Radio/stationbeam.c b/src/lib/Radio/stationbeam.c index 2fbfa33..135a025 100644 --- a/src/lib/Radio/stationbeam.c +++ b/src/lib/Radio/stationbeam.c @@ -126,10 +126,10 @@ array_element_beam(double ra, double dec, double ra0, double dec0, double f, dou double *px,*py,*pz; double r1,r2,r3; double sint,cost,sint0,cost0,sinph,cosph,sinph0,cosph0; - double csum,ssum,tmpc,tmps; + double csum,ssum,*tmpc=0,*tmps=0; /* 2*PI/C */ const double tpc=2.0*M_PI/CONST_C; - + double *tmpprod=0; /* iterate over stations */ for (ci=0; ci Date: Tue, 6 Dec 2022 12:28:31 +0100 Subject: [PATCH 09/19] working libmvec --- src/lib/Radio/stationbeam.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/lib/Radio/stationbeam.c b/src/lib/Radio/stationbeam.c index 135a025..b26876c 100644 --- a/src/lib/Radio/stationbeam.c +++ b/src/lib/Radio/stationbeam.c @@ -178,17 +178,18 @@ array_element_beam(double ra, double dec, double ra0, double dec0, double f, dou fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); exit(1); } -//#pragma GCC ivdep #pragma omp simd for (cj=0; cj Date: Wed, 7 Dec 2022 14:03:44 +0100 Subject: [PATCH 10/19] vectorized elementcoeff eval --- src/lib/Radio/elementbeam.c | 85 ++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/src/lib/Radio/elementbeam.c b/src/lib/Radio/elementbeam.c index 5c3b2f9..bddbc2a 100644 --- a/src/lib/Radio/elementbeam.c +++ b/src/lib/Radio/elementbeam.c @@ -195,7 +195,7 @@ L_g1(int p, int q, double x) { elementval -eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { +eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { /* evaluate r^2/beta^2 */ double rb=pow(r/ecoeff->beta,2); /* evaluate e^(-r^2/2beta^2) */ @@ -234,6 +234,89 @@ eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { } +elementval +eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { + /* evaluate r^2/beta^2 */ + double rb=pow(r/ecoeff->beta,2); + /* evaluate e^(-r^2/2beta^2) */ + double ex=exp(-0.5*rb); + + elementval eval; + eval.phi=0.0+_Complex_I*0.0; + eval.theta=0.0+_Complex_I*0.0; + + /* storage for temp data */ + int N=ecoeff->M*(ecoeff->M+1)/2; + double *Lg=0; + if (posix_memalign((void*)&Lg,sizeof(double),((size_t)N*sizeof(double)))!=0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + + int idx=0; + for (int n=0; nM; n++) { +#pragma omp simd + for (int m=-n; m<=n; m+=2) { + int absm=m>=0?m:-m; /* |m| */ + Lg[idx]=L_g1((n-absm)/2,absm,rb)*pow(M_PI_4+r,(double)absm); + + idx++; + } + } + idx=0; + for (int n=0; nM; n++) { + double *inm=0,*ins=0,*inc=0; + if (posix_memalign((void*)&inm,sizeof(double),((size_t)(n+1)*sizeof(double)))!=0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + if (posix_memalign((void*)&ins,sizeof(double),((size_t)(n+1)*sizeof(double)))!=0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + if (posix_memalign((void*)&inc,sizeof(double),((size_t)(n+1)*sizeof(double)))!=0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + int m=-n; +#pragma omp simd + for (int ci=0; cipreamble[idx]; + double re,im; + /* basis function re+j*im */ + re=pr*inc[ci]; + im=pr*ins[ci]; + + eval.phi+=ecoeff->pattern_phi[idx]*(re+_Complex_I*im); + eval.theta+=ecoeff->pattern_theta[idx]*(re+_Complex_I*im); + idx++; + } + + free(inm); + free(ins); + free(inc); + } + + free(Lg); + + return eval; +} /* Legendre function P(l,m,x) */ static double From 36eb4284531be1ffe118ff0aa3c4c709bfc4cc5e Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Wed, 7 Dec 2022 14:30:29 +0100 Subject: [PATCH 11/19] revert to original --- src/lib/Radio/elementbeam.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib/Radio/elementbeam.c b/src/lib/Radio/elementbeam.c index bddbc2a..3a85ca7 100644 --- a/src/lib/Radio/elementbeam.c +++ b/src/lib/Radio/elementbeam.c @@ -195,7 +195,7 @@ L_g1(int p, int q, double x) { elementval -eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { +eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { /* evaluate r^2/beta^2 */ double rb=pow(r/ecoeff->beta,2); /* evaluate e^(-r^2/2beta^2) */ @@ -235,7 +235,7 @@ eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { elementval -eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { +eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { /* evaluate r^2/beta^2 */ double rb=pow(r/ecoeff->beta,2); /* evaluate e^(-r^2/2beta^2) */ From 1e07855716a10c98926fac2a97b1bfc41236e083 Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Wed, 7 Dec 2022 17:47:04 +0100 Subject: [PATCH 12/19] extract time from first row --- src/MS/data.cpp | 40 +++++++++++++------- src/lib/Radio/elementbeam.c | 73 ++++++++++++++++++++++++++----------- 2 files changed, 78 insertions(+), 35 deletions(-) diff --git a/src/MS/data.cpp b/src/MS/data.cpp index 32b7626..5d7ec4b 100644 --- a/src/MS/data.cpp +++ b/src/MS/data.cpp @@ -245,18 +245,23 @@ Data::readAuxData(const char *fname, Data::IOData *data, Data::LBeam *binfo) { ROArrayColumn chan_width(_freq, "CHAN_WIDTH"); data->deltaf=(double)data->Nchan*(chan_width(0).data()[0]); - /* UTC time */ - binfo->time_utc=new double[data->tilesz]; - /* no of elements in each station */ - binfo->Nelem=new int[data->N]; - /* positions of stations */ - binfo->sx=new double[data->N]; - binfo->sy=new double[data->N]; - binfo->sz=new double[data->N]; - /* coordinates of elements */ - binfo->xx=new double*[data->N]; - binfo->yy=new double*[data->N]; - binfo->zz=new double*[data->N]; + try { + /* UTC time */ + binfo->time_utc=new double[data->tilesz]; + /* no of elements in each station */ + binfo->Nelem=new int[data->N]; + /* positions of stations */ + binfo->sx=new double[data->N]; + binfo->sy=new double[data->N]; + binfo->sz=new double[data->N]; + /* coordinates of elements */ + binfo->xx=new double*[data->N]; + binfo->yy=new double*[data->N]; + binfo->zz=new double*[data->N]; + } catch (const std::bad_alloc& e) { + cout<<"Allocating memory for data failed. Quitting."<< e.what() << endl; + exit(1); + } Table antfield; bool isDipole=false; @@ -706,10 +711,13 @@ Data::loadData(Table ti, Data::IOData iodata, LBeam binfo, double *fratio) { } /* counters for finding flagged data ratio */ int countgood=0; int countbad=0; + /* get antenna pair of first row for recording time */ + uInt ant_i=a1(0); + uInt ant_j=a2(0); for(int row = 0; row < nrow && row0beta,2); /* evaluate e^(-r^2/2beta^2) */ @@ -235,7 +255,7 @@ eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { elementval -eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { +eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { /* evaluate r^2/beta^2 */ double rb=pow(r/ecoeff->beta,2); /* evaluate e^(-r^2/2beta^2) */ @@ -258,46 +278,57 @@ eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { #pragma omp simd for (int m=-n; m<=n; m+=2) { int absm=m>=0?m:-m; /* |m| */ - Lg[idx]=L_g1((n-absm)/2,absm,rb)*pow(M_PI_4+r,(double)absm); + double dabsm=(double)absm; + Lg[idx]=ex*L_g2((n-absm)/2,dabsm,rb)*pow(M_PI_4+r,dabsm); idx++; } } - idx=0; - for (int n=0; nM; n++) { - double *inm=0,*ins=0,*inc=0; - if (posix_memalign((void*)&inm,sizeof(double),((size_t)(n+1)*sizeof(double)))!=0) { + int *inm=0; + double *inmp=0,*ins=0,*inc=0; + /* Note: we allocate for the largest possible, n=M-1, size n+1=M */ + if ((inm=(int*)malloc((size_t)(ecoeff->M)*sizeof(int)))==0) { fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); exit(1); - } - if (posix_memalign((void*)&ins,sizeof(double),((size_t)(n+1)*sizeof(double)))!=0) { + } + if (posix_memalign((void*)&inmp,sizeof(double),((size_t)(ecoeff->M)*sizeof(double)))!=0) { fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); exit(1); - } - if (posix_memalign((void*)&inc,sizeof(double),((size_t)(n+1)*sizeof(double)))!=0) { + } + if (posix_memalign((void*)&ins,sizeof(double),((size_t)(ecoeff->M)*sizeof(double)))!=0) { fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); exit(1); - } + } + if (posix_memalign((void*)&inc,sizeof(double),((size_t)(ecoeff->M)*sizeof(double)))!=0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + idx=0; + for (int n=0; nM; n++) { int m=-n; #pragma omp simd for (int ci=0; cipreamble[idx]; + double pr=Lg[idx]*ecoeff->preamble[idx]; double re,im; /* basis function re+j*im */ re=pr*inc[ci]; @@ -308,11 +339,11 @@ eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { idx++; } - free(inm); - free(ins); - free(inc); } - + free(inm); + free(inmp); + free(ins); + free(inc); free(Lg); return eval; From 5143536ce199084d1d5199d18d53b09483c82242 Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Fri, 9 Dec 2022 01:21:55 +0100 Subject: [PATCH 13/19] bump version --- INSTALL.md | 107 +++-------------------------------------------- src/MPI/main.cpp | 2 +- src/MS/main.cpp | 2 +- 3 files changed, 7 insertions(+), 104 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index a02af48..2217a07 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -12,7 +12,7 @@ Run cmake (with GPU support) for example like mkdir build && cd build cmake .. -DHAVE_CUDA=ON -DCMAKE_CXX_FLAGS='-DMAX_GPU_ID=0' -DCMAKE_CXX_COMPILER=g++-8 -DCMAKE_C_FLAGS='-DMAX_GPU_ID=0' -DCMAKE_C_COMPILER=gcc-8 -DCUDA_NVCC_FLAGS='-gencode arch=compute_75,code=sm_75' -DBLA_VENDOR=OpenBLAS ``` -where *MAX_GPU_ID=0* is when there is only one GPU (ordinal 0). If you have more GPUs, increase this number to 1,2, and so on. This will produce *sagecal_gpu* and *sagecal-mpi_gpu* binary files (after running *make* of course). +where *MAX_GPU_ID=0* is when there is only one GPU (ordinal 0). If you have more GPUs, increase this number to 1,2, and so on. This will produce *sagecal_gpu* and *sagecal-mpi_gpu* binary files (after running *make* of course). You can also use *-DNUM_CPU* to specify the number of GPUs to use, for example *-DNUM_GPU=4*. CPU only version can be build as ``` @@ -31,20 +31,14 @@ to make a symbolic link to libgfortran.so.5 or whatever version that is installe To only build *libdirac* (shared) library, use *-DLIB_ONLY=1* option (also *-DBLA_VENDOR* to select the BLAS flavour). This library can be used with pkg-config using *lib/pkgconfig/libdirac.pc*. -### Requirements for older installations -#### das5 +### Vectorized math operations (New) +SAGECal can use ***libmvec*** vectorized math operations, both in GPU and CPU versions. In order to enable this, use compiler options *-fopenmp -ffast-math -lmvec -lm* for both gcc and g++. Also *-mavx*, *-mavx2* etc. can be added. Here is an example for CPU version -Load the modules below before compiling SageCal. ``` -module load cmake/3.8.2 -module load mpich/ge/gcc/64/3.2 -module load gcc/4.9.3 -module load casacore/2.3.0-gcc-4.9.3 -module load wcslib/5.13-gcc-4.9.3 -module load wcslib/5.16-gcc-4.9.3 -module load cfitsio/3.410-gcc-4.9.3 +cmake .. -DCMAKE_CXX_FLAGS='-g -O3 -Wall -fopenmp -ffast-math -lmvec -lm -mavx2' -DCMAKE_C_FLAGS='-g -O3 -Wall -fopenmp -ffast-math -lmvec -lm -mavx2' ``` +### Requirements for older installations checkout the source code and compile it with the instructions below(in source folder): ``` git clone https://github.com/nlesc-dirac/sagecal.git @@ -111,94 +105,3 @@ MPI support is automatically detected, otherwise, it can be forced with: ``` cmake -DENABLE_MPI=ON ``` - -## GPU Support - -### Loading modules on DAS5 -See scripts folder for the modules. -``` -source ./scripts/load_das5_modules_gcc6.sh -``` - -### Compiling with GPU support -``` -mkdir -p build && cd build -cmake -DCUDA_DEBUG=ON -DDEBUG=ON -DHAVE_CUDA=ON .. -make VERBOSE=1 -``` - - - -## Installation via Anaconda (WIP) -``` - conda install -c sagecal=0.6.0 -``` - - - -## Manual installation -For expert users, and for custom architectures (GPU), the manual install is recommended. -### 1 Prerequisites: - - CASACORE http://casacore.googlecode.com/ - - glib http://developer.gnome.org/glib - - BLAS/LAPACK - Highly recommended is OpenBLAS http://www.openblas.net/ - Also, to avoid any linking issues (and to get best performance), build OpenBLAS from source and link SAGECal with the static library (libopenblas***.a) and NOT libopenblas***.so - - Compilers gcc/g++ or Intel icc/icpc - - If you have NVIDIA GPUs, - -- CUDA/CUBLAS/CUSOLVER and nvcc - -- NVML Nvidia management library - - If you are using Intel Xeon Phi MICs. - -- Intel MKL and other libraries - - Get the source for SAGECal -``` - git clone -b master https://git@github.com/nlesc-dirac/sagecal.git -``` - -### 2 The basic way to build is - 1.a) go to ./src/lib/Dirac and ./src/lib/Radio and run make (which will create libdirac.a and libradio.a) - 1.b) go to ./src/MS and run make (which will create the executable) - - -### 3 Build settings -In ./src/lib/Dirac and ./src/lib/Radio and ./src/MS you MUST edit the Makefiles to suit your system. Some common items to edit are: - - LAPACK: directory where LAPACK/OpenBLAS is installed - - GLIBI/GLIBL: include/lib files for glib - - CASA_LIBDIR/CASA_INCDIR/CASA_LIBS : casacore include/library location and files: - Note with new CASACORE might need two include paths, e.g. - -I/opt/casacore/include/ -I/opt/casacore/include/casacore - - CUDAINC/CUDALIB : where CUDA/CUBLAS/CUSOLVER is installed - - NVML_INC/NVML_LIB : NVML include/lib path - - NVCFLAGS : flags to pass to nvcc, especially -arch option to match your GPU - - MKLROOT : for Intel MKL - - Example makefiles: - Makefile : plain build - Makefile.gpu: with GPU support - Note: Edit ./lib/Radio/Radio.h MAX_GPU_ID to match the number of available GPUs, e.g., for 2 GPUs, MAX_GPU_ID=1 - - - -## SAGECAL-MPI Manual Installation -This is for manually installing the distributed version of sagecal (sagecal-mpi), the cmake build will will work for most cases. -## 1 Prerequsites: - - Same as for SAGECal. - - MPI (e.g. OpenMPI) - -## 2 Build ./src/lib/Dirac ./src/lib/Radio as above (using mpicc -DMPI_BUILD) - -## 3 Build ./src/MPI using mpicc++ - - - -## BUILDSKY Installation - - - See INSTALL in ./src/buildsky - - -## RESTORE Installation - - - See INSTALL in ./src/restore - - - diff --git a/src/MPI/main.cpp b/src/MPI/main.cpp index 7cf07d6..3516654 100644 --- a/src/MPI/main.cpp +++ b/src/MPI/main.cpp @@ -34,7 +34,7 @@ using namespace Data; void print_copyright(void) { - cout<<"SAGECal-MPI 0.7.8 (C) 2011-2022 Sarod Yatawatta"< Date: Fri, 9 Dec 2022 01:24:24 +0100 Subject: [PATCH 14/19] update install doc --- INSTALL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL.md b/INSTALL.md index 2217a07..1250fef 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,4 +1,4 @@ -do 24 feb 2022 1:25:28 CET +vr 9 dec 2022 1:23:12 CET # SAGECal Installation ## Cmake Build From 0c798a7985176fbe2134a6625e9e04d6fd076418 Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Sat, 10 Dec 2022 13:31:30 +0100 Subject: [PATCH 15/19] update docker files --- CMakeLists.txt | 2 +- Docker/ubuntu2004-cpu/Dockerfile | 3 ++- Docker/ubuntu2004-cuda/Dockerfile | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d63dcbe..0afba97 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ cmake_minimum_required(VERSION 3.10) project (SAGECal) set(PROJECT_VERSION_MAJOR 0) set(PROJECT_VERSION_MINOR 7) -set(PROJECT_VERSION_PATCH 8) +set(PROJECT_VERSION_PATCH 9) set(PROJECT_VERSION_REVISION 0) set(PROJECT_VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}") diff --git a/Docker/ubuntu2004-cpu/Dockerfile b/Docker/ubuntu2004-cpu/Dockerfile index 1873ecc..500ed35 100644 --- a/Docker/ubuntu2004-cpu/Dockerfile +++ b/Docker/ubuntu2004-cpu/Dockerfile @@ -23,7 +23,8 @@ RUN git clone --depth 1 --branch master \ mkdir build-ubuntu && cd build-ubuntu && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/sagecal \ -DBLA_VENDOR=OpenBLAS \ - -DCMAKE_CXX_FLAGS="-O3" -DCMAKE_C_FLAGS="-O3" .. && \ + -DCMAKE_CXX_FLAGS="-O3 -fopenmp -ffast-math -lmvec -lm" \ + -DCMAKE_C_FLAGS="-O3 -fopenmp -ffast-math -lmvec -lm" .. && \ make -j4 && \ make install RUN ls -alsrt /opt/sagecal && \ diff --git a/Docker/ubuntu2004-cuda/Dockerfile b/Docker/ubuntu2004-cuda/Dockerfile index 20829ce..da57aaa 100644 --- a/Docker/ubuntu2004-cuda/Dockerfile +++ b/Docker/ubuntu2004-cuda/Dockerfile @@ -33,7 +33,8 @@ RUN git clone --depth 1 --branch master \ -DHAVE_CUDA=ON -DCUDA_NVCC_FLAGS="-gencode arch=compute_75,code=sm_75 -O3" \ -DNUM_GPU=2 -DBLA_VENDOR=OpenBLAS \ -DCMAKE_CXX_COMPILER=g++-8 -DCMAKE_C_COMPILER=gcc-8 \ - -DCMAKE_CXX_FLAGS="-O3" -DCMAKE_C_FLAGS="-O3" .. && \ + -DCMAKE_CXX_FLAGS="-O3 -fopenmp -ffast-math -lmvec -lm" \ + -DCMAKE_C_FLAGS="-O3 -fopenmp -ffast-math -lmvec -lm" .. && \ make -j4 && \ make install RUN ls -alsrt /opt/sagecal && \ From 90bc269d266a1f9a55cac46a073d8949e9ed0172 Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Sun, 11 Dec 2022 03:13:56 +0100 Subject: [PATCH 16/19] add openmp guard --- src/lib/Radio/elementbeam.c | 12 +++++++++--- src/lib/Radio/stationbeam.c | 9 +++++++++ src/restore/fft.c | 6 ++++++ src/restore/shapelet_lm.c | 6 ++++++ 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/lib/Radio/elementbeam.c b/src/lib/Radio/elementbeam.c index e7c60e7..f948c4b 100644 --- a/src/lib/Radio/elementbeam.c +++ b/src/lib/Radio/elementbeam.c @@ -175,6 +175,7 @@ free_elementcoeffs(elementcoeff ecoeff) { /* generalized Laguerre polynomial L_p^q(x) */ /* for calculating L_{n-|m|/2}^|m| (x) */ +#ifndef _OPENMP static double L_g1(int p, int q, double x) { /* max p: (n-|m|)/2 = n/2 */ @@ -192,7 +193,9 @@ L_g1(int p, int q, double x) { } return L_p; } +#endif /* !_OPENMP */ +#ifdef _OPENMP static double L_g2(int p, double q, double x) { /* max p: (n-|m|)/2 = n/2 */ @@ -210,12 +213,12 @@ L_g2(int p, double q, double x) { } return L_p; } +#endif /* _OPENMP */ - - +#ifndef _OPENMP elementval -eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { +eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { /* evaluate r^2/beta^2 */ double rb=pow(r/ecoeff->beta,2); /* evaluate e^(-r^2/2beta^2) */ @@ -252,8 +255,10 @@ eval_elementcoeffs0(double r, double theta, elementcoeff *ecoeff) { return eval; } +#endif /* !_OPENMP */ +#ifdef _OPENMP elementval eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { /* evaluate r^2/beta^2 */ @@ -348,6 +353,7 @@ eval_elementcoeffs(double r, double theta, elementcoeff *ecoeff) { return eval; } +#endif /* _OPENMP */ /* Legendre function P(l,m,x) */ static double diff --git a/src/lib/Radio/stationbeam.c b/src/lib/Radio/stationbeam.c index b26876c..3e7cfc5 100644 --- a/src/lib/Radio/stationbeam.c +++ b/src/lib/Radio/stationbeam.c @@ -178,19 +178,28 @@ array_element_beam(double ra, double dec, double ra0, double dec0, double f, dou fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); exit(1); } + +#ifdef _OPENMP #pragma omp simd +#endif /* _OPENMP */ for (cj=0; cj Date: Mon, 12 Dec 2022 15:32:51 +0100 Subject: [PATCH 17/19] cleanup --- src/lib/Radio/predict_withbeam.c | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/src/lib/Radio/predict_withbeam.c b/src/lib/Radio/predict_withbeam.c index 6d088c2..eac905d 100644 --- a/src/lib/Radio/predict_withbeam.c +++ b/src/lib/Radio/predict_withbeam.c @@ -1033,7 +1033,10 @@ visibilities_threadfn_multifreq(void *data) { fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); exit(1); } + /* add up terms together with Ejones multiplication */ +#ifdef _OPENMP #pragma omp simd +#endif for (cn=0; cncarr[cm].N; cn++) { complex double Ph,IIl,QQl,UUl,VVl; Ph=(PHr[cn]+_Complex_I*PHi[cn]); @@ -1046,43 +1049,25 @@ visibilities_threadfn_multifreq(void *data) { CO[2+4*cn]=UUl-_Complex_I*VVl; CO[3+4*cn]=IIl-QQl; } +#ifdef _OPENMP #pragma omp simd +#endif for (cn=0; cncarr[cm].N; cn++) { complex double *E1=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta1*8]; /* 8 values */ complex double *E2=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta2*8]; /* 8 values */ amb(E1,&CO[4*cn],T1); ambt(T1,E2,&CO[4*cn]); } +#ifdef _OPENMP #pragma omp simd +#endif for (cn=0; cncarr[cm].N; cn++) { C[0]+=CO[0+4*cn]; C[1]+=CO[1+4*cn]; C[2]+=CO[2+4*cn]; C[3]+=CO[3+4*cn]; } - /* add up terms together with Ejones multiplication */ - // for (cn=0; cncarr[cm].N; cn++) { - // complex double *E1=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta1*8]; /* 8 values */ - // complex double *E2=(complex double*)&t->elementbeam[cn*(t->N*8*t->tilesz*t->Nchan)+cf*(t->N*8*t->tilesz)+tslot*t->N*8+sta2*8]; /* 8 values */ - // complex double Ph,IIl,QQl,UUl,VVl; - // Ph=(PHr[cn]+_Complex_I*PHi[cn]); - // IIl=Ph*II[cn]; - // QQl=Ph*QQ[cn]; - // UUl=Ph*UU[cn]; - // VVl=Ph*VV[cn]; - // complex double C0[4]; - // C0[0]=IIl+QQl; - // C0[1]=UUl+_Complex_I*VVl; - // C0[2]=UUl-_Complex_I*VVl; - // C0[3]=IIl-QQl; - // amb(E1,C0,T1); - // ambt(T1,E2,C0); - // C[0]+=C0[0]; - // C[1]+=C0[1]; - // C[2]+=C0[2]; - // C[3]+=C0[3]; - // } - free(CO); + free(CO); } else { /* add up terms together */ for (cn=0; cncarr[cm].N; cn++) { From ac7b4d256590c74935f45b89fbf8d49abee3c89c Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Tue, 13 Dec 2022 10:56:57 +0100 Subject: [PATCH 18/19] split to smaller loops --- src/lib/Radio/predict_withbeam.c | 67 ++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/src/lib/Radio/predict_withbeam.c b/src/lib/Radio/predict_withbeam.c index eac905d..51c0139 100644 --- a/src/lib/Radio/predict_withbeam.c +++ b/src/lib/Radio/predict_withbeam.c @@ -990,36 +990,11 @@ visibilities_threadfn_multifreq(void *data) { } - /* flux of each source, at each freq */ +#ifdef _OPENMP +#pragma omp simd +#endif for (cn=0; cncarr[cm].N; cn++) { - /* coherencies are NOT scaled by 1/2, with spectral index */ - if (t->carr[cm].spec_idx[cn]!=0.0) { - double fratio=log(freq0/t->carr[cm].f0[cn]); - double fratio1=fratio*fratio; - double fratio2=fratio1*fratio; - double tempfr=t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2; - /* catch -ve and 0 sI */ - if (t->carr[cm].sI0[cn]>0.0) { - II[cn]=exp(log(t->carr[cm].sI0[cn])+tempfr); - } else { - II[cn]=(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+tempfr)); - } - if (t->carr[cm].sQ0[cn]>0.0) { - QQ[cn]=exp(log(t->carr[cm].sQ0[cn])+tempfr); - } else { - QQ[cn]=(t->carr[cm].sQ0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sQ0[cn])+tempfr)); - } - if (t->carr[cm].sU0[cn]>0.0) { - UU[cn]=exp(log(t->carr[cm].sU0[cn])+tempfr); - } else { - UU[cn]=(t->carr[cm].sU0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sU0[cn])+tempfr)); - } - if (t->carr[cm].sV0[cn]>0.0) { - VV[cn]=exp(log(t->carr[cm].sV0[cn])+tempfr); - } else { - VV[cn]=(t->carr[cm].sV0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sV0[cn])+tempfr)); - } - } else { + if (t->carr[cm].spec_idx[cn]==0.0) { II[cn]=t->carr[cm].sI[cn]; QQ[cn]=t->carr[cm].sQ[cn]; UU[cn]=t->carr[cm].sU[cn]; @@ -1027,6 +1002,40 @@ visibilities_threadfn_multifreq(void *data) { } } + /* flux of each source, at each freq */ + /* coherencies are NOT scaled by 1/2, with spectral index */ + double *tempfr=0; + if (posix_memalign((void*)&tempfr,sizeof(double),((size_t)t->carr[cm].N*sizeof(double)))!=0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } +#ifdef _OPENMP +#pragma omp simd +#endif + for (cn=0; cncarr[cm].N; cn++) { + if (t->carr[cm].spec_idx[cn]!=0.0) { + double fratio=log(freq0/t->carr[cm].f0[cn]); + tempfr[cn]=(t->carr[cm].spec_idx[cn]+(t->carr[cm].spec_idx1[cn]+t->carr[cm].spec_idx2[cn]*fratio)*fratio)*fratio; + } + } +#ifdef _OPENMP +#pragma omp simd +#endif + for (cn=0; cncarr[cm].N; cn++) { + if (t->carr[cm].spec_idx[cn]!=0.0) { + /* catch -ve and 0 sI */ + II[cn]=t->carr[cm].sI0[cn]>0.0?exp(log(t->carr[cm].sI0[cn])+tempfr[cn]) + :(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+tempfr[cn])); + QQ[cn]=t->carr[cm].sQ0[cn]>0.0?exp(log(t->carr[cm].sQ0[cn])+tempfr[cn]) + :(t->carr[cm].sQ0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sQ0[cn])+tempfr[cn])); + UU[cn]=t->carr[cm].sU0[cn]>0.0?exp(log(t->carr[cm].sU0[cn])+tempfr[cn]) + :(t->carr[cm].sU0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sU0[cn])+tempfr[cn])); + VV[cn]=t->carr[cm].sV0[cn]>0.0?exp(log(t->carr[cm].sV0[cn])+tempfr[cn]) + :(t->carr[cm].sV0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sV0[cn])+tempfr[cn])); + } + } + free(tempfr); + if (t->dobeam==DOBEAM_ELEMENT || t->dobeam==DOBEAM_FULL) { complex double *CO=0; if (posix_memalign((void*)&CO,sizeof(complex double),((size_t)t->carr[cm].N*4*sizeof(complex double)))!=0) { From 159a58a51a161705a0924f3a68c2dd9bef076035 Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Tue, 27 Dec 2022 16:48:55 +0100 Subject: [PATCH 19/19] catch tail end of time --- src/MS/data.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/MS/data.cpp b/src/MS/data.cpp index 5d7ec4b..1459917 100644 --- a/src/MS/data.cpp +++ b/src/MS/data.cpp @@ -814,6 +814,12 @@ Data::loadData(Table ti, Data::IOData iodata, LBeam binfo, double *fratio) { iodata.flag[row]=1; } + /* also set time to last valid one */ + if (rowt>0 && rowt