diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml index cef70861f27c1..edf6a1f935d59 100644 --- a/HeterogeneousCore/CUDACore/test/BuildFile.xml +++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml @@ -8,7 +8,6 @@ - @@ -22,6 +21,12 @@ + + + + + + diff --git a/HeterogeneousCore/CUDACore/test/mpiCudaGeneric.cu b/HeterogeneousCore/CUDACore/test/mpiCudaGeneric.cu new file mode 100644 index 0000000000000..c34256e2986be --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/mpiCudaGeneric.cu @@ -0,0 +1,1335 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//////////////////////////////////////////// C U D A ///////////////////////////////////////// +#include +#include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +//called in the Host and excuted in the Device (GPU) +__global__ void addVectorsGpu(float *vect1, float *vect2, float *vect3, int size, int taskN) { + //blockDim.x gives the number of threads in a block, in the x direction. + //gridDim.x gives the number of blocks in a grid, in the x direction. + //blockDim.x * gridDim.x gives the number of threads in a grid (in the x direction, in this case). + int first = blockDim.x * blockIdx.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = 0; i < taskN; ++i) { + for (int j = first; j < size; j += stride) { + vect3[j] = vect2[j] + vect1[j]; + } + } +} //add two vectors and save the result into the third vector. +////////////////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////// Global Varaibles ///////////////////////////////////// +unsigned int sizeVector = 2000; +int average = 5; +int task = 1; +int partsToRun = 1; +bool printStander = false; +bool saveFile = false; +bool help = false; +////////////////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////// Data Structure ///////////////////////////////////// +//Data For User's Choises Only +struct UserChoises { + // unsigned int sizeVector; + unsigned int sizeVectorBytes; //Defualt vectors element float + // unsigned int average; + unsigned int extra; + // unsigned int task; + // unsigned int partsToRun; + int root; + // int numberProcess; + int averageVectorSend; + std::vector partsToRunVector; //vector for user's choice of part. +}; + +//Data For MPI Only +struct MPIData { + int num_procs; + int rank; + + std::pair workSplit; + float *mVect1; //declare vector 1. + float *mVect2; //declare vector 2. + float *mVect3; //declare vector fulled only by root to get result from workers. + float *mVectChecking; //declare vector to verify the results from each process. + float *mVectWorker1; //declare vector 1 for workers only. + float *mVectWorker2; //declare vector 2 for workers only. + float *mVectWorker3; //declare vector 2 for workers only. + std::vector displacement; //declare vector for selecting location of each element to be sent. + std::vector numberToSend; +}; + +//Data For Cuda Only +struct Pointers { + float *vect1; //pointers only for Host + float *vect2; + float *vect3; + + float *dVect1; //pointers only for device + float *dVect2; + float *dVect3; + + float *dVect1Extra; //pointers only for device + float *dVect2Extra; + float *dVect3Extra; +}; + +//Data for Time Measurements Only +struct Timing { + int partChosen; + int unitChoice; + double inputPreparationRoot[2]; // get time points from start and end on Root Side + double inputPreparationHost[2]; // get time points from start and end on Host Side. + double operationOnDeviceByHost[2]; //get time duration in Device with Host perspective. + + double outputPreparationRoot[2]; + double outputPreparationHost[2]; + + std::vector timeInputPreparationRoot; //Save the Duration time. + std::vector timeInputPreparationHost; + std::vector timeOperationOnDeviceByRootHost; + std::vector timeOutputPreparationRoot; + std::vector timeOutputPreparationHost; + + cudaEvent_t start, stop; //get time points in Device. + float operationOnDeviceByDevice = 0; //get time duration in Device with device perspective. + std::vector operationOnDeviceByDeviceAcc; //get accumulating time duration in Device with device perspective. + std::vector averageResults; ///declare vector for getting average calcualtion for Hosts and device to Root. +}; + +////////////////////////////////////////////////////////////////////////////////////////////////// +const std::vector chooseFunction(int toInteger); +std::pair splitProcess(int works, int numberOfProcess); +const std::vector numberDataSend(int numberOfProcess, std::pair splitWorks); +void setupMPIAndVectors( + MPIData &mpiData, + UserChoises &user); //initialize communicator environment for MPI and Resize Vectors with Generating Random numbers. +void setupTime(Timing &timing, UserChoises &user); //Resizing Vectors of Time. +void calculateTimeDuration(Timing &timing, int i, int &root); +void addVectorsHost(float *vect1, float *vect2, float *vect3); +void cleanBuffer(float *vect); +bool checkingResultsPrintout(float *vectCpu, float *vectGpu); +void calculateAverageDeviation(Timing &timing, int averg, int &root); +bool sendAverageToRoot(Timing &timing, UserChoises &user, int &rank); + +Timing blockSendPart1(MPIData &mpidata, Timing &timing, Pointers &pointer, UserChoises &user); +Timing blockSendPart2(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user); +Timing blockSendPart3(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user); + +void printTable(std::vector &timing, bool standerDeviationPrint); +int getNumberofDigits(double number); +void newLineTitle(int line, const std::string &title); +void printResultEach(std::vector &timing, int type, bool standerDeviationPrint); +bool saveToFile(const std::string &name, const Timing &timing); + +void printHelp(void); +int main(int argc, char *argv[]) { + cms::cudatest::requireDevices(); + int c; //to get parameters from user. + + UserChoises user; //Setup Uuser's input variables + user.extra = 2; + user.root = 0; + user.averageVectorSend = 8; + + while ((c = getopt(argc, argv, "s:a:t:p:qfh")) != -1) { + switch (c) { + case 's': + try { + sizeVector = std::stoll(optarg, nullptr, 0); + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; + } + break; + case 'a': + try { + average = std::stoll(optarg, nullptr, 0); + + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; + } + break; + case 't': + try { + task = std::stoll(optarg, nullptr, 0); + //std::cout << "\nNumber of repeated Task is " << task << std::endl; + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; + } + break; + case 'p': + try { + partsToRun = std::stoll(optarg, nullptr, 0); + user.partsToRunVector = chooseFunction(partsToRun); + //std::cout << "\nyou have chosen Part "; + for (unsigned int j = 0; j < user.partsToRunVector.size(); ++j) { + std::cout << user.partsToRunVector[j] << " ,"; + } + std::cout << "\n"; + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; + } + break; + case 'q': + try { + printStander = true; + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; + } + break; + case 'f': + try { + saveFile = true; + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; + } + break; + case 'h': + try { + help = true; + } catch (std::exception &err) { + std::cout << "\n\tError Must be integer Argument!"; + std::cout << "\n\t" << err.what() << std::endl; + return 0; + } + break; + + default: + abort(); + } + } + + MPIData mpiData; + Timing timing; + Timing resetTime; + Pointers pointer; + timing.unitChoice = 1000000; //1M + resetTime.unitChoice = 1000000; //1M + + std::vector allTiming; + allTiming.resize(user.partsToRunVector.size()); + + MPI_Init(&argc, &argv); //initialize communicator environment. + + if (help) { + printHelp(); + MPI::Finalize(); + exit(0); + } + setupMPIAndVectors(mpiData, user); + + setupTime(timing, user); + setupTime(resetTime, user); + + for (long unsigned int i = 0; i < user.partsToRunVector.size(); ++i) { + if (user.partsToRunVector[i] == 1) { + //setupTime(allTiming[i], user); + //blockSendPart1(mpiData, allTiming[i], pointer, user); + allTiming[i] = blockSendPart1(mpiData, timing, pointer, user); + timing = resetTime; + + } else if (user.partsToRunVector[i] == 2) { + //setupTime(allTiming[i], user); + //blockSendPart2(mpiData, allTiming[i], pointer, user); + allTiming[i] = blockSendPart2(mpiData, timing, pointer, user); + timing = resetTime; + + } else if (user.partsToRunVector[i] == 3) { + allTiming[i] = blockSendPart3(mpiData, timing, pointer, user); + timing = resetTime; + // } else if (user.partsToRunVector[i] == 4) { + // allTiming[i] = cudaTimePart4(timing, vect, dvect, size); + + // } else if (user.partsToRunVector[i] == 5) { + // allTiming[i] = cudaTimePart5(timing, vect, dvect, size); + + } else { + std::cout << "\n\n\tError the User has not chose any number of Function!\n"; + break; + } + } + + if (!mpiData.rank) + printTable(allTiming, printStander); + + MPI::Finalize(); + return 0; +} +const std::vector chooseFunction(int toInteger) { + std::vector digits(0, 0); + std::vector ERROR(0, 0); + + int digit{1}; + + while (toInteger > 0) { + digit = toInteger % 10; + if (digit > 7) { + std::cout << "\n\tError Must be integer Argument <= " << toInteger << std::endl; + return ERROR; + } + digits.push_back(digit); + toInteger /= 10; + } + std::reverse(digits.begin(), digits.end()); + return digits; +} + +std::pair splitProcess(int works, int numberOfProcess) { + std::pair Return{0, 0}; + if (numberOfProcess > 1 && numberOfProcess <= works) { + Return.first = works / (numberOfProcess - 1); //number of cycle for each process. + Return.second = works % (numberOfProcess - 1); //extra cycle for process. + } else { + std::cout << "\tError Either No worker are found OR Number Processes Larger than Length!!!\n"; + } + + return Return; +} +const std::vector numberDataSend(int numberOfProcess, std::pair splitWorks) { + std::vector dataSend(numberOfProcess, splitWorks.first); + dataSend[0] = 0; + for (int i = 1; i < splitWorks.second + 1; i++) //neglect root + { + dataSend[i] += 1; //extra work for each first processes. + } + return dataSend; +} +const std::vector displacmentData(int numberOfProcess, + std::pair splitWorks, + const std::vector &numberDataSend) { + std::vector displacment(numberOfProcess, splitWorks.first); + + displacment[0] = 0; + displacment[1] = 0; //start Here. + + for (int i = 2; i < numberOfProcess; i++) //neglect root + { + displacment[i] = numberDataSend[i - 1] + displacment[i - 1]; //extra work for each first processes. + } + return displacment; +} +void randomGenerator(float *vect) { + std::random_device rand; + std::default_random_engine gener(rand()); + std::uniform_real_distribution<> dis(0., 1.); + for (unsigned int i = 0; i < sizeVector; ++i) { + vect[i] = dis(gener); + } +} +void setupMPIAndVectors(MPIData &mpiData, UserChoises &user) { + mpiData.num_procs = MPI::COMM_WORLD.Get_size(); //get total size of processes. + mpiData.rank = MPI::COMM_WORLD.Get_rank(); //get each process number. + + user.sizeVectorBytes = sizeVector * sizeof(float); //get size in byte for vectors. + + mpiData.mVect1 = (float *)malloc(user.sizeVectorBytes); //initialize size. + mpiData.mVect2 = (float *)malloc(user.sizeVectorBytes); + mpiData.mVect3 = (float *)malloc(user.sizeVectorBytes); + mpiData.mVectChecking = (float *)malloc(user.sizeVectorBytes); + + //mpiData.mVectWorker1 = (float*) malloc(user.sizeVectorBytes); + //mpiData.mVectWorker2 = (float*) malloc(user.sizeVectorBytes); + mpiData.mVectWorker3 = (float *)malloc(user.sizeVectorBytes); + + mpiData.workSplit = splitProcess(sizeVector, mpiData.num_procs); + + if (!mpiData.workSplit.first) { + MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); + exit(-1); + } + + mpiData.numberToSend = numberDataSend(mpiData.num_procs, mpiData.workSplit); + mpiData.displacement = displacmentData(mpiData.num_procs, mpiData.workSplit, mpiData.numberToSend); + + // mpiData.mVectWorker1.resize(mpiData.numberToSend[mpiData.rank]); //Resizing each process with appropriate Receiving Data. + // mpiData.mVectWorker2.resize(mpiData.numberToSend[mpiData.rank]); + mpiData.mVectWorker1 = (float *)malloc(mpiData.numberToSend[mpiData.rank] * sizeof(float)); + mpiData.mVectWorker2 = (float *)malloc(mpiData.numberToSend[mpiData.rank] * sizeof(float)); + + if (!mpiData.rank) //Only for root + { + randomGenerator(mpiData.mVect1); //generate random floating numbers from(0,1) Only in the root. + randomGenerator(mpiData.mVect2); + std::cout << "\n\tNumber of Processes " << mpiData.num_procs << std::endl; + std::cout << "\tNumber of workSplit First " << mpiData.workSplit.first << std::endl; + std::cout << "\tNumber of workSplit Second " << mpiData.workSplit.second << std::endl; + std::cout << "\tTotal size of a Buffer " << user.sizeVectorBytes << " B" << std::endl; + } +} +void setupTime(Timing &timing, UserChoises &user) { + //Setup Verctors for Taking Average and Standard deviation + timing.timeInputPreparationRoot.resize(average + user.extra); //extra for saving the average. + timing.timeInputPreparationHost.resize(average + user.extra); + timing.timeOperationOnDeviceByRootHost.resize(average + user.extra); + timing.timeOutputPreparationRoot.resize(average + user.extra); + timing.timeOutputPreparationHost.resize(average + user.extra); + timing.operationOnDeviceByDeviceAcc.resize(average + user.extra); + timing.averageResults.resize(user.averageVectorSend); +} + +void calculateTimeDuration(Timing &timing, int i, int &root) { + if (!root) { + timing.timeInputPreparationRoot[i] = + (timing.inputPreparationRoot[1] - timing.inputPreparationRoot[0]); //getting the time in microseconds + timing.timeOperationOnDeviceByRootHost[i] = (timing.outputPreparationRoot[0] - timing.inputPreparationRoot[1]); + timing.timeOutputPreparationRoot[i] = (timing.outputPreparationRoot[1] - timing.outputPreparationRoot[0]); + } else { + timing.timeInputPreparationHost[i] = (timing.inputPreparationHost[1] - timing.inputPreparationHost[0]); + timing.timeOperationOnDeviceByRootHost[i] = + (timing.operationOnDeviceByHost[1] - + timing.operationOnDeviceByHost[0]); //time taking for Device operation with respect of Host. + cudaEventElapsedTime(&timing.operationOnDeviceByDevice, + timing.start, + timing.stop); //get the time elapse in Device operation with device perspective. + timing.operationOnDeviceByDeviceAcc[i] = (timing.operationOnDeviceByDevice * 1000); + timing.timeOutputPreparationHost[i] = (timing.outputPreparationHost[1] - timing.outputPreparationHost[0]); + } +} +void addVectorsHost(float *vect1, float *vect2, float *vect3) { + for (unsigned int i = 0; i < sizeVector; ++i) { + vect3[i] = vect2[i] + vect1[i]; + } +} +void cleanBuffer(float *vect) { + for (unsigned int i = 0; i < sizeVector; ++i) { + vect[i] = 0; + } +} +bool checkingResultsPrintout(float *vectCpu, float *vectGpu) { + float percent{0.0}; + float totalError{0.0}; + + for (unsigned int j = 0; j < sizeVector; j++) { + percent = ((vectCpu[j] - vectGpu[j]) / vectCpu[j]) * 100; + totalError += percent; + } + if (totalError) { + std::cout << "\n------------------------------------\n"; + std::cout << "| CpuSum | GpuSum | Error | Error %| "; + std::cout << "\n------------------------------------\n"; + //std::cout.precision(4); + for (unsigned int j = 0; j < sizeVector; j++) { + std::cout.flags(std::ios::fixed | std::ios::showpoint); + std::cout.precision(4); + std::cout << "| " << vectCpu[j] << " | " << vectGpu[j] << " | " << vectCpu[j] - vectGpu[j] << " | " << percent + << " |\n"; + } + std::cout << "-------------------------------------\n"; + std::cout << "-Total Error is " << totalError << std::endl; + return false; + } + return true; +} +void calculateAverageDeviation(Timing &timing, int averg, int &root) { + //Average + for (int i = 0; i < averg; ++i) { + if (!root) { + timing.timeInputPreparationRoot[averg] += timing.timeInputPreparationRoot[i]; + timing.timeOperationOnDeviceByRootHost[averg] += timing.timeOperationOnDeviceByRootHost[i]; + timing.timeOutputPreparationRoot[averg] += timing.timeOutputPreparationRoot[i]; + } else { + timing.timeInputPreparationHost[averg] += timing.timeInputPreparationHost[i]; + timing.timeOperationOnDeviceByRootHost[averg] += timing.timeOperationOnDeviceByRootHost[i]; + timing.timeOutputPreparationHost[averg] += timing.timeOutputPreparationHost[i]; + timing.operationOnDeviceByDeviceAcc[averg] += timing.operationOnDeviceByDeviceAcc[i]; + } + } + if (!root) { + timing.timeInputPreparationRoot[averg] = timing.timeInputPreparationRoot[averg] / averg; + timing.timeOperationOnDeviceByRootHost[averg] = timing.timeOperationOnDeviceByRootHost[averg] / averg; + + timing.timeOutputPreparationRoot[averg] = timing.timeOutputPreparationRoot[averg] / averg; + + } else { + timing.timeInputPreparationHost[averg] = timing.timeInputPreparationHost[averg] / averg; + + timing.timeOperationOnDeviceByRootHost[averg] = timing.timeOperationOnDeviceByRootHost[averg] / averg; + + timing.timeOutputPreparationHost[averg] = timing.timeOutputPreparationHost[averg] / averg; + + timing.operationOnDeviceByDeviceAcc[averg] = (double)timing.operationOnDeviceByDeviceAcc[averg] / averg; + } + + //Standard deviation + for (int i = 0; i < averg; ++i) { + if (!root) { + timing.timeInputPreparationRoot[i] -= timing.timeInputPreparationRoot[averg]; //Take the different. + timing.timeInputPreparationRoot[i] = + timing.timeInputPreparationRoot[i] * timing.timeInputPreparationRoot[i]; // Square it. + timing.timeInputPreparationRoot[averg + 1] += + timing.timeInputPreparationRoot[i]; //add them togather. averg+1 is location of the Deviation + + timing.timeOperationOnDeviceByRootHost[i] -= timing.timeOperationOnDeviceByRootHost[averg]; + timing.timeOperationOnDeviceByRootHost[i] *= timing.timeOperationOnDeviceByRootHost[i]; + timing.timeOperationOnDeviceByRootHost[averg + 1] += timing.timeOperationOnDeviceByRootHost[i]; + + timing.timeOutputPreparationRoot[i] -= timing.timeOutputPreparationRoot[averg]; + timing.timeOutputPreparationRoot[i] *= timing.timeOutputPreparationRoot[i]; + timing.timeOutputPreparationRoot[averg + 1] += timing.timeOutputPreparationRoot[i]; + } else { + timing.timeInputPreparationHost[i] -= timing.timeInputPreparationHost[averg]; //Take the different. + timing.timeInputPreparationHost[i] = + timing.timeInputPreparationHost[i] * timing.timeInputPreparationHost[i]; // Square it. + timing.timeInputPreparationHost[averg + 1] += + timing.timeInputPreparationHost[i]; //add them togather. averg+1 is location of the Deviation + + timing.timeOperationOnDeviceByRootHost[i] -= timing.timeOperationOnDeviceByRootHost[averg]; + timing.timeOperationOnDeviceByRootHost[i] *= timing.timeOperationOnDeviceByRootHost[i]; + timing.timeOperationOnDeviceByRootHost[averg + 1] += timing.timeOperationOnDeviceByRootHost[i]; + + timing.timeOutputPreparationHost[i] -= timing.timeOutputPreparationHost[averg]; + timing.timeOutputPreparationHost[i] *= timing.timeOutputPreparationHost[i]; + timing.timeOutputPreparationHost[averg + 1] += timing.timeOutputPreparationHost[i]; + + timing.operationOnDeviceByDeviceAcc[i] -= timing.operationOnDeviceByDeviceAcc[averg]; + timing.operationOnDeviceByDeviceAcc[i] *= timing.operationOnDeviceByDeviceAcc[i]; + timing.operationOnDeviceByDeviceAcc[averg + 1] += timing.operationOnDeviceByDeviceAcc[i]; + } + } + + if (!root) { + timing.timeInputPreparationRoot[averg + 1] = timing.timeInputPreparationRoot[averg + 1] / averg; + timing.timeInputPreparationRoot[averg + 1] = sqrt(timing.timeInputPreparationRoot[averg + 1]); + + timing.timeOperationOnDeviceByRootHost[averg + 1] = timing.timeOperationOnDeviceByRootHost[averg + 1] / averg; + timing.timeOperationOnDeviceByRootHost[averg + 1] = sqrt(timing.timeOperationOnDeviceByRootHost[averg + 1]); + + timing.timeOutputPreparationRoot[averg + 1] = timing.timeOutputPreparationRoot[averg + 1] / averg; + timing.timeOutputPreparationRoot[averg + 1] = sqrt(timing.timeOutputPreparationRoot[averg + 1]); + + } else { + timing.timeInputPreparationHost[averg + 1] = timing.timeInputPreparationHost[averg + 1] / averg; //*1000000 + timing.timeInputPreparationHost[averg + 1] = sqrt(timing.timeInputPreparationHost[averg + 1]); + + timing.timeOperationOnDeviceByRootHost[averg + 1] = timing.timeOperationOnDeviceByRootHost[averg + 1] / averg; + timing.timeOperationOnDeviceByRootHost[averg + 1] = sqrt(timing.timeOperationOnDeviceByRootHost[averg + 1]); + + timing.timeOutputPreparationHost[averg + 1] = timing.timeOutputPreparationHost[averg + 1] / averg; + timing.timeOutputPreparationHost[averg + 1] = sqrt(timing.timeOutputPreparationHost[averg + 1]); + + timing.operationOnDeviceByDeviceAcc[averg + 1] = (double)timing.operationOnDeviceByDeviceAcc[averg + 1] / averg; + timing.operationOnDeviceByDeviceAcc[averg + 1] = sqrt(timing.operationOnDeviceByDeviceAcc[averg + 1]); + } + + if (!root) { + timing.timeInputPreparationRoot[averg] *= timing.unitChoice; + timing.timeOperationOnDeviceByRootHost[averg] *= timing.unitChoice; + timing.timeOutputPreparationRoot[averg] *= timing.unitChoice; + + timing.timeInputPreparationRoot[averg + 1] *= timing.unitChoice; + timing.timeOperationOnDeviceByRootHost[averg + 1] *= timing.unitChoice; + timing.timeOutputPreparationRoot[averg + 1] *= timing.unitChoice; + } else { + timing.timeInputPreparationHost[averg] *= timing.unitChoice; + timing.timeOperationOnDeviceByRootHost[averg] *= timing.unitChoice; + timing.timeOutputPreparationHost[averg] *= timing.unitChoice; + + timing.timeInputPreparationHost[averg + 1] *= timing.unitChoice; + timing.timeOperationOnDeviceByRootHost[averg + 1] *= timing.unitChoice; + timing.timeOutputPreparationHost[averg + 1] *= timing.unitChoice; + } +} + +bool sendAverageToRoot(Timing &timing, UserChoises &user, int &rank) { + if (rank) { + timing.averageResults[0] = timing.timeInputPreparationHost[average]; + timing.averageResults[1] = timing.timeInputPreparationHost[average + 1]; //Stander Deviation + + timing.averageResults[2] = timing.timeOperationOnDeviceByRootHost[average]; + timing.averageResults[3] = timing.timeOperationOnDeviceByRootHost[average + 1]; + + timing.averageResults[4] = timing.timeOutputPreparationHost[average]; + timing.averageResults[5] = timing.timeOutputPreparationHost[average + 1]; + + timing.averageResults[6] = timing.operationOnDeviceByDeviceAcc[average]; + timing.averageResults[7] = timing.operationOnDeviceByDeviceAcc[average + 1]; + + MPI_Send(&timing.averageResults[0], user.averageVectorSend, MPI_FLOAT, user.root, 0, MPI_COMM_WORLD); + + } else if (!rank) { + MPI_Recv(&timing.averageResults[0], user.averageVectorSend, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } + return true; +} + +Timing blockSendPart1(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user) { + cleanBuffer(mpiData.mVectWorker3); //clear each value of vector's elements + timing.partChosen = 1; + + if (mpiData.rank) //Only for Workers + { + cudaCheck(cudaMalloc((void **)&pointer.dVect1, + user.sizeVectorBytes)); //allocate memory space for vector in the global memory of the Device. + cudaCheck(cudaMalloc((void **)&pointer.dVect2, user.sizeVectorBytes)); + cudaCheck(cudaMalloc((void **)&pointer.dVect3, user.sizeVectorBytes)); + } + ///////////////////////////// Start of Average //////////////////////// + for (int a = 0; a <= average; ++a) { + if (!mpiData.rank) //Only for root + { + ////////////////////////////////// Input Prepation for Root ////////////////////////////////// + timing.inputPreparationRoot[0] = MPI_Wtime(); + for (int i = 1; i < mpiData.num_procs; ++i) { + MPI_Send(&mpiData.mVect1[mpiData.displacement[i]], + mpiData.numberToSend[i], + MPI_FLOAT, + i, + 0, + MPI_COMM_WORLD); //Tag is 0 + MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD); + } + timing.inputPreparationRoot[1] = MPI_Wtime(); + ///////////////////////////////////////////////////////////////////////////////////////////////// + } + + if (mpiData.rank) //Only for Workers + { + ////////////////////////////////// Input Prepation for Host ////////////////////////////////// + MPI_Probe(user.root, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + timing.inputPreparationHost[0] = MPI_Wtime(); + MPI_Recv(&mpiData.mVectWorker1[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + MPI_Recv(&mpiData.mVectWorker2[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + + cudaCheck(cudaMemcpy(pointer.dVect1, + mpiData.mVectWorker1, + user.sizeVectorBytes, + cudaMemcpyHostToDevice)); //copy random vector from host to device. + cudaCheck(cudaMemcpy(pointer.dVect2, mpiData.mVectWorker2, user.sizeVectorBytes, cudaMemcpyHostToDevice)); + + timing.inputPreparationHost[1] = MPI_Wtime(); + /////////////////////////////////////////////////////////////////////////////////////// + + cudaCheck(cudaEventCreate(&timing.start)); //inialize Event. + cudaCheck(cudaEventCreate(&timing.stop)); + + ///////////////////////////// Operation on Device with respect of Host ////////////////// + + int threads = 512; //arbitrary number. + int blocks = (sizeVector + threads - 1) / threads; //get ceiling number of blocks. + blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs. + + ////////////////////////// CAll Device Kernel ////////////////////////////////// + cudaCheck(cudaEventRecord(timing.start)); + timing.operationOnDeviceByHost[0] = MPI_Wtime(); + + addVectorsGpu<<>>(pointer.dVect1, + pointer.dVect2, + pointer.dVect3, + sizeVector, + task); //call device function to add two vectors and save into vect3Gpu. + + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + cudaCheck(cudaEventRecord(timing.stop)); + + timing.operationOnDeviceByHost[1] = MPI_Wtime(); + ///////////////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////////////// Output Prepation for the Host ////////////////////////////////////// + timing.outputPreparationHost[0] = MPI_Wtime(); + cudaCheck(cudaMemcpy( + mpiData.mVectWorker3, + pointer.dVect3, + user.sizeVectorBytes, + cudaMemcpyDeviceToHost)); //copy summing result vector from Device to Host.// Try_Regist(3) delete this + + MPI_Send(&mpiData.mVectWorker3[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD); //Tag is 0 + timing.outputPreparationHost[1] = MPI_Wtime(); + //////////////////////////////////////////////////////////////////////////////////////////////// + } + + if (!mpiData.rank) //Only for root + { + /////////////////////////////////// Output Prepation for the Root ////////////////////////////////////// + MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + timing.outputPreparationRoot[0] = MPI_Wtime(); + //MPI probe + for (int i = 1; i < mpiData.num_procs; i++) { + MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]], + mpiData.numberToSend[i], + MPI_FLOAT, + i, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + } + timing.outputPreparationRoot[1] = MPI_Wtime(); + //////////////////////////////////////////////////////////////////////////////////////////////// + } + + if (a > 0) + calculateTimeDuration(timing, a - 1, mpiData.rank); + + if (mpiData.rank) { + cudaCheck(cudaEventDestroy(timing.start)); + cudaCheck(cudaEventDestroy(timing.stop)); + } + } + ///////////////////////////// End of Average //////////////////////// + if (mpiData.rank) { + cudaCheck(cudaFree(pointer.dVect1)); + cudaCheck(cudaFree(pointer.dVect2)); + cudaCheck(cudaFree(pointer.dVect3)); + } + /// + bool test = 0; + if (!mpiData.rank) //Only for root + { + addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking); //Host is adding vectors too. + test = checkingResultsPrintout(mpiData.mVectChecking, + mpiData.mVectWorker3); //Checking the results, if error then Print out to the user. + if (!test) + exit(-1); + } + + calculateAverageDeviation(timing, average, mpiData.rank); + test = sendAverageToRoot(timing, user, mpiData.rank); + if (test && !mpiData.rank) { + if (saveFile) { + test = saveToFile("dataPart1", timing); + + if (test) + std::cout << "Done Part " << timing.partChosen << " And File saved" << std::endl; + else + std::cout << "Error Saving File!!" << std::endl; + } + std::cout << "Done Part " << timing.partChosen << std::endl; + } + return timing; +} + +Timing blockSendPart2(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user) { + cleanBuffer(mpiData.mVectWorker3); //clear each value of vector's elements + timing.partChosen = 2; + + if (mpiData.rank) //Only for Workers + { + cudaCheck(cudaMallocHost((void **)&pointer.vect1, user.sizeVectorBytes)); //allocate Pinned memory on the Host. + cudaCheck(cudaMallocHost((void **)&pointer.vect2, user.sizeVectorBytes)); + cudaCheck(cudaMallocHost((void **)&pointer.vect3, user.sizeVectorBytes)); + cudaCheck(cudaMalloc((void **)&pointer.dVect1, + user.sizeVectorBytes)); //allocate memory space for vector in the global memory of the Device. + cudaCheck(cudaMalloc((void **)&pointer.dVect2, user.sizeVectorBytes)); + cudaCheck(cudaMalloc((void **)&pointer.dVect3, user.sizeVectorBytes)); + } + ///////////////////////////// Start of Average //////////////////////// + for (int a = 0; a <= average; ++a) { + if (!mpiData.rank) //Only for root + { + ////////////////////////////////// Input Prepation for Root ////////////////////////////////// + timing.inputPreparationRoot[0] = MPI_Wtime(); + for (int i = 1; i < mpiData.num_procs; ++i) { + MPI_Send(&mpiData.mVect1[mpiData.displacement[i]], + mpiData.numberToSend[i], + MPI_FLOAT, + i, + 0, + MPI_COMM_WORLD); //Tag is 0 + MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD); + } + timing.inputPreparationRoot[1] = MPI_Wtime(); + ///////////////////////////////////////////////////////////////////////////////////////////////// + } + + if (mpiData.rank) //Only for Workers + { + ////////////////////////////////// Input Prepation for Host ////////////////////////////////// + MPI_Probe(user.root, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + timing.inputPreparationHost[0] = MPI_Wtime(); + + MPI_Recv(&pointer.vect1[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + MPI_Recv(&pointer.vect2[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + + cudaCheck(cudaMemcpy(pointer.dVect1, + pointer.vect1, + user.sizeVectorBytes, + cudaMemcpyHostToDevice)); //copy random vector from host to device. + cudaCheck(cudaMemcpy(pointer.dVect2, pointer.vect2, user.sizeVectorBytes, cudaMemcpyHostToDevice)); + + timing.inputPreparationHost[1] = MPI_Wtime(); + /////////////////////////////////////////////////////////////////////////////////////// + + cudaCheck(cudaEventCreate(&timing.start)); //inialize Event. + cudaCheck(cudaEventCreate(&timing.stop)); + + ///////////////////////////// Operation on Device with respect of Host ////////////////// + + int threads = 512; //arbitrary number. + int blocks = (sizeVector + threads - 1) / threads; //get ceiling number of blocks. + blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs. + + ////////////////////////// CAll Device Kernel ////////////////////////////////// + cudaCheck(cudaEventRecord(timing.start)); + timing.operationOnDeviceByHost[0] = MPI_Wtime(); + + addVectorsGpu<<>>(pointer.dVect1, + pointer.dVect2, + pointer.dVect3, + sizeVector, + task); //call device function to add two vectors and save into vect3Gpu. + + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + cudaCheck(cudaEventRecord(timing.stop)); + + timing.operationOnDeviceByHost[1] = MPI_Wtime(); + ///////////////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////////////// Output Prepation for the Host ////////////////////////////////////// + timing.outputPreparationHost[0] = MPI_Wtime(); + + cudaCheck(cudaMemcpy( + pointer.vect3, + pointer.dVect3, + user.sizeVectorBytes, + cudaMemcpyDeviceToHost)); //copy summing result vector from Device to Host.// Try_Regist(3) delete this + + MPI_Send(&pointer.vect3[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD); //Tag is 0 + + timing.outputPreparationHost[1] = MPI_Wtime(); + //////////////////////////////////////////////////////////////////////////////////////////////// + } + + if (!mpiData.rank) //Only for root + { + /////////////////////////////////// Output Prepation for the Root ////////////////////////////////////// + MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + timing.outputPreparationRoot[0] = MPI_Wtime(); + //MPI probe + for (int i = 1; i < mpiData.num_procs; i++) { + MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]], + mpiData.numberToSend[i], + MPI_FLOAT, + i, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + } + timing.outputPreparationRoot[1] = MPI_Wtime(); + //////////////////////////////////////////////////////////////////////////////////////////////// + } + + if (a > 0) + calculateTimeDuration(timing, a - 1, mpiData.rank); + + if (mpiData.rank) { + cudaCheck(cudaEventDestroy(timing.start)); + cudaCheck(cudaEventDestroy(timing.stop)); + } + } + ///////////////////////////// End of Average //////////////////////// + if (mpiData.rank) { + cudaCheck(cudaFreeHost(pointer.vect1)); + cudaCheck(cudaFreeHost(pointer.vect2)); + cudaCheck(cudaFreeHost(pointer.vect3)); + cudaCheck(cudaFree(pointer.dVect1)); + cudaCheck(cudaFree(pointer.dVect2)); + cudaCheck(cudaFree(pointer.dVect3)); + } + + bool test = 0; + if (!mpiData.rank) //Only for root + { + addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking); //Host is adding vectors too. + test = checkingResultsPrintout(mpiData.mVectChecking, + mpiData.mVectWorker3); //Checking the results, if error then Print out to the user. + if (!test) + exit(-1); + } + + calculateAverageDeviation(timing, average, mpiData.rank); + test = sendAverageToRoot(timing, user, mpiData.rank); + if (test && !mpiData.rank) { + if (saveFile) { + test = saveToFile("dataPart2", timing); + + if (test) + std::cout << "Done Part " << timing.partChosen << " And File saved" << std::endl; + else + std::cout << "Error Saving File!!" << std::endl; + } + std::cout << "Done Part " << timing.partChosen << std::endl; + } + return timing; +} + +Timing blockSendPart3(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user) { + cleanBuffer(mpiData.mVectWorker3); //clear each value of vector's elements + timing.partChosen = 3; + + if (mpiData.rank) //Only for Workers + { + cudaCheck(cudaMalloc((void **)&pointer.dVect1, + user.sizeVectorBytes)); //allocate memory space for vector in the global memory of the Device. + cudaCheck(cudaMalloc((void **)&pointer.dVect2, user.sizeVectorBytes)); + cudaCheck(cudaMalloc((void **)&pointer.dVect3, user.sizeVectorBytes)); + } + ///////////////////////////// Start of Average //////////////////////// + for (int a = 0; a <= average; ++a) { + if (!mpiData.rank) //Only for root + { + ////////////////////////////////// Input Prepation for Root ////////////////////////////////// + timing.inputPreparationRoot[0] = MPI_Wtime(); + for (int i = 1; i < mpiData.num_procs; ++i) { + MPI_Send(&mpiData.mVect1[mpiData.displacement[i]], + mpiData.numberToSend[i], + MPI_FLOAT, + i, + 0, + MPI_COMM_WORLD); //Tag is 0 + MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD); + } + timing.inputPreparationRoot[1] = MPI_Wtime(); + ///////////////////////////////////////////////////////////////////////////////////////////////// + } + + if (mpiData.rank) //Only for Workers + { + ////////////////////////////////// Input Prepation for Host ////////////////////////////////// + MPI_Probe(user.root, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + timing.inputPreparationHost[0] = MPI_Wtime(); + MPI_Recv(&pointer.dVect1[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + MPI_Recv(&pointer.dVect2[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + + // cudaCheck(cudaMemcpy(pointer.dVect1, mpiData.mVectWorker1, user.sizeVectorBytes, cudaMemcpyHostToDevice)); //copy random vector from host to device. + // cudaCheck(cudaMemcpy(pointer.dVect2, mpiData.mVectWorker2, user.sizeVectorBytes, cudaMemcpyHostToDevice)); + + timing.inputPreparationHost[1] = MPI_Wtime(); + /////////////////////////////////////////////////////////////////////////////////////// + + cudaCheck(cudaEventCreate(&timing.start)); //inialize Event. + cudaCheck(cudaEventCreate(&timing.stop)); + + ///////////////////////////// Operation on Device with respect of Host ////////////////// + + int threads = 512; //arbitrary number. + int blocks = (sizeVector + threads - 1) / threads; //get ceiling number of blocks. + blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs. + + ////////////////////////// CAll Device Kernel ////////////////////////////////// + cudaCheck(cudaEventRecord(timing.start)); + timing.operationOnDeviceByHost[0] = MPI_Wtime(); + + addVectorsGpu<<>>(pointer.dVect1, + pointer.dVect2, + pointer.dVect3, + sizeVector, + task); //call device function to add two vectors and save into vect3Gpu. + + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + cudaCheck(cudaEventRecord(timing.stop)); + + timing.operationOnDeviceByHost[1] = MPI_Wtime(); + ///////////////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////////////// Output Prepation for the Host ////////////////////////////////////// + timing.outputPreparationHost[0] = MPI_Wtime(); + //cudaCheck(cudaMemcpy(mpiData.mVectWorker3,pointer.dVect3,user.sizeVectorBytes,cudaMemcpyDeviceToHost)); //copy summing result vector from Device to Host.// Try_Regist(3) delete this + + MPI_Send(&pointer.dVect3[0], + mpiData.numberToSend[mpiData.rank], + MPI_FLOAT, + user.root, + 0, + MPI_COMM_WORLD); //Tag is 0 + timing.outputPreparationHost[1] = MPI_Wtime(); + //////////////////////////////////////////////////////////////////////////////////////////////// + } + + if (!mpiData.rank) //Only for root + { + /////////////////////////////////// Output Prepation for the Root ////////////////////////////////////// + MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + timing.outputPreparationRoot[0] = MPI_Wtime(); + //MPI probe + for (int i = 1; i < mpiData.num_procs; i++) { + MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]], + mpiData.numberToSend[i], + MPI_FLOAT, + i, + 0, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + } + timing.outputPreparationRoot[1] = MPI_Wtime(); + //////////////////////////////////////////////////////////////////////////////////////////////// + } + + if (a > 0) + calculateTimeDuration(timing, a - 1, mpiData.rank); + + if (mpiData.rank) { + cudaCheck(cudaEventDestroy(timing.start)); + cudaCheck(cudaEventDestroy(timing.stop)); + } + } + ///////////////////////////// End of Average //////////////////////// + if (mpiData.rank) { + cudaCheck(cudaFree(pointer.dVect1)); + cudaCheck(cudaFree(pointer.dVect2)); + cudaCheck(cudaFree(pointer.dVect3)); + } + /// + bool test = 0; + if (!mpiData.rank) //Only for root + { + addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking); //Host is adding vectors too. + test = checkingResultsPrintout(mpiData.mVectChecking, + mpiData.mVectWorker3); //Checking the results, if error then Print out to the user. + if (!test) + exit(-1); + } + + calculateAverageDeviation(timing, average, mpiData.rank); + test = sendAverageToRoot(timing, user, mpiData.rank); + if (test && !mpiData.rank) { + if (saveFile) { + test = saveToFile("dataPart3", timing); + + if (test) + std::cout << "Done Part " << timing.partChosen << " And File saved" << std::endl; + else + std::cout << "Error Saving File!!" << std::endl; + } + std::cout << "Done Part " << timing.partChosen << std::endl; + } + return timing; +} + +void printTable(std::vector &timing, bool standerDeviationPrint) { + const std::string inPrepatRoot = " Duration Time Read Input Prepations On Root "; + const std::string inPrepatHost = " Duration Time Read Input Prepations On Host "; + const std::string timeCpuR = " Duration Time operation on Root point View "; + const std::string timeCpu = " Duration Time operation on Host point View "; + const std::string timeGpu = " Duration Time operation on Device point View"; + const std::string outPrepatRoot = " Duration Time Read Output Prepations On Root"; + const std::string outPrepatHost = " Duration Time Read Output Prepations On Host"; + + const std::string averageTime = " AverTime "; + const std::string standerDeviation = " StDeviation "; + const std::string nameTiming = " Name Timing "; + const std::string partsNumberall = "Part "; + + int totalFix = 0; + + if (standerDeviationPrint) { + totalFix = timeGpu.size() + timing.size() * (averageTime.size() + standerDeviation.size() + 3); + } else { + totalFix = timeGpu.size() + timing.size() * (averageTime.size() + 3); + } + + std::cout.flags(std::ios::fixed | std::ios::showpoint); + std::cout.precision(4); + + std::cout << '\n'; + std::cout.width(totalFix); + std::cout.fill('-'); + std::cout << '-' << '\n'; + std::cout.fill(' '); + + std::cout << "|"; + std::cout.width((timeGpu.size() - nameTiming.size()) / 2); + std::cout.fill(' '); + std::cout << " "; + std::cout << nameTiming; + std::cout.width((timeGpu.size() - nameTiming.size()) / 2); + std::cout.fill(' '); + std::cout << " "; + std::cout << " |"; + + for (unsigned int i = 0; i < timing.size(); ++i) { + if (standerDeviationPrint) { + std::cout.width(((averageTime.size() + standerDeviation.size()) - partsNumberall.size() + 1) / 2); + } //9 + else { + std::cout.width(((averageTime.size()) - partsNumberall.size()) / 2); + } //2 + + std::cout << " "; + std::cout << partsNumberall << timing[i].partChosen; + + if (standerDeviationPrint) { + std::cout.width(((averageTime.size() + standerDeviation.size()) - partsNumberall.size() + 1) / 2); + } //9 + else { + std::cout.width(((averageTime.size()) - partsNumberall.size()) / 2); + } + //2 + std::cout << " "; + std::cout << "|"; + } + + std::cout << '\n'; + std::cout << "|"; + std::cout.width(inPrepatHost.size() + 3); + std::cout.fill(' '); + std::cout << "|"; + + for (unsigned int i = 0; i < timing.size(); ++i) { + std::cout << averageTime; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout << standerDeviation; + std::cout << "|"; + } + } + + newLineTitle(totalFix, inPrepatRoot); + printResultEach(timing, 1, standerDeviationPrint); + + newLineTitle(totalFix, inPrepatHost); + printResultEach(timing, 2, standerDeviationPrint); + + newLineTitle(totalFix, timeCpuR); + printResultEach(timing, 3, standerDeviationPrint); + + newLineTitle(totalFix, timeCpu); + printResultEach(timing, 4, standerDeviationPrint); + + newLineTitle(totalFix, timeGpu); + printResultEach(timing, 5, standerDeviationPrint); + + newLineTitle(totalFix, outPrepatRoot); + printResultEach(timing, 6, standerDeviationPrint); + + newLineTitle(totalFix, outPrepatHost); + printResultEach(timing, 7, standerDeviationPrint); + + std::cout << '\n'; + std::cout.width(totalFix); + std::cout.fill('-'); + std::cout << '-' << '\n'; + std::cout.fill(' '); +} +int getNumberofDigits(double number) { return ((int)log10(number) + 1) + 4; } +void newLineTitle(int line, const std::string &title) { + std::cout << '\n'; + std::cout.width(line); + std::cout.fill('-'); + std::cout << '-' << '\n'; + std::cout.fill(' '); + + std::cout << "| "; + std::cout << title; + std::cout << " |"; +} +void printResultEach(std::vector &timing, int type, bool standerDeviationPrint) { + int averageTimeWidth = 10; + int standerDeviationWidth = 13; + + for (unsigned int i = 0; i < timing.size(); ++i) { + if (type == 1) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].timeInputPreparationRoot[average]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); + std::cout.fill(' '); + std::cout << timing[i].timeInputPreparationRoot[average + 1]; + std::cout << "|"; + } + } else if (type == 2) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].averageResults[0]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); + std::cout.fill(' '); + std::cout << timing[i].averageResults[1]; + std::cout << "|"; + } + } else if (type == 3) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].timeOperationOnDeviceByRootHost[average]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); + std::cout.fill(' '); + std::cout << timing[i].timeOperationOnDeviceByRootHost[average + 1]; + std::cout << "|"; + } + } else if (type == 4) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].averageResults[2]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); + std::cout.fill(' '); + std::cout << timing[i].averageResults[3]; + std::cout << "|"; + } + } else if (type == 5) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].averageResults[6]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); + std::cout.fill(' '); + std::cout << timing[i].averageResults[7]; + std::cout << "|"; + } + } else if (type == 6) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].timeOutputPreparationRoot[average]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); + std::cout.fill(' '); + std::cout << timing[i].timeOutputPreparationRoot[average + 1]; + std::cout << "|"; + } + } else if (type == 7) { + std::cout.width(averageTimeWidth); + std::cout.fill(' '); + std::cout << timing[i].averageResults[4]; + std::cout << "|"; + if (standerDeviationPrint) { + std::cout.width(standerDeviationWidth); + std::cout.fill(' '); + std::cout << timing[i].averageResults[5]; + std::cout << "|"; + } + } + } +} +bool saveToFile(const std::string &name, const Timing &timing) { + std::ofstream file(name + ".txt", std::ios::out | std::ios::app); + + if (!file.is_open()) { + std::cout << "\nCannot open File nor Create File!" << std::endl; + return 0; + } + + file << sizeVector << std::endl; + file << average << std::endl; + file << task << std::endl; + file << timing.timeInputPreparationRoot[average] << " " << timing.timeInputPreparationRoot[average + 1] << std::endl; + file << timing.averageResults[0] << " " << timing.averageResults[1] << std::endl; + file << timing.timeOperationOnDeviceByRootHost[average] << " " << timing.timeOperationOnDeviceByRootHost[average + 1] + << std::endl; + file << timing.averageResults[2] << " " << timing.averageResults[3] << std::endl; + file << timing.averageResults[6] << " " << timing.averageResults[7] << std::endl; + file << timing.timeOutputPreparationRoot[average] << " " << timing.timeOutputPreparationRoot[average + 1] + << std::endl; + file << timing.averageResults[4] << " " << timing.averageResults[5] << std::endl; + + file.close(); + if (!file.good()) { + std::cout << "\n*ERROR While Writing The " + name + " file!!" << std::endl; + return 0; + } + return 1; +} +void printHelp(void) { + int rank = MPI::COMM_WORLD.Get_rank(); + if (!rank) { + std::cout << "\n\n\t**************************************\n"; + std::cout << "\t* This is a Help for Command Opitions*"; + std::cout << "\n\t**************************************\n"; + std::cout << "\n\tYou as a user, can choose two ways to run the program:\n"; + std::cout << "\n\t1) mpirun -np -s -t -a " + "-p \n"; + std::cout << "\n\t2) cmsenv_mpirun -np -s -t -a -p \n"; + std::cout << "\n\t[-np] is for number of processes or processors that you would like to run."; + std::cout + << "\n\t[-s] is the size of vector that you would like to send, the type is float and there are two vectors."; + std::cout << "\n\t[-t] is the number of repeating of task on the Device(GPU) side."; + std::cout << "\n\t[-a] is the number of repeating the part that user has chosen."; + std::cout << "\n\t[-p] is the choice of what part to run in the program."; + std::cout << "\n\t[-q] is to print Stander Deviation."; + std::cout << "\n\t[-f] is to save the results into a file for each part."; + std::cout << "\n\n\tExample for only local Machine: "; + std::cout << "\n\tcmsenv_mpirun -np 2 mpiCudaGeneric -p1 -s200 -t1 -a1\n"; + std::cout << "\n\tExample for two Machines connected: "; + std::cout + << "\n\tcmsenv_mpirun -H , -np 2 mpiCudaGeneric -p1 -s200 -t1 -a1"; + std::cout << "\n\tExample for two Machines connected Using ucx: "; + std::cout << "\n\tcmsenv_mpirun -H , -np 2 -mca pml ucx -- " + "mpiCudaGeneric -p1 -s200 -t1 -a1"; + std::cout << "\n\n\tFor the Parts, we have in this program 4 Parts:"; + std::cout << "\n\t1)The Root, who does not have a GPU, using MPI Blocking send and receive to Host, The Host is " + "who have a GPU, then Host:"; + std::cout << "\n\t uses cudaMalloc and copies the receiving values to GPU side. Next, the GPU does the compuation"; + std::cout << "\n\t Finaly, the Host copies the results from GPU, sends them back to The Root using MPI Blocking " + "Send.\n\n"; + } +} \ No newline at end of file