diff --git a/packages/intrepid2/assembly-examples/GRADGRADStandardAssembly.hpp b/packages/intrepid2/assembly-examples/GRADGRADStandardAssembly.hpp new file mode 100644 index 000000000000..19ea89d94b4f --- /dev/null +++ b/packages/intrepid2/assembly-examples/GRADGRADStandardAssembly.hpp @@ -0,0 +1,151 @@ +// +// GRADGRADStandardAssembly.hpp +// Trilinos +// +// Created by Roberts, Nathan V on 7/2/21. +// + +#ifndef Intrepid2_GRADGRADStandardAssembly_hpp +#define Intrepid2_GRADGRADStandardAssembly_hpp + +#include "JacobianFlopEstimate.hpp" + +/** \file GRADGRADStandardAssembly.hpp + \brief Locally assembles a Poisson matrix -- an array of shape (C,F,F), with formulation (grad e_i, grad e_j), using standard Intrepid2 methods; these do not algorithmically exploit geometric structure. + */ + +//! Version that uses the classic, generic Intrepid2 paths. +template +Intrepid2::ScalarView performStandardQuadratureGRADGRAD(Intrepid2::CellGeometry &geometry, + const int &polyOrder, int worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) +{ + using ExecutionSpace = typename DeviceType::execution_space; + int numVertices = 1; + for (int d=0; dstart(); + + using CellTools = Intrepid2::CellTools; + using FunctionSpaceTools = Intrepid2::FunctionSpaceTools; + + using namespace Intrepid2; + + using namespace std; + // dimensions of the returned view are (C,F,F) + auto fs = FUNCTION_SPACE_HGRAD; + + shards::CellTopology cellTopo = geometry.cellTopology(); + + auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder); + + int numFields = basis->getCardinality(); + int numCells = geometry.numCells(); + + if (worksetSize > numCells) worksetSize = numCells; + + // local stiffness matrices: + ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); + + auto cubature = DefaultCubatureFactory::create(cellTopo,polyOrder*2); + int numPoints = cubature->getNumPoints(); + ScalarView cubaturePoints("cubature points",numPoints,spaceDim); + ScalarView cubatureWeights("cubature weights", numPoints); + + cubature->getCubature(cubaturePoints, cubatureWeights); + + const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices); + const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints); + const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints); + + // Allocate some intermediate containers + ScalarView basisValues ("basis values", numFields, numPoints ); + ScalarView basisGradValues("basis grad values", numFields, numPoints, spaceDim); + + ScalarView transformedGradValues("transformed grad values", worksetSize, numFields, numPoints, spaceDim); + ScalarView transformedWeightedGradValues("transformed weighted grad values", worksetSize, numFields, numPoints, spaceDim); + + basis->getValues(basisValues, cubaturePoints, OPERATOR_VALUE ); + basis->getValues(basisGradValues, cubaturePoints, OPERATOR_GRAD ); + + const int numNodesPerCell = geometry.numNodesPerCell(); + ScalarView expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim); + Kokkos::parallel_for(Kokkos::RangePolicy(0,numCells), + KOKKOS_LAMBDA (const int &cellOrdinal) { + for (int nodeOrdinal=0; nodeOrdinal cellMeasures("cell measures", worksetSize, numPoints); + ScalarView jacobianDeterminant("jacobian determinant", worksetSize, numPoints); + ScalarView jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim); + ScalarView jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim); + + initialSetupTimer->stop(); + + transformIntegrateFlopCount = 0; + jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant + jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P) + + int cellOffset = 0; + while (cellOffset < numCells) + { + int startCell = cellOffset; + int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell; + + std::pair cellRange = {startCell, startCell+numCellsInWorkset}; + auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL()); + + if (numCellsInWorkset != worksetSize) + { + Kokkos::resize(jacobian, numCellsInWorkset, numPoints, spaceDim, spaceDim); + Kokkos::resize(jacobianInverse, numCellsInWorkset, numPoints, spaceDim, spaceDim); + Kokkos::resize(jacobianDeterminant, numCellsInWorkset, numPoints); + Kokkos::resize(cellMeasures, numCellsInWorkset, numPoints); + Kokkos::resize(transformedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim); + Kokkos::resize(transformedWeightedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim); + } + jacobianAndCellMeasureTimer->start(); + CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell. + CellTools::setJacobianInv(jacobianInverse, jacobian); + CellTools::setJacobianDet(jacobianDeterminant, jacobian); + + FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights); + ExecutionSpace().fence(); + jacobianAndCellMeasureTimer->stop(); + + // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls. + fstIntegrateCall->start(); + FunctionSpaceTools::HGRADtransformGRAD(transformedGradValues, jacobianInverse, basisGradValues); + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction. + FunctionSpaceTools::multiplyMeasure(transformedWeightedGradValues, cellMeasures, transformedGradValues); + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedGradValues: one flop for each. + + auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL()); + + FunctionSpaceTools::integrate(cellStiffnessSubview, transformedGradValues, transformedWeightedGradValues); + ExecutionSpace().fence(); + fstIntegrateCall->stop(); + + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction. + + cellOffset += worksetSize; + } +// std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl; + return cellStiffness; +} + +#endif /* GRADGRADStandardAssembly_h */ diff --git a/packages/intrepid2/assembly-examples/GRADGRADStructuredAssembly.hpp b/packages/intrepid2/assembly-examples/GRADGRADStructuredAssembly.hpp new file mode 100644 index 000000000000..7ca60a2b5fd2 --- /dev/null +++ b/packages/intrepid2/assembly-examples/GRADGRADStructuredAssembly.hpp @@ -0,0 +1,153 @@ +// +// GRADGRADStructuredAssembly.hpp +// Trilinos +// +// Created by Roberts, Nathan V on 7/2/21. +// + +#ifndef GRADGRADStructuredAssembly_h +#define GRADGRADStructuredAssembly_h + +#include "JacobianFlopEstimate.hpp" + +/** \file GRADGRADStructuredAssembly.hpp + \brief Locally assembles a Poisson matrix -- an array of shape (C,F,F), with formulation (grad e_i, grad e_j), using "structured" Intrepid2 methods; these algorithmically exploit geometric structure as expressed in the provided CellGeometry. + */ + +//! Version that takes advantage of new structured integration support, including sum factorization. +template +Intrepid2::ScalarView performStructuredQuadratureGRADGRAD(Intrepid2::CellGeometry &geometry, const int &polyOrder, const int &worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) +{ + using namespace Intrepid2; + + using ExecutionSpace = typename DeviceType::execution_space; + + int numVertices = 1; + for (int d=0; dstart(); + using namespace std; + using FunctionSpaceTools = FunctionSpaceTools; + using IntegrationTools = IntegrationTools; + // dimensions of the returned view are (C,F,F) + auto fs = FUNCTION_SPACE_HGRAD; + + shards::CellTopology cellTopo = geometry.cellTopology(); + + auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder); + + int numFields = basis->getCardinality(); + int numCells = geometry.numCells(); + + // local stiffness matrix: + ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); + + auto cubature = DefaultCubatureFactory::create(cellTopo,polyOrder*2); + auto tensorCubatureWeights = cubature->allocateCubatureWeights(); + TensorPoints tensorCubaturePoints = cubature->allocateCubaturePoints(); + + cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights); + + EOperator op = OPERATOR_GRAD; + BasisValues gradientValues = basis->allocateBasisValues(tensorCubaturePoints, op); + basis->getValues(gradientValues, tensorCubaturePoints, op); + + // goal here is to do a weighted Poisson; i.e. (f grad u, grad v) on each cell + + int cellOffset = 0; + + auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians"); + auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()"); + + Data jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize); + Data jacobianDet = CellTools::allocateJacobianDet(jacobian); + Data jacobianInv = CellTools::allocateJacobianInv(jacobian); + TensorData cellMeasures = geometry.allocateCellMeasure(jacobianDet, tensorCubatureWeights); + + // lazily-evaluated transformed gradient values (temporary to allow integralData allocation) + auto transformedGradientValuesTemp = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues); + auto integralData = IntegrationTools::allocateIntegralData(transformedGradientValuesTemp, cellMeasures, transformedGradientValuesTemp); + + const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases + + // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount) + const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices); + const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints); + const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints); + + transformIntegrateFlopCount = 0; + jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant + jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P) + + auto refData = geometry.getJacobianRefData(tensorCubaturePoints); + + initialSetupTimer->stop(); + while (cellOffset < numCells) + { + int startCell = cellOffset; + int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell; + int endCell = numCellsInWorkset + startCell; + + jacobianAndCellMeasureTimer->start(); + if (numCellsInWorkset != worksetSize) + { + const int CELL_DIM = 0; // first dimension corresponds to cell + jacobian.setExtent( CELL_DIM, numCellsInWorkset); + jacobianDet.setExtent( CELL_DIM, numCellsInWorkset); + jacobianInv.setExtent( CELL_DIM, numCellsInWorkset); + integralData.setExtent(CELL_DIM, numCellsInWorkset); + + // cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension… + cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset); + } + + geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell); + CellTools::setJacobianDet(jacobianDet, jacobian); + CellTools::setJacobianInv(jacobianInv, jacobian); + + // lazily-evaluated transformed gradient values: + auto transformedGradientValues = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues); + + geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights); + ExecutionSpace().fence(); + jacobianAndCellMeasureTimer->stop(); + + bool sumInto = false; + double approximateFlopCountIntegrateWorkset = 0; + fstIntegrateCall->start(); + IntegrationTools::integrate(integralData, transformedGradientValues, cellMeasures, transformedGradientValues, sumInto, &approximateFlopCountIntegrateWorkset); + ExecutionSpace().fence(); + fstIntegrateCall->stop(); + + // copy into cellStiffness container. (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.) + if (integralData.getUnderlyingViewRank() == 3) + { + std::pair cellRange = {startCell, endCell}; + auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3()); + } + else // underlying view rank is 2; copy to each cell in destination stiffness matrix + { + auto integralView2 = integralData.getUnderlyingView2(); + auto policy = Kokkos::MDRangePolicy>({0,0,0},{numCellsInWorkset,numFields,numFields}); + Kokkos::parallel_for("copy uniform data to expanded container", policy, + KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) { + cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal); + }); + } + + transformIntegrateFlopCount += approximateFlopCountIntegrateWorkset; + + cellOffset += worksetSize; + } + return cellStiffness; +} + +#endif /* GRADGRADStructuredAssembly_h */ diff --git a/packages/intrepid2/assembly-examples/H1StandardAssembly.hpp b/packages/intrepid2/assembly-examples/H1StandardAssembly.hpp new file mode 100644 index 000000000000..fadaff6111c6 --- /dev/null +++ b/packages/intrepid2/assembly-examples/H1StandardAssembly.hpp @@ -0,0 +1,165 @@ +// +// H1StandardAssembly.hpp +// Trilinos +// +// Created by Roberts, Nathan V on 3/1/22. +// + +#ifndef Intrepid2_H1StandardAssembly_hpp +#define Intrepid2_H1StandardAssembly_hpp + +#include "JacobianFlopEstimate.hpp" + +/** \file H1StandardAssembly.hpp + \brief Locally assembles a matrix with the H^1 natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j) + (grad e_i, grad e_j), using standard Intrepid2 methods; these do not algorithmically exploit geometric structure. + */ + +//! Version that uses the classic, generic Intrepid2 paths. +template +Intrepid2::ScalarView performStandardQuadratureH1(Intrepid2::CellGeometry &geometry, + const int &polyOrder, int worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) +{ + using namespace Intrepid2; + using ExecutionSpace = typename DeviceType::execution_space; + + int numVertices = 1; + for (int d=0; dstart(); + + using CellTools = CellTools; + using FunctionSpaceTools = FunctionSpaceTools; + + using namespace std; + // dimensions of the returned view are (C,F,F) + auto fs = FUNCTION_SPACE_HGRAD; + + shards::CellTopology cellTopo = geometry.cellTopology(); + + auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder); + + int numFields = basis->getCardinality(); + int numCells = geometry.numCells(); + + if (worksetSize > numCells) worksetSize = numCells; + + // local stiffness matrices: + ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); + + auto cubature = DefaultCubatureFactory::create(cellTopo,polyOrder*2); + int numPoints = cubature->getNumPoints(); + ScalarView cubaturePoints("cubature points",numPoints,spaceDim); + ScalarView cubatureWeights("cubature weights", numPoints); + + cubature->getCubature(cubaturePoints, cubatureWeights); + + const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices); + const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints); + const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints); + + // Allocate some intermediate containers + ScalarView basisValues ("basis values", numFields, numPoints ); + ScalarView basisGradValues("basis grad values", numFields, numPoints, spaceDim); + + ScalarView transformedGradValues("transformed grad values", worksetSize, numFields, numPoints, spaceDim); + ScalarView transformedWeightedGradValues("transformed weighted grad values", worksetSize, numFields, numPoints, spaceDim); + + ScalarView transformedBasisValues("transformed basis values", worksetSize, numFields, numPoints); + ScalarView transformedWeightedBasisValues("transformed weighted basis values", worksetSize, numFields, numPoints); + + basis->getValues(basisValues, cubaturePoints, OPERATOR_VALUE ); + basis->getValues(basisGradValues, cubaturePoints, OPERATOR_GRAD ); + + const int numNodesPerCell = geometry.numNodesPerCell(); + ScalarView expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim); + Kokkos::parallel_for(Kokkos::RangePolicy(0,numCells), + KOKKOS_LAMBDA (const int &cellOrdinal) { + for (int nodeOrdinal=0; nodeOrdinal cellMeasures("cell measures", worksetSize, numPoints); + ScalarView jacobianDeterminant("jacobian determinant", worksetSize, numPoints); + ScalarView jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim); + ScalarView jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim); + + initialSetupTimer->stop(); + + transformIntegrateFlopCount = 0; + jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant + jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P) + + int cellOffset = 0; + while (cellOffset < numCells) + { + int startCell = cellOffset; + int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell; + + std::pair cellRange = {startCell, startCell+numCellsInWorkset}; + auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL()); + + if (numCellsInWorkset != worksetSize) + { + Kokkos::resize(jacobian, numCellsInWorkset, numPoints, spaceDim, spaceDim); + Kokkos::resize(jacobianInverse, numCellsInWorkset, numPoints, spaceDim, spaceDim); + Kokkos::resize(jacobianDeterminant, numCellsInWorkset, numPoints); + Kokkos::resize(cellMeasures, numCellsInWorkset, numPoints); + Kokkos::resize(transformedBasisValues, numCellsInWorkset, numFields, numPoints); + Kokkos::resize(transformedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim); + Kokkos::resize(transformedWeightedBasisValues,numCellsInWorkset, numFields, numPoints); + Kokkos::resize(transformedWeightedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim); + } + jacobianAndCellMeasureTimer->start(); + CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell. + CellTools::setJacobianInv(jacobianInverse, jacobian); + CellTools::setJacobianDet(jacobianDeterminant, jacobian); + + FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights); + ExecutionSpace().fence(); + jacobianAndCellMeasureTimer->stop(); + + // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls. + fstIntegrateCall->start(); + FunctionSpaceTools::HGRADtransformGRAD(transformedGradValues, jacobianInverse, basisGradValues); + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction. + FunctionSpaceTools::multiplyMeasure(transformedWeightedGradValues, cellMeasures, transformedGradValues); + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedGradValues: one flop for each. + + auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL()); + + FunctionSpaceTools::integrate(cellStiffnessSubview, transformedGradValues, transformedWeightedGradValues); + ExecutionSpace().fence(); + + FunctionSpaceTools::HGRADtransformVALUE(transformedBasisValues, basisValues); + FunctionSpaceTools::multiplyMeasure(transformedWeightedBasisValues, cellMeasures, transformedBasisValues); + bool sumInto = true; // add the (value,value) integral to the (grad,grad) that we've already integrated + FunctionSpaceTools::integrate(cellStiffnessSubview, transformedBasisValues, transformedWeightedBasisValues, sumInto); + ExecutionSpace().fence(); + fstIntegrateCall->stop(); + + // record flops for (grad,grad) + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction; but you do one less add than the number of multiplies, hence the minus 1. + // for (value,value): + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * 2 - 1); // one multiply, add for each (P) entry in the contraction. + + cellOffset += worksetSize; + } +// std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl; + return cellStiffness; +} + +#endif /* H1StandardAssembly_h */ diff --git a/packages/intrepid2/assembly-examples/H1StructuredAssembly.hpp b/packages/intrepid2/assembly-examples/H1StructuredAssembly.hpp new file mode 100644 index 000000000000..1ec823b406be --- /dev/null +++ b/packages/intrepid2/assembly-examples/H1StructuredAssembly.hpp @@ -0,0 +1,166 @@ +// +// H1StructuredAssembly.hpp +// Trilinos +// +// Created by Roberts, Nathan V on 3/1/22. +// + +#ifndef H1StructuredAssembly_h +#define H1StructuredAssembly_h + +#include "JacobianFlopEstimate.hpp" + +/** \file H1StructuredAssembly.hpp + \brief Locally assembles a matrix with the H^1 natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j) + (grad e_i, grad e_j), using "structured" Intrepid2 methods; these algorithmically exploit geometric structure as expressed in the provided CellGeometry. + */ + +//! Version that takes advantage of new structured integration support, including sum factorization. +template +Intrepid2::ScalarView performStructuredQuadratureH1(Intrepid2::CellGeometry &geometry, const int &polyOrder, const int &worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) +{ + using namespace Intrepid2; + + using ExecutionSpace = typename DeviceType::execution_space; + + int numVertices = 1; + for (int d=0; dstart(); + using namespace std; + using FunctionSpaceTools = FunctionSpaceTools; + using IntegrationTools = IntegrationTools; + // dimensions of the returned view are (C,F,F) + auto fs = FUNCTION_SPACE_HGRAD; + + shards::CellTopology cellTopo = geometry.cellTopology(); + + auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder); + + int numFields = basis->getCardinality(); + int numCells = geometry.numCells(); + + // local stiffness matrix: + ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); + + auto cubature = DefaultCubatureFactory::create(cellTopo,polyOrder*2); + auto tensorCubatureWeights = cubature->allocateCubatureWeights(); + TensorPoints tensorCubaturePoints = cubature->allocateCubaturePoints(); + + cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights); + + EOperator opGrad = OPERATOR_GRAD; + BasisValues gradientValues = basis->allocateBasisValues(tensorCubaturePoints, opGrad); + basis->getValues(gradientValues, tensorCubaturePoints, opGrad); + + EOperator opValue = OPERATOR_VALUE; + BasisValues basisValues = basis->allocateBasisValues(tensorCubaturePoints, opValue); + basis->getValues(basisValues, tensorCubaturePoints, opValue); + + // goal here is to do a weighted Poisson; i.e. (f grad u, grad v) on each cell + + int cellOffset = 0; + + auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians"); + auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()"); + + Data jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize); + Data jacobianDet = CellTools::allocateJacobianDet(jacobian); + Data jacobianInv = CellTools::allocateJacobianInv(jacobian); + TensorData cellMeasures = geometry.allocateCellMeasure(jacobianDet, tensorCubatureWeights); + + // lazily-evaluated transformed gradient values (temporary to allow integralData allocation) + auto transformedGradientValuesTemp = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues); + auto integralData = IntegrationTools::allocateIntegralData(transformedGradientValuesTemp, cellMeasures, transformedGradientValuesTemp); + + const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases + + // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount) + const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices); + const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints); + const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints); + + transformIntegrateFlopCount = 0; + jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant + jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P) + + auto refData = geometry.getJacobianRefData(tensorCubaturePoints); + + initialSetupTimer->stop(); + while (cellOffset < numCells) + { + int startCell = cellOffset; + int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell; + int endCell = numCellsInWorkset + startCell; + + jacobianAndCellMeasureTimer->start(); + if (numCellsInWorkset != worksetSize) + { + const int CELL_DIM = 0; // first dimension corresponds to cell + jacobian.setExtent( CELL_DIM, numCellsInWorkset); + jacobianDet.setExtent( CELL_DIM, numCellsInWorkset); + jacobianInv.setExtent( CELL_DIM, numCellsInWorkset); + integralData.setExtent(CELL_DIM, numCellsInWorkset); + + // cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension… + cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset); + } + + geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell); + CellTools::setJacobianDet(jacobianDet, jacobian); + CellTools::setJacobianInv(jacobianInv, jacobian); + + // lazily-evaluated transformed gradient values: + auto transformedGradientValues = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues); + + auto transformedBasisValues = FunctionSpaceTools::getHGRADtransformVALUE(numCellsInWorkset, basisValues); + + geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights); + ExecutionSpace().fence(); + jacobianAndCellMeasureTimer->stop(); + + bool sumInto = false; + double approximateFlopCountIntegrateWorksetGRADGRAD = 0; + double approximateFlopCountIntegrateWorksetVALUEVALUE = 0; + fstIntegrateCall->start(); + // (grad,grad) + IntegrationTools::integrate(integralData, transformedGradientValues, cellMeasures, transformedGradientValues, sumInto, &approximateFlopCountIntegrateWorksetGRADGRAD); + + // (value,value) + sumInto = true; // add to what we already accumulated for (grad,grad) + IntegrationTools::integrate(integralData, transformedBasisValues, cellMeasures, transformedBasisValues, sumInto, &approximateFlopCountIntegrateWorksetVALUEVALUE); + + ExecutionSpace().fence(); + fstIntegrateCall->stop(); + + // copy into cellStiffness container. (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.) + if (integralData.getUnderlyingViewRank() == 3) + { + std::pair cellRange = {startCell, endCell}; + auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3()); + } + else // underlying view rank is 2; copy to each cell in destination stiffness matrix + { + auto integralView2 = integralData.getUnderlyingView2(); + auto policy = Kokkos::MDRangePolicy>({0,0,0},{numCellsInWorkset,numFields,numFields}); + Kokkos::parallel_for("copy uniform data to expanded container", policy, + KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) { + cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal); + }); + } + + transformIntegrateFlopCount += approximateFlopCountIntegrateWorksetGRADGRAD + approximateFlopCountIntegrateWorksetVALUEVALUE; + + cellOffset += worksetSize; + } + return cellStiffness; +} + +#endif /* GRADGRADStructuredAssembly_h */ diff --git a/packages/intrepid2/assembly-examples/HCURLStandardAssembly.hpp b/packages/intrepid2/assembly-examples/HCURLStandardAssembly.hpp new file mode 100644 index 000000000000..2d12f1d21cb3 --- /dev/null +++ b/packages/intrepid2/assembly-examples/HCURLStandardAssembly.hpp @@ -0,0 +1,185 @@ +// +// HCURLStandardAssembly.hpp +// Trilinos +// +// Created by Roberts, Nathan V on 3/24/22. +// + +#ifndef Intrepid2_HCURLStandardAssembly_hpp +#define Intrepid2_HCURLStandardAssembly_hpp + +#include "JacobianFlopEstimate.hpp" + +/** \file HCURLStandardAssembly.hpp + \brief Locally assembles a matrix with the H(curl) natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j) + (curl e_i, curl e_j), using standard Intrepid2 methods; these do not algorithmically exploit geometric structure. + */ + +//! Version that uses the classic, generic Intrepid2 paths. +template +Intrepid2::ScalarView performStandardQuadratureHCURL(Intrepid2::CellGeometry &geometry, + const int &polyOrder, int worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) +{ + using namespace Intrepid2; + using ExecutionSpace = typename DeviceType::execution_space; + + int numVertices = 1; + for (int d=0; dstart(); + + using CellTools = CellTools; + using FunctionSpaceTools = FunctionSpaceTools; + + using namespace std; + // dimensions of the returned view are (C,F,F) + auto fs = FUNCTION_SPACE_HCURL; + + shards::CellTopology cellTopo = geometry.cellTopology(); + + auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder); + + int numFields = basis->getCardinality(); + int numCells = geometry.numCells(); + + if (worksetSize > numCells) worksetSize = numCells; + + // local stiffness matrices: + ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); + + auto cubature = DefaultCubatureFactory::create(cellTopo,polyOrder*2); + int numPoints = cubature->getNumPoints(); + ScalarView cubaturePoints("cubature points",numPoints,spaceDim); + ScalarView cubatureWeights("cubature weights", numPoints); + + cubature->getCubature(cubaturePoints, cubatureWeights); + + const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices); + const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints); + const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints); + + // Allocate some intermediate containers + ScalarView basisValues ("basis values", numFields, numPoints, spaceDim ); + + ScalarView basisCurlValues, transformedCurlValues, transformedWeightedCurlValues; + if (spaceDim == 2) + { + // curl in 2D is a scalar: + basisCurlValues = ScalarView("basis curl values", numFields, numPoints); + transformedCurlValues = ScalarView ("transformed curl values", worksetSize, numFields, numPoints); + transformedWeightedCurlValues = ScalarView ("transformed weighted curl values", worksetSize, numFields, numPoints); + + } + else + { + // curl in 3D is a vector + basisCurlValues = ScalarView("basis curl values", numFields, numPoints, spaceDim); + transformedCurlValues = ScalarView ("transformed curl values", worksetSize, numFields, numPoints, spaceDim); + transformedWeightedCurlValues = ScalarView ("transformed weighted curl values", worksetSize, numFields, numPoints, spaceDim); + } + + ScalarView transformedBasisValues("transformed basis values", worksetSize, numFields, numPoints, spaceDim); + ScalarView transformedWeightedBasisValues("transformed weighted basis values", worksetSize, numFields, numPoints, spaceDim); + + basis->getValues(basisValues, cubaturePoints, OPERATOR_VALUE ); + basis->getValues(basisCurlValues, cubaturePoints, OPERATOR_CURL ); + + const int numNodesPerCell = geometry.numNodesPerCell(); + ScalarView expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim); + Kokkos::parallel_for(Kokkos::RangePolicy(0,numCells), + KOKKOS_LAMBDA (const int &cellOrdinal) { + for (int nodeOrdinal=0; nodeOrdinal cellMeasures("cell measures", worksetSize, numPoints); + ScalarView jacobianDeterminant("jacobian determinant", worksetSize, numPoints); + ScalarView jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim); + ScalarView jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim); + + initialSetupTimer->stop(); + + transformIntegrateFlopCount = 0; + jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant + jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P) + + int cellOffset = 0; + while (cellOffset < numCells) + { + int startCell = cellOffset; + int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell; + + std::pair cellRange = {startCell, startCell+numCellsInWorkset}; + auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL()); + + if (numCellsInWorkset != worksetSize) + { + Kokkos::resize(jacobian, numCellsInWorkset, numPoints, spaceDim, spaceDim); + Kokkos::resize(jacobianInverse, numCellsInWorkset, numPoints, spaceDim, spaceDim); + Kokkos::resize(jacobianDeterminant, numCellsInWorkset, numPoints); + Kokkos::resize(cellMeasures, numCellsInWorkset, numPoints); + Kokkos::resize(transformedBasisValues, numCellsInWorkset, numFields, numPoints, spaceDim); + Kokkos::resize(transformedWeightedBasisValues, numCellsInWorkset, numFields, numPoints, spaceDim); + if (spaceDim == 2) + { + Kokkos::resize(transformedCurlValues, numCellsInWorkset, numFields, numPoints); + Kokkos::resize(transformedWeightedCurlValues, numCellsInWorkset, numFields, numPoints); + } + else + { + Kokkos::resize(transformedCurlValues, numCellsInWorkset, numFields, numPoints, spaceDim); + Kokkos::resize(transformedWeightedCurlValues, numCellsInWorkset, numFields, numPoints, spaceDim); + } + } + jacobianAndCellMeasureTimer->start(); + CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell. + CellTools::setJacobianInv(jacobianInverse, jacobian); + CellTools::setJacobianDet(jacobianDeterminant, jacobian); + + FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights); + ExecutionSpace().fence(); + jacobianAndCellMeasureTimer->stop(); + + // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls. + fstIntegrateCall->start(); + FunctionSpaceTools::HCURLtransformCURL(transformedCurlValues, jacobian, jacobianDeterminant, basisCurlValues); + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction. + FunctionSpaceTools::multiplyMeasure(transformedWeightedCurlValues, cellMeasures, transformedCurlValues); + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedCurlValues: one flop for each. + + auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL()); + + FunctionSpaceTools::integrate(cellStiffnessSubview, transformedCurlValues, transformedWeightedCurlValues); + + FunctionSpaceTools::HCURLtransformVALUE(transformedBasisValues, jacobianInverse, basisValues); + FunctionSpaceTools::multiplyMeasure(transformedWeightedBasisValues, cellMeasures, transformedBasisValues); + bool sumInto = true; // add the (value,value) integral to the (curl,curl) that we've already integrated + FunctionSpaceTools::integrate(cellStiffnessSubview, transformedBasisValues, transformedWeightedBasisValues, sumInto); + ExecutionSpace().fence(); + fstIntegrateCall->stop(); + + // record flops for (curl,curl): + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction; but you do one less add than the number of multiplies, hence the minus 1. + // for (value,value): + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction; but you do one less add than the number of multiplies, hence the minus 1. + + cellOffset += worksetSize; + } +// std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl; + return cellStiffness; +} + +#endif /* HCURLStandardAssembly_h */ diff --git a/packages/intrepid2/assembly-examples/HCURLStructuredAssembly.hpp b/packages/intrepid2/assembly-examples/HCURLStructuredAssembly.hpp new file mode 100644 index 000000000000..42323581a559 --- /dev/null +++ b/packages/intrepid2/assembly-examples/HCURLStructuredAssembly.hpp @@ -0,0 +1,188 @@ +// +// HCURLStructuredAssembly.hpp +// Trilinos +// +// Created by Roberts, Nathan V on 3/24/22. +// + +#ifndef HCURLStructuredAssembly_h +#define HCURLStructuredAssembly_h + +#include "JacobianFlopEstimate.hpp" + +/** \file HCURLStructuredAssembly.hpp + \brief Locally assembles a matrix with the H(curl) natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j) + (curl e_i, curl e_j), using "structured" Intrepid2 methods; these algorithmically exploit geometric structure as expressed in the provided CellGeometry. + */ + +//! Version that takes advantage of new structured integration support, including sum factorization. Computes H(curl) norm: (curl, curl) + (value,value). +template +Intrepid2::ScalarView performStructuredQuadratureHCURL(Intrepid2::CellGeometry &geometry, const int &polyOrder, const int &worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) +{ + using namespace Intrepid2; + + using ExecutionSpace = typename DeviceType::execution_space; + + int numVertices = 1; + for (int d=0; dstart(); + using namespace std; + using FunctionSpaceTools = FunctionSpaceTools; + using IntegrationTools = IntegrationTools; + // dimensions of the returned view are (C,F,F) + auto fs = FUNCTION_SPACE_HCURL; + + shards::CellTopology cellTopo = geometry.cellTopology(); + + auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder); + + int numFields = basis->getCardinality(); + int numCells = geometry.numCells(); + + // local stiffness matrix: + ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); + + auto cubature = DefaultCubatureFactory::create(cellTopo,polyOrder*2); + auto tensorCubatureWeights = cubature->allocateCubatureWeights(); + TensorPoints tensorCubaturePoints = cubature->allocateCubaturePoints(); + + cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights); + + EOperator opCurl = OPERATOR_CURL; + BasisValues curlValues = basis->allocateBasisValues(tensorCubaturePoints, opCurl); + basis->getValues(curlValues, tensorCubaturePoints, opCurl); + + EOperator opValue = OPERATOR_VALUE; + BasisValues basisValues = basis->allocateBasisValues(tensorCubaturePoints, opValue); + basis->getValues(basisValues, tensorCubaturePoints, opValue); + + int cellOffset = 0; + + auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians"); + auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()"); + + Data jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize); + Data jacobianDet = CellTools::allocateJacobianDet(jacobian); + Data jacobianDetInverse = CellTools::allocateJacobianDet(jacobian); + Data jacobianDividedByJacobianDet = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize); // jacobianTimesJacobianDet has same underlying structure as jacobian + Data jacobianInv = CellTools::allocateJacobianInv(jacobian); + TensorData cellMeasures = geometry.allocateCellMeasure(jacobianDetInverse, tensorCubatureWeights); + + Data jacobianDetInverseExtended; // container with same underlying data as jacobianDet, but extended with CONSTANT type to have same logical shape as Jacobian + // setup jacobianDetInverseExtended + { + auto variationTypes = jacobianDetInverse.getVariationTypes(); // defaults to CONSTANT in ranks beyond the rank of the container; this is what we want for our new extents + auto extents = jacobian.getExtents(); + + jacobianDetInverseExtended = jacobianDetInverse.shallowCopy(jacobian.rank(), extents, variationTypes); + } + + // lazily-evaluated transformed curl values (temporary to allow integralData allocation) + auto transformedCurlValuesTemp = FunctionSpaceTools::getHCURLtransformCURL(jacobianDividedByJacobianDet, curlValues); + auto integralData = IntegrationTools::allocateIntegralData(transformedCurlValuesTemp, cellMeasures, transformedCurlValuesTemp); + + const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases + + // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount) + const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices); + const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints); + const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints); + + transformIntegrateFlopCount = 0; + jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant + jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P) + + auto refData = geometry.getJacobianRefData(tensorCubaturePoints); + + initialSetupTimer->stop(); + while (cellOffset < numCells) + { + int startCell = cellOffset; + int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell; + int endCell = numCellsInWorkset + startCell; + + jacobianAndCellMeasureTimer->start(); + if (numCellsInWorkset != worksetSize) + { + const int CELL_DIM = 0; // first dimension corresponds to cell + jacobian.setExtent( CELL_DIM, numCellsInWorkset); + jacobianDet.setExtent( CELL_DIM, numCellsInWorkset); + jacobianDetInverse.setExtent( CELL_DIM, numCellsInWorkset); + jacobianInv.setExtent( CELL_DIM, numCellsInWorkset); + integralData.setExtent( CELL_DIM, numCellsInWorkset); + jacobianDividedByJacobianDet.setExtent(CELL_DIM, numCellsInWorkset); + + // cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension… + cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset); + } + + geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell); + CellTools::setJacobianDet( jacobianDet, jacobian); + CellTools::setJacobianDetInv(jacobianDetInverse, jacobian); + CellTools::setJacobianInv( jacobianInv, jacobian); + + // compute the jacobian divided by its determinant + jacobianDividedByJacobianDet.storeInPlaceProduct(jacobian,jacobianDetInverseExtended); + + // lazily-evaluated transformed curl, values: + TransformedBasisValues transformedCurlValues; + if (spaceDim == 2) + { + transformedCurlValues = FunctionSpaceTools::getHCURLtransformCURL(jacobianDetInverse, curlValues); + } + else + { + transformedCurlValues = FunctionSpaceTools::getHCURLtransformCURL(jacobianDividedByJacobianDet, curlValues); + } + auto transformedBasisValues = FunctionSpaceTools::getHCURLtransformVALUE(jacobianInv, basisValues); + + geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights); + ExecutionSpace().fence(); + jacobianAndCellMeasureTimer->stop(); + + bool sumInto = false; + double approximateFlopCountIntegrateWorksetCURLCURL = 0; + double approximateFlopCountIntegrateWorksetVALUEVALUE = 0; + fstIntegrateCall->start(); + // (curl,curl) + IntegrationTools::integrate(integralData, transformedCurlValues, cellMeasures, transformedCurlValues, sumInto, &approximateFlopCountIntegrateWorksetCURLCURL); + + // (value,value) + sumInto = true; // add to what we already accumulated for (curl,curl) + IntegrationTools::integrate(integralData, transformedBasisValues, cellMeasures, transformedBasisValues, sumInto, &approximateFlopCountIntegrateWorksetVALUEVALUE); + + ExecutionSpace().fence(); + fstIntegrateCall->stop(); + + // copy into cellStiffness container. (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.) + if (integralData.getUnderlyingViewRank() == 3) + { + std::pair cellRange = {startCell, endCell}; + auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3()); + } + else // underlying view rank is 2; copy to each cell in destination stiffness matrix + { + auto integralView2 = integralData.getUnderlyingView2(); + auto policy = Kokkos::MDRangePolicy>({0,0,0},{numCellsInWorkset,numFields,numFields}); + Kokkos::parallel_for("copy uniform data to expanded container", policy, + KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) { + cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal); + }); + } + + transformIntegrateFlopCount += approximateFlopCountIntegrateWorksetCURLCURL + approximateFlopCountIntegrateWorksetVALUEVALUE; + + cellOffset += worksetSize; + } + return cellStiffness; +} + +#endif /* HCURLStructuredAssembly_h */ diff --git a/packages/intrepid2/assembly-examples/HDIVStandardAssembly.hpp b/packages/intrepid2/assembly-examples/HDIVStandardAssembly.hpp new file mode 100644 index 000000000000..4beb1feaf3e0 --- /dev/null +++ b/packages/intrepid2/assembly-examples/HDIVStandardAssembly.hpp @@ -0,0 +1,165 @@ +// +// HDIVStandardAssembly.hpp +// Trilinos +// +// Created by Roberts, Nathan V on 3/7/22. +// + +#ifndef Intrepid2_HDIVStandardAssembly_hpp +#define Intrepid2_HDIVStandardAssembly_hpp + +#include "JacobianFlopEstimate.hpp" + +/** \file HDIVStandardAssembly.hpp + \brief Locally assembles a matrix with the H(div) natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j) + (div e_i, div e_j), using standard Intrepid2 methods; these do not algorithmically exploit geometric structure. + */ + +//! Version that uses the classic, generic Intrepid2 paths. +template +Intrepid2::ScalarView performStandardQuadratureHDIV(Intrepid2::CellGeometry &geometry, + const int &polyOrder, int worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) +{ + using namespace Intrepid2; + using ExecutionSpace = typename DeviceType::execution_space; + + int numVertices = 1; + for (int d=0; dstart(); + + using CellTools = CellTools; + using FunctionSpaceTools = FunctionSpaceTools; + + using namespace std; + // dimensions of the returned view are (C,F,F) + auto fs = FUNCTION_SPACE_HDIV; + + shards::CellTopology cellTopo = geometry.cellTopology(); + + auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder); + + int numFields = basis->getCardinality(); + int numCells = geometry.numCells(); + + if (worksetSize > numCells) worksetSize = numCells; + + // local stiffness matrices: + ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); + + auto cubature = DefaultCubatureFactory::create(cellTopo,polyOrder*2); + int numPoints = cubature->getNumPoints(); + ScalarView cubaturePoints("cubature points",numPoints,spaceDim); + ScalarView cubatureWeights("cubature weights", numPoints); + + cubature->getCubature(cubaturePoints, cubatureWeights); + + const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices); + const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints); + const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints); + + // Allocate some intermediate containers + ScalarView basisValues ("basis values", numFields, numPoints, spaceDim ); + ScalarView basisDivValues("basis div values", numFields, numPoints); + + ScalarView transformedDivValues("transformed div values", worksetSize, numFields, numPoints); + ScalarView transformedWeightedDivValues("transformed weighted div values", worksetSize, numFields, numPoints); + + ScalarView transformedBasisValues("transformed basis values", worksetSize, numFields, numPoints, spaceDim); + ScalarView transformedWeightedBasisValues("transformed weighted basis values", worksetSize, numFields, numPoints, spaceDim); + + basis->getValues(basisValues, cubaturePoints, OPERATOR_VALUE ); + basis->getValues(basisDivValues, cubaturePoints, OPERATOR_DIV ); + + const int numNodesPerCell = geometry.numNodesPerCell(); + ScalarView expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim); + Kokkos::parallel_for(Kokkos::RangePolicy(0,numCells), + KOKKOS_LAMBDA (const int &cellOrdinal) { + for (int nodeOrdinal=0; nodeOrdinal cellMeasures("cell measures", worksetSize, numPoints); + ScalarView jacobianDeterminant("jacobian determinant", worksetSize, numPoints); + ScalarView jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim); + ScalarView jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim); + + initialSetupTimer->stop(); + + transformIntegrateFlopCount = 0; + jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant + jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P) + + int cellOffset = 0; + while (cellOffset < numCells) + { + int startCell = cellOffset; + int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell; + + std::pair cellRange = {startCell, startCell+numCellsInWorkset}; + auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL()); + + if (numCellsInWorkset != worksetSize) + { + Kokkos::resize(jacobian, numCellsInWorkset, numPoints, spaceDim, spaceDim); + Kokkos::resize(jacobianInverse, numCellsInWorkset, numPoints, spaceDim, spaceDim); + Kokkos::resize(jacobianDeterminant, numCellsInWorkset, numPoints); + Kokkos::resize(cellMeasures, numCellsInWorkset, numPoints); + Kokkos::resize(transformedBasisValues, numCellsInWorkset, numFields, numPoints, spaceDim); + Kokkos::resize(transformedDivValues, numCellsInWorkset, numFields, numPoints); + Kokkos::resize(transformedWeightedBasisValues,numCellsInWorkset, numFields, numPoints, spaceDim); + Kokkos::resize(transformedWeightedDivValues, numCellsInWorkset, numFields, numPoints); + } + jacobianAndCellMeasureTimer->start(); + CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell. + CellTools::setJacobianInv(jacobianInverse, jacobian); + CellTools::setJacobianDet(jacobianDeterminant, jacobian); + + FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights); + ExecutionSpace().fence(); + jacobianAndCellMeasureTimer->stop(); + + // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls. + fstIntegrateCall->start(); + FunctionSpaceTools::HDIVtransformDIV(transformedDivValues, jacobianDeterminant, basisDivValues); + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction. + FunctionSpaceTools::multiplyMeasure(transformedWeightedDivValues, cellMeasures, transformedDivValues); + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedDivValues: one flop for each. + + auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL()); + + FunctionSpaceTools::integrate(cellStiffnessSubview, transformedDivValues, transformedWeightedDivValues); + ExecutionSpace().fence(); + + FunctionSpaceTools::HDIVtransformVALUE(transformedBasisValues, jacobian, jacobianDeterminant, basisValues); + FunctionSpaceTools::multiplyMeasure(transformedWeightedBasisValues, cellMeasures, transformedBasisValues); + bool sumInto = true; // add the (value,value) integral to the (div,div) that we've already integrated + FunctionSpaceTools::integrate(cellStiffnessSubview, transformedBasisValues, transformedWeightedBasisValues, sumInto); + ExecutionSpace().fence(); + fstIntegrateCall->stop(); + + // record flops for (div,div) + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * 2 - 1); // one multiply, add for each (P) entry in the contraction. + // for (value,value): + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction; but you do one less add than the number of multiplies, hence the minus 1. + + cellOffset += worksetSize; + } +// std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl; + return cellStiffness; +} + +#endif /* HDIVStandardAssembly_h */ diff --git a/packages/intrepid2/assembly-examples/HDIVStructuredAssembly.hpp b/packages/intrepid2/assembly-examples/HDIVStructuredAssembly.hpp new file mode 100644 index 000000000000..a2eef17f9e35 --- /dev/null +++ b/packages/intrepid2/assembly-examples/HDIVStructuredAssembly.hpp @@ -0,0 +1,180 @@ +// +// HDIVStructuredAssembly.hpp +// Trilinos +// +// Created by Roberts, Nathan V on 3/7/22. +// + +#ifndef HDIVStructuredAssembly_h +#define HDIVStructuredAssembly_h + +#include "JacobianFlopEstimate.hpp" + +/** \file HDIVStructuredAssembly.hpp + \brief Locally assembles a matrix with the H(div) natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j) + (div e_i, div e_j), using "structured" Intrepid2 methods; these algorithmically exploit geometric structure as expressed in the provided CellGeometry. + */ + +//! Version that takes advantage of new structured integration support, including sum factorization. Computes H(div) norm: (div, div) + (value,value). +template +Intrepid2::ScalarView performStructuredQuadratureHDIV(Intrepid2::CellGeometry &geometry, const int &polyOrder, const int &worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) +{ + using namespace Intrepid2; + + using ExecutionSpace = typename DeviceType::execution_space; + + int numVertices = 1; + for (int d=0; dstart(); + using namespace std; + using FunctionSpaceTools = FunctionSpaceTools; + using IntegrationTools = IntegrationTools; + // dimensions of the returned view are (C,F,F) + auto fs = FUNCTION_SPACE_HDIV; + + shards::CellTopology cellTopo = geometry.cellTopology(); + + auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder); + + int numFields = basis->getCardinality(); + int numCells = geometry.numCells(); + + // local stiffness matrix: + ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); + + auto cubature = DefaultCubatureFactory::create(cellTopo,polyOrder*2); + auto tensorCubatureWeights = cubature->allocateCubatureWeights(); + TensorPoints tensorCubaturePoints = cubature->allocateCubaturePoints(); + + cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights); + + EOperator opDiv = OPERATOR_DIV; + BasisValues divValues = basis->allocateBasisValues(tensorCubaturePoints, opDiv); + basis->getValues(divValues, tensorCubaturePoints, opDiv); + + EOperator opValue = OPERATOR_VALUE; + BasisValues basisValues = basis->allocateBasisValues(tensorCubaturePoints, opValue); + basis->getValues(basisValues, tensorCubaturePoints, opValue); + + int cellOffset = 0; + + auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians"); + auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()"); + + Data jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize); + Data jacobianDet = CellTools::allocateJacobianDet(jacobian); + Data jacobianDetInverse = CellTools::allocateJacobianDet(jacobian); + Data jacobianDividedByJacobianDet = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize); // jacobianTimesJacobianDet has same underlying structure as jacobian + Data jacobianInv = CellTools::allocateJacobianInv(jacobian); + TensorData cellMeasures = geometry.allocateCellMeasure(jacobianDetInverse, tensorCubatureWeights); + + Data jacobianDetInverseExtended; // container with same underlying data as jacobianDet, but extended with CONSTANT type to have same logical shape as Jacobian + // setup jacobianDetInverseExtended + { + auto variationTypes = jacobianDetInverse.getVariationTypes(); // defaults to CONSTANT in ranks beyond the rank of the container; this is what we want for our new extents + auto extents = jacobian.getExtents(); + + jacobianDetInverseExtended = jacobianDetInverse.shallowCopy(jacobian.rank(), extents, variationTypes); + } + + // lazily-evaluated transformed div values (temporary to allow integralData allocation) + auto transformedDivValuesTemp = FunctionSpaceTools::getHDIVtransformDIV(jacobianDetInverse, divValues); + auto integralData = IntegrationTools::allocateIntegralData(transformedDivValuesTemp, cellMeasures, transformedDivValuesTemp); + + const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases + + // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount) + const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices); + const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints); + const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints); + + transformIntegrateFlopCount = 0; + jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant + jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P) + + auto refData = geometry.getJacobianRefData(tensorCubaturePoints); + + initialSetupTimer->stop(); + while (cellOffset < numCells) + { + int startCell = cellOffset; + int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell; + int endCell = numCellsInWorkset + startCell; + + jacobianAndCellMeasureTimer->start(); + if (numCellsInWorkset != worksetSize) + { + const int CELL_DIM = 0; // first dimension corresponds to cell + jacobian.setExtent( CELL_DIM, numCellsInWorkset); + jacobianDet.setExtent( CELL_DIM, numCellsInWorkset); + jacobianDetInverse.setExtent( CELL_DIM, numCellsInWorkset); + jacobianInv.setExtent( CELL_DIM, numCellsInWorkset); + integralData.setExtent( CELL_DIM, numCellsInWorkset); + jacobianDividedByJacobianDet.setExtent(CELL_DIM, numCellsInWorkset); + + // cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension… + cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset); + } + + geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell); + CellTools::setJacobianDet( jacobianDet, jacobian); + CellTools::setJacobianDetInv(jacobianDetInverse, jacobian); + CellTools::setJacobianInv( jacobianInv, jacobian); + + // compute the jacobian divided by its determinant + jacobianDividedByJacobianDet.storeInPlaceProduct(jacobian,jacobianDetInverseExtended); + + // lazily-evaluated transformed div, values: + auto transformedDivValues = FunctionSpaceTools::getHDIVtransformDIV(jacobianDetInverse, divValues); + auto transformedBasisValues = FunctionSpaceTools::getHDIVtransformVALUE(jacobianDividedByJacobianDet, basisValues); + + geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights); + ExecutionSpace().fence(); + jacobianAndCellMeasureTimer->stop(); + + bool sumInto = false; + double approximateFlopCountIntegrateWorksetDIVDIV = 0; + double approximateFlopCountIntegrateWorksetVALUEVALUE = 0; + fstIntegrateCall->start(); + // (div,div) + IntegrationTools::integrate(integralData, transformedDivValues, cellMeasures, transformedDivValues, sumInto, &approximateFlopCountIntegrateWorksetDIVDIV); + + // (value,value) + sumInto = true; // add to what we already accumulated for (div,div) + IntegrationTools::integrate(integralData, transformedBasisValues, cellMeasures, transformedBasisValues, sumInto, &approximateFlopCountIntegrateWorksetVALUEVALUE); + + ExecutionSpace().fence(); + fstIntegrateCall->stop(); + + // copy into cellStiffness container. (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.) + if (integralData.getUnderlyingViewRank() == 3) + { + std::pair cellRange = {startCell, endCell}; + auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3()); + } + else // underlying view rank is 2; copy to each cell in destination stiffness matrix + { + auto integralView2 = integralData.getUnderlyingView2(); + auto policy = Kokkos::MDRangePolicy>({0,0,0},{numCellsInWorkset,numFields,numFields}); + Kokkos::parallel_for("copy uniform data to expanded container", policy, + KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) { + cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal); + }); + } + + transformIntegrateFlopCount += approximateFlopCountIntegrateWorksetDIVDIV + approximateFlopCountIntegrateWorksetVALUEVALUE; + + cellOffset += worksetSize; + } + return cellStiffness; +} + +#endif /* HDIVStructuredAssembly_h */ diff --git a/packages/intrepid2/assembly-examples/HVOLStandardAssembly.hpp b/packages/intrepid2/assembly-examples/HVOLStandardAssembly.hpp new file mode 100644 index 000000000000..630a6cb6924b --- /dev/null +++ b/packages/intrepid2/assembly-examples/HVOLStandardAssembly.hpp @@ -0,0 +1,142 @@ +// +// HVOLStandardAssembly.hpp +// Trilinos +// +// Created by Roberts, Nathan V on 3/30/22. +// + +#ifndef Intrepid2_HVOLStandardAssembly_hpp +#define Intrepid2_HVOLStandardAssembly_hpp + +#include "JacobianFlopEstimate.hpp" + +/** \file HVOLStandardAssembly.hpp + \brief Locally assembles a matrix with the H(vol) natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j), using standard Intrepid2 methods; these do not algorithmically exploit geometric structure. + */ + +//! Version that uses the classic, generic Intrepid2 paths. +template +Intrepid2::ScalarView performStandardQuadratureHVOL(Intrepid2::CellGeometry &geometry, + const int &polyOrder, int worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) +{ + using namespace Intrepid2; + using ExecutionSpace = typename DeviceType::execution_space; + + int numVertices = 1; + for (int d=0; dstart(); + + using CellTools = CellTools; + using FunctionSpaceTools = FunctionSpaceTools; + + using namespace std; + // dimensions of the returned view are (C,F,F) + auto fs = FUNCTION_SPACE_HVOL; + + shards::CellTopology cellTopo = geometry.cellTopology(); + + auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder); + + int numFields = basis->getCardinality(); + int numCells = geometry.numCells(); + + if (worksetSize > numCells) worksetSize = numCells; + + // local stiffness matrices: + ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); + + auto cubature = DefaultCubatureFactory::create(cellTopo,polyOrder*2); + int numPoints = cubature->getNumPoints(); + ScalarView cubaturePoints("cubature points",numPoints,spaceDim); + ScalarView cubatureWeights("cubature weights", numPoints); + + cubature->getCubature(cubaturePoints, cubatureWeights); + + const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices); + const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints); + + // Allocate some intermediate containers + ScalarView basisValues ("basis values", numFields, numPoints ); + + ScalarView transformedBasisValues("transformed basis values", worksetSize, numFields, numPoints); + ScalarView transformedWeightedBasisValues("transformed weighted basis values", worksetSize, numFields, numPoints); + + basis->getValues(basisValues, cubaturePoints, OPERATOR_VALUE ); + + const int numNodesPerCell = geometry.numNodesPerCell(); + ScalarView expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim); + Kokkos::parallel_for(Kokkos::RangePolicy(0,numCells), + KOKKOS_LAMBDA (const int &cellOrdinal) { + for (int nodeOrdinal=0; nodeOrdinal cellMeasures("cell measures", worksetSize, numPoints); + ScalarView jacobianDeterminant("jacobian determinant", worksetSize, numPoints); + ScalarView jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim); + + initialSetupTimer->stop(); + + transformIntegrateFlopCount = 0; + jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant + jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P) + + int cellOffset = 0; + while (cellOffset < numCells) + { + int startCell = cellOffset; + int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell; + + std::pair cellRange = {startCell, startCell+numCellsInWorkset}; + auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL()); + + if (numCellsInWorkset != worksetSize) + { + Kokkos::resize(jacobian, numCellsInWorkset, numPoints, spaceDim, spaceDim); + Kokkos::resize(jacobianDeterminant, numCellsInWorkset, numPoints); + Kokkos::resize(cellMeasures, numCellsInWorkset, numPoints); + Kokkos::resize(transformedBasisValues, numCellsInWorkset, numFields, numPoints); + Kokkos::resize(transformedWeightedBasisValues, numCellsInWorkset, numFields, numPoints); + } + jacobianAndCellMeasureTimer->start(); + CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell. + CellTools::setJacobianDet(jacobianDeterminant, jacobian); + + FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights); + ExecutionSpace().fence(); + jacobianAndCellMeasureTimer->stop(); + + // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls. + fstIntegrateCall->start(); + auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL()); + + FunctionSpaceTools::HVOLtransformVALUE(transformedBasisValues, jacobianDeterminant, basisValues); + FunctionSpaceTools::multiplyMeasure(transformedWeightedBasisValues, cellMeasures, transformedBasisValues); + bool sumInto = true; // add the (value,value) integral to the (curl,curl) that we've already integrated + FunctionSpaceTools::integrate(cellStiffnessSubview, transformedBasisValues, transformedWeightedBasisValues, sumInto); + ExecutionSpace().fence(); + fstIntegrateCall->stop(); + + // record flops for (value,value): + transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * 2 - 1); // one multiply, add for each (P) entry in the contraction. + cellOffset += worksetSize; + } +// std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl; + return cellStiffness; +} + +#endif /* HVOLStandardAssembly_h */ diff --git a/packages/intrepid2/assembly-examples/HVOLStructuredAssembly.hpp b/packages/intrepid2/assembly-examples/HVOLStructuredAssembly.hpp new file mode 100644 index 000000000000..275c9d356bf7 --- /dev/null +++ b/packages/intrepid2/assembly-examples/HVOLStructuredAssembly.hpp @@ -0,0 +1,153 @@ +// +// HVOLStructuredAssembly.hpp +// Trilinos +// +// Created by Roberts, Nathan V on 3/30/22. +// + +#ifndef HVOLStructuredAssembly_h +#define HVOLStructuredAssembly_h + +#include "JacobianFlopEstimate.hpp" + +/** \file HVOLStructuredAssembly.hpp + \brief Locally assembles a matrix with the H(vol) natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j), using "structured" Intrepid2 methods; these algorithmically exploit geometric structure as expressed in the provided CellGeometry. + */ + +//! Version that takes advantage of new structured integration support, including sum factorization. Computes H(vol) norm: (value,value). +template +Intrepid2::ScalarView performStructuredQuadratureHVOL(Intrepid2::CellGeometry &geometry, const int &polyOrder, const int &worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) +{ + using namespace Intrepid2; + + using ExecutionSpace = typename DeviceType::execution_space; + + int numVertices = 1; + for (int d=0; dstart(); + using namespace std; + using FunctionSpaceTools = FunctionSpaceTools; + using IntegrationTools = IntegrationTools; + // dimensions of the returned view are (C,F,F) + auto fs = FUNCTION_SPACE_HVOL; + + shards::CellTopology cellTopo = geometry.cellTopology(); + + auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder); + + int numFields = basis->getCardinality(); + int numCells = geometry.numCells(); + + // local stiffness matrix: + ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); + + auto cubature = DefaultCubatureFactory::create(cellTopo,polyOrder*2); + auto tensorCubatureWeights = cubature->allocateCubatureWeights(); + TensorPoints tensorCubaturePoints = cubature->allocateCubaturePoints(); + + cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights); + + EOperator opValue = OPERATOR_VALUE; + BasisValues basisValues = basis->allocateBasisValues(tensorCubaturePoints, opValue); + basis->getValues(basisValues, tensorCubaturePoints, opValue); + + int cellOffset = 0; + + auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians"); + auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()"); + + Data jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize); + Data jacobianDet = CellTools::allocateJacobianDet(jacobian); + Data jacobianDetInverse = CellTools::allocateJacobianDet(jacobian); + TensorData cellMeasures = geometry.allocateCellMeasure(jacobianDetInverse, tensorCubatureWeights); + + // lazily-evaluated transformed values (temporary to allow integralData allocation) + auto transformedValuesTemp = FunctionSpaceTools::getHVOLtransformVALUE(jacobianDet, basisValues); + auto integralData = IntegrationTools::allocateIntegralData(transformedValuesTemp, cellMeasures, transformedValuesTemp); + + const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases + + // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount) + const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices); + const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints); + + transformIntegrateFlopCount = 0; + jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself + jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant + jacobianCellMeasureFlopCount += numCells * numPoints; // determinant inverse (one divide per (C,P)) + jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P) + + auto refData = geometry.getJacobianRefData(tensorCubaturePoints); + + initialSetupTimer->stop(); + while (cellOffset < numCells) + { + int startCell = cellOffset; + int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell; + int endCell = numCellsInWorkset + startCell; + + jacobianAndCellMeasureTimer->start(); + if (numCellsInWorkset != worksetSize) + { + const int CELL_DIM = 0; // first dimension corresponds to cell + jacobian.setExtent( CELL_DIM, numCellsInWorkset); + jacobianDet.setExtent( CELL_DIM, numCellsInWorkset); + jacobianDetInverse.setExtent( CELL_DIM, numCellsInWorkset); + integralData.setExtent( CELL_DIM, numCellsInWorkset); + + // cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension… + cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset); + } + + geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell); + CellTools::setJacobianDet( jacobianDet, jacobian); + CellTools::setJacobianDetInv(jacobianDetInverse, jacobian); + + // lazily-evaluated transformed values: + auto transformedBasisValues = FunctionSpaceTools::getHVOLtransformVALUE(jacobianDetInverse, basisValues); + + geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights); + ExecutionSpace().fence(); + jacobianAndCellMeasureTimer->stop(); + + double approximateFlopCountIntegrateWorksetVALUEVALUE = 0; + fstIntegrateCall->start(); + + // (value,value) + bool sumInto = false; + IntegrationTools::integrate(integralData, transformedBasisValues, cellMeasures, transformedBasisValues, sumInto, &approximateFlopCountIntegrateWorksetVALUEVALUE); + + ExecutionSpace().fence(); + fstIntegrateCall->stop(); + + // copy into cellStiffness container. (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.) + if (integralData.getUnderlyingViewRank() == 3) + { + std::pair cellRange = {startCell, endCell}; + auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3()); + } + else // underlying view rank is 2; copy to each cell in destination stiffness matrix + { + auto integralView2 = integralData.getUnderlyingView2(); + auto policy = Kokkos::MDRangePolicy>({0,0,0},{numCellsInWorkset,numFields,numFields}); + Kokkos::parallel_for("copy uniform data to expanded container", policy, + KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) { + cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal); + }); + } + + transformIntegrateFlopCount += approximateFlopCountIntegrateWorksetVALUEVALUE; + + cellOffset += worksetSize; + } + return cellStiffness; +} + +#endif /* HVOLStructuredAssembly_h */ diff --git a/packages/intrepid2/assembly-examples/JacobianFlopEstimate.hpp b/packages/intrepid2/assembly-examples/JacobianFlopEstimate.hpp new file mode 100644 index 000000000000..319cf5069c1c --- /dev/null +++ b/packages/intrepid2/assembly-examples/JacobianFlopEstimate.hpp @@ -0,0 +1,125 @@ +// +// JacobianFlopEstimate.hpp +// Trilinos +// +// Created by Roberts, Nathan V on 7/7/21. +// + +#ifndef Intrepid2_JacobianFlopEstimate_hpp +#define Intrepid2_JacobianFlopEstimate_hpp + +double flopsPerJacobian(const int &spaceDim, const int &numPoints, const int &numGeometryNodes) +{ + // implementation looks like: +// for (ordinal_type i=0;i::Serial::det(mat); + double totalFlops = flopsPerJacobianDet(spaceDim, numPoints); + + /* + case 1: { + inv(0,0) = 1.0/mat(0,0); + break; + } + case 2: { + inv(0,0) = mat(1,1)/val; + inv(1,1) = mat(0,0)/val; + + inv(1,0) = - mat(1,0)/val; + inv(0,1) = - mat(0,1)/val; + break; + } + */ + + if (spaceDim == 1) + { + // inv(0,0) = 1.0/mat(0,0); // 1 flop: 1 division + totalFlops += 1 * numPoints; + } + else if (spaceDim == 2) + { +// inv(0,0) = mat(1,1)/val; // 1 flop: 1 division +// inv(1,1) = mat(0,0)/val; // 1 flop: 1 division +// +// inv(1,0) = - mat(1,0)/val; // 2 flops: 1 negation, 1 division +// inv(0,1) = - mat(0,1)/val; // 2 flops: 1 negation, 1 division + totalFlops += 6 * numPoints; + } + else if (spaceDim == 3) + { + // val0 = mat(1,1)*mat(2,2) - mat(2,1)*mat(1,2); // 3 flops: 2 mults, 1 subtraction + // val1 = - mat(1,0)*mat(2,2) + mat(2,0)*mat(1,2); // 4 flops: 2 mults, 1 negation, 1 add + // val2 = mat(1,0)*mat(2,1) - mat(2,0)*mat(1,1); // 3 flops: 2 mults, 1 subtraction + // + // inv(0,0) = val0/val; // 1 flop + // inv(1,0) = val1/val; // 1 flop + // inv(2,0) = val2/val; // 1 flop + // + // val0 = mat(2,1)*mat(0,2) - mat(0,1)*mat(2,2); // 3 + // val1 = mat(0,0)*mat(2,2) - mat(2,0)*mat(0,2); // 3 + // val2 = - mat(0,0)*mat(2,1) + mat(2,0)*mat(0,1); // 4 + // + // inv(0,1) = val0/val; // 1 + // inv(1,1) = val1/val; // 1 + // inv(2,1) = val2/val; // 1 + // + // val0 = mat(0,1)*mat(1,2) - mat(1,1)*mat(0,2); // 3 + // val1 = - mat(0,0)*mat(1,2) + mat(1,0)*mat(0,2); // 4 + // val2 = mat(0,0)*mat(1,1) - mat(1,0)*mat(0,1); // 3 + // + // inv(0,2) = val0/val; // 1 + // inv(1,2) = val1/val; // 1 + // inv(2,2) = val2/val; // 1 + totalFlops += 36.0 * numPoints; + } + else + { + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unhandled spaceDim"); + } + + return totalFlops; +} + +#endif /* GRADGRADStandardAssembly_h */ diff --git a/packages/intrepid2/src/Cell/Intrepid2_CellGeometry.hpp b/packages/intrepid2/src/Cell/Intrepid2_CellGeometry.hpp index 2fadab29d3e3..ed2dbdad09d8 100644 --- a/packages/intrepid2/src/Cell/Intrepid2_CellGeometry.hpp +++ b/packages/intrepid2/src/Cell/Intrepid2_CellGeometry.hpp @@ -58,7 +58,7 @@ #include "Intrepid2_OrientationTools.hpp" #include "Intrepid2_TensorData.hpp" #include "Intrepid2_TensorPoints.hpp" -#include "Intrepid2_TransformedVectorData.hpp" +#include "Intrepid2_TransformedBasisValues.hpp" #include "Intrepid2_Utils.hpp" #include "Intrepid2_VectorData.hpp" @@ -116,24 +116,22 @@ namespace Intrepid2 public: /** \brief Notionally-private method that provides a common interface for multiple public-facing allocateJacobianData() methods. (Marked as public due to compiler constraints.) - \param [in] points - if a valid points container, specifies the points at which the Jacobian will be evaluated. (Invalid containers are acceptable for affine CellGeometry.) + \param [in] pointComponentView - typically either the first component of a TensorPoints container, or a View containing all the points, but may be empty. Only used for scalar-type-size-matched construction of new views (important for views of Fad types with hidden dimensions). \param [in] pointsPerCell - the number of points at which the Jacobian will be evaluated in each cell. If points is a valid container, pointsPerCell must match its first dimension. \param [in] startCell - the first cell ordinal for which the Jacobian will be evaluated (used to define worksets smaller than the whole CellGeometry). \param [in] endCell - the first cell ordinal for which the Jacobian will not be evaluated (used to define worksets smaller than the whole CellGeometry); use -1 to indicate that all cells from startCell on should be evaluated. \return a Data object appropriately sized to accomodate the specified Jacobian values. */ - Data allocateJacobianDataPrivate(const TensorPoints &points, const int &pointsPerCell, const int startCell, const int endCell) const; + Data allocateJacobianDataPrivate(const ScalarView &pointComponentView, const int &pointsPerCell, const int startCell, const int endCell) const; /** \brief Notionally-private method that provides a common interface for multiple public-facing setJacobianData() methods. (Marked as public due to compiler constraints.) \param [out] jacobianData - a container, allocated by allocateJacobianData(), into which the evaluated Jacobians will be placed. - \param [in] points - if a valid points container, specifies the points at which the Jacobian will be evaluated. (Invalid containers are acceptable for affine CellGeometry.) \param [in] pointsPerCell - the number of points at which the Jacobian will be evaluated in each cell. If points is a valid container, pointsPerCell must match its first dimension. \param [in] refData - the return from getJacobianRefData(); may be an empty container, depending on details of CellGeometry (e.g. if it is affine) \param [in] startCell - the first cell ordinal for which the Jacobian will be evaluated (used to define worksets smaller than the whole CellGeometry). \param [in] endCell - the first cell ordinal for which the Jacobian will not be evaluated (used to define worksets smaller than the whole CellGeometry); use -1 to indicate that all cells from startCell on should be evaluated. */ - void setJacobianDataPrivate(Data &jacobianData, const TensorPoints &points, const int &pointsPerCell, - const Data &refData, const int startCell, const int endCell) const; + void setJacobianDataPrivate(Data &jacobianData, const int &pointsPerCell, const Data &refData, const int startCell, const int endCell) const; protected: HypercubeNodeOrdering nodeOrdering_; CellGeometryType cellGeometryType_; @@ -321,8 +319,14 @@ namespace Intrepid2 Data allocateJacobianData(const int &numPoints, const int startCell=0, const int endCell=-1) const; /** \brief Computes reference-space data for the specified points, to be used in setJacobian(). - \param [in] points - the points at which the Jacobian will be evaluated. - \return a Data object with any reference-space data required. This may be empty, if no reference-space data is required in setJacobian(). + \param [in] points - the points at which the Jacobian will be evaluated; shape (P,D) or (C,P,D). + \return a Data object with any reference-space data required. This may be empty, if no reference-space data is required in setJacobian(). If filled, will have shape (F,P,D) or (C,F,P,D). + */ + Data getJacobianRefData(const ScalarView &points) const; + + /** \brief Computes reference-space data for the specified points, to be used in setJacobian(). + \param [in] points - the points at which the Jacobian will be evaluated, with shape (P,D). + \return a Data object with any reference-space data required. This may be empty, if no reference-space data is required in setJacobian(). If filled, will have shape (F,P,D). */ Data getJacobianRefData(const TensorPoints &points) const; diff --git a/packages/intrepid2/src/Cell/Intrepid2_CellGeometryDef.hpp b/packages/intrepid2/src/Cell/Intrepid2_CellGeometryDef.hpp index f47e8cbf9cfc..ca3403ad4e49 100644 --- a/packages/intrepid2/src/Cell/Intrepid2_CellGeometryDef.hpp +++ b/packages/intrepid2/src/Cell/Intrepid2_CellGeometryDef.hpp @@ -181,7 +181,7 @@ namespace Intrepid2 template Data - CellGeometry::allocateJacobianDataPrivate(const TensorPoints &points, const int &pointsPerCell, const int startCell, const int endCell) const + CellGeometry::allocateJacobianDataPrivate(const ScalarView &pointComponentView, const int &pointsPerCell, const int startCell, const int endCell) const { ScalarView data; const int rank = 4; // C,P,D,D @@ -208,7 +208,7 @@ namespace Intrepid2 int cellTypeModulus = uniformJacobianModulus(); - data = getMatchingViewWithLabel(points.getTensorComponent(0), "CellGeometryProvider: Jacobian data", cellTypeModulus, spaceDim, spaceDim); + data = getMatchingViewWithLabel(pointComponentView, "CellGeometryProvider: Jacobian data", cellTypeModulus, spaceDim, spaceDim); } else { @@ -217,7 +217,7 @@ namespace Intrepid2 variationType[D2_DIM] = BLOCK_PLUS_DIAGONAL; blockPlusDiagonalLastNonDiagonal = -1; - data = getMatchingViewWithLabel(points.getTensorComponent(0), "CellGeometryProvider: Jacobian data", spaceDim); + data = getMatchingViewWithLabel(pointComponentView, "CellGeometryProvider: Jacobian data", spaceDim); } } else if (cellGeometryType_ == TENSOR_GRID) @@ -273,9 +273,9 @@ namespace Intrepid2 } template - void CellGeometry::setJacobianDataPrivate(Data &jacobianData, const TensorPoints &points, - const int &pointsPerCell, const Data &refData, - const int startCell, const int endCell) const + void CellGeometry::setJacobianDataPrivate(Data &jacobianData, + const int &pointsPerCell, const Data &refData, + const int startCell, const int endCell) const { const int numCellsWorkset = (endCell == -1) ? (numCells_ - startCell) : (endCell - startCell); @@ -384,12 +384,22 @@ namespace Intrepid2 TEUCHOS_TEST_FOR_EXCEPTION(basisForNodes == Teuchos::null, std::invalid_argument, "basisForNodes must not be null"); TEUCHOS_TEST_FOR_EXCEPTION(dataView.size() == 0, std::invalid_argument, "underlying view is not valid"); - // refData should contain the basis gradients; shape is (F,P,D) + // refData should contain the basis gradients; shape is (F,P,D) or (C,F,P,D) INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(!refData.isValid(), std::invalid_argument, "refData should be a valid container for cases with non-affine geometry"); - INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.rank() != 3, std::invalid_argument, "refData should have shape (F,P,D)"); - INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(0) != basisForNodes->getCardinality(), std::invalid_argument, "refData should have shape (F,P,D)"); - INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(1) != points.extent_int(0), std::invalid_argument, "refData should have shape (F,P,D)"); - INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(2) != spaceDim, std::invalid_argument, "refData should have shape (F,P,D)"); + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE((refData.rank() != 3) && (refData.rank() != 4), std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)"); + if (refData.rank() == 3) + { + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(0) != basisForNodes->getCardinality(), std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)"); + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(1) != pointsPerCell, std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)"); + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(2) != spaceDim, std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)"); + } + else + { + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(0) != numCellsWorkset, std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)"); + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(1) != basisForNodes->getCardinality(), std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)"); + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(2) != pointsPerCell, std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)"); + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(3) != spaceDim, std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)"); + } CellTools::setJacobian(dataView, *this, refData, startCell, endCell); } @@ -744,6 +754,90 @@ namespace Intrepid2 else return GENERAL; } + template + Data CellGeometry::getJacobianRefData(const ScalarView &points) const + { + Data emptyRefData; + if (cellGeometryType_ == UNIFORM_GRID) + { + // no need for basis computations + return emptyRefData; + } + else if (cellGeometryType_ == TENSOR_GRID) + { + // no need for basis values + return emptyRefData; + } + else if ((cellGeometryType_ == FIRST_ORDER) || (cellGeometryType_ == HIGHER_ORDER)) + { + const bool simplex = (spaceDim + 1 == cellToNodes_.extent_int(1)); + if (simplex) + { + // no need for precomputed basis values + return emptyRefData; + } + else + { + auto basisForNodes = this->basisForNodes(); + + if (affine_) + { + // no need for precomputed basis values + return emptyRefData; + } + else + { + // 2 use cases: (P,D) and (C,P,D). + // if (P,D), call the TensorPoints variant + if (points.rank() == 2) + { + TensorPoints tensorPoints(points); + return getJacobianRefData(tensorPoints); + } + else + { + const int numCells = points.extent_int(0); + const int numPoints = points.extent_int(1); + const int numFields = basisForNodes->getCardinality(); + + auto cellBasisGradientsView = getMatchingViewWithLabel(points, "CellGeometryProvider: cellBasisGradients", numCells, numFields, numPoints, spaceDim); + auto basisGradientsView = getMatchingViewWithLabel(points, "CellGeometryProvider: basisGradients", numFields, numPoints, spaceDim); + + for (int cellOrdinal=0; cellOrdinalgetValues(basisGradientsView, refPointsForCell, OPERATOR_GRAD); + + // At some (likely relatively small) memory cost, we copy the BasisGradients into an explicit (F,P,D) container. + // Given that we expect to reuse this for a non-trivial number of cell in the common use case, the extra memory + // cost is likely worth the increased flop count, etc. (One might want to revisit this in cases of high spaceDim + // and/or very high polynomial order.) + + using ExecutionSpace = typename DeviceType::execution_space; + auto policy = Kokkos::MDRangePolicy>({0,0,0},{numFields,numPoints,spaceDim}); + + Kokkos::parallel_for("copy basis gradients", policy, + KOKKOS_LAMBDA (const int &fieldOrdinal, const int &pointOrdinal, const int &d) { + cellBasisGradientsView(cellOrdinal,fieldOrdinal,pointOrdinal,d) = basisGradientsView(fieldOrdinal,pointOrdinal,d); + }); + ExecutionSpace().fence(); + } + Data basisRefData(cellBasisGradientsView); + return basisRefData; + } + } + } + } + else + { + // TODO: handle the other cases + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::invalid_argument, "support for this CellGeometryType is not yet implemented"); + } + return emptyRefData; + + + } + template Data CellGeometry::getJacobianRefData(const TensorPoints &points) const { @@ -866,11 +960,11 @@ namespace Intrepid2 // then there are as many distinct orientations possible as there are there are cells per grid cell // fill cellNodesHost with sample nodes from grid cell 0 const int numSubdivisions = numCellsPerGridCell(subdivisionStrategy_); // can be up to 6 - const int gridCellOrdinal = 0; #if defined(INTREPID2_COMPILE_DEVICE_CODE) /// do not compile host only code with device #else + const int gridCellOrdinal = 0; auto hostPolicy = Kokkos::MDRangePolicy>({0,0},{numSubdivisions,nodesPerCell}); Kokkos::parallel_for("fill cellNodesHost", hostPolicy, [this,gridCellOrdinal,cellNodesHost] (const int &subdivisionOrdinal, const int &nodeInCell) { @@ -1251,7 +1345,7 @@ namespace Intrepid2 Data CellGeometry::allocateJacobianData(const TensorPoints &points, const int startCell, const int endCell) const { const int pointsPerCell = points.extent_int(0); - return allocateJacobianDataPrivate(points,pointsPerCell,startCell,endCell); + return allocateJacobianDataPrivate(points.getTensorComponent(0),pointsPerCell,startCell,endCell); } template @@ -1260,8 +1354,7 @@ namespace Intrepid2 // if points is rank 3, it has shape (C,P,D). If it's rank 2, (P,D). const int pointDimension = (points.rank() == 3) ? 1 : 0; const int pointsPerCell = points.extent_int(pointDimension); - TensorPoints tensorPoints(points); - return allocateJacobianDataPrivate(tensorPoints,pointsPerCell,startCell,endCell); + return allocateJacobianDataPrivate(points,pointsPerCell,startCell,endCell); } template @@ -1269,7 +1362,7 @@ namespace Intrepid2 { INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(!affine_, std::invalid_argument, "this version of allocateJacobianData() is only supported for affine CellGeometry"); - TensorPoints emptyPoints; + ScalarView emptyPoints; return allocateJacobianDataPrivate(emptyPoints,numPoints,startCell,endCell); } @@ -1278,7 +1371,7 @@ namespace Intrepid2 const Data &refData, const int startCell, const int endCell) const { const int pointsPerCell = points.extent_int(0); - setJacobianDataPrivate(jacobianData,points,pointsPerCell,refData,startCell,endCell); + setJacobianDataPrivate(jacobianData,pointsPerCell,refData,startCell,endCell); } template @@ -1288,8 +1381,7 @@ namespace Intrepid2 // if points is rank 3, it has shape (C,P,D). If it's rank 2, (P,D). const int pointDimension = (points.rank() == 3) ? 1 : 0; const int pointsPerCell = points.extent_int(pointDimension); - TensorPoints tensorPoints(points); - setJacobianDataPrivate(jacobianData,tensorPoints,pointsPerCell,refData,startCell,endCell); + setJacobianDataPrivate(jacobianData,pointsPerCell,refData,startCell,endCell); } template @@ -1297,9 +1389,8 @@ namespace Intrepid2 { INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(!affine_, std::invalid_argument, "this version of setJacobian() is only supported for affine CellGeometry"); - TensorPoints emptyPoints; Data emptyRefData; - setJacobianDataPrivate(jacobianData,emptyPoints,numPoints,emptyRefData,startCell,endCell); + setJacobianDataPrivate(jacobianData,numPoints,emptyRefData,startCell,endCell); } } // namespace Intrepid2 diff --git a/packages/intrepid2/src/Cell/Intrepid2_CellTools.hpp b/packages/intrepid2/src/Cell/Intrepid2_CellTools.hpp index fb9b7f2073cb..aaa08d1410e1 100644 --- a/packages/intrepid2/src/Cell/Intrepid2_CellTools.hpp +++ b/packages/intrepid2/src/Cell/Intrepid2_CellTools.hpp @@ -378,6 +378,15 @@ namespace Intrepid2 { template static void setJacobianDet( Data & jacobianDet, const Data & jacobian); + + /** \brief Computes reciprocals of determinants corresponding to the Jacobians in the Data container provided + + \param jacobianDet [out] - data with shape (C,P), as returned by CellTools::allocateJacobianDet() + \param jacobian [in] - data with shape (C,P,D,D), as returned by CellGeometry::allocateJacobianData() + */ + template + static void setJacobianDetInv( Data & jacobianDet, + const Data & jacobian); /** \brief Computes determinants corresponding to the Jacobians in the Data container provided diff --git a/packages/intrepid2/src/Cell/Intrepid2_CellToolsDefJacobian.hpp b/packages/intrepid2/src/Cell/Intrepid2_CellToolsDefJacobian.hpp index 9f0c32423b5b..ef682870c4fd 100644 --- a/packages/intrepid2/src/Cell/Intrepid2_CellToolsDefJacobian.hpp +++ b/packages/intrepid2/src/Cell/Intrepid2_CellToolsDefJacobian.hpp @@ -459,6 +459,16 @@ namespace Intrepid2 { } } + template + template + void CellTools::setJacobianDetInv( Data &jacobianDetInv, const Data & jacobian ) + { + setJacobianDet(jacobianDetInv, jacobian); + + auto unitData = jacobianDetInv.allocateConstantData(1.0); + jacobianDetInv.storeInPlaceQuotient(unitData, jacobianDetInv); + } + template template void CellTools::setJacobianInv( Data &jacobianInv, const Data & jacobian ) diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisValues.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisValues.hpp index 4857dff98423..787a6d3a82ad 100644 --- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisValues.hpp +++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisValues.hpp @@ -163,6 +163,25 @@ namespace Intrepid2 } } + //! Returns the field ordinal offset for the specified family. + KOKKOS_INLINE_FUNCTION + int familyFieldOrdinalOffset(const int &familyOrdinal) const + { + if (vectorData_.isValid()) + { + return vectorData_.familyFieldOrdinalOffset(familyOrdinal); + } + else + { + int offset = 0; + for (int i=0; i= numTensorDataFamilies_, std::invalid_argument, "familyOrdinal too large"); INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(familyOrdinal < 0, std::invalid_argument, "familyOrdinal may not be less than 0"); diff --git a/packages/intrepid2/src/Discretization/FunctionSpaceTools/Intrepid2_FunctionSpaceTools.hpp b/packages/intrepid2/src/Discretization/FunctionSpaceTools/Intrepid2_FunctionSpaceTools.hpp index 8beeaeb32b05..2318839cb77e 100644 --- a/packages/intrepid2/src/Discretization/FunctionSpaceTools/Intrepid2_FunctionSpaceTools.hpp +++ b/packages/intrepid2/src/Discretization/FunctionSpaceTools/Intrepid2_FunctionSpaceTools.hpp @@ -64,7 +64,7 @@ #include "Intrepid2_CellTools.hpp" #include "Intrepid2_Data.hpp" -#include "Intrepid2_TransformedVectorData.hpp" +#include "Intrepid2_TransformedBasisValues.hpp" #include "Intrepid2_VectorData.hpp" #include "Kokkos_Core.hpp" @@ -89,7 +89,7 @@ namespace Intrepid2 { defined on cells in physical space and indexed by (C,F,P,D). The transformations are computed on entry access; algorithms such as sum factorization rely on having access to the reference-space basis values as well as the transformation operator; both are stored in the - returned TransformedVectorData object. + returned TransformedBasisValues object. Computes pullback of gradients of \e HGRAD functions \f$\Phi^*(\nabla\widehat{u}_f) = \left((DF_c)^{-{\sf T}}\cdot\nabla\widehat{u}_f\right)\circ F^{-1}_{c}\f$ @@ -123,13 +123,351 @@ namespace Intrepid2 { \endcode \param jacobianInverse [in] - Input array containing cell Jacobian inverses. - \param inputVals [in] - Input array of reference HGRAD gradients. - \return TransformedVectorData object defined on (C,F,P,D) indices; transformation is computed on access. + \param refBasisGradValues [in] - Input array of reference HGRAD gradients. + \return TransformedBasisValues object defined on (C,F,P,D) indices; transformation is computed on access. */ template - static TransformedVectorData getHGRADtransformGRAD(const Data &jacobianInverse, const VectorData &refBasisGradValues) + static TransformedBasisValues getHGRADtransformGRAD(const Data &jacobianInverse, const BasisValues &refBasisGradValues) { - return TransformedVectorData(jacobianInverse,refBasisGradValues); + return TransformedBasisValues(jacobianInverse,refBasisGradValues); + } + + /** \brief Transformation of a (scalar) value field in the H-grad space, defined at points on a + reference cell, stored in the user-provided container input + and indexed by (F,P), into the output container output, + defined on cells in physical space and indexed by (C,F,P). This transformation is trivial, and the + returned container is logically indexed by (C,F,P), but only contains (F,P) distinct data entries. + + Computes pullback of \e HGRAD functions \f$\Phi^*(\widehat{u}_f) = \widehat{u}_f\circ F^{-1}_{c} \f$ + for points in one or more physical cells that are images of a given set of points in the reference cell: + \f[ + \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,. + \f] + In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container + should contain the values of the function set \f$\{\widehat{u}_f\}_{f=0}^{F}\f$ at the + reference points: + \f[ + input(f,p) = \widehat{u}_f(\widehat{x}_p) \,. + \f] + The method returns + \f[ + output(c,f,p) + = \widehat{u}_f\circ F^{-1}_{c}(x_{c,p}) + = \widehat{u}_f(\widehat{x}_p) = input(f,p) \qquad 0\le c < C \,, + \f] + i.e., it simply replicates the values in the user-provided container to every cell. + See Section \ref sec_pullbacks for more details about pullbacks. + + \code + |------|----------------------|--------------------------------------------------| + | | Index | Dimension | + |------|----------------------|--------------------------------------------------| + | C | cell | 0 <= C < num. integration domains | + | F | field | 0 <= F < dim. of the basis | + | P | point | 0 <= P < num. integration points | + |------|----------------------|--------------------------------------------------| + \endcode + + \param input [in] - Input container of reference HGRAD values. + \return TransformedBasisValues object of logical shape (C,F,P). + */ + template + static TransformedBasisValues + getHGRADtransformVALUE(const ordinal_type &numCells, const BasisValues &refBasisValues) + { + return TransformedBasisValues(numCells,refBasisValues); + } + + /** \brief Transformation of a (vector) value field in the H-curl space, defined at points on a + reference cell, stored in the user-provided container inputVals + and indexed by (F,P,D), into the output container outputVals, + defined on cells in physical space and indexed by (C,F,P,D). + + Computes pullback of \e HCURL functions + \f$\Phi^*(\widehat{\bf u}_f) = \left((DF_c)^{-{\sf T}}\cdot\widehat{\bf u}_f\right)\circ F^{-1}_{c}\f$ + for points in one or more physical cells that are images of a given set of points in the reference cell: + \f[ + \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,. + \f] + In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container + should contain the values of the vector function set \f$\{\widehat{\bf u}_f\}_{f=0}^{F}\f$ at the + reference points: + \f[ + inputVals(f,p,*) = \widehat{\bf u}_f(\widehat{x}_p) \,. + \f] + The method returns + \f[ + outputVals(c,f,p,*) + = \left((DF_c)^{-{\sf T}}\cdot\widehat{\bf u}_f\right)\circ F^{-1}_{c}(x_{c,p}) + = (DF_c)^{-{\sf T}}(\widehat{x}_p)\cdot\widehat{\bf u}_f(\widehat{x}_p) \qquad 0\le c < C \,. + \f] + See Section \ref sec_pullbacks for more details about pullbacks. + \code + |------|----------------------|--------------------------------------------------| + | | Index | Dimension | + |------|----------------------|--------------------------------------------------| + | C | cell | 0 <= C < num. integration domains | + | F | field | 0 <= F < dim. of native basis | + | P | point | 0 <= P < num. integration points | + | D | space dim | 0 <= D < spatial dimension | + |------|----------------------|--------------------------------------------------| + \endcode + + \param jacobianInverse [in] - Input array containing cell Jacobian inverses. + \param inputVals [in] - Input array of reference HCURL values. + \return lazily-evaluated transformed basis values. + */ + template + static TransformedBasisValues + getHCURLtransformVALUE(const Data &jacobianInverse, + const BasisValues &refBasisValues ) + { + return TransformedBasisValues(jacobianInverse,refBasisValues); + } + + /** \brief Transformation of a 3D curl field in the H-curl space, defined at points on a + reference cell, stored in the user-provided container inputVals + and indexed by (F,P,D), into the output container outputVals, + defined on cells in physical space and indexed by (C,F,P,D). + + Computes pullback of curls of \e HCURL functions + \f$\Phi^*(\widehat{\bf u}_f) = \left(J^{-1}_{c} DF_{c}\cdot\nabla\times\widehat{\bf u}_f\right)\circ F^{-1}_{c}\f$ + for points in one or more physical cells that are images of a given set of points in the reference cell: + \f[ + \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,. + \f] + In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container + should contain the curls of the vector function set \f$\{\widehat{\bf u}_f\}_{f=0}^{F}\f$ at the + reference points: + \f[ + inputVals(f,p,*) = \nabla\times\widehat{\bf u}_f(\widehat{x}_p) \,. + \f] + The method returns + \f[ + outputVals(c,f,p,*) + = \left(J^{-1}_{c} DF_{c}\cdot\nabla\times\widehat{\bf u}_f\right)\circ F^{-1}_{c}(x_{c,p}) + = J^{-1}_{c}(\widehat{x}_p) DF_{c}(\widehat{x}_p)\cdot\nabla\times\widehat{\bf u}_f(\widehat{x}_p) + \qquad 0\le c < C \,. + \f] + See Section \ref sec_pullbacks for more details about pullbacks. + + \code + |------|----------------------|--------------------------------------------------| + | | Index | Dimension | + |------|----------------------|--------------------------------------------------| + | C | cell | 0 <= C < num. integration domains | + | F | field | 0 <= F < dim. of the basis | + | P | point | 0 <= P < num. integration points | + | D | space dim | 0 <= D < spatial dimension | + |------|----------------------|--------------------------------------------------| + \endcode + + \param jacobianDividedByJacobianDet [in] - Input cell Jacobians, divided by their determinant. + \param inputVals [in] - Input container of reference HDIV values. + \return lazily-evaluated transformed basis values. + */ + template + static TransformedBasisValues + getHCURLtransformCURL(const Data &jacobianDividedByJacobianDet, + const BasisValues &refBasisValues ) + { + return TransformedBasisValues(jacobianDividedByJacobianDet,refBasisValues); + } + + /** \brief Transformation of a 2D curl field in the H-curl space, defined at points on a + reference cell, stored in the user-provided container inputVals + and indexed by (F,P,D), into the output container outputVals, + defined on cells in physical space and indexed by (C,F,P). + + Computes pullback of curls of \e HCURL functions + \f$\Phi^*(\widehat{\bf u}_f) = \left(J^{-1}_{c}\nabla\times\widehat{\bf u}_{f}\right) \circ F^{-1}_{c} \f$ + for points in one or more physical cells that are images of a given set of points in the reference cell: + \f[ + \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,. + \f] + In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container + should contain the 2d curls of the vector function set \f$\{\widehat{\bf u}_f\}_{f=0}^{F}\f$ at the + reference points: + \f[ + inputVals(f,p) = \nabla\times\widehat{\bf u}_f(\widehat{x}_p) \,. + \f] + The method returns + \f[ + outputVals(c,f,p,*) + = \left(J^{-1}_{c}\nabla\times\widehat{\bf u}_{f}\right) \circ F^{-1}_{c} (x_{c,p}) + = J^{-1}_{c}(\widehat{x}_p) \nabla\times\widehat{\bf u}_{f} (\widehat{x}_p) + \qquad 0\le c < C \,. + \f] + See Section \ref sec_pullbacks for more details about pullbacks. + + \code + |------|----------------------|--------------------------------------------------| + | | Index | Dimension | + |------|----------------------|--------------------------------------------------| + | C | cell | 0 <= C < num. integration domains | + | F | field | 0 <= F < dim. of the basis | + | P | point | 0 <= P < num. integration points | + | D | space dim | 0 <= D < spatial dimension | + |------|----------------------|--------------------------------------------------| + \endcode + + \param outputVals [out] - Output array with transformed values + \param jacobianDetInverse [in] - Reciprocals of input cell Jacobian determinants. + \param inputVals [in] - Input array of reference HCURL curls. + */ + + template + static TransformedBasisValues + getHCURLtransformCURL2D(const Data &jacobianDetInverse, + const BasisValues &refBasisValues ) + { + return TransformedBasisValues(jacobianDetInverse,refBasisValues); + } + + /** \brief Transformation of a (vector) value field in the H-div space, defined at points on a + reference cell, stored in the user-provided container inputVals + and indexed by (F,P,D), into the output container outputVals, + defined on cells in physical space and indexed by (C,F,P,D). + + Computes pullback of \e HDIV functions + \f$\Phi^*(\widehat{\bf u}_f) = \left(J^{-1}_{c} DF_{c}\cdot\widehat{\bf u}_f\right)\circ F^{-1}_{c} \f$ + for points in one or more physical cells that are images of a given set of points in the reference cell: + \f[ + \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,. + \f] + In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container + should contain the values of the vector function set \f$\{\widehat{\bf u}_f\}_{f=0}^{F}\f$ at the + reference points: + \f[ + inputVals(f,p,*) = \widehat{\bf u}_f(\widehat{x}_p) \,. + \f] + The method returns + \f[ + outputVals(c,f,p,*) + = \left(J^{-1}_{c} DF_{c}\cdot \widehat{\bf u}_f\right)\circ F^{-1}_{c}(x_{c,p}) + = J^{-1}_{c}(\widehat{x}_p) DF_{c}(\widehat{x}_p)\cdot\widehat{\bf u}_f(\widehat{x}_p) + \qquad 0\le c < C \,. + \f] + See Section \ref sec_pullbacks for more details about pullbacks. + + \code + |------|----------------------|--------------------------------------------------| + | | Index | Dimension | + |------|----------------------|--------------------------------------------------| + | C | cell | 0 <= C < num. integration domains | + | F | field | 0 <= F < dim. of the basis | + | P | point | 0 <= P < num. integration points | + | D | space dim | 0 <= D < spatial dimension | + |------|----------------------|--------------------------------------------------| + \endcode + + \param jacobianDividedByJacobianDet [in] - Input cell Jacobians, divided by their determinant. + \param inputVals [in] - Input container of reference HDIV values. + \return TransformedBasisValues object of logical shape (C,F,P,D). + */ + + template + static TransformedBasisValues + getHDIVtransformVALUE(const Data &jacobianDividedByJacobianDet, + const BasisValues &refBasisValues ) + { + return TransformedBasisValues(jacobianDividedByJacobianDet,refBasisValues); + } + + /** \brief Transformation of a divergence field in the H-div space, defined at points on a + reference cell, stored in the user-provided container inputVals + and indexed by (F,P), into the output container outputVals, + defined on cells in physical space and indexed by (C,F,P). + + Computes pullback of the divergence of \e HDIV functions + \f$\Phi^*(\widehat{\bf u}_f) = \left(J^{-1}_{c}\nabla\cdot\widehat{\bf u}_{f}\right) \circ F^{-1}_{c} \f$ + for points in one or more physical cells that are images of a given set of points in the reference cell: + \f[ + \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,. + \f] + In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container + should contain the divergencies of the vector function set \f$\{\widehat{\bf u}_f\}_{f=0}^{F}\f$ at the + reference points: + \f[ + inputVals(f,p) = \nabla\cdot\widehat{\bf u}_f(\widehat{x}_p) \,. + \f] + The method returns + \f[ + outputVals(c,f,p,*) + = \left(J^{-1}_{c}\nabla\cdot\widehat{\bf u}_{f}\right) \circ F^{-1}_{c} (x_{c,p}) + = J^{-1}_{c}(\widehat{x}_p) \nabla\cdot\widehat{\bf u}_{f} (\widehat{x}_p) + \qquad 0\le c < C \,. + \f] + See Section \ref sec_pullbacks for more details about pullbacks. + + \code + |------|----------------------|--------------------------------------------------| + | | Index | Dimension | + |------|----------------------|--------------------------------------------------| + | C | cell | 0 <= C < num. integration domains | + | F | field | 0 <= F < dim. of the basis | + | P | point | 0 <= P < num. integration points | + |------|----------------------|--------------------------------------------------| + \endcode + + \param jacobianDetInverse [in] - Reciprocals of input cell Jacobian determinants. + \param refBasisDivValues [in] - Input container of reference HDIV divergences. + \return TransformedBasisValues object of logical shape (C,F,P). + */ + template + static TransformedBasisValues + getHDIVtransformDIV(const Data &jacobianDetInverse, + const BasisValues &refBasisDivValues ) + { + return TransformedBasisValues(jacobianDetInverse,refBasisDivValues); + } + + /** \brief Transformation of a (scalar) value field in the H-vol space, defined at points on a + reference cell, stored in the user-provided container inputVals + and indexed by (F,P), into the output container outputVals, + defined on cells in physical space and indexed by (C,F,P). + + Computes pullback of \e HVOL functions + \f$\Phi^*(\widehat{u}_f) = \left(J^{-1}_{c}\widehat{u}_{f}\right) \circ F^{-1}_{c} \f$ + for points in one or more physical cells that are images of a given set of points in the reference cell: + \f[ + \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,. + \f] + In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container + should contain the values of the functions in the set \f$\{\widehat{\bf u}_f\}_{f=0}^{F}\f$ at the + reference points: + \f[ + inputVals(f,p) = \widehat{u}_f(\widehat{x}_p) \,. + \f] + The method returns + \f[ + outputVals(c,f,p,*) + = \left(J^{-1}_{c}\widehat{u}_{f}\right) \circ F^{-1}_{c} (x_{c,p}) + = J^{-1}_{c}(\widehat{x}_p) \widehat{u}_{f} (\widehat{x}_p) + \qquad 0\le c < C \,. + \f] + See Section \ref sec_pullbacks for more details about pullbacks. + + \code + |------|----------------------|--------------------------------------------------| + | | Index | Dimension | + |------|----------------------|--------------------------------------------------| + | C | cell | 0 <= C < num. integration domains | + | F | field | 0 <= F < dim. of the basis | + | P | point | 0 <= P < num. integration points | + |------|----------------------|--------------------------------------------------| + \endcode + + \param jacobianDetInverse [in] - Reciprocals of input cell Jacobian determinants. + \param refBasisValues [in] - Input container of reference HVOL values. + \return TransformedBasisValues object of logical shape (C,F,P). + */ + template + static TransformedBasisValues + getHVOLtransformVALUE(const Data &jacobianDetInverse, + const BasisValues &refBasisValues ) + { + return TransformedBasisValues(jacobianDetInverse,refBasisValues); } /** \brief Transformation of a (scalar) value field in the H-grad space, defined at points on a diff --git a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationTools.hpp b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationTools.hpp index 727dd28c2a98..2780ad891922 100644 --- a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationTools.hpp +++ b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationTools.hpp @@ -63,7 +63,7 @@ #include "Intrepid2_CellTools.hpp" #include "Intrepid2_Data.hpp" -#include "Intrepid2_TransformedVectorData.hpp" +#include "Intrepid2_TransformedBasisValues.hpp" #include "Intrepid2_VectorData.hpp" #include "Kokkos_Core.hpp" @@ -88,9 +88,9 @@ namespace Intrepid2 { \return integrals, a container with logical shape (C,F,F), suitable for passing as the first argument to the integrate() variant that takes an Intrepid2::Data object as its first, integrals, argument. */ template - static Data allocateIntegralData(const TransformedVectorData vectorDataLeft, + static Data allocateIntegralData(const TransformedBasisValues vectorDataLeft, const TensorData cellMeasures, - const TransformedVectorData vectorDataRight); + const TransformedBasisValues vectorDataRight); /** \brief Contracts \a vectorDataLeft and \a vectorDataRight containers on point and space dimensions, weighting each point according to cellMeasures, @@ -104,9 +104,9 @@ namespace Intrepid2 { \param approxFlops [in] - if not NULL, the double pointed to will be set with an estimated number of floating point operations. Intended for performance assessment purposes. */ template - static void integrate(Data integrals, const TransformedVectorData &vectorDataLeft, + static void integrate(Data integrals, const TransformedBasisValues &vectorDataLeft, const TensorData &cellMeasures, - const TransformedVectorData &vectorDataRight, const bool sumInto = false, + const TransformedBasisValues &vectorDataRight, const bool sumInto = false, double* approximateFlops = NULL); }; // end IntegrationTools class diff --git a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp index 041ab6d45c49..cf5a8aa68905 100644 --- a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp +++ b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp @@ -1490,7 +1490,7 @@ namespace Intrepid2 { // const int numPointsFirst = leftFirstComponent.extent_int(1); // // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) { -// pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal); +// pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal); // }); // // // synchronize threads @@ -1608,7 +1608,7 @@ namespace Intrepid2 { for (int b_component=0; b_component < rightComponentSpan_; b_component++) { // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) { -// pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal); +// pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal); // }); flopCount += composedTransform_.extent_int(1) * cellMeasures_.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access @@ -1789,9 +1789,9 @@ namespace Intrepid2 { template template -Data IntegrationTools::allocateIntegralData(const TransformedVectorData vectorDataLeft, +Data IntegrationTools::allocateIntegralData(const TransformedBasisValues vectorDataLeft, const TensorData cellMeasures, - const TransformedVectorData vectorDataRight) + const TransformedBasisValues vectorDataRight) { // allocates a (C,F,F) container for storing integral data @@ -1834,9 +1834,9 @@ Data IntegrationTools::allocateIntegralData(const //! 2. arbitrary mesh: cellMeasures has trivial tensor product structure (one tensorial component). template template -void IntegrationTools::integrate(Data integrals, const TransformedVectorData & vectorDataLeft, +void IntegrationTools::integrate(Data integrals, const TransformedBasisValues & basisValuesLeft, const TensorData & cellMeasures, - const TransformedVectorData & vectorDataRight, const bool sumInto, + const TransformedBasisValues & basisValuesRight, const bool sumInto, double* approximateFlops) { using ExecutionSpace = typename DeviceType::execution_space; @@ -1855,68 +1855,89 @@ void IntegrationTools::integrate(Data integrals, } const int cellDataExtent = integrals.getDataExtent(0); - const int numFieldsLeft = vectorDataLeft.numFields(); - const int numFieldsRight = vectorDataRight.numFields(); - const int spaceDim = vectorDataLeft.spaceDim(); + const int numFieldsLeft = basisValuesLeft.numFields(); + const int numFieldsRight = basisValuesRight.numFields(); + const int spaceDim = basisValuesLeft.spaceDim(); - INTREPID2_TEST_FOR_EXCEPTION(vectorDataLeft.spaceDim() != vectorDataRight.spaceDim(), std::invalid_argument, "vectorDataLeft and vectorDataRight must agree on the space dimension"); + INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.spaceDim() != basisValuesRight.spaceDim(), std::invalid_argument, "vectorDataLeft and vectorDataRight must agree on the space dimension"); - // TODO: add more checks against validity of inputs + const int leftFamilyCount = basisValuesLeft.basisValues().numFamilies(); + const int rightFamilyCount = basisValuesRight.basisValues().numFamilies(); // we require that the number of tensor components in the vectors are the same for each vector entry // this is not strictly necessary, but it makes implementation easier, and we don't at present anticipate other use cases int numTensorComponentsLeft = -1; - const auto &refVectorLeft = vectorDataLeft.vectorData(); - int numFamiliesLeft = refVectorLeft.numFamilies(); - int numVectorComponentsLeft = refVectorLeft.numComponents(); - Kokkos::Array maxFieldsForComponentLeft {0,0,0,0,0,0,0}; - Kokkos::Array maxFieldsForComponentRight {0,0,0,0,0,0,0}; - for (int familyOrdinal=0; familyOrdinal maxFieldsForComponentLeft {0,0,0,0,0,0,0}; + Kokkos::Array maxFieldsForComponentRight {0,0,0,0,0,0,0}; + for (int familyOrdinal=0; familyOrdinal &tensorData = refVectorLeft.getComponent(familyOrdinal,vectorComponent); - if (tensorData.numTensorComponents() > 0) + for (int vectorComponent=0; vectorComponent &tensorData = refVectorLeft.getComponent(familyOrdinal,vectorComponent); + if (tensorData.numTensorComponents() > 0) { - maxFieldsForComponentLeft[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentLeft[r]); + if (numTensorComponentsLeft == -1) + { + numTensorComponentsLeft = tensorData.numTensorComponents(); + } + INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataLeft must have the same number of tensor components as every other"); + for (int r=0; r 0) + for (int vectorComponent=0; vectorComponent 0) { - maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]); + if (numTensorComponentsRight == -1) + { + numTensorComponentsRight = tensorData.numTensorComponents(); + } + INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataRight must have the same number of tensor components as every other"); + for (int r=0; r::integrate(Data integrals, pointDimensions[r] = cellMeasures.getTensorComponent(r+1).extent_int(0); } - Kokkos::Array< Kokkos::Array, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents> componentIntegrals; // these are reference-space integrals; no Jacobians or cell measures applied as yet - - for (int r=0; r & leftComponent = vectorDataLeft.vectorData().getComponent(d).getTensorComponent(r); - const Data & rightComponent = vectorDataRight.vectorData().getComponent(d).getTensorComponent(r); - - // It may be worth considering the possibility that some of these components point to the same data -- if so, we could possibly get better data locality by making the corresponding componentIntegral entries point to the same location as well. (And we could avoid some computations here.) - - const Data & quadratureWeights = cellMeasures.getTensorComponent(r+1); - - int leftComponentCount = leftComponent.extent_int(0); - int rightComponentCount = rightComponent.extent_int(0); - - const bool allocateFadStorage = !std::is_pod::value; - if (allocateFadStorage) - { - auto fad_size_output = dimension_scalar(integrals.getUnderlyingView()); - componentIntegrals[r][d] = ScalarView("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftComponentCount, rightComponentCount, fad_size_output); - } - else - { - componentIntegrals[r][d] = ScalarView("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftComponentCount, rightComponentCount); - } - - Kokkos::deep_copy(componentIntegrals[r][d], 0.0); // initialize - - const int & numPoints = pointDimensions[r]; - const int leftComponentRank = leftComponent.rank(); - const int rightComponentRank = rightComponent.rank(); - - const int leftComponentDimSpan = leftComponent.extent_int(2); - const int rightComponentDimSpan = rightComponent.extent_int(2); - INTREPID2_TEST_FOR_EXCEPTION(( leftComponentDimSpan != rightComponentDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions."); - -// // TODO: add support for cases where the component ranks are 3, and spaceDim > 1. -// INTREPID2_TEST_FOR_EXCEPTION(( leftComponentRank == 3) && ( leftComponent.extent_int(2) > 1), std::invalid_argument, "spaceDim > 1 not yet supported by this integrate() use case."); -// INTREPID2_TEST_FOR_EXCEPTION((rightComponentRank == 3) && (rightComponent.extent_int(2) > 1), std::invalid_argument, "spaceDim > 1 not yet supported by this integrate() use case."); - - auto policy = Kokkos::MDRangePolicy>({0,0},{leftComponentCount,rightComponentCount}); - - for (int a=0; a integralView2; Kokkos::View integralView3; @@ -2023,21 +1964,16 @@ void IntegrationTools::integrate(Data integrals, { integralView2 = integrals.getUnderlyingView2(); } - - const int leftFamilyCount = vectorDataLeft.vectorData().numFamilies(); - const int rightFamilyCount = vectorDataRight.vectorData().numFamilies(); - for (int leftFamilyOrdinal=0; leftFamilyOrdinal leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal) + : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal); if (!leftComponent.isValid()) { a_offset++; // empty components are understood to take up one dimension @@ -2049,15 +1985,14 @@ void IntegrationTools::integrate(Data integrals, for (int rightFamilyOrdinal=0; rightFamilyOrdinal rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal) + : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal); if (!rightComponent.isValid()) { b_offset++; // empty components are understood to take up one dimension @@ -2082,13 +2017,81 @@ void IntegrationTools::integrate(Data integrals, const int d_start = a_offset; const int d_end = d_start + leftDimSpan; - if (haveLaunchedContributionToLeftFamily && haveLaunchedContributionToRightFamily) + using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>; + ComponentIntegralsArray componentIntegrals; + for (int r=0; r quadratureWeights = cellMeasures.getTensorComponent(r+1); + const int numPoints = pointDimensions[r]; + + // It may be worth considering the possibility that some of these components point to the same data -- if so, we could possibly get better data locality by making the corresponding componentIntegral entries point to the same location as well. (And we could avoid some computations here.) + + Data leftTensorComponent = leftComponent.getTensorComponent(r); + Data rightTensorComponent = rightComponent.getTensorComponent(r); + + const int leftTensorComponentRank = leftTensorComponent.rank(); + const int leftTensorComponentDimSpan = leftTensorComponent.extent_int(2); + const int leftTensorComponentFields = leftTensorComponent.extent_int(0); + const int rightTensorComponentRank = rightTensorComponent.rank(); + const int rightTensorComponentDimSpan = rightTensorComponent.extent_int(2); + const int rightTensorComponentFields = rightTensorComponent.extent_int(0); + + INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions."); + + for (int d=d_start; d::value; + if (allocateFadStorage) + { + auto fad_size_output = dimension_scalar(integrals.getUnderlyingView()); + componentIntegrals[r][d] = ScalarView("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output); + } + else + { + componentIntegrals[r][d] = ScalarView("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields); + } + + auto componentIntegralView = componentIntegrals[r][d]; + + auto policy = Kokkos::MDRangePolicy>({0,0},{leftTensorComponentFields,rightTensorComponentFields}); + + for (int a=0; a upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount}; // separately declared in effort to get around Intel 17.0.1 compiler weirdness. Kokkos::Array lowerBounds {0,0,0}; @@ -2107,8 +2110,8 @@ void IntegrationTools::integrate(Data integrals, Scalar integralSum = 0.0; for (int d=d_start; d::integrate(Data integrals, } } }); - b_offset += rightDimSpan; - } // rightVectorComponentOrdinal loop - } // rightFamilyOrdinal loop - + } // rightVectorComponentOrdinal + } // rightFamilyOrdinal a_offset += leftDimSpan; } // leftVectorComponentOrdinal } // leftFamilyOrdinal @@ -2154,28 +2155,74 @@ void IntegrationTools::integrate(Data integrals, else // general case (not axis-aligned + affine tensor-product structure) { // prepare composed transformation matrices - const Data & leftTransform = vectorDataLeft.transform(); - const Data & rightTransform = vectorDataRight.transform(); + const Data & leftTransform = basisValuesLeft.transform(); + const Data & rightTransform = basisValuesRight.transform(); const bool transposeLeft = true; const bool transposeRight = false; // auto timer = Teuchos::TimeMonitor::getNewTimer("mat-mat"); // timer->start(); - Data composedTransform = leftTransform.allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform); - composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform); + // transforms can be matrices -- (C,P,D,D): rank 4 -- or scalar weights -- (C,P): rank 2 + const bool matrixTransform = (leftTransform.rank() == 4) || (rightTransform.rank() == 4); + Data composedTransform; + // invalid/empty transforms are used when the identity is intended. + if (leftTransform.isValid() && rightTransform.isValid()) + { + if (matrixTransform) + { + composedTransform = leftTransform.allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform); + composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform); + + // if the composedTransform matrices are full, the following is a good estimate. If they have some diagonal portions, this will overcount. + if (approximateFlops != NULL) + { + *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2; + } + } + else + { + composedTransform = leftTransform.allocateInPlaceCombinationResult(leftTransform, rightTransform); + composedTransform.storeInPlaceProduct(leftTransform, rightTransform); + + // re-cast composedTranform as a rank 4 (C,P,D,D) object -- a 1 x 1 matrix at each (C,P). + const int newRank = 4; + auto extents = composedTransform.getExtents(); + auto variationTypes = composedTransform.getVariationTypes(); + composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes); + if (approximateFlops != NULL) + { + *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry + } + } + } + else if (leftTransform.isValid()) + { + // rightTransform is the identity + composedTransform = leftTransform; + } + else if (rightTransform.isValid()) + { + // leftTransform is the identity + composedTransform = rightTransform; + } + else + { + // both left and right transforms are identity + Kokkos::Array extents {basisValuesLeft.numCells(),basisValuesLeft.numPoints(),spaceDim,spaceDim}; + Kokkos::Array variationTypes {CONSTANT,CONSTANT,BLOCK_PLUS_DIAGONAL,BLOCK_PLUS_DIAGONAL}; + + Kokkos::View identityUnderlyingView("Intrepid2::FST::integrate() - identity view",spaceDim); + Kokkos::deep_copy(identityUnderlyingView, 1.0); + composedTransform = Data(identityUnderlyingView,extents,variationTypes); + } + // timer->stop(); // std::cout << "Completed mat-mat in " << timer->totalElapsedTime() << " seconds.\n"; // timer->reset(); - // if the composedTransform matrices are full, the following is a good estimate. If they have some diagonal portions, this will overcount. - if (approximateFlops != NULL) - { - *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2; - } - - const int leftFamilyCount = vectorDataLeft. vectorData().numFamilies(); - const int rightFamilyCount = vectorDataRight.vectorData().numFamilies(); - const int leftComponentCount = vectorDataLeft. vectorData().numComponents(); - const int rightComponentCount = vectorDataRight.vectorData().numComponents(); + const int leftFamilyCount = basisValuesLeft. basisValues().numFamilies(); + const int rightFamilyCount = basisValuesRight.basisValues().numFamilies(); + const int leftComponentCount = isVectorValued ? basisValuesLeft. vectorData().numComponents() : 1; + const int rightComponentCount = isVectorValued ? basisValuesRight.vectorData().numComponents() : 1; int leftFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families for (int leftFamilyOrdinal=0; leftFamilyOrdinal::integrate(Data integrals, bool haveLaunchedContributionToCurrentFamilyLeft = false; // helps to track whether we need a Kokkos::fence before launching a kernel. for (int leftComponentOrdinal=0; leftComponentOrdinal & leftComponent = vectorDataLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal); + TensorData leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal) + : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal); if (!leftComponent.isValid()) { // represents zero - a_offset += vectorDataLeft.vectorData().numDimsForComponent(leftComponentOrdinal); + a_offset += basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal); continue; } @@ -2203,11 +2251,12 @@ void IntegrationTools::integrate(Data integrals, int b_offset = 0; for (int rightComponentOrdinal=0; rightComponentOrdinal & rightComponent = vectorDataRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal); + TensorData rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal) + : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal); if (!rightComponent.isValid()) { // represents zero - b_offset += vectorDataRight.vectorData().numDimsForComponent(rightComponentOrdinal); + b_offset += basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal); continue; } @@ -2305,20 +2354,20 @@ void IntegrationTools::integrate(Data integrals, } } } - b_offset += vectorDataRight.vectorData().numDimsForComponent(rightComponentOrdinal); + b_offset += isVectorValued ? basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal) : 1; } - rightFieldOrdinalOffset += vectorDataRight.vectorData().numFieldsInFamily(rightFamilyOrdinal); + rightFieldOrdinalOffset += isVectorValued ? basisValuesRight.vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal); } - a_offset += vectorDataLeft.vectorData().numDimsForComponent(leftComponentOrdinal); + a_offset += isVectorValued ? basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal) : 1; } - leftFieldOrdinalOffset += vectorDataLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal); + leftFieldOrdinalOffset += isVectorValued ? basisValuesLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal); } } // if (approximateFlops != NULL) // { // std::cout << "Approximate flop count (new): " << *approximateFlops << std::endl; // } - ExecutionSpace().fence(); // make sure we've finished writing to integrals container before exiting… + ExecutionSpace().fence(); // make sure we've finished writing to integrals container before we return } } // end namespace Intrepid2 diff --git a/packages/intrepid2/src/Shared/Intrepid2_Data.hpp b/packages/intrepid2/src/Shared/Intrepid2_Data.hpp index 4c626f897b1c..e5a1ab39f1ce 100644 --- a/packages/intrepid2/src/Shared/Intrepid2_Data.hpp +++ b/packages/intrepid2/src/Shared/Intrepid2_Data.hpp @@ -367,13 +367,25 @@ class ZeroView { } public: - //! For use with Data object into which a value will be stored. + //! For use with Data object into which a value will be stored. We use passThroughBlockDiagonalArgs = true for storeInPlaceCombination(). + template struct FullArgExtractorWritableData { template static KOKKOS_INLINE_FUNCTION reference_type get(const ViewType &view, const IntArgs&... intArgs) { - return view.getWritableEntry(intArgs...); + return view.getWritableEntryWithPassThroughOption(passThroughBlockDiagonalArgs, intArgs...); + } + }; + + //! For use with Data object into which a value will be stored. We use passThroughBlockDiagonalArgs = true for storeInPlaceCombination(). + template + struct FullArgExtractorData + { + template + static KOKKOS_INLINE_FUNCTION const_reference_type get(const ViewType &view, const IntArgs&... intArgs) + { + return view.getEntryWithPassThroughOption(passThroughBlockDiagonalArgs, intArgs...); } }; @@ -475,8 +487,8 @@ class ZeroView { const ConstantArgExtractor constArg; const FullArgExtractor fullArgs; - const FullArgExtractor fullArgsConst; - const FullArgExtractorWritableData fullArgsWritable; + const FullArgExtractorData fullArgsData; // true: pass through block diagonal args. This is due to the behavior of dataExtentRangePolicy() for block diagonal args. + const FullArgExtractorWritableData fullArgsWritable; // true: pass through block diagonal args. This is due to the behavior of dataExtentRangePolicy() for block diagonal args. const SingleArgExtractor arg0; const SingleArgExtractor arg1; @@ -539,7 +551,7 @@ class ZeroView { } else // this_full, not B_full: B may have modular data, etc. { - auto BAE = fullArgsConst; + auto BAE = fullArgsData; storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, AAE, BAE); } } @@ -567,7 +579,7 @@ class ZeroView { { // since storing to Data object requires a call to getWritableEntry(), we use FullArgExtractorWritableData auto thisAE = fullArgsWritable; - auto BAE = fullArgsConst; + auto BAE = fullArgsData; storeInPlaceCombination(policy, thisData, A_underlying, B, binaryOperator, thisAE, AAE, BAE); } } @@ -590,7 +602,7 @@ class ZeroView { else // this_full, not A_full: A may have modular data, etc. { // use A (the Data object). This could be further optimized by using A's underlying View and an appropriately-defined ArgExtractor. - auto AAE = fullArgsConst; + auto AAE = fullArgsData; storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, thisAE, AAE, BAE); } } @@ -618,7 +630,7 @@ class ZeroView { { // since storing to Data object requires a call to getWritableEntry(), we use FullArgExtractorWritableData auto thisAE = fullArgsWritable; - auto AAE = fullArgsConst; + auto AAE = fullArgsData; storeInPlaceCombination(policy, thisData, A, B_underlying, binaryOperator, thisAE, AAE, BAE); } } @@ -661,12 +673,12 @@ class ZeroView { // B is not full-extent in dimension argThis; use the Data object switch (argThis) { - case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg0, arg0, fullArgsConst); break; - case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg1, arg1, fullArgsConst); break; - case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg2, arg2, fullArgsConst); break; - case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg3, arg3, fullArgsConst); break; - case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg4, arg4, fullArgsConst); break; - case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg5, arg5, fullArgsConst); break; + case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg0, arg0, fullArgsData); break; + case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg1, arg1, fullArgsData); break; + case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg2, arg2, fullArgsData); break; + case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg3, arg3, fullArgsData); break; + case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg4, arg4, fullArgsData); break; + case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg5, arg5, fullArgsData); break; default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index"); } } @@ -675,12 +687,12 @@ class ZeroView { // A is not full-extent in dimension argThis; use the Data object switch (argThis) { - case 0: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg0, fullArgsConst, arg0); break; - case 1: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg1, fullArgsConst, arg1); break; - case 2: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg2, fullArgsConst, arg2); break; - case 3: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg3, fullArgsConst, arg3); break; - case 4: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg4, fullArgsConst, arg4); break; - case 5: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg5, fullArgsConst, arg5); break; + case 0: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg0, fullArgsData, arg0); break; + case 1: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg1, fullArgsData, arg1); break; + case 2: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg2, fullArgsData, arg2); break; + case 3: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg3, fullArgsData, arg3); break; + case 4: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg4, fullArgsData, arg4); break; + case 5: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg5, fullArgsData, arg5); break; default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index"); } } @@ -715,7 +727,7 @@ class ZeroView { { // A is full; B is not full, but not constant or full-extent 1D // unoptimized in B access: - auto BAE = fullArgsConst; + auto BAE = fullArgsData; storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, AAE, BAE); } } @@ -742,7 +754,7 @@ class ZeroView { } else { - auto BAE = fullArgsConst; + auto BAE = fullArgsData; switch (argIndex) { case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg0, BAE); break; @@ -758,8 +770,8 @@ class ZeroView { else // A not full, and not full-extent 1D { // unoptimized in A, B accesses. - auto AAE = fullArgsConst; - auto BAE = fullArgsConst; + auto AAE = fullArgsData; + auto BAE = fullArgsData; storeInPlaceCombination(policy, this_underlying, A, B, binaryOperator, thisAE, AAE, BAE); } } @@ -768,8 +780,8 @@ class ZeroView { { // completely un-optimized case: we use Data objects for this, A, B. auto thisAE = fullArgsWritable; - auto AAE = fullArgsConst; - auto BAE = fullArgsConst; + auto AAE = fullArgsData; + auto BAE = fullArgsData; storeInPlaceCombination(policy, thisData, A, B, binaryOperator, thisAE, AAE, BAE); } } @@ -783,7 +795,7 @@ class ZeroView { auto policy = dataExtentRangePolicy(); using DataType = Data; - using ThisAE = FullArgExtractorWritableData; + using ThisAE = FullArgExtractorWritableData; using AAE = FullArgExtractor; using BAE = FullArgExtractor; @@ -897,108 +909,126 @@ class ZeroView { INTREPID2_TEST_FOR_EXCEPTION(true,std::invalid_argument,"Unsupported data rank"); } } - - //! Returns an l-value reference to the specified logical entry in the underlying view. Note that for variation types other than GENERAL, multiple valid argument sets will refer to the same memory location. Intended for Intrepid2 developers and expert users only. + + //! Returns an l-value reference to the specified logical entry in the underlying view. Note that for variation types other than GENERAL, multiple valid argument sets will refer to the same memory location. Intended for Intrepid2 developers and expert users only. If passThroughBlockDiagonalArgs is TRUE, the corresponding arguments are interpreted as entries in the 1D packed matrix rather than as logical 2D matrix row and column. template KOKKOS_INLINE_FUNCTION - reference_type getWritableEntry(const IntArgs... intArgs) const + reference_type getWritableEntryWithPassThroughOption(const bool &passThroughBlockDiagonalArgs, const IntArgs... intArgs) const { -#ifdef INTREPID2_HAVE_DEBUG - INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(numArgs != rank_, std::invalid_argument, "getWritableEntry() should have the same number of arguments as the logical rank."); -#endif - constexpr int numArgs = sizeof...(intArgs); - if (underlyingMatchesLogical_) - { - // in this case, we require that numArgs == dataRank_ - return getUnderlyingView()(intArgs...); - } - - // extract the type of the first argument; use that for the arrays below - using int_type = std::tuple_element_t<0, std::tuple>; - - const Kokkos::Array args {intArgs...}; - Kokkos::Array refEntry; - for (int d=0; d()(intArgs...); + } + + // extract the type of the first argument; use that for the arrays below + using int_type = std::tuple_element_t<0, std::tuple>; + + const Kokkos::Array args {intArgs...,0}; // we pad with one extra entry (0) to avoid gcc compiler warnings about references beyond the bounds of the array (the [d+1]'s below) + Kokkos::Array refEntry; + for (int d=0; d= numArgs) - { - INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::invalid_argument, "BLOCK_PLUS_DIAGONAL must be present for two dimensions; here, encountered only one"); - } - else + case CONSTANT: refEntry[d] = 0; break; + case GENERAL: refEntry[d] = args[d]; break; + case MODULAR: refEntry[d] = args[d] % variationModulus_[d]; break; + case BLOCK_PLUS_DIAGONAL: { - const int_type &j = args[d+1]; - - if ((i > static_cast(blockPlusDiagonalLastNonDiagonal_)) || (j > static_cast(blockPlusDiagonalLastNonDiagonal_))) + if (passThroughBlockDiagonalArgs) { - if (i != j) + refEntry[d] = args[d]; + refEntry[d+1] = args[d+1]; // this had better be == 0 + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(args[d+1] != 0, std::invalid_argument, "getWritableEntry() called with passThroughBlockDiagonalArgs = true, but nonzero second matrix argument."); + } + else + { + const int numNondiagonalEntries = blockPlusDiagonalNumNondiagonalEntries(blockPlusDiagonalLastNonDiagonal_); + + const int_type &i = args[d]; + if (d+1 >= numArgs) { - // off diagonal: zero - return zeroView_(0); // NOTE: this branches in an argument-dependent way; this is not great for CUDA performance. When using BLOCK_PLUS_DIAGONAL, should generally avoid calls to this getEntry() method. (Use methods that directly take advantage of the data packing instead.) + INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::invalid_argument, "BLOCK_PLUS_DIAGONAL must be present for two dimensions; here, encountered only one"); } else { - refEntry[d] = blockPlusDiagonalDiagonalEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i); + const int_type &j = args[d+1]; + + if ((i > static_cast(blockPlusDiagonalLastNonDiagonal_)) || (j > static_cast(blockPlusDiagonalLastNonDiagonal_))) + { + if (i != j) + { + // off diagonal: zero + return zeroView_(0); // NOTE: this branches in an argument-dependent way; this is not great for CUDA performance. When using BLOCK_PLUS_DIAGONAL, should generally avoid calls to this getEntry() method. (Use methods that directly take advantage of the data packing instead.) + } + else + { + refEntry[d] = blockPlusDiagonalDiagonalEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i); + } + } + else + { + refEntry[d] = blockPlusDiagonalBlockEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i, j); + } + + // skip next d (this is required also to be BLOCK_PLUS_DIAGONAL, and we've consumed its arg as j above) + refEntry[d+1] = 0; } } - else - { - refEntry[d] = blockPlusDiagonalBlockEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i, j); - } - - // skip next d (this is required also to be BLOCK_PLUS_DIAGONAL, and we've consumed its arg as j above) - refEntry[d+1] = 0; + d++; } - d++; } } - } - // refEntry should be zero-filled beyond numArgs, for cases when rank_ < dataRank_ (this only is allowed if the extra dimensions each has extent 1). - for (int d=numArgs; d<7; d++) - { - refEntry[d] = 0; - } + // refEntry should be zero-filled beyond numArgs, for cases when rank_ < dataRank_ (this only is allowed if the extra dimensions each has extent 1). + for (int d=numArgs; d<7; d++) + { + refEntry[d] = 0; + } + + if (dataRank_ == 1) + { + return data1_(refEntry[activeDims_[0]]); + } + else if (dataRank_ == 2) + { + return data2_(refEntry[activeDims_[0]],refEntry[activeDims_[1]]); + } + else if (dataRank_ == 3) + { + return data3_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]]); + } + else if (dataRank_ == 4) + { + return data4_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]]); + } + else if (dataRank_ == 5) + { + return data5_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]], + refEntry[activeDims_[4]]); + } + else if (dataRank_ == 6) + { + return data6_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]], + refEntry[activeDims_[4]],refEntry[activeDims_[5]]); + } + else // dataRank_ == 7 + { + return data7_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]], + refEntry[activeDims_[4]],refEntry[activeDims_[5]],refEntry[activeDims_[6]]); + } - if (dataRank_ == 1) - { - return data1_(refEntry[activeDims_[0]]); - } - else if (dataRank_ == 2) - { - return data2_(refEntry[activeDims_[0]],refEntry[activeDims_[1]]); - } - else if (dataRank_ == 3) - { - return data3_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]]); - } - else if (dataRank_ == 4) - { - return data4_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]]); - } - else if (dataRank_ == 5) - { - return data5_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]], - refEntry[activeDims_[4]]); - } - else if (dataRank_ == 6) - { - return data6_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]], - refEntry[activeDims_[4]],refEntry[activeDims_[5]]); - } - else // dataRank_ == 7 - { - return data7_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]], - refEntry[activeDims_[4]],refEntry[activeDims_[5]],refEntry[activeDims_[6]]); - } + } + + //! Returns an l-value reference to the specified logical entry in the underlying view. Note that for variation types other than GENERAL, multiple valid argument sets will refer to the same memory location. Intended for Intrepid2 developers and expert users only. If passThroughBlockDiagonalArgs is TRUE, the corresponding arguments are interpreted as entries in the 1D packed matrix rather than as logical 2D matrix row and column. + template + KOKKOS_INLINE_FUNCTION + reference_type getWritableEntry(const IntArgs... intArgs) const + { + return getWritableEntryWithPassThroughOption(false, intArgs...); } public: //! Generic data copying method to allow construction of Data object from DynRankViews for which deep_copy() to the underlying view would be disallowed. This method made public to allow CUDA compilation (because it contains a Kokkos lambda). @@ -1058,7 +1088,7 @@ class ZeroView { { std::vector dataExtents; - bool blockPlusDiagonalEncountered = true; + bool blockPlusDiagonalEncountered = false; for (int d=0; d + KOKKOS_INLINE_FUNCTION + return_type getEntryWithPassThroughOption(const bool &passThroughBlockDiagonalArgs, const IntArgs&... intArgs) const + { + return getWritableEntryWithPassThroughOption(passThroughBlockDiagonalArgs, intArgs...); + } + //! Returns a (read-only) value corresponding to the specified logical data location. template KOKKOS_INLINE_FUNCTION return_type getEntry(const IntArgs&... intArgs) const { - return getWritableEntry(intArgs...); + return getEntryWithPassThroughOption(false, intArgs...); } template struct bool_pack; @@ -1806,6 +1844,14 @@ class ZeroView { return false; // statement should be unreachable; included because compilers don't necessarily recognize that fact... } + //! Constructs a container with extents matching this, with a single-value underlying View, CONSTANT in each dimension. + //! \param value [in] - the constant value. + //! \return A container with the same logical shape as this, with a single-value underlying View. + Data allocateConstantData( const DataScalar &value ) + { + return Data(value, this->getExtents()); + } + //! Constructs a container suitable for storing the result of an in-place combination of the two provided data containers. The two containers must have the same logical shape. //! \see storeInPlaceCombination() //! \param A [in] - the first data container. @@ -2183,6 +2229,23 @@ class ZeroView { return policy; } + //! Creates a new Data object with the same underlying view, but with the specified logical rank, extents, and variation types. + Data shallowCopy(const int rank, const Kokkos::Array &extents, const Kokkos::Array &variationTypes) + { + switch (dataRank_) + { + case 1: return Data(data1_, extents, variationTypes); + case 2: return Data(data2_, extents, variationTypes); + case 3: return Data(data3_, extents, variationTypes); + case 4: return Data(data4_, extents, variationTypes); + case 5: return Data(data5_, extents, variationTypes); + case 6: return Data(data6_, extents, variationTypes); + case 7: return Data(data7_, extents, variationTypes); + default: + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unhandled dataRank_"); + } + } + //! Places the result of an in-place combination (e.g., entrywise sum) into this data container. template void storeInPlaceCombination(const Data &A, const Data &B, BinaryOperator binaryOperator) diff --git a/packages/intrepid2/src/Shared/Intrepid2_TensorData.hpp b/packages/intrepid2/src/Shared/Intrepid2_TensorData.hpp index f42846072a67..15d93372955c 100644 --- a/packages/intrepid2/src/Shared/Intrepid2_TensorData.hpp +++ b/packages/intrepid2/src/Shared/Intrepid2_TensorData.hpp @@ -157,6 +157,32 @@ namespace Intrepid2 initialize(); } + /** + \brief Constructor to combine two other TensorData objects + \param [in] first - TensorData object with the components for the first dimension(s) + \param [in] second - TensorData object with the components for the remaining dimension(s). + \param [in] separateFirstComponent - if true, indicates that the first component (from the first TensorData object) will be indexed separately (this is used when the first index corresponds to a cell ordinal) + + When separateFirstComponent is true, all components are required to have rank 1, and TensorData has rank 2, with the first argument reserved for the first component. The second argument is indexed precisely as described above, omitting the first component. + */ + TensorData(const TensorData &first, const TensorData &second, bool separateFirstComponent = false) + : + separateFirstComponent_(separateFirstComponent), + numTensorComponents_(first.numTensorComponents() + second.numTensorComponents()) + { + ordinal_type r = 0; + for (ordinal_type r1=0; r1::reference_type; + void TEST_VALID_POINT_COMPONENTS() + { +#ifdef HAVE_INTREPID2_DEBUG + if (isValid_) + { + for (ordinal_type r=0; r whichDims) + : + numTensorComponents_(whichDims.size()), + isValid_(true) + { + int r = 0; + for (const auto & componentOrdinal : whichDims) + { + pointTensorComponents_[r++] = otherPointsContainer.getTensorComponent(componentOrdinal); + } + + initialize(); + } + /** \brief Constructor with variable-length std::vector argument. \param [in] pointTensorComponents - the components representing the points. diff --git a/packages/intrepid2/src/Shared/Intrepid2_TestUtils.hpp b/packages/intrepid2/src/Shared/Intrepid2_TestUtils.hpp index d76bdc9ef603..981325720f03 100644 --- a/packages/intrepid2/src/Shared/Intrepid2_TestUtils.hpp +++ b/packages/intrepid2/src/Shared/Intrepid2_TestUtils.hpp @@ -566,20 +566,22 @@ The total number of points defined will be a triangular number; if n=numPointsBa } template - typename std::enable_if< !supports_rank::value, void >::type + typename std::enable_if< !(supports_rank::value && supports_rank::value), void >::type testFloatingEquality(const FunctorType1 &functor1, const FunctorType2 &functor2, double relTol, double absTol, Teuchos::FancyOStream &out, bool &success, std::string functor1Name = "Functor 1", std::string functor2Name = "Functor 2") { - INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "testFloatingEquality() called on FunctorType1 that does not support the specified rank.\n"); + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "testFloatingEquality() called on FunctorType1 or FunctorType2 that does not support the specified rank.\n"); } //! this method assumes both functors are accesible on device. template - typename std::enable_if< supports_rank::value, void >::type + typename std::enable_if< (supports_rank::value && supports_rank::value), void >::type testFloatingEquality(const FunctorType1 &functor1, const FunctorType2 &functor2, double relTol, double absTol, Teuchos::FancyOStream &out, bool &success, std::string functor1Name = "Functor 1", std::string functor2Name = "Functor 2") { - static_assert( supports_rank::value, "Both Functor1 and Functor2 must support the specified rank through operator()."); + static_assert( supports_rank::value, "Functor1 must support the specified rank through operator()."); + static_assert( supports_rank::value, "Functor2 must support the specified rank through operator()."); + using Functor1IteratorScalar = FunctorIterator; using Functor2IteratorScalar = FunctorIterator; diff --git a/packages/intrepid2/src/Shared/Intrepid2_TransformedVectorData.hpp b/packages/intrepid2/src/Shared/Intrepid2_TransformedBasisValues.hpp similarity index 62% rename from packages/intrepid2/src/Shared/Intrepid2_TransformedVectorData.hpp rename to packages/intrepid2/src/Shared/Intrepid2_TransformedBasisValues.hpp index 197e3ed17e21..e103106e3207 100644 --- a/packages/intrepid2/src/Shared/Intrepid2_TransformedVectorData.hpp +++ b/packages/intrepid2/src/Shared/Intrepid2_TransformedBasisValues.hpp @@ -41,43 +41,46 @@ // ************************************************************************ // @HEADER -/** \file Intrepid2_TransformedVectorData.hpp - \brief Structure-preserving representation of transformed vector data; reference space values and transformations are stored separately. +/** \file Intrepid2_TransformedBasisValues.hpp + \brief Structure-preserving representation of transformed basis values; reference space values and transformations are stored separately. + + There are effectively two modes: one for vector-valued BasisValues, and one for scalar-valued BasisValues. In the former case the transformation is logically a matrix, with shape (C,P,D,D). In the latter case, the transformation is logically a weight on each physical-space quadrature point, with shape (C,P). If the transform is left unset, it is understood to be the identity. \author Nathan V. Roberts */ -#ifndef Intrepid2_TransformedVectorData_h -#define Intrepid2_TransformedVectorData_h - -#include "Intrepid2_VectorData.hpp" +#ifndef Intrepid2_TransformedBasisValues_h +#define Intrepid2_TransformedBasisValues_h +#include "Intrepid2_BasisValues.hpp" #include "Intrepid2_ScalarView.hpp" namespace Intrepid2 { -/** \class Intrepid2::TransformedVectorData +/** \class Intrepid2::TransformedBasisValues \brief Structure-preserving representation of transformed vector data; reference space values and transformations are stored separately. - TransformedVectorData provides a View-like interface of rank 4, with shape (C,F,P,D). When the corresponding accessor is used, the transformed value is determined from corresponding reference space values and the transformation. + TransformedBasisValues provides a View-like interface of rank 4, with shape (C,F,P,D). When the corresponding accessor is used, the transformed value is determined from corresponding reference space values and the transformation. */ template - class TransformedVectorData + class TransformedBasisValues { public: - using Transform = Data; + ordinal_type numCells_; - Data transform_; // (C,P,D,D) jacobian or jacobian inverse; can also be unset for identity transform + Data transform_; // vector case: (C,P,D,D) jacobian or jacobian inverse; can also be unset for identity transform. Scalar case: (C,P), or unset for identity. - VectorData vectorData_; // notionally (F,P,D) container + BasisValues basisValues_; /** \brief Standard constructor. - \param [in] transform - the transformation matrix, with logical shape (C,P,D,D) - \param [in] vectorData - the reference-space data to be transformed, with logical shape (F,P,D) + \param [in] transform - the transformation (matrix), with logical shape (C,P) or (C,P,D,D) + \param [in] basisValues - the reference-space data to be transformed, with logical shape (F,P) (for scalar values) or (F,P,D) (for vector values) */ - TransformedVectorData(const Data &transform, const VectorData &vectorData) + TransformedBasisValues(const Data &transform, const BasisValues &basisValues) : - transform_(transform), vectorData_(vectorData) + numCells_(transform.extent_int(0)), + transform_(transform), + basisValues_(basisValues) { // sanity check: when transform is diagonal, we expect there to be no pointwise variation. INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(transform_.isDiagonal() && (transform_.getVariationTypes()[1] != CONSTANT), std::invalid_argument, "When transform is diagonal, we assume in various places that there is no pointwise variation; the transform_ Data should have CONSTANT as its variation type in dimension 1."); @@ -87,17 +90,27 @@ namespace Intrepid2 { \brief Constructor for the case of an identity transform. \param [in] vectorData - the reference-space data, with logical shape (F,P,D) */ - TransformedVectorData(const VectorData &vectorData) + TransformedBasisValues(const ordinal_type &numCells, const BasisValues &basisValues) : - vectorData_(vectorData) + numCells_(numCells), + basisValues_(basisValues) {} //! copy-like constructor for differing device types. This may do a deep_copy of underlying views, depending on the memory spaces involved. template::value>::type> - TransformedVectorData(const TransformedVectorData &transformedVectorData) + TransformedBasisValues(const TransformedBasisValues &transformedVectorData) : + numCells_(transformedVectorData.numCells()), transform_(transformedVectorData.transform()), - vectorData_(transformedVectorData.vectorData()) + basisValues_(transformedVectorData.basisValues()) + {} + + /** + \brief Default constructor; an invalid container. Will return -1 for numCells(). + */ + TransformedBasisValues() + : + numCells_(-1) {} //! Returns true if the transformation matrix is diagonal. @@ -113,7 +126,12 @@ namespace Intrepid2 { return transform_.isDiagonal(); } } - + + BasisValues basisValues() const + { + return basisValues_; + } + //! Returns the true data extent in the cell dimension (e.g., will be 1 for transform matrices that do not vary from one cell to the next). KOKKOS_INLINE_FUNCTION int cellDataExtent() const { @@ -129,50 +147,78 @@ namespace Intrepid2 { //! Returns the logical extent in the cell dimension, which is the 0 dimension in this container. KOKKOS_INLINE_FUNCTION int numCells() const { - return transform_.extent_int(0); + return numCells_; } //! Returns the logical extent in the fields dimension, which is the 1 dimension in this container. KOKKOS_INLINE_FUNCTION int numFields() const { - return vectorData_.numFields(); + return basisValues_.extent_int(0); } //! Returns the logical extent in the points dimension, which is the 2 dimension in this container. KOKKOS_INLINE_FUNCTION int numPoints() const { - return vectorData_.numPoints(); + return basisValues_.extent_int(1); } //! Returns the logical extent in the space dimension, which is the 3 dimension in this container. KOKKOS_INLINE_FUNCTION int spaceDim() const { - return vectorData_.spaceDim(); + return basisValues_.extent_int(2); } - //! Accessor, with arguments (C,F,P,D). + //! Scalar accessor, with arguments (C,F,P). + KOKKOS_INLINE_FUNCTION Scalar operator()(const int &cellOrdinal, const int &fieldOrdinal, const int &pointOrdinal) const + { + if (!transform_.isValid()) + { + // null transform is understood as the identity + return basisValues_(fieldOrdinal,pointOrdinal); + } + else + { + return transform_(cellOrdinal,pointOrdinal) * basisValues_(fieldOrdinal,pointOrdinal); + } + } + + //! Vector accessor, with arguments (C,F,P,D). KOKKOS_INLINE_FUNCTION Scalar operator()(const int &cellOrdinal, const int &fieldOrdinal, const int &pointOrdinal, const int &dim) const { if (!transform_.isValid()) { // null transform is understood as the identity - return vectorData_(fieldOrdinal,pointOrdinal,dim); + return basisValues_(fieldOrdinal,pointOrdinal,dim); } else if (transform_.isDiagonal()) { - return transform_(cellOrdinal,pointOrdinal,dim,dim) * vectorData_(fieldOrdinal,pointOrdinal,dim); + return transform_(cellOrdinal,pointOrdinal,dim,dim) * basisValues_(fieldOrdinal,pointOrdinal,dim); } else { Scalar value = 0.0; for (int d2=0; d2 & vectorData() const { - return vectorData_; + return basisValues_.vectorData(); } - //! Returns the rank of the container, which is 4. + //! Returns the rank of the container, which is 3 for scalar values, and 4 for vector values. KOKKOS_INLINE_FUNCTION - constexpr unsigned rank() const + unsigned rank() const { - return 4; // shape is (C,F,P,D) + return basisValues_.rank() + 1; // transformation adds a cell dimension } //! Returns the extent in the specified dimension as an int. @@ -216,10 +262,9 @@ namespace Intrepid2 { else if (r == 3) return spaceDim(); else if (r > 3) return 1; - INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::invalid_argument, "Unsupported rank"); return -1; // unreachable return; here to avoid compiler warnings. } }; } -#endif /* Intrepid2_TransformedVectorData_h */ +#endif /* Intrepid2_TransformedBasisValues_h */ diff --git a/packages/intrepid2/src/Shared/Intrepid2_Types.hpp b/packages/intrepid2/src/Shared/Intrepid2_Types.hpp index cd362e2e4b68..2e94566365d9 100644 --- a/packages/intrepid2/src/Shared/Intrepid2_Types.hpp +++ b/packages/intrepid2/src/Shared/Intrepid2_Types.hpp @@ -139,6 +139,8 @@ namespace Intrepid2 { static constexpr ordinal_type MaxDerivative = 10; /// Maximum number of tensor/Cartesian products that can be taken: this allows hypercube basis in 7D to be formed by 7 line basis components. static constexpr ordinal_type MaxTensorComponents = 7; + /// Maximum number of components that a VectorData object will store -- 66 corresponds to OPERATOR_D10 on an H^1 hexahedral basis. For now, we limit to 7, matching MaxTensorComponents. + static constexpr ordinal_type MaxVectorComponents = 7; // we do not want to use hard-wired epsilon, threshold and tolerence. // static constexpr double Epsilon = 1.0e-16; diff --git a/packages/intrepid2/src/Shared/Intrepid2_VectorData.hpp b/packages/intrepid2/src/Shared/Intrepid2_VectorData.hpp index 5544c20cee6d..2e3913538794 100644 --- a/packages/intrepid2/src/Shared/Intrepid2_VectorData.hpp +++ b/packages/intrepid2/src/Shared/Intrepid2_VectorData.hpp @@ -65,17 +65,16 @@ namespace Intrepid2 { class VectorData { public: - using VectorArray = Kokkos::Array< TensorData, Parameters::MaxTensorComponents>; // for axis-aligned case, these correspond entry-wise to the axis with which the vector values align + using VectorArray = Kokkos::Array< TensorData, Parameters::MaxVectorComponents >; // for axis-aligned case, these correspond entry-wise to the axis with which the vector values align using FamilyVectorArray = Kokkos::Array< VectorArray, Parameters::MaxTensorComponents>; FamilyVectorArray vectorComponents_; // outer: family ordinal; inner: component/spatial dimension ordinal bool axialComponents_; // if true, each entry in vectorComponents_ is an axial component vector; for 3D: (f1,0,0); (0,f2,0); (0,0,f3). The 0s are represented by trivial/invalid TensorData objects. In this case, numComponents_ == numFamilies_. int totalDimension_; - Kokkos::Array dimToComponent_; - Kokkos::Array dimToComponentDim_; - - Kokkos::Array numDimsForComponent_; + Kokkos::Array dimToComponent_; + Kokkos::Array dimToComponentDim_; + Kokkos::Array numDimsForComponent_; Kokkos::Array familyFieldUpperBound_; // one higher than the end of family indicated @@ -128,6 +127,7 @@ namespace Intrepid2 { INTREPID2_TEST_FOR_EXCEPTION(!validEntryFoundForFamily, std::invalid_argument, "Each family must have at least one valid TensorData entry"); } + // do a pass through components to determine total component dim (totalDimension_) and size lookups appropriately int currentDim = 0; for (unsigned j=0; j Parameters::MaxTensorComponents, std::invalid_argument, "numFamilies must be less than Parameters::MaxTensorComponents"); - INTREPID2_TEST_FOR_EXCEPTION(numComponents_ > Parameters::MaxTensorComponents, std::invalid_argument, "numComponents must be less than Parameters::MaxTensorComponents"); + INTREPID2_TEST_FOR_EXCEPTION(numFamilies_ > Parameters::MaxTensorComponents, std::invalid_argument, "numFamilies must be at most Parameters::MaxTensorComponents"); + INTREPID2_TEST_FOR_EXCEPTION(numComponents_ > Parameters::MaxVectorComponents, std::invalid_argument, "numComponents must be at most Parameters::MaxVectorComponents"); for (unsigned i=0; i::HGRAD_LINE; using DGBasis = DGHierarchicalBasisFamily::HGRAD_LINE; - std::vector opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5}; + // NOTE: for the moment, OPERATOR_Dn for n > 2 not supported by DerivedBasis. We can support more by either increasing + // Parameters::MaxVectorComponents (which is 7 right now), or by changing VectorData to allow a dynamic number of + // components. (We were doing the latter using Kokkos::vector, but have switched to a Kokkos::Array instead to + // avoid using UVM.) + std::vector opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2}; //, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5}; // these tolerances are selected such that we have a little leeway for architectural differences // (It is true, though, that we incur a fair amount of floating point error for higher order bases in higher dimensions) @@ -488,7 +492,11 @@ namespace using CGBasis = HierarchicalBasisFamily::HGRAD_QUAD; using DGBasis = DGHierarchicalBasisFamily::HGRAD_QUAD; - std::vector opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5}; + // NOTE: for the moment, OPERATOR_Dn for n > 2 not supported by DerivedBasis. We can support more by either increasing + // Parameters::MaxVectorComponents (which is 7 right now), or by changing VectorData to allow a dynamic number of + // components. (We were doing the latter using Kokkos::vector, but have switched to a Kokkos::Array instead to + // avoid using UVM.) + std::vector opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2}; //, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5}; // these tolerances are selected such that we have a little leeway for architectural differences // (It is true, though, that we incur a fair amount of floating point error for higher order bases in higher dimensions) @@ -508,7 +516,11 @@ namespace using HierarchicalBasis = HierarchicalBasisFamily::HGRAD_QUAD; using NodalBasis = NodalBasisFamily::HGRAD_QUAD; - std::vector opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5}; + // NOTE: for the moment, OPERATOR_Dn for n > 2 not supported by DerivedBasis. We can support more by either increasing + // Parameters::MaxVectorComponents (which is 7 right now), or by changing VectorData to allow a dynamic number of + // components. (We were doing the latter using Kokkos::vector, but have switched to a Kokkos::Array instead to + // avoid using UVM.) + std::vector opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2}; //, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5}; // these tolerances are selected such that we have a little leeway for architectural differences // (It is true, though, that we incur a fair amount of floating point error for higher order bases in higher dimensions) @@ -533,7 +545,11 @@ namespace const double relTol=1e-11; // 2e-12 is sharp on development setup for polyOrder=2; relaxing for potential architectural differences const double absTol=1e-11; // 9e-13 is sharp on development setup for polyOrder=2; relaxing for potential architectural differences - std::vector opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5}; + // NOTE: for the moment, OPERATOR_Dn for n > 2 not supported by DerivedBasis. We can support more by either increasing + // Parameters::MaxVectorComponents (which is 7 right now), or by changing VectorData to allow a dynamic number of + // components. (We were doing the latter using Kokkos::vector, but have switched to a Kokkos::Array instead to + // avoid using UVM.) + std::vector opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2}; //, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5}; for (int polyOrder=1; polyOrder<3; polyOrder++) { CGBasis cgBasis(polyOrder); @@ -552,7 +568,11 @@ namespace const double relTol=1e-12; // 2e-13 is sharp on development setup for polyOrder=2; relaxing for potential architectural differences const double absTol=1e-13; // 2e-14 is sharp on development setup for polyOrder=2; relaxing for potential architectural differences - std::vector opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5}; + // NOTE: for the moment, OPERATOR_Dn for n > 2 not supported by DerivedBasis. We can support more by either increasing + // Parameters::MaxVectorComponents (which is 7 right now), or by changing VectorData to allow a dynamic number of + // components. (We were doing the latter using Kokkos::vector, but have switched to a Kokkos::Array instead to + // avoid using UVM.) + std::vector opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2}; //, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5}; for (int polyOrder=1; polyOrder<3; polyOrder++) { HierarchicalBasis hierarchicalBasis(polyOrder); diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/BasisValuesTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/BasisValuesTests.cpp index aba9c289a1c7..34cc645c4292 100644 --- a/packages/intrepid2/unit-test/MonolithicExecutable/BasisValuesTests.cpp +++ b/packages/intrepid2/unit-test/MonolithicExecutable/BasisValuesTests.cpp @@ -58,7 +58,7 @@ #include "Intrepid2_ProjectedGeometry.hpp" #include "Intrepid2_ProjectedGeometryExamples.hpp" #include "Intrepid2_ScalarView.hpp" -#include "Intrepid2_TransformedVectorData.hpp" +#include "Intrepid2_TransformedBasisValues.hpp" #include "Intrepid2_Types.hpp" #include "Intrepid2_TestUtils.hpp" @@ -249,7 +249,6 @@ namespace using DeviceType = Intrepid2::DefaultTestDeviceType; using Basis = HierarchicalBasisFamily::HGRAD_QUAD; - // for now, the BasisValues path only supports the standard exact-sequence operators std::vector opsToTest {OPERATOR_VALUE, OPERATOR_GRAD}; const double relTol=1e-13; @@ -285,7 +284,6 @@ namespace using DeviceType = Intrepid2::DefaultTestDeviceType; using Basis = HierarchicalBasisFamily::HGRAD_HEX; - // for now, the BasisValues path only supports the standard exact-sequence operators std::vector opsToTest {OPERATOR_VALUE, OPERATOR_GRAD}; const double relTol=1e-13; @@ -447,7 +445,6 @@ namespace using DeviceType = Intrepid2::DefaultTestDeviceType; using Basis = HierarchicalBasisFamily::HVOL_QUAD; - // for now, the BasisValues path only supports the standard exact-sequence operators std::vector opsToTest {OPERATOR_VALUE}; const double relTol=1e-13; @@ -465,7 +462,6 @@ namespace using DeviceType = Intrepid2::DefaultTestDeviceType; using Basis = HierarchicalBasisFamily::HVOL_HEX; - // for now, the BasisValues path only supports the standard exact-sequence operators std::vector opsToTest {OPERATOR_VALUE}; const double relTol=1e-13; diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/CMakeLists.txt b/packages/intrepid2/unit-test/MonolithicExecutable/CMakeLists.txt index 17758a980275..77ac7eb27b1b 100644 --- a/packages/intrepid2/unit-test/MonolithicExecutable/CMakeLists.txt +++ b/packages/intrepid2/unit-test/MonolithicExecutable/CMakeLists.txt @@ -2,7 +2,9 @@ #FILE(GLOB TEST_SOURCES "*.cpp" ${ETI_SOURCES} ) -FILE(GLOB TEST_SOURCES "*.cpp" ) +FILE(GLOB TEST_SOURCES "*.cpp") + +INCLUDE_DIRECTORIES("../../assembly-examples") TRIBITS_ADD_EXECUTABLE_AND_TEST( Intrepid2_Tests diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp index 1305831c79a9..dfe4d42bff2a 100644 --- a/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp +++ b/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp @@ -297,6 +297,58 @@ namespace testFloatingEquality3(AB_actual, AB_expected, relTol, absTol, out, success); } + TEUCHOS_UNIT_TEST( Data, InPlaceProduct_Matrix ) + { + double relTol = 1e-13; + double absTol = 1e-13; + + // Use two Data objects A and B, each with logical shape (10,3,2,2) -- (C,P,D,D), say. + // with A having variation types of CONSTANT, CONSTANT, and BLOCK_DIAGONAL (with the final two dimensions being purely diagonal) + // and B having variation types of CONSTANT for all entries + // Result should have variation types of CONSTANT, CONSTANT, and BLOCK_DIAGONAL, with the diagonal matrix entries weighted by the value from B. + // (This test is modeled on Jacobians for a uniform geometry being weighted by inverse determinants, as happens when computing inputs to getHDIVtransformVALUE(). + using DeviceType = DefaultTestDeviceType; + using Scalar = double; + + const int rank = 4; + const int cellCount = 10; + const int pointCount = 3; + const int spaceDim = 2; + + const double bValue = M_PI; + + auto AView = getView("A", spaceDim); // diagonal: one entry per dimension + auto ABView = getView("A .* B", spaceDim); // should be same shape as A + + auto AViewHost = Kokkos::create_mirror(AView); + auto ABViewHost = Kokkos::create_mirror(ABView); + for (int d=0; d extents {cellCount, pointCount, spaceDim, spaceDim}; + Kokkos::Array A_variation {CONSTANT, CONSTANT, BLOCK_PLUS_DIAGONAL, BLOCK_PLUS_DIAGONAL}; + + Data A(AView,extents,A_variation); + Data B(bValue,extents); + + // expected variation for A+B: + Kokkos::Array AB_variation = A_variation; + // expected Data object for A+B: + Data AB_expected(ABView,extents,AB_variation); + + auto AB_actual = Data::allocateInPlaceCombinationResult(A, B); + + AB_actual.storeInPlaceProduct(A, B); + + // test AB_actual equals AB_expected. (This will iterate over the logical extents.) + testFloatingEquality4(AB_actual, AB_expected, relTol, absTol, out, success); + } + // #pragma mark Data: MatVec /** \brief Data provides matrix-vector multiplication support. This method checks correctness of the computed mat-vec for a particular case involving a 2x2 matrix and a 2x1 vector. */ diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/HostCopyTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/HostCopyTests.cpp index 485087257fba..4303ab4417d4 100644 --- a/packages/intrepid2/unit-test/MonolithicExecutable/HostCopyTests.cpp +++ b/packages/intrepid2/unit-test/MonolithicExecutable/HostCopyTests.cpp @@ -53,7 +53,7 @@ #include "Intrepid2_DefaultCubatureFactory.hpp" #include "Intrepid2_TensorData.hpp" #include "Intrepid2_TensorPoints.hpp" -#include "Intrepid2_TransformedVectorData.hpp" +#include "Intrepid2_TransformedBasisValues.hpp" #include "Intrepid2_VectorData.hpp" #include "Intrepid2_ScalarView.hpp" #include "Intrepid2_Types.hpp" @@ -190,7 +190,7 @@ namespace } } - TEUCHOS_UNIT_TEST(HostCopy, TransformedVectorData) + TEUCHOS_UNIT_TEST(HostCopy, TransformedBasisValues) { using DeviceType = DefaultTestDeviceType; using Scalar = double; @@ -218,10 +218,9 @@ namespace const int blockPlusDiagonalLastNonDiagonal = -1; // only diagonal Data transform(scalingView,rank,extents,variationTypes,blockPlusDiagonalLastNonDiagonal); - TransformedVectorData transformedVectorData(transform,vectorData); + TransformedBasisValues transformedVectorData(transform,vectorData); - using HostExecSpace = Kokkos::HostSpace::execution_space; - TransformedVectorData transformedVectorDataHost(transformedVectorData); + TransformedBasisValues transformedVectorDataHost(transformedVectorData); TEST_EQUALITY(transformedVectorData.rank(), transformedVectorDataHost.rank()); TEST_EQUALITY(transformedVectorData.extent_int(0), transformedVectorDataHost.extent_int(0)); diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests.cpp index a076d361f884..2c54ae18c636 100644 --- a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests.cpp +++ b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests.cpp @@ -70,20 +70,236 @@ #include "Intrepid2_Data.hpp" #include "Intrepid2_TensorData.hpp" #include "Intrepid2_TensorPoints.hpp" -#include "Intrepid2_TransformedVectorData.hpp" +#include "Intrepid2_TransformedBasisValues.hpp" #include "Intrepid2_VectorData.hpp" #include "Intrepid2_ScalarView.hpp" +#include "GRADGRADStandardAssembly.hpp" +#include "GRADGRADStructuredAssembly.hpp" +#include "H1StandardAssembly.hpp" +#include "H1StructuredAssembly.hpp" +#include "HDIVStandardAssembly.hpp" +#include "HDIVStructuredAssembly.hpp" +#include "HCURLStandardAssembly.hpp" +#include "HCURLStructuredAssembly.hpp" +#include "HVOLStandardAssembly.hpp" +#include "HVOLStructuredAssembly.hpp" + namespace { using namespace Intrepid2; + enum FormulationChoice + { + Poisson, // (grad, grad) + Hgrad, // (grad, grad) + (value, value) + Hdiv, // (div, div) + (value, value) + Hcurl, // (curl, curl) + (value, value) + L2 // (value, value) + }; + + enum AlgorithmChoice + { + Standard, + AffineNonTensor, + NonAffineTensor, + AffineTensor, + DiagonalJacobian, + Uniform + }; + + enum BasisFamilyChoice + { + Nodal, + Hierarchical, + Serendipity + }; + + // tags to allow us to use templated Teuchos tests + class PoissonFormulation { + public: + static const FormulationChoice formulation = Poisson; + }; + class HgradFormulation { + public: + static const FormulationChoice formulation = Hgrad; + }; + class HdivFormulation { + public: + static const FormulationChoice formulation = Hdiv; + }; + class HcurlFormulation { + public: + static const FormulationChoice formulation = Hcurl; + }; + class L2Formulation { + public: + static const FormulationChoice formulation = L2; + }; +// StandardAlgorithm is not actually used in our test templates: Standard is the baseline we compare against. +// We therefore comment it out here; the unused static member "algorithm" below generates compilation errors on +// some platforms (with warnings as errors turned on). +// class StandardAlgorithm +// { +// public: +// static const AlgorithmChoice algorithm = Standard; +// }; + class AffineNonTensorAlgorithm + { + public: + static const AlgorithmChoice algorithm = AffineNonTensor; + }; + class NonAffineTensorAlgorithm + { + public: + static const AlgorithmChoice algorithm = NonAffineTensor; + }; + class AffineTensorAlgorithm + { + public: + static const AlgorithmChoice algorithm = AffineTensor; + }; + class UniformAlgorithm + { + public: + static const AlgorithmChoice algorithm = Uniform; + }; +// DiagonalJacobianAlgorithm is not yet used in our test templates: getMesh() does not support DiagonalJacobian yet. +// (We hope soon to add support for orthogonal extrusions in CellGeometry, which would give us those +// diagonal Jacobians in a natural way.) +// We therefore comment out the class out here; the unused static member "algorithm" below generates compilation errors on +// some platforms (with warnings as errors turned on). +// class DiagonalJacobianAlgorithm // note that DiagonalJacobian is not yet supported by getMesh() +// { +// public: +// static const AlgorithmChoice algorithm = DiagonalJacobian; +// }; + class D1 + { + public: + static const int spaceDim = 1; + }; + class D2 + { + public: + static const int spaceDim = 2; + }; + class D3 + { + public: + static const int spaceDim = 3; + }; + class P1 + { + public: + static const int polyOrder = 1; + }; + class P2 + { + public: + static const int polyOrder = 2; + }; + class P3 + { + public: + static const int polyOrder = 3; + }; + class P4 + { + public: + static const int polyOrder = 4; + }; + + using namespace Intrepid2; + + template< typename PointScalar, int spaceDim, typename DeviceType > + inline + CellGeometry getMesh(AlgorithmChoice algorithmChoice, const Kokkos::Array &gridCellCounts) + { + Kokkos::Array domainExtents; + for (int d=0; d(domainExtents, gridCellCounts); + + switch (algorithmChoice) + { + case Standard: + case NonAffineTensor: + { + // Standard and non-affine tensor use the same geometry; the difference is how this is used in assembly + const bool copyAffineness = false; + auto genericGeometry = getNodalCellGeometry(uniformTensorGeometry, copyAffineness); + return genericGeometry; + } + case Uniform: + return uniformTensorGeometry; + case AffineNonTensor: + case AffineTensor: + { + const bool copyAffineness = true; + auto affineNonTensorGeometry = getNodalCellGeometry(uniformTensorGeometry, copyAffineness); + return affineNonTensorGeometry; + } + case DiagonalJacobian: + { + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "DiagonalJacobian case not yet implemented"); + } + } + return uniformTensorGeometry; // this line should be unreachable; included to avoid compiler warnings from nvcc + } + + template + Intrepid2::ScalarView performStandardQuadrature(FormulationChoice formulation, + Intrepid2::CellGeometry &geometry, const int &polyOrder, const int &worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) + { + switch (formulation) + { + case Poisson: + return performStandardQuadratureGRADGRAD(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case Hgrad: + return performStandardQuadratureH1(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case Hdiv: + return performStandardQuadratureHDIV(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case Hcurl: + return performStandardQuadratureHCURL(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case L2: + return performStandardQuadratureHVOL(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + default: + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported formulation"); + } + } + + template + Intrepid2::ScalarView performStructuredQuadrature(FormulationChoice formulation, + Intrepid2::CellGeometry &geometry, const int &polyOrder, const int &worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) + { + switch (formulation) + { + case Poisson: + return performStructuredQuadratureGRADGRAD(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case Hgrad: + return performStructuredQuadratureH1(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case Hdiv: + return performStructuredQuadratureHDIV(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case Hcurl: + return performStructuredQuadratureHCURL(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case L2: + return performStructuredQuadratureHVOL(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + default: + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported formulation"); + } + } + //! version of integrate that performs a standard integration for affine meshes; does not take advantage of the tensor product structure at all //! this version can be used to verify correctness of other versions template -void integrate_baseline(Data integrals, const TransformedVectorData vectorDataLeft, - const TensorData cellMeasures, const TransformedVectorData vectorDataRight) +void integrate_baseline(Data integrals, const TransformedBasisValues vectorDataLeft, + const TensorData cellMeasures, const TransformedBasisValues vectorDataRight) { const int spaceDim = vectorDataLeft.spaceDim(); @@ -135,120 +351,11 @@ void integrate_baseline(Data integrals, const TransformedVect integralView(fieldOrdinalLeft,fieldOrdinalRight) = integral; } }); -// const int numFieldsLeft = vectorDataLeft.numFields(); -// const int numFieldsRight = vectorDataRight.numFields(); -// int approximateFlopCount = (spaceDim*2 + 2) * numPoints * numFieldsLeft * numFieldsRight * cellDataExtent; -// printView(integralView, std::cout, "stiffness in " + std::to_string(spaceDim) + "D"); -// std::cout << "\n\nApproximate flop count (baseline): " << approximateFlopCount << std::endl; -} - -//! version that uses the classic Intrepid2 paths -template -ScalarView performStandardQuadratureHypercube(int meshWidth, int polyOrder, int worksetSize) -{ - using ExecutionSpace = typename DeviceType::execution_space; - using CellTools = Intrepid2::CellTools; - using FunctionSpaceTools = Intrepid2::FunctionSpaceTools; - - using namespace std; - // dimensions of the returned view are (C,F,F) - auto fs = Intrepid2::FUNCTION_SPACE_HGRAD; - - shards::CellTopology lineTopo = shards::getCellTopologyData< shards::Line<> >(); - shards::CellTopology cellTopo; - if (spaceDim == 1) cellTopo = shards::getCellTopologyData< shards::Line<> >(); - else if (spaceDim == 2) cellTopo = shards::getCellTopologyData< shards::Quadrilateral<> >(); - else if (spaceDim == 3) cellTopo = shards::getCellTopologyData< shards::Hexahedron<> >(); - - auto basis = Intrepid2::getBasis< Intrepid2::NodalBasisFamily >(cellTopo, fs, polyOrder); - - int numFields = basis->getCardinality(); - int numHypercubes = 1; - for (int d=0; d numCells) worksetSize = numCells; - - // local stiffness matrix: - ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); - - auto cubature = Intrepid2::DefaultCubatureFactory::create(cellTopo,polyOrder*2); - int numPoints = cubature->getNumPoints(); - ScalarView cubaturePoints("cubature points",numPoints,spaceDim); - ScalarView cubatureWeights("cubature weights", numPoints); - - cubature->getCubature(cubaturePoints, cubatureWeights); - - // Allocate some intermediate containers - ScalarView basisValues ("basis values", numFields, numPoints ); - ScalarView basisGradValues("basis grad values", numFields, numPoints, spaceDim); - - ScalarView transformedGradValues("transformed grad values", worksetSize, numFields, numPoints, spaceDim); - ScalarView transformedWeightedGradValues("transformed weighted grad values", worksetSize, numFields, numPoints, spaceDim); - - basis->getValues(basisValues, cubaturePoints, Intrepid2::OPERATOR_VALUE ); - basis->getValues(basisGradValues, cubaturePoints, Intrepid2::OPERATOR_GRAD ); - - CellGeometry cellNodes = uniformCartesianMesh(1.0, meshWidth); - - const int numNodesPerCell = cellNodes.numNodesPerCell(); - ScalarView expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim); - using ExecutionSpace = typename DeviceType::execution_space; - auto policy = Kokkos::MDRangePolicy>({0,0},{numCells,numNodesPerCell}); - Kokkos::parallel_for("fill expanded cell nodes", policy, - KOKKOS_LAMBDA (const int &cellOrdinal, const int &nodeOrdinal) - { - for (int d=0; d cellMeasures("cell measures", worksetSize, numPoints); - ScalarView jacobianDeterminant("jacobian determinant", worksetSize, numPoints); - ScalarView jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim); - ScalarView jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim); - - int cellOffset = 0; - while (cellOffset < numCells) - { - int startCell = cellOffset; - int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell; - - std::pair cellRange = {startCell, startCell+numCellsInWorkset}; - auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL()); - - // note that the following will not work if numCellsInWorkset != worksetSize - // (we would need to take an appropriate subview of jacobian, etc. containers) - INTREPID2_TEST_FOR_EXCEPTION(numCellsInWorkset != worksetSize, std::invalid_argument, "workset size must evenly divide the number of cells!"); - CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); - CellTools::setJacobianInv(jacobianInverse, jacobian); - CellTools::setJacobianDet(jacobianDeterminant, jacobian); - - FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights); - FunctionSpaceTools::HGRADtransformGRAD(transformedGradValues, jacobianInverse, basisGradValues); - FunctionSpaceTools::multiplyMeasure(transformedWeightedGradValues, cellMeasures, transformedGradValues); - -// printView(transformedGradValues, std::cout, "transformedGradValues"); -// printView(cellMeasures, std::cout, "cellMeasures"); - - auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL()); - - FunctionSpaceTools::integrate(cellStiffnessSubview, transformedGradValues, transformedWeightedGradValues); - - cellOffset += worksetSize; - } - return cellStiffness; } template - void testIntegrateMatchesBaseline(const TransformedVectorData vectorDataLeft, - const TensorData cellMeasures, const TransformedVectorData vectorDataRight, + void testIntegrateMatchesBaseline(const TransformedBasisValues vectorDataLeft, + const TensorData cellMeasures, const TransformedBasisValues vectorDataRight, Teuchos::FancyOStream &out, bool &success) { const double relTol = 1e-12; @@ -264,15 +371,6 @@ ScalarView performStandardQuadratureHypercube(int meshWidth, const int integralsBaselineViewRank = integralsBaseline.getUnderlyingViewRank(); -// std::cout << "integralsBaselineView.extent_int(0) = " << integralsBaselineView.extent_int(0) << std::endl; -// std::cout << "integralsBaselineView.extent_int(1) = " << integralsBaselineView.extent_int(1) << std::endl; -// std::cout << "integralsBaselineView.extent_int(2) = " << integralsBaselineView.extent_int(2) << std::endl; -// -// std::cout << "integralsBaselineView.rank() = " << integralsBaselineView.rank() << std::endl; -// -// std::cout << "integralsBaselineView(0,0) = " << integralsBaselineView(0,0) << std::endl; -// std::cout << "integralsIntegrateView(0,0) = " << integralsIntegrateView(0,0) << std::endl; - printFunctor3( integralsBaseline, out, "integralsBaseline"); printFunctor3(integralsIntegrate, out, "integralsIntegrate"); // printView(integralsBaselineView, out); @@ -293,341 +391,199 @@ ScalarView performStandardQuadratureHypercube(int meshWidth, } } - template - void testQuadratureHypercube(bool useAffinePath, int meshWidth, int polyOrder, int worksetSize, Teuchos::FancyOStream &out, bool &success) + template + void testQuadratureHypercube(int meshWidth, int polyOrder, int worksetSize, + const FormulationChoice &formulation, const AlgorithmChoice &algorithm, + const double &relTol, const double &absTol, + Teuchos::FancyOStream &out, bool &success) { - const double relTol = 1e-12; - const double absTol = 1e-12; - using namespace std; - using DeviceType = DefaultTestDeviceType; - - using IntegrationTools = Intrepid2::IntegrationTools; - // dimensions of the returned view are (C,F,F) - auto fs = Intrepid2::FUNCTION_SPACE_HGRAD; - - auto lineBasis = Intrepid2::getLineBasis< Intrepid2::NodalBasisFamily >(fs, polyOrder); - int numFields_1D = lineBasis->getCardinality(); - - int numFields = 1; - int numHypercubes = 1; + Kokkos::Array gridCellCounts; for (int d=0; d numCells) worksetSize = numCells; - - // local stiffness matrix: - ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); - - shards::CellTopology lineTopo = shards::getCellTopologyData< shards::Line<> >(); - shards::CellTopology cellTopo; - if (spaceDim == 1) cellTopo = shards::getCellTopologyData< shards::Line<> >(); - else if (spaceDim == 2) cellTopo = shards::getCellTopologyData< shards::Quadrilateral<> >(); - else if (spaceDim == 3) cellTopo = shards::getCellTopologyData< shards::Hexahedron<> >(); - - auto lineCubature = Intrepid2::DefaultCubatureFactory::create(lineTopo,polyOrder*2); - int numPoints_1D = lineCubature->getNumPoints(); - ScalarView lineCubaturePoints("line cubature points",numPoints_1D,1); - ScalarView lineCubatureWeights("line cubature weights", numPoints_1D); - - lineCubature->getCubature(lineCubaturePoints, lineCubatureWeights); - - // Allocate some intermediate containers - ScalarView lineBasisValues ("line basis values", numFields_1D, numPoints_1D ); - ScalarView lineBasisGradValues("line basis grad values", numFields_1D, numPoints_1D, 1); - - // for now, we use 1D values to build up the 2D or 3D gradients - // eventually, TensorBasis should offer a getValues() variant that returns tensor basis data - lineBasis->getValues(lineBasisValues, lineCubaturePoints, Intrepid2::OPERATOR_VALUE ); - lineBasis->getValues(lineBasisGradValues, lineCubaturePoints, Intrepid2::OPERATOR_GRAD ); - // drop the trivial space dimension in line gradient values: - Kokkos::resize(lineBasisGradValues, numFields_1D, numPoints_1D); - - Kokkos::Array, spaceDim> vectorComponents; - - for (int d=0; d, spaceDim> gradComponent_d; - for (int d2=0; d2(lineBasisGradValues); - else gradComponent_d[d2] = Data(lineBasisValues); - } - vectorComponents[d] = TensorData(gradComponent_d); - } - VectorData gradientValues(vectorComponents, false); // false: not axis-aligned - - CellGeometry cellNodes = uniformCartesianMesh(1.0, meshWidth); - - if (useAffinePath) - { - out << "Testing path with affine, grid-aligned CellGeometry.\n"; - } - else - { - // make a "generic" copy of cellNodes, one that uses the (C,N), (N,D) node specification. This will not know that the geometry is affine, grid-aligned, or uniform. - const bool copyAffineness = false; // want to go through the non-affine geometry path - CellGeometry nodalCellGeometry = getNodalCellGeometry(cellNodes, copyAffineness); - - cellNodes = nodalCellGeometry; - out << "Testing non-affine path.\n"; - } - - auto cubature = Intrepid2::DefaultCubatureFactory::create(cellTopo,polyOrder*2); - auto tensorCubatureWeights = cubature->allocateCubatureWeights(); - auto tensorCubaturePoints = cubature->allocateCubaturePoints(); - - cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights); - - // goal here is to do a weighted Poisson; i.e. (f grad u, grad v) on each cell - - int pointsPerCell = 1; - for (int d=0; d(algorithm, gridCellCounts); + double flopCountIntegration = 0, flopCountJacobian = 0; + auto standardIntegrals = performStandardQuadrature(formulation, geometry, polyOrder, worksetSize, flopCountIntegration, flopCountJacobian); - Data jacobian = cellNodes.allocateJacobianData(tensorCubaturePoints); - Data jacobianDet = CellTools::allocateJacobianDet(jacobian); - Data jacobianInv = CellTools::allocateJacobianInv(jacobian); + auto structuredIntegrals = performStructuredQuadrature(formulation, geometry, polyOrder, worksetSize, flopCountIntegration, flopCountJacobian); - auto refData = cellNodes.getJacobianRefData(tensorCubaturePoints); - cellNodes.setJacobian(jacobian, tensorCubaturePoints, refData); - CellTools::setJacobianDet(jacobianDet, jacobian); - CellTools::setJacobianInv(jacobianInv, jacobian); - - // lazily-evaluated transformed gradient values: - using FunctionSpaceToolsDT = ::Intrepid2::FunctionSpaceTools; // TODO: once FunctionSpaceTools has proper DeviceType support, change the earlier FunctionSpaceTools typedef, and use it here… - auto transformedGradientValues = FunctionSpaceToolsDT::getHGRADtransformGRAD(jacobianInv, gradientValues); - auto standardIntegrals = performStandardQuadratureHypercube(meshWidth, polyOrder, worksetSize); - - TensorData cellMeasures = cellNodes.allocateCellMeasure(jacobianDet, tensorCubatureWeights); - if (!cellNodes.affine()) - { - // if cellNodes is not (known to be) affine, then cellMeasures should not have a separate first component (indicating the cell dimension is separated, thanks to point-invariant cell Jacobian determinant) - TEST_EQUALITY(cellMeasures.separateFirstComponent(), false); - } - cellNodes.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights); - - auto integralData = IntegrationTools::allocateIntegralData(transformedGradientValues, cellMeasures, transformedGradientValues); - - IntegrationTools::integrate(integralData, transformedGradientValues, cellMeasures, transformedGradientValues); - - auto integralDataBaseline = IntegrationTools::allocateIntegralData(transformedGradientValues, cellMeasures, transformedGradientValues); - - integrate_baseline(integralDataBaseline, transformedGradientValues, cellMeasures, transformedGradientValues); - - out << "Comparing new integration path path with baseline integration…\n"; - testIntegrateMatchesBaseline(transformedGradientValues, cellMeasures, transformedGradientValues, out, success); - - out << "Comparing baseline to standard Intrepid2 integration…\n"; - testFloatingEquality3(standardIntegrals, integralDataBaseline, relTol, absTol, out, success, "standard Intrepid2 integral", "reduced data integral - baseline"); - - out << "Comparing new integration path with standard Intrepid2 integration…\n"; - testFloatingEquality3(standardIntegrals, integralData, relTol, absTol, out, success, "standard Intrepid2 integral", "reduced data integral"); + out << "Comparing standard Intrepid2 integration to new integration path…\n"; + testFloatingEquality3(standardIntegrals, structuredIntegrals, relTol, absTol, out, success, "standard Intrepid2 integral", "reduced data integral - baseline"); } -// #pragma mark StructuredIntegration: QuadratureUniformMesh_1D_p1_AffinePath - TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_1D_p1_AffinePath ) + TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, QuadratureUniformMesh, FormulationTag, AlgorithmTag, DimTag, PolyOrderTag) { using DataScalar = double; using PointScalar = double; const int meshWidth = 1; - const int polyOrder = 1; - const int spaceDim = 1; + const int spaceDim = DimTag::spaceDim; + const int polyOrder = PolyOrderTag::polyOrder; const int worksetSize = meshWidth; - const bool affinePath = true; - - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); - } -// #pragma mark StructuredIntegration: QuadratureUniformMesh_1D_p1_GeneralPath - TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_1D_p1_GeneralPath ) - { - using DataScalar = double; - using PointScalar = double; - const int meshWidth = 4; - const int polyOrder = 1; - const int spaceDim = 1; - const int worksetSize = meshWidth; - const bool affinePath = false; - - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); - } - -// #pragma mark StructuredIntegration: QuadratureUniformMesh_1D_p4_GeneralPath -TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_1D_p4_GeneralPath ) -{ - using DataScalar = double; - using PointScalar = double; - const int meshWidth = 4; - const int polyOrder = 4; - const int spaceDim = 1; - const int worksetSize = meshWidth; - const bool affinePath = false; - - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); -} - -// #pragma mark StructuredIntegration: QuadratureUniformMesh_1D_p2 - TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_1D_p2 ) - { - using DataScalar = double; - using PointScalar = double; - const int meshWidth = 1; - const int polyOrder = 2; - const int spaceDim = 1; - const int worksetSize = meshWidth; - const bool affinePath = true; + using DeviceType = DefaultTestDeviceType; + using BasisFamily = DerivedNodalBasisFamily; - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); - } - -// #pragma mark StructuredIntegration: QuadratureUniformMesh_2D_p1 - TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_2D_p1 ) - { - using DataScalar = double; - using PointScalar = double; - const int meshWidth = 3; - const int polyOrder = 1; - const int spaceDim = 2; - const int worksetSize = meshWidth * meshWidth; - const bool affinePath = true; + const AlgorithmChoice algorithm = AlgorithmTag::algorithm; + const FormulationChoice formulation = FormulationTag::formulation; - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); - } - -// #pragma mark StructuredIntegration: QuadratureUniformMesh_2D_p1_GeneralPath - TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_2D_p1_GeneralPath ) - { - using DataScalar = double; - using PointScalar = double; - const int meshWidth = 2; - const int polyOrder = 1; - const int spaceDim = 2; - const int worksetSize = meshWidth * meshWidth; - const bool affinePath = false; + double relTol = 1e-12; + double absTol = 1e-12; - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); + testQuadratureHypercube(meshWidth, polyOrder, worksetSize, formulation, algorithm, + relTol, absTol, out, success); } -// #pragma mark StructuredIntegration: QuadratureUniformMesh_2D_p2_GeneralPath -TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_2D_p2_GeneralPath ) -{ - using DataScalar = double; - using PointScalar = double; - const int meshWidth = 2; - const int polyOrder = 2; - const int spaceDim = 2; - const int worksetSize = meshWidth * meshWidth; - const bool affinePath = false; - - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); -} + // comparisons are to Standard algorithm, so we don't instantiate with Standard: + // 1D, p=1 tests: + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm, D1, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D1, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D1, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm, D1, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineTensorAlgorithm, D1, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, NonAffineTensorAlgorithm, D1, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineNonTensorAlgorithm, D1, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, UniformAlgorithm, D1, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineTensorAlgorithm, D1, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, NonAffineTensorAlgorithm, D1, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineNonTensorAlgorithm, D1, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, UniformAlgorithm, D1, P1) + // 1D, p=2 tests: + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm, D1, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D1, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D1, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm, D1, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineTensorAlgorithm, D1, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, NonAffineTensorAlgorithm, D1, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineNonTensorAlgorithm, D1, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, UniformAlgorithm, D1, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineTensorAlgorithm, D1, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, NonAffineTensorAlgorithm, D1, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineNonTensorAlgorithm, D1, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, UniformAlgorithm, D1, P2) + // 1D, p=4 tests: + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm, D1, P4) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D1, P4) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D1, P4) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm, D1, P4) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineTensorAlgorithm, D1, P4) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, NonAffineTensorAlgorithm, D1, P4) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineNonTensorAlgorithm, D1, P4) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, UniformAlgorithm, D1, P4) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineTensorAlgorithm, D1, P4) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, NonAffineTensorAlgorithm, D1, P4) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineNonTensorAlgorithm, D1, P4) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, UniformAlgorithm, D1, P4) -// #pragma mark StructuredIntegration: QuadratureUniformMesh_2D_p2 - TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_2D_p2 ) - { - using DataScalar = double; - using PointScalar = double; - const int meshWidth = 3; - const int polyOrder = 2; - const int spaceDim = 2; - const int worksetSize = meshWidth * meshWidth; - const bool affinePath = true; - - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); - } + // 2D, p=1 tests: + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, NonAffineTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineNonTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, UniformAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, AffineTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, NonAffineTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, AffineNonTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, UniformAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, AffineTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, NonAffineTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, AffineNonTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, UniformAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, NonAffineTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineNonTensorAlgorithm, D2, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, UniformAlgorithm, D2, P1) + // 2D, p=2 tests: + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, NonAffineTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineNonTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, UniformAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, AffineTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, NonAffineTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, AffineNonTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, UniformAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, AffineTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, NonAffineTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, AffineNonTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, UniformAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, NonAffineTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineNonTensorAlgorithm, D2, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, UniformAlgorithm, D2, P2) -// #pragma mark StructuredIntegration: QuadratureUniformMesh_3D_p1 - TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_3D_p1 ) - { - using DataScalar = double; - using PointScalar = double; - const int meshWidth = 2; - const int polyOrder = 1; - const int spaceDim = 3; - const int worksetSize = meshWidth * meshWidth * meshWidth; - const bool affinePath = true; - - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); - } - -// #pragma mark StructuredIntegration: QuadratureUniformMesh_3D_p1_GeneralPath -TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_3D_p1_GeneralPath ) -{ - using DataScalar = double; - using PointScalar = double; - const int meshWidth = 2; - const int polyOrder = 1; - const int spaceDim = 3; - const int worksetSize = meshWidth * meshWidth * meshWidth; - const bool affinePath = false; - - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); -} - -// #pragma mark StructuredIntegration: QuadratureUniformMesh_3D_p2 - TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_3D_p2 ) - { - using DataScalar = double; - using PointScalar = double; - const int meshWidth = 2; - const int polyOrder = 2; - const int spaceDim = 3; - const int worksetSize = meshWidth * meshWidth * meshWidth; - const bool affinePath = true; - - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); - } - -// #pragma mark StructuredIntegration: QuadratureUniformMesh_3D_p2_GeneralPath -TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_3D_p2_GeneralPath ) -{ - using DataScalar = double; - using PointScalar = double; - const int meshWidth = 1; - const int polyOrder = 2; - const int spaceDim = 3; - const int worksetSize = meshWidth * meshWidth * meshWidth; - const bool affinePath = false; - - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); -} - -// #pragma mark StructuredIntegration: QuadratureUniformMesh_3D_p3 - TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_3D_p3 ) - { - using DataScalar = double; - using PointScalar = double; - const int meshWidth = 2; - const int polyOrder = 3; - const int spaceDim = 3; - const int worksetSize = meshWidth * meshWidth * meshWidth; - const bool affinePath = true; - - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); - } - -// #pragma mark StructuredIntegration: QuadratureUniformMesh_3D_p3_GeneralPath - TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_3D_p3_GeneralPath ) - { - using DataScalar = double; - using PointScalar = double; - const int meshWidth = 1; - const int polyOrder = 3; - const int spaceDim = 3; - const int worksetSize = meshWidth * meshWidth * meshWidth; - const bool affinePath = false; - - testQuadratureHypercube(affinePath,meshWidth, polyOrder, worksetSize, out, success); - } + // 3D, p=1 tests: + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, NonAffineTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineNonTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, UniformAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, AffineTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, NonAffineTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, AffineNonTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, UniformAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, AffineTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, NonAffineTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, AffineNonTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, UniformAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, NonAffineTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineNonTensorAlgorithm, D3, P1) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, UniformAlgorithm, D3, P1) + // 3D, p=2 tests: + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, NonAffineTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineNonTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, UniformAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, AffineTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, NonAffineTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, AffineNonTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, UniformAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, AffineTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, NonAffineTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, AffineNonTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, UniformAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, NonAffineTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineNonTensorAlgorithm, D3, P2) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, UniformAlgorithm, D3, P2) + // 3D, p=3 tests: + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, NonAffineTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, AffineNonTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation, UniformAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, AffineTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, NonAffineTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, AffineNonTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation, UniformAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, AffineTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, NonAffineTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, AffineNonTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation, UniformAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, NonAffineTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, AffineNonTensorAlgorithm, D3, P3) + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation, UniformAlgorithm, D3, P3) // #pragma mark StructuredIntegration: QuadratureSynthetic_AxisAlignedPath_Case1 TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_AxisAlignedPath_Case1 ) @@ -661,7 +617,7 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_AxisAlignedPath_Ca VectorData unitVectorData(vectorComponents); - TransformedVectorData transformedUnitVectorData(explicitIdentityMatrix,unitVectorData); + TransformedBasisValues transformedUnitVectorData(explicitIdentityMatrix,unitVectorData); Data constantCellMeasuresCellComponent(1.0, Kokkos::Array{numCells}); Data constantCellMeasuresPointComponent(1.0, Kokkos::Array{numComponentPoints}); @@ -712,7 +668,7 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case1 VectorData unitVectorData(vectorComponents); - TransformedVectorData transformedUnitVectorData(explicitIdentityMatrix,unitVectorData); + TransformedBasisValues transformedUnitVectorData(explicitIdentityMatrix,unitVectorData); Data constantCellMeasuresData(1.0, Kokkos::Array{numCells,numPoints}); TensorData constantCellMeasures(constantCellMeasuresData); @@ -775,7 +731,7 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case2 VectorData vectorData(vectorComponents); - TransformedVectorData transformedVectorData(explicitIdentityMatrix,vectorData); + TransformedBasisValues transformedVectorData(explicitIdentityMatrix,vectorData); Data constantCellMeasuresData(1.0, Kokkos::Array{numCells,numPoints}); TensorData constantCellMeasures(constantCellMeasuresData); @@ -858,7 +814,7 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case3 VectorData vectorData(vectorComponents); - TransformedVectorData transformedUnitVectorData(explicitIdentityMatrix,vectorData); + TransformedBasisValues transformedUnitVectorData(explicitIdentityMatrix,vectorData); Data constantCellMeasuresData(1.0, Kokkos::Array{numCells,numPoints}); TensorData constantCellMeasures(constantCellMeasuresData); @@ -929,7 +885,7 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case4 TEST_EQUALITY(numFieldsPerFamily, vectorData.numFieldsInFamily(0)); TEST_EQUALITY(numFieldsPerFamily, vectorData.numFieldsInFamily(1)); - TransformedVectorData transformedUnitVectorData(explicitIdentityMatrix,vectorData); + TransformedBasisValues transformedUnitVectorData(explicitIdentityMatrix,vectorData); Data constantCellMeasuresData(1.0, Kokkos::Array{numCells,numPoints}); TensorData constantCellMeasures(constantCellMeasuresData); @@ -1011,8 +967,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case5 TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0)); TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(1)); - TransformedVectorData transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); - TransformedVectorData transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); + TransformedBasisValues transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); + TransformedBasisValues transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); Data constantCellMeasuresData(1.0, Kokkos::Array{numCells,numPoints}); TensorData constantCellMeasures(constantCellMeasuresData); @@ -1096,8 +1052,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_AxisAlignedPath_Ca VectorData vectorDataRight(vectorComponentsRight); TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0)); - TransformedVectorData transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); - TransformedVectorData transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); + TransformedBasisValues transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); + TransformedBasisValues transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); Data constantCellMeasuresCellComponent(1.0, Kokkos::Array{numCells}); Data constantCellMeasuresPointComponent(1.0, Kokkos::Array{numComponentPoints}); @@ -1155,44 +1111,26 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case6_ fieldComponentExtents[0] = numFields3; Data fieldComponentData3(fieldComponentDataView3,fieldComponentExtents,fieldComponentVariationTypes); - TensorData tensorDataLeft(std::vector< Data >{fieldComponentData1,fieldComponentData2,fieldComponentData3}); - TensorData tensorDataRight(std::vector< Data >{fieldComponentData1,fieldComponentData3,fieldComponentData2}); + TensorData tensorDataLeft(std::vector< Data >{fieldComponentData1,fieldComponentData2,fieldComponentData3}); + TensorData tensorDataRight(std::vector< Data >{fieldComponentData1,fieldComponentData3,fieldComponentData2}); - auto identityMatrixView = getFixedRankView("identity matrix", spaceDim, spaceDim); - auto identityMatrixViewHost = getHostCopy(identityMatrixView); - - for (int d1=0; d1 transformationExtents {numCells, numPoints, spaceDim, spaceDim}; - Kokkos::Array transformationVariationType {CONSTANT, CONSTANT, GENERAL, GENERAL}; - - Data explicitIdentityMatrix(identityMatrixView, transformationExtents, transformationVariationType); + auto identityMatrixView = getFixedRankView("identity matrix", spaceDim, spaceDim); + auto identityMatrixViewHost = getHostCopy(identityMatrixView); -// const int numFamilies = 3; -// Kokkos::Array, spaceDim > firstFamilyLeft {tensorDataLeft,TensorData(),TensorData()}; -// Kokkos::Array, spaceDim > secondFamilyLeft {TensorData(),tensorDataLeft,TensorData()}; -// Kokkos::Array, spaceDim > thirdFamilyLeft {TensorData(),TensorData(),tensorDataLeft}; -// Kokkos::Array< Kokkos::Array, spaceDim>, numFamilies> vectorComponentsLeft {firstFamilyLeft, secondFamilyLeft, thirdFamilyLeft}; -// -// VectorData vectorDataLeft(vectorComponentsLeft); -// TEST_EQUALITY(numFieldsPerFamilyLeft, vectorDataLeft.numFieldsInFamily(0)); -// TEST_EQUALITY(numFieldsPerFamilyLeft, vectorDataLeft.numFieldsInFamily(1)); -// -// Kokkos::Array, spaceDim > firstFamilyRight {tensorDataRight,TensorData(),TensorData()}; -// Kokkos::Array, spaceDim > secondFamilyRight {TensorData(),tensorDataRight,TensorData()}; -// Kokkos::Array, spaceDim > thirdFamilyRight {TensorData(),TensorData(),tensorDataRight}; -// Kokkos::Array< Kokkos::Array, spaceDim>, numFamilies> vectorComponentsRight {firstFamilyRight, secondFamilyRight, thirdFamilyRight}; -// VectorData vectorDataRight(vectorComponentsRight); -// TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0)); -// TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(1)); + for (int d1=0; d1 transformationExtents {numCells, numPoints, spaceDim, spaceDim}; + Kokkos::Array transformationVariationType {CONSTANT, CONSTANT, GENERAL, GENERAL}; + + Data explicitIdentityMatrix(identityMatrixView, transformationExtents, transformationVariationType); const int numFamilies = 1; Kokkos::Array, spaceDim > firstFamilyLeft {tensorDataLeft,TensorData(),TensorData()}; @@ -1207,8 +1145,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case6_ VectorData vectorDataRight(vectorComponentsRight); TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0)); - TransformedVectorData transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); - TransformedVectorData transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); + TransformedBasisValues transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); + TransformedBasisValues transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); Data constantCellMeasuresData(1.0, Kokkos::Array{numCells,numPoints}); TensorData constantCellMeasures(constantCellMeasuresData); @@ -1294,8 +1232,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case7_ VectorData vectorDataRight(vectorComponentsRight); TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0)); - TransformedVectorData transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); - TransformedVectorData transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); + TransformedBasisValues transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); + TransformedBasisValues transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); Data constantCellMeasuresData(1.0, Kokkos::Array{numCells,numPoints}); TensorData constantCellMeasures(constantCellMeasuresData); @@ -1385,8 +1323,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case8_ TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0)); TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(1)); - TransformedVectorData transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); - TransformedVectorData transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); + TransformedBasisValues transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); + TransformedBasisValues transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); Data constantCellMeasuresData(1.0, Kokkos::Array{numCells,numPoints}); TensorData constantCellMeasures(constantCellMeasuresData); @@ -1473,8 +1411,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case9_ VectorData vectorDataRight(vectorComponentsRight); TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0)); - TransformedVectorData transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); - TransformedVectorData transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); + TransformedBasisValues transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); + TransformedBasisValues transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); Data constantCellMeasuresData(1.0, Kokkos::Array{numCells,numPoints}); TensorData constantCellMeasures(constantCellMeasuresData); @@ -1556,8 +1494,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case10 VectorData vectorDataRight(vectorComponentsRight); TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0)); - TransformedVectorData transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); - TransformedVectorData transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); + TransformedBasisValues transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft); + TransformedBasisValues transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight); auto cellMeasures = getFixedRankView("cellMeasures", numCells, numPoints); diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/TransformedVectorDataTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/TransformedBasisValuesTests.cpp similarity index 94% rename from packages/intrepid2/unit-test/MonolithicExecutable/TransformedVectorDataTests.cpp rename to packages/intrepid2/unit-test/MonolithicExecutable/TransformedBasisValuesTests.cpp index 6f956cf3d2b9..3e9f8729c081 100644 --- a/packages/intrepid2/unit-test/MonolithicExecutable/TransformedVectorDataTests.cpp +++ b/packages/intrepid2/unit-test/MonolithicExecutable/TransformedBasisValuesTests.cpp @@ -42,7 +42,7 @@ /** \file TransformedVectorDataTests.cpp - \brief Tests against TransformedVectorData. + \brief Tests against TransformedBasisValues. \author Created by Nate Roberts */ @@ -65,7 +65,7 @@ #include "Intrepid2_FunctionSpaceTools.hpp" #include "Intrepid2_TensorData.hpp" #include "Intrepid2_TensorPoints.hpp" -#include "Intrepid2_TransformedVectorData.hpp" +#include "Intrepid2_TransformedBasisValues.hpp" #include "Intrepid2_VectorData.hpp" #include "Intrepid2_ScalarView.hpp" @@ -136,7 +136,8 @@ namespace } vectorComponents[d] = TensorData(gradComponent_d); } - VectorData gradientValues(vectorComponents, false); // false: not axis-aligned + VectorData gradientVectorData(vectorComponents, false); // false: not axis-aligned + BasisValues gradientValues(gradientVectorData); CellGeometry cellNodes = uniformCartesianMesh(1.0, meshWidth); @@ -215,7 +216,7 @@ namespace testFloatingEquality4(transformedGradValues, transformedGradientData, relTol, absTol, out, success); } - TEUCHOS_UNIT_TEST( TransformedVectorData, TransformedVector_1D_p1 ) + TEUCHOS_UNIT_TEST( TransformedBasisValues, TransformedVector_1D_p1 ) { const int spaceDim = 1; const int polyOrder = 1; @@ -223,7 +224,7 @@ namespace testVectorTransformation(polyOrder, meshWidth, out, success); } - TEUCHOS_UNIT_TEST( TransformedVectorData, TransformedVector_1D_p2 ) + TEUCHOS_UNIT_TEST( TransformedBasisValues, TransformedVector_1D_p2 ) { const int spaceDim = 1; const int polyOrder = 2; @@ -231,7 +232,7 @@ namespace testVectorTransformation(polyOrder, meshWidth, out, success); } - TEUCHOS_UNIT_TEST( TransformedVectorData, TransformedVector_2D_p1 ) + TEUCHOS_UNIT_TEST( TransformedBasisValues, TransformedVector_2D_p1 ) { const int spaceDim = 2; const int polyOrder = 1; @@ -239,7 +240,7 @@ namespace testVectorTransformation(polyOrder, meshWidth, out, success); } - TEUCHOS_UNIT_TEST( TransformedVectorData, TransformedVector_2D_p2 ) + TEUCHOS_UNIT_TEST( TransformedBasisValues, TransformedVector_2D_p2 ) { const int spaceDim = 2; const int polyOrder = 2; diff --git a/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp b/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp index aa57b4e2536c..052a8b50da3c 100644 --- a/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp +++ b/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp @@ -300,14 +300,6 @@ int main( int argc, char* argv[] ) { for (CaseChoice caseChoice2 : caseChoices) { -// { -// // DEBUGGING: -// if ((caseChoice1 != General) && (caseChoice2 == General)) -// { -// cout << "Set breakpoint here.\n"; -// } -// } - // since constant takes so little time (and measurement is therefore noisy), we do a bunch of measurements and use their average const bool bothConstant = (caseChoice1 == Constant) && (caseChoice2 == Constant); const int numMeasurements = bothConstant ? 1000 : 1; diff --git a/packages/intrepid2/unit-test/performance/StructuredIntegration/CMakeLists.txt b/packages/intrepid2/unit-test/performance/StructuredIntegration/CMakeLists.txt index 1b9659237466..86db43427c93 100644 --- a/packages/intrepid2/unit-test/performance/StructuredIntegration/CMakeLists.txt +++ b/packages/intrepid2/unit-test/performance/StructuredIntegration/CMakeLists.txt @@ -2,12 +2,14 @@ SET(SOURCES "") FILE(GLOB SOURCES *.cpp) +INCLUDE_DIRECTORIES("../../../assembly-examples") + SET(LIBRARIES intrepid2) TRIBITS_ADD_EXECUTABLE_AND_TEST( StructuredIntegrationPerformance SOURCES ${SOURCES} - ARGS + ARGS NUM_MPI_PROCS 1 ADD_DIR_TO_NAME ) diff --git a/packages/intrepid2/unit-test/performance/StructuredIntegration/StructuredIntegrationPerformance.cpp b/packages/intrepid2/unit-test/performance/StructuredIntegration/StructuredIntegrationPerformance.cpp index 80e37693833e..f3d0657dc7e1 100644 --- a/packages/intrepid2/unit-test/performance/StructuredIntegration/StructuredIntegrationPerformance.cpp +++ b/packages/intrepid2/unit-test/performance/StructuredIntegration/StructuredIntegrationPerformance.cpp @@ -58,6 +58,27 @@ #include "Intrepid2_DefaultCubatureFactory.hpp" #include "Intrepid2_FunctionSpaceTools.hpp" #include "Intrepid2_IntegrationTools.hpp" +#include "Intrepid2_TestUtils.hpp" + +#include "GRADGRADStandardAssembly.hpp" +#include "GRADGRADStructuredAssembly.hpp" +#include "H1StandardAssembly.hpp" +#include "H1StructuredAssembly.hpp" +#include "HDIVStandardAssembly.hpp" +#include "HDIVStructuredAssembly.hpp" +#include "HCURLStandardAssembly.hpp" +#include "HCURLStructuredAssembly.hpp" +#include "HVOLStandardAssembly.hpp" +#include "HVOLStructuredAssembly.hpp" + +enum FormulationChoice +{ + Poisson, // (grad, grad) + Hgrad, // (grad, grad) + (value, value) + Hdiv, // (div, div) + (value, value) + Hcurl, // (curl, curl) + (value, value) + L2 // (value, value) +}; enum AlgorithmChoice { @@ -69,6 +90,13 @@ enum AlgorithmChoice Uniform }; +enum BasisFamilyChoice +{ + Nodal, + Hierarchical, + Serendipity +}; + std::string to_string(AlgorithmChoice choice) { switch (choice) { @@ -83,20 +111,31 @@ std::string to_string(AlgorithmChoice choice) } } +std::string to_string(FormulationChoice choice) +{ + switch (choice) { + case Poisson: return "Poisson"; + case Hgrad: return "Hgrad"; + case Hdiv: return "Hdiv"; + case Hcurl: return "Hcurl"; + case L2: return "L2"; + + default: return "Unknown FormulationChoice"; + } +} + using namespace Intrepid2; -template< typename PointScalar, int spaceDim, typename ExecutionSpace > +template< typename PointScalar, int spaceDim, typename DeviceType > inline -CellGeometry getMesh(AlgorithmChoice algorithmChoice, const int &meshWidth) +CellGeometry getMesh(AlgorithmChoice algorithmChoice, const Kokkos::Array &gridCellCounts) { Kokkos::Array domainExtents; - Kokkos::Array gridCellCounts; for (int d=0; d(domainExtents, gridCellCounts); + auto uniformTensorGeometry = uniformCartesianMesh(domainExtents, gridCellCounts); switch (algorithmChoice) { @@ -125,352 +164,138 @@ CellGeometry getMesh(AlgorithmChoice algo return uniformTensorGeometry; // this line should be unreachable; included to avoid compiler warnings from nvcc } -double flopsPerJacobian(const int &spaceDim, const int &numPoints, const int &numGeometryNodes) -{ - // implementation looks like: -// for (ordinal_type i=0;i +inline +CellGeometry getMesh(AlgorithmChoice algorithmChoice, const int &meshWidth) { - //implementation in RealSpaceTools: - /*value_type r_val = 0.0; - switch (dim) { - case 3: - r_val = ( inMat(0,0) * inMat(1,1) * inMat(2,2) + // 3 flops: 2 mults, 1 add - inMat(1,0) * inMat(2,1) * inMat(0,2) + // 3 flops: 2 mults, 1 add - inMat(2,0) * inMat(0,1) * inMat(1,2) - // 3 flops: 2 mults, 1 subtract - inMat(2,0) * inMat(1,1) * inMat(0,2) - // 3 flops: 2 mults, 1 subtract - inMat(0,0) * inMat(2,1) * inMat(1,2) - // 3 flops: 2 mults, 1 subtract - inMat(1,0) * inMat(0,1) * inMat(2,2) ); // 2 flops: 2 mults - break; - case 2: - r_val = ( inMat(0,0) * inMat(1,1) - - inMat(0,1) * inMat(1,0) ); - break; - case 1: - r_val = ( inMat(0,0) ); - break; - } - return r_val;*/ - int r_val; - switch (spaceDim) { - case 3: r_val = 17.0 * numPoints; break; - case 2: r_val = 3.0 * numPoints; break; - case 1: r_val = 0.0; break; - default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unhandled spaceDim"); + Kokkos::Array gridCellCounts; + for (int d=0; d(algorithmChoice, gridCellCounts); } -double flopsPerJacobianInverse(const int &spaceDim, const int &numPoints) +//! Returns an Array of roughly isotropic grid dimensions for which (C,F,F) stifness matrix will have at most maxStiffnessEntryCount entries, and at most maxElements total cells. +template +Kokkos::Array +getMeshWidths(int basisCardinality, int maxStiffnessEntryCount, int maxElements) { - // implementation looks like: - // const value_type val = RealSpaceTools<>::Serial::det(mat); - double totalFlops = flopsPerJacobianDet(spaceDim, numPoints); + Kokkos::Array meshWidths; + const int entriesPerElement = basisCardinality * basisCardinality; + const int maxElementCount = std::min(maxStiffnessEntryCount / entriesPerElement, maxElements); - if (spaceDim == 3) + // initialize meshWidths: + for (int d=0; d -ScalarView performStandardQuadratureHypercubeGRADGRAD(CellGeometry &geometry, - const int &polyOrder, int worksetSize, - double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) +template +Intrepid2::ScalarView performStandardQuadrature(FormulationChoice formulation, + Intrepid2::CellGeometry &geometry, const int &polyOrder, const int &worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) { - int numVertices = 1; - for (int d=0; d(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case Hgrad: + return performStandardQuadratureH1(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case Hdiv: + return performStandardQuadratureHDIV(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case Hcurl: + return performStandardQuadratureHCURL(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case L2: + return performStandardQuadratureHVOL(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + default: + return Intrepid2::ScalarView(); } - - auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians"); - auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()"); - auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup"); - initialSetupTimer->start(); - - using CellTools = Intrepid2::CellTools; - using FunctionSpaceTools = Intrepid2::FunctionSpaceTools; - - using namespace std; - // dimensions of the returned view are (C,F,F) - auto fs = Intrepid2::FUNCTION_SPACE_HGRAD; - - shards::CellTopology cellTopo = geometry.cellTopology(); - - auto basis = Intrepid2::getBasis< Intrepid2::NodalBasisFamily >(cellTopo, fs, polyOrder); - - int numFields = basis->getCardinality(); - int numCells = geometry.numCells(); - - if (worksetSize > numCells) worksetSize = numCells; - - // local stiffness matrices: - ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); - - using Kokkos::DefaultExecutionSpace; - auto cubature = Intrepid2::DefaultCubatureFactory::create(cellTopo,polyOrder*2); - int numPoints = cubature->getNumPoints(); - ScalarView cubaturePoints("cubature points",numPoints,spaceDim); - ScalarView cubatureWeights("cubature weights", numPoints); - - cubature->getCubature(cubaturePoints, cubatureWeights); - - const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices); - const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints); - const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints); - - // Allocate some intermediate containers - ScalarView basisValues ("basis values", numFields, numPoints ); - ScalarView basisGradValues("basis grad values", numFields, numPoints, spaceDim); - - ScalarView transformedGradValues("transformed grad values", worksetSize, numFields, numPoints, spaceDim); - ScalarView transformedWeightedGradValues("transformed weighted grad values", worksetSize, numFields, numPoints, spaceDim); - - basis->getValues(basisValues, cubaturePoints, Intrepid2::OPERATOR_VALUE ); - basis->getValues(basisGradValues, cubaturePoints, Intrepid2::OPERATOR_GRAD ); - - const int numNodesPerCell = geometry.numNodesPerCell(); - ScalarView expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim); - Kokkos::parallel_for(Kokkos::RangePolicy(0,numCells), - KOKKOS_LAMBDA (const int &cellOrdinal) { - for (int nodeOrdinal=0; nodeOrdinal cellMeasures("cell measures", worksetSize, numPoints); - ScalarView jacobianDeterminant("jacobian determinant", worksetSize, numPoints); - ScalarView jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim); - ScalarView jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim); +} - initialSetupTimer->stop(); - - transformIntegrateFlopCount = 0; - jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself - jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse - jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant - jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P) - - int cellOffset = 0; - while (cellOffset < numCells) +template +Intrepid2::ScalarView performStructuredQuadrature(FormulationChoice formulation, + Intrepid2::CellGeometry &geometry, const int &polyOrder, const int &worksetSize, + double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) +{ + switch (formulation) { - int startCell = cellOffset; - int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell; - - std::pair cellRange = {startCell, startCell+numCellsInWorkset}; - auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL()); - - if (numCellsInWorkset != worksetSize) - { - Kokkos::resize(jacobian, numCellsInWorkset, numPoints, spaceDim, spaceDim); - Kokkos::resize(jacobianInverse, numCellsInWorkset, numPoints, spaceDim, spaceDim); - Kokkos::resize(jacobianDeterminant, numCellsInWorkset, numPoints); - Kokkos::resize(cellMeasures, numCellsInWorkset, numPoints); - Kokkos::resize(transformedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim); - Kokkos::resize(transformedWeightedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim); - } - jacobianAndCellMeasureTimer->start(); - CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell. - CellTools::setJacobianInv(jacobianInverse, jacobian); - CellTools::setJacobianDet(jacobianDeterminant, jacobian); - - FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights); - ExecSpaceType().fence(); - jacobianAndCellMeasureTimer->stop(); - - // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls. - fstIntegrateCall->start(); - FunctionSpaceTools::HGRADtransformGRAD(transformedGradValues, jacobianInverse, basisGradValues); - transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction. - FunctionSpaceTools::multiplyMeasure(transformedWeightedGradValues, cellMeasures, transformedGradValues); - transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedGradValues: one flop for each. - - auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL()); - - FunctionSpaceTools::integrate(cellStiffnessSubview, transformedGradValues, transformedWeightedGradValues); - ExecSpaceType().fence(); - fstIntegrateCall->stop(); - - transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction. - - cellOffset += worksetSize; + case Poisson: + return performStructuredQuadratureGRADGRAD(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case Hgrad: + return performStructuredQuadratureH1(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case Hdiv: + return performStructuredQuadratureHDIV(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case Hcurl: + return performStructuredQuadratureHCURL(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + case L2: + return performStructuredQuadratureHVOL(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + default: + return Intrepid2::ScalarView(); } -// std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl; - return cellStiffness; } -//! returns an estimated count of the floating point operations performed. -template -void performStructuredQuadratureHypercubeGRADGRAD(CellGeometry &geometry, const int &polyOrder, const int &worksetSize, - double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount) +template +typename BasisFamily::BasisPtr getBasisForFormulation(FormulationChoice formulation, shards::CellTopology &cellTopo, const int polyOrder) { - int numVertices = 1; - for (int d=0; dstart(); - using namespace std; - using FunctionSpaceTools = Intrepid2::FunctionSpaceTools; - using IntegrationTools = Intrepid2::IntegrationTools; - // dimensions of the returned view are (C,F,F) - auto fs = Intrepid2::FUNCTION_SPACE_HGRAD; - - shards::CellTopology cellTopo = geometry.cellTopology(); - - auto basis = Intrepid2::getBasis< Intrepid2::DerivedNodalBasisFamily >(cellTopo, fs, polyOrder); - - int numFields = basis->getCardinality(); - int numCells = geometry.numCells(); - - // local stiffness matrix: - ScalarView cellStiffness("cell stiffness matrices",numCells,numFields,numFields); - - auto cubature = Intrepid2::DefaultCubatureFactory::create(cellTopo,polyOrder*2); - auto tensorCubatureWeights = cubature->allocateCubatureWeights(); - TensorPoints tensorCubaturePoints = cubature->allocateCubaturePoints(); - - cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights); - - EOperator op = OPERATOR_GRAD; - BasisValues gradientValues = basis->allocateBasisValues(tensorCubaturePoints, op); - basis->getValues(gradientValues, tensorCubaturePoints, op); - - // goal here is to do a weighted Poisson; i.e. (f grad u, grad v) on each cell - - int cellOffset = 0; - - auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians"); - auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()"); - - Data jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize); - Data jacobianDet = CellTools::allocateJacobianDet(jacobian); - Data jacobianInv = CellTools::allocateJacobianInv(jacobian); - TensorData cellMeasures = geometry.allocateCellMeasure(jacobianDet, tensorCubatureWeights); - - // lazily-evaluated transformed gradient values (temporary to allow integralData allocation) - auto transformedGradientValuesTemp = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues.vectorData()); - auto integralData = IntegrationTools::allocateIntegralData(transformedGradientValuesTemp, cellMeasures, transformedGradientValuesTemp); - - const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases - - // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount) - const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices); - const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints); - const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints); - - transformIntegrateFlopCount = 0; - jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself - jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse - jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant - jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P) - - auto refData = geometry.getJacobianRefData(tensorCubaturePoints); + auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder); + return basis; +} + +template +BasisPtr getHypercubeBasisForFormulation(FormulationChoice formulation, BasisFamilyChoice basisFamilyChoice, const int polyOrder) +{ + shards::CellTopology cellTopo; + switch (spaceDim) + { + case 1: cellTopo = shards::CellTopology(shards::getCellTopologyData >()); break; + case 2: cellTopo = shards::CellTopology(shards::getCellTopologyData >()); break; + case 3: cellTopo = shards::CellTopology(shards::getCellTopologyData >()); break; + default: + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported spaceDim"); + } - initialSetupTimer->stop(); - while (cellOffset < numCells) + switch (basisFamilyChoice) { - int startCell = cellOffset; - int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell; - int endCell = numCellsInWorkset + startCell; - - jacobianAndCellMeasureTimer->start(); - if (numCellsInWorkset != worksetSize) - { - const int CELL_DIM = 0; // first dimension corresponds to cell - jacobian.setExtent( CELL_DIM, numCellsInWorkset); - jacobianDet.setExtent( CELL_DIM, numCellsInWorkset); - jacobianInv.setExtent( CELL_DIM, numCellsInWorkset); - integralData.setExtent(CELL_DIM, numCellsInWorkset); - - // cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension… - cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset); - } - - geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell); - CellTools::setJacobianDet(jacobianDet, jacobian); - CellTools::setJacobianInv(jacobianInv, jacobian); - - // lazily-evaluated transformed gradient values: - auto transformedGradientValues = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues.vectorData()); - - geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights); - ExecSpaceType().fence(); - jacobianAndCellMeasureTimer->stop(); - - bool sumInto = false; - double approximateFlopCountIntegrateWorkset = 0; - fstIntegrateCall->start(); - IntegrationTools::integrate(integralData, transformedGradientValues, cellMeasures, transformedGradientValues, sumInto, &approximateFlopCountIntegrateWorkset); - ExecSpaceType().fence(); - fstIntegrateCall->stop(); - - // copy into cellStiffness container. (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.) - if (integralData.getUnderlyingViewRank() == 3) + case Nodal: { - std::pair cellRange = {startCell, endCell}; - auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL()); - Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3()); + using BasisFamily = DerivedNodalBasisFamily; + return getBasisForFormulation(formulation, cellTopo, polyOrder); } - else // underlying view rank is 2; copy to each cell in destination stiffness matrix + break; + case Hierarchical: { - auto integralView2 = integralData.getUnderlyingView2(); - auto policy = Kokkos::MDRangePolicy>({0,0,0},{numCellsInWorkset,numFields,numFields}); - Kokkos::parallel_for("copy uniform data to expanded container", policy, - KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) { - cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal); - }); + using BasisFamily = HierarchicalBasisFamily; + return getBasisForFormulation(formulation, cellTopo, polyOrder); } - - transformIntegrateFlopCount += approximateFlopCountIntegrateWorkset; - - cellOffset += worksetSize; + break; + case Serendipity: + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "basis family choice not yet implemented"); } + + return Teuchos::null; } int main( int argc, char* argv[] ) @@ -485,9 +310,13 @@ int main( int argc, char* argv[] ) using std::vector; { - // For now, we focus on 3D Poisson test. - // The non-affine tensor case is one that, like the Standard case, does *not* make any algorithmic assumptions about the geometry. - // This should be a good proxy for Poisson with unstructured material data on a curvilinear hexahedral mesh. + // Here, we support various 3D "formulations": Poisson, H^1, H(div), H(curl), and L^2 norms. (Poisson is the default.) + // The geometry we use is an axis-aligned, uniform hypercube mesh, but in the non-affine tensor case, the CellGeometry object does + // not express the axis-aligned or uniform structure, and the computations are as they would be in a more general hypercube mesh. + // Similarly, the Affine case only assumes an affine mesh; it need not be uniform or axis-aligned. + + string timingsFilePath; + const int spaceDim = 3; enum Mode @@ -501,10 +330,13 @@ int main( int argc, char* argv[] ) Mode mode; vector allAlgorithmChoices {Standard, NonAffineTensor, AffineTensor, Uniform}; + vector allFormulationChoices {Poisson, Hgrad, Hdiv, Hcurl, L2}; Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options string algorithmChoiceString = "All"; // alternatives: Standard, NonAffineTensor, AffineTensor, Uniform + string formulationChoiceString = "Poisson"; + string basisFamilyChoiceString = "Nodal"; int polyOrderFixed = -1; int polyOrderMin = 1; @@ -512,11 +344,22 @@ int main( int argc, char* argv[] ) string modeChoiceString = "Test"; // alternatives: Calibration, BestSerial, BestCuda + bool saveTimingsToFile = false; + string outputDir = "."; + cmdp.setOption("algorithm", &algorithmChoiceString, "Options: All, Standard, NonAffineTensor, AffineTensor, Uniform"); + cmdp.setOption("formulation", &formulationChoiceString, "Options: Poisson, Hgrad, Hdiv, Hcurl, L2"); cmdp.setOption("polyOrder", &polyOrderFixed, "Single polynomial degree to run at"); cmdp.setOption("minPolyOrder", &polyOrderMin, "Starting polynomial degree to run at"); cmdp.setOption("maxPolyOrder", &polyOrderMax, "Maximum polynomial degree to run at"); cmdp.setOption("mode", &modeChoiceString); + cmdp.setOption("basisFamily", &basisFamilyChoiceString, "Options: Nodal, Hierarchical, Serendipity"); + cmdp.setOption("saveTimings", "dontSaveTimings", &saveTimingsToFile, "Save timings to a file in outputDir."); + cmdp.setOption("outputDir", &outputDir, "Directory for saving timings file"); + + Teuchos::RCP timingsFileStream; + + bool success = true; if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) { @@ -525,6 +368,48 @@ int main( int argc, char* argv[] ) #endif return -1; } + + if (saveTimingsToFile) + { + std::ostringstream fileNameStream; + fileNameStream << outputDir << "/"; + fileNameStream << "timings_" << algorithmChoiceString << "_"; + fileNameStream << formulationChoiceString << "_"; + if (polyOrderFixed != -1) + { + fileNameStream << "p" << polyOrderFixed; + } + else + { + fileNameStream << "p" << polyOrderMin << "_to_p" << polyOrderMax << "_"; + } + fileNameStream << modeChoiceString << "_"; + fileNameStream << basisFamilyChoiceString; + fileNameStream << ".dat"; + + timingsFilePath = fileNameStream.str(); + + timingsFileStream = Teuchos::rcp( new std::ofstream(timingsFilePath, std::ios::out) ); + + *timingsFileStream << "Algorithm\t"; + *timingsFileStream << "p\t"; + *timingsFileStream << "Element Count\t"; + *timingsFileStream << "Workset Size\t"; + *timingsFileStream << "mode\t"; + *timingsFileStream << "Basis Family\t"; + *timingsFileStream << "Core Integration Timing\t"; + *timingsFileStream << "Core Integration Flops\t"; + *timingsFileStream << "Core Integration Throughput\t"; + *timingsFileStream << "Jac. Timing\t"; + *timingsFileStream << "Jac. Flops\t"; + *timingsFileStream << "Jac. Throughput\t"; + *timingsFileStream << "Initialization Timing\t"; + *timingsFileStream << "Other Timing\t"; + *timingsFileStream << "Total Time\t"; + *timingsFileStream << "Total Flops\t"; + *timingsFileStream << "Total Throughput"; + *timingsFileStream << std::endl; + } vector algorithmChoices; if (algorithmChoiceString == "All") @@ -556,6 +441,62 @@ int main( int argc, char* argv[] ) return -1; } + vector formulationChoices; + if (formulationChoiceString == "All") + { + formulationChoices = allFormulationChoices; + } + else if (formulationChoiceString == "Poisson") + { + formulationChoices = vector{Poisson}; + } + else if (formulationChoiceString == "Hgrad") + { + formulationChoices = vector{Hgrad}; + } + else if (formulationChoiceString == "Hdiv") + { + formulationChoices = vector{Hdiv}; + } + else if (formulationChoiceString == "Hcurl") + { + formulationChoices = vector{Hcurl}; + } + else if (formulationChoiceString == "L2") + { + formulationChoices = vector{L2}; + } + else + { + cout << "Unrecognized formulation choice: " << formulationChoiceString << endl; +#ifdef HAVE_MPI + MPI_Finalize(); +#endif + return -1; + } + + vector basisFamilyChoices; + if (basisFamilyChoiceString == "Nodal") + { + basisFamilyChoices = vector{Nodal}; + } + else if (basisFamilyChoiceString == "Hierarchical") + { + basisFamilyChoices = vector{Hierarchical}; + } + else if (basisFamilyChoiceString == "Serendipity") + { + basisFamilyChoices = vector{Serendipity}; + } + else + { + cout << "Unrecognized basis family choice: " << basisFamilyChoiceString << endl; +#ifdef HAVE_MPI + MPI_Finalize(); +#endif + return -1; + } + if (polyOrderFixed > 0) { polyOrderMin = polyOrderFixed; @@ -593,6 +534,7 @@ int main( int argc, char* argv[] ) using Scalar = double; using ExecutionSpace = Kokkos::DefaultExecutionSpace; + using DeviceType = Kokkos::DefaultExecutionSpace::device_type; using std::vector; using std::map; @@ -605,27 +547,45 @@ int main( int argc, char* argv[] ) using WorksetForAlgorithmChoice = map; - vector< tuple > polyOrderMeshWidthWorksetTestCases; + vector< tuple,WorksetForAlgorithmChoice> > polyOrderGridDimsWorksetTestCases; + + const int maxStiffnessGB = 2; + const int maxEntryCount = maxStiffnessGB * 1024 * (1024 * 1024 / sizeof(Scalar)); + + const int maxCellCount = 32768; + + map cellCountForPolyOrder; - const int meshWidth = 16; - vector worksetSizes {1,2,4,8,16,32,64,128,256,512,1024,2048,4096}; + map > gridCellCountsForPolyOrder; + for (int p=polyOrderMin; p<=polyOrderMax; p++) + { + int maxBasisCardinality = 1; + for (auto formulationChoice : formulationChoices) + { + for (auto basisFamilyChoice : basisFamilyChoices) + { + auto basis = getHypercubeBasisForFormulation(formulationChoice, basisFamilyChoice, p); + maxBasisCardinality = std::max(basis->getCardinality(), maxBasisCardinality); + } + } + gridCellCountsForPolyOrder[p] = getMeshWidths(maxBasisCardinality, maxEntryCount, maxCellCount); + + int cellCount = 1; + for (int d=0; d maxWorksetSizeForPolyOrder; - maxWorksetSizeForPolyOrder[1] = 4096; - maxWorksetSizeForPolyOrder[2] = 4096; - maxWorksetSizeForPolyOrder[3] = 4096; - maxWorksetSizeForPolyOrder[4] = 4096; - maxWorksetSizeForPolyOrder[5] = 4096; - maxWorksetSizeForPolyOrder[6] = 2048; - maxWorksetSizeForPolyOrder[7] = 1024; - maxWorksetSizeForPolyOrder[8] = 512; + vector worksetSizes {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768}; map minWorksetSizeForPolyOrder; - minWorksetSizeForPolyOrder[1] = 1; - minWorksetSizeForPolyOrder[2] = 1; - minWorksetSizeForPolyOrder[3] = 1; - minWorksetSizeForPolyOrder[4] = 1; + minWorksetSizeForPolyOrder[1] = 256; + minWorksetSizeForPolyOrder[2] = 64; + minWorksetSizeForPolyOrder[3] = 16; + minWorksetSizeForPolyOrder[4] = 4; minWorksetSizeForPolyOrder[5] = 1; minWorksetSizeForPolyOrder[6] = 1; minWorksetSizeForPolyOrder[7] = 1; @@ -638,7 +598,7 @@ int main( int argc, char* argv[] ) { for (int worksetSize : worksetSizes) { - if (worksetSize > maxWorksetSizeForPolyOrder[polyOrder]) + if (worksetSize > cellCountForPolyOrder[polyOrder]) { continue; } @@ -652,13 +612,16 @@ int main( int argc, char* argv[] ) { worksetForAlgorithmChoice[algorithmChoice] = worksetSize; } - polyOrderMeshWidthWorksetTestCases.push_back(tuple{polyOrder,meshWidth,worksetForAlgorithmChoice} ); + auto gridDims = gridCellCountsForPolyOrder[polyOrder]; + polyOrderGridDimsWorksetTestCases.push_back(tuple,WorksetForAlgorithmChoice>{polyOrder,gridDims,worksetForAlgorithmChoice} ); } } } break; case Test: { + // DEBUGGING -- for ease of debugging, a single test case. +// vector< tuple > testCases { tuple {1,1,1} }; // for test run, use the same modestly-sized tuples for each AlgorithmChoice // (note that meshWidth varies here) vector< tuple > testCases { tuple {1,8,512}, @@ -684,45 +647,189 @@ int main( int argc, char* argv[] ) worksetForAlgorithmChoice[algorithmChoice] = std::get<2>(testCase); } worksetForAlgorithmChoice[Uniform] = numCells; - polyOrderMeshWidthWorksetTestCases.push_back(tuple{polyOrder,meshWidth,worksetForAlgorithmChoice} ); + Kokkos::Array gridDims; + for (int d=0; d,WorksetForAlgorithmChoice>{polyOrder,gridDims,worksetForAlgorithmChoice} ); } } break; case BestSerial: { - // manually calibrated workset sizes on iMac Pro (2.3 GHz Xeon W, 18-core, running in serial) - // (these were calibrated without much tuning for the affine tensor case; if/when that happens, will want to recalibrate.) + if (formulationChoices.size() != 1) + { + std::cout << "BestSerial mode is not supported when running multiple formulations.\n"; + exit(-1); + } - map standardWorksetForPolyOrder; - standardWorksetForPolyOrder[1] = 64; - standardWorksetForPolyOrder[2] = 64; - standardWorksetForPolyOrder[3] = 128; - standardWorksetForPolyOrder[4] = 64; - standardWorksetForPolyOrder[5] = 16; - standardWorksetForPolyOrder[6] = 2; - standardWorksetForPolyOrder[7] = 2; - standardWorksetForPolyOrder[8] = 1; + auto formulationChoice = formulationChoices[0]; - // Non-Affine Tensor - map nonAffineTensorWorksetForPolyOrder; - nonAffineTensorWorksetForPolyOrder[1] = 256; - nonAffineTensorWorksetForPolyOrder[2] = 256; - nonAffineTensorWorksetForPolyOrder[3] = 128; - nonAffineTensorWorksetForPolyOrder[4] = 64; - nonAffineTensorWorksetForPolyOrder[5] = 16; - nonAffineTensorWorksetForPolyOrder[6] = 8; - nonAffineTensorWorksetForPolyOrder[7] = 2; - nonAffineTensorWorksetForPolyOrder[8] = 2; + // manually calibrated workset sizes on Mac Pro (2.5 GHz Xeon W, 28-core, running in serial) + map standardWorksetForPolyOrder; + map nonAffineTensorWorksetForPolyOrder; map affineTensorWorksetForPolyOrder; - affineTensorWorksetForPolyOrder[1] = 256; - affineTensorWorksetForPolyOrder[2] = 128; - affineTensorWorksetForPolyOrder[3] = 16; - affineTensorWorksetForPolyOrder[4] = 4; - affineTensorWorksetForPolyOrder[5] = 2; - affineTensorWorksetForPolyOrder[6] = 1; - affineTensorWorksetForPolyOrder[7] = 1; - affineTensorWorksetForPolyOrder[8] = 1; + + switch(formulationChoice) + { + case Poisson: + { + // best for Poisson - these are for meshes that range from 32768 for p=1 to 256 for p=8 + standardWorksetForPolyOrder[1] = 8192; + standardWorksetForPolyOrder[2] = 4096; + standardWorksetForPolyOrder[3] = 64; + standardWorksetForPolyOrder[4] = 16; + standardWorksetForPolyOrder[5] = 16; + standardWorksetForPolyOrder[6] = 1; + standardWorksetForPolyOrder[7] = 1; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 2048; + nonAffineTensorWorksetForPolyOrder[2] = 256; + nonAffineTensorWorksetForPolyOrder[3] = 128; + nonAffineTensorWorksetForPolyOrder[4] = 16; + nonAffineTensorWorksetForPolyOrder[5] = 2; + nonAffineTensorWorksetForPolyOrder[6] = 1; + nonAffineTensorWorksetForPolyOrder[7] = 1; + nonAffineTensorWorksetForPolyOrder[8] = 1; + + affineTensorWorksetForPolyOrder[1] = 4096; + affineTensorWorksetForPolyOrder[2] = 64; + affineTensorWorksetForPolyOrder[3] = 32; + affineTensorWorksetForPolyOrder[4] = 4; + affineTensorWorksetForPolyOrder[5] = 2; + affineTensorWorksetForPolyOrder[6] = 1; + affineTensorWorksetForPolyOrder[7] = 1; + affineTensorWorksetForPolyOrder[8] = 1; + } + break; + case Hgrad: + { + // best for Hgrad - these are for meshes that range from 32768 for p=1 to 256 for p=8 + standardWorksetForPolyOrder[1] = 32768; + standardWorksetForPolyOrder[2] = 16384; + standardWorksetForPolyOrder[3] = 512; + standardWorksetForPolyOrder[4] = 512; + standardWorksetForPolyOrder[5] = 512; + standardWorksetForPolyOrder[6] = 2; + standardWorksetForPolyOrder[7] = 1; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 4096; + nonAffineTensorWorksetForPolyOrder[2] = 512; + nonAffineTensorWorksetForPolyOrder[3] = 128; + nonAffineTensorWorksetForPolyOrder[4] = 32; + nonAffineTensorWorksetForPolyOrder[5] = 16; + nonAffineTensorWorksetForPolyOrder[6] = 1; + nonAffineTensorWorksetForPolyOrder[7] = 1; + nonAffineTensorWorksetForPolyOrder[8] = 1; + + affineTensorWorksetForPolyOrder[1] = 8192; + affineTensorWorksetForPolyOrder[2] = 512; + affineTensorWorksetForPolyOrder[3] = 128; + affineTensorWorksetForPolyOrder[4] = 64; + affineTensorWorksetForPolyOrder[5] = 16; + affineTensorWorksetForPolyOrder[6] = 1; + affineTensorWorksetForPolyOrder[7] = 1; + affineTensorWorksetForPolyOrder[8] = 1; + } + break; + case Hdiv: + { + // best for Hdiv - these are for meshes that range from 32768 for p=1 to 64 for p=8 + standardWorksetForPolyOrder[1] = 256; + standardWorksetForPolyOrder[2] = 64; + standardWorksetForPolyOrder[3] = 64; + standardWorksetForPolyOrder[4] = 16; + standardWorksetForPolyOrder[5] = 4; + standardWorksetForPolyOrder[6] = 1; + standardWorksetForPolyOrder[7] = 1; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 4096; + nonAffineTensorWorksetForPolyOrder[2] = 256; + nonAffineTensorWorksetForPolyOrder[3] = 64; + nonAffineTensorWorksetForPolyOrder[4] = 16; + nonAffineTensorWorksetForPolyOrder[5] = 4; + nonAffineTensorWorksetForPolyOrder[6] = 1; + nonAffineTensorWorksetForPolyOrder[7] = 1; + nonAffineTensorWorksetForPolyOrder[8] = 1; + + affineTensorWorksetForPolyOrder[1] = 8192; + affineTensorWorksetForPolyOrder[2] = 512; + affineTensorWorksetForPolyOrder[3] = 64; + affineTensorWorksetForPolyOrder[4] = 16; + affineTensorWorksetForPolyOrder[5] = 8; + affineTensorWorksetForPolyOrder[6] = 1; + affineTensorWorksetForPolyOrder[7] = 1; + affineTensorWorksetForPolyOrder[8] = 1; + } + break; + case Hcurl: + { + // best for Hcurl - these are for meshes that range from 32768 for p=1 to 64 for p=8 + standardWorksetForPolyOrder[1] = 1024; + standardWorksetForPolyOrder[2] = 512; + standardWorksetForPolyOrder[3] = 256; + standardWorksetForPolyOrder[4] = 4; + standardWorksetForPolyOrder[5] = 1; + standardWorksetForPolyOrder[6] = 1; + standardWorksetForPolyOrder[7] = 1; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 512; + nonAffineTensorWorksetForPolyOrder[2] = 64; + nonAffineTensorWorksetForPolyOrder[3] = 16; + nonAffineTensorWorksetForPolyOrder[4] = 4; + nonAffineTensorWorksetForPolyOrder[5] = 1; + nonAffineTensorWorksetForPolyOrder[6] = 1; + nonAffineTensorWorksetForPolyOrder[7] = 1; + nonAffineTensorWorksetForPolyOrder[8] = 1; + + affineTensorWorksetForPolyOrder[1] = 1024; + affineTensorWorksetForPolyOrder[2] = 128; + affineTensorWorksetForPolyOrder[3] = 16; + affineTensorWorksetForPolyOrder[4] = 4; + affineTensorWorksetForPolyOrder[5] = 1; + affineTensorWorksetForPolyOrder[6] = 1; + affineTensorWorksetForPolyOrder[7] = 1; + affineTensorWorksetForPolyOrder[8] = 1; + } + break; + case L2: + { + // best for L^2 - these are for meshes that range from 32768 for p=1 to 256 for p=8 + standardWorksetForPolyOrder[1] = 1024; + standardWorksetForPolyOrder[2] = 256; + standardWorksetForPolyOrder[3] = 64; + standardWorksetForPolyOrder[4] = 16; + standardWorksetForPolyOrder[5] = 16; + standardWorksetForPolyOrder[6] = 16; + standardWorksetForPolyOrder[7] = 1; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 16384; + nonAffineTensorWorksetForPolyOrder[2] = 512; + nonAffineTensorWorksetForPolyOrder[3] = 256; + nonAffineTensorWorksetForPolyOrder[4] = 64; + nonAffineTensorWorksetForPolyOrder[5] = 16; + nonAffineTensorWorksetForPolyOrder[6] = 8; + nonAffineTensorWorksetForPolyOrder[7] = 2; + nonAffineTensorWorksetForPolyOrder[8] = 1; + + affineTensorWorksetForPolyOrder[1] = 32768; + affineTensorWorksetForPolyOrder[2] = 1024; + affineTensorWorksetForPolyOrder[3] = 256; + affineTensorWorksetForPolyOrder[4] = 128; + affineTensorWorksetForPolyOrder[5] = 16; + affineTensorWorksetForPolyOrder[6] = 8; + affineTensorWorksetForPolyOrder[7] = 1; + affineTensorWorksetForPolyOrder[8] = 1; + } + break; + } // for the cases that we have not tried yet (polyOrder > 8), we try to choose sensible guesses for workset size: // 1 is best, we think, for polyOrder 8, so it'll be the best for the rest. @@ -734,76 +841,207 @@ int main( int argc, char* argv[] ) standardWorksetForPolyOrder[polyOrder] = worksetSize; } - int numCells = 1; - for (int d=0; d{polyOrder,meshWidth,worksetForAlgorithmChoice} ); + polyOrderGridDimsWorksetTestCases.push_back(tuple,WorksetForAlgorithmChoice>{polyOrder,gridDims,worksetForAlgorithmChoice} ); } } break; case BestOpenMP_16: { - // manually calibrated workset sizes on iMac Pro (2.3 GHz Xeon W, 18-core, running with OpenMP, OMP_NUM_THREADS=16) + if (formulationChoices.size() != 1) + { + std::cout << "BestOpenMP_16 mode is not supported when running multiple formulations.\n"; + exit(-1); + } + + auto formulationChoice = formulationChoices[0]; + + // manually calibrated workset sizes on Mac Pro (2.5 GHz Xeon W, 28-core, running with OpenMP, OMP_NUM_THREADS=16) // Calibration for sum factorization cases was run while usePointCacheForRank3Tensor = true. - // (these were calibrated without much tuning for the affine tensor case; if/when that happens, will want to recalibrate.) map standardWorksetForPolyOrder; - standardWorksetForPolyOrder[1] = 1024; - standardWorksetForPolyOrder[2] = 128; - standardWorksetForPolyOrder[3] = 128; - standardWorksetForPolyOrder[4] = 256; - standardWorksetForPolyOrder[5] = 8; - standardWorksetForPolyOrder[6] = 2; - standardWorksetForPolyOrder[7] = 2; - standardWorksetForPolyOrder[8] = 1; - - // Non-Affine Tensor map nonAffineTensorWorksetForPolyOrder; - nonAffineTensorWorksetForPolyOrder[1] = 512; - nonAffineTensorWorksetForPolyOrder[2] = 128; - nonAffineTensorWorksetForPolyOrder[3] = 256; - nonAffineTensorWorksetForPolyOrder[4] = 128; - nonAffineTensorWorksetForPolyOrder[5] = 32; - nonAffineTensorWorksetForPolyOrder[6] = 16; - nonAffineTensorWorksetForPolyOrder[7] = 16; - nonAffineTensorWorksetForPolyOrder[8] = 16; - map affineTensorWorksetForPolyOrder; - affineTensorWorksetForPolyOrder[1] = 4096; - affineTensorWorksetForPolyOrder[2] = 256; - affineTensorWorksetForPolyOrder[3] = 512; - affineTensorWorksetForPolyOrder[4] = 128; - affineTensorWorksetForPolyOrder[5] = 64; - affineTensorWorksetForPolyOrder[6] = 32; - affineTensorWorksetForPolyOrder[7] = 16; - affineTensorWorksetForPolyOrder[8] = 64; - // for the cases that we have not tried yet (polyOrder > 8), we try to choose sensible guesses for workset size: - // 1 is best, we think, for polyOrder 8, so it'll be the best for the rest. - int worksetSize = 1; - for (int polyOrder=9; polyOrder <= polyOrderMax; polyOrder++) + switch(formulationChoice) { - nonAffineTensorWorksetForPolyOrder[polyOrder] = worksetSize; - affineTensorWorksetForPolyOrder[polyOrder] = worksetSize; - standardWorksetForPolyOrder[polyOrder] = worksetSize; + case Poisson: + { + // best for Poisson - these are for meshes that range from 32768 for p=1 to 256 for p=8 + standardWorksetForPolyOrder[1] = 4096; + standardWorksetForPolyOrder[2] = 2048; + standardWorksetForPolyOrder[3] = 2048; + standardWorksetForPolyOrder[4] = 2048; + standardWorksetForPolyOrder[5] = 2048; + standardWorksetForPolyOrder[6] = 2048; + standardWorksetForPolyOrder[7] = 4; + standardWorksetForPolyOrder[8] = 2; + + nonAffineTensorWorksetForPolyOrder[1] = 2048; + nonAffineTensorWorksetForPolyOrder[2] = 512; + nonAffineTensorWorksetForPolyOrder[3] = 256; + nonAffineTensorWorksetForPolyOrder[4] = 128; + nonAffineTensorWorksetForPolyOrder[5] = 64; + nonAffineTensorWorksetForPolyOrder[6] = 32; + nonAffineTensorWorksetForPolyOrder[7] = 16; + nonAffineTensorWorksetForPolyOrder[8] = 16; + + affineTensorWorksetForPolyOrder[1] = 8192; + affineTensorWorksetForPolyOrder[2] = 4096; + affineTensorWorksetForPolyOrder[3] = 1024; + affineTensorWorksetForPolyOrder[4] = 256; + affineTensorWorksetForPolyOrder[5] = 64; + affineTensorWorksetForPolyOrder[6] = 32; + affineTensorWorksetForPolyOrder[7] = 16; + affineTensorWorksetForPolyOrder[8] = 16; + } + break; + case Hgrad: + { + // best for Hgrad - these are for meshes that range from 32768 for p=1 to 256 for p=8 + standardWorksetForPolyOrder[1] = 16384; + standardWorksetForPolyOrder[2] = 8192; + standardWorksetForPolyOrder[3] = 8192; + standardWorksetForPolyOrder[4] = 2048; + standardWorksetForPolyOrder[5] = 512; + standardWorksetForPolyOrder[6] = 512; + standardWorksetForPolyOrder[7] = 512; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 16384; + nonAffineTensorWorksetForPolyOrder[2] = 8192; + nonAffineTensorWorksetForPolyOrder[3] = 256; + nonAffineTensorWorksetForPolyOrder[4] = 256; + nonAffineTensorWorksetForPolyOrder[5] = 64; + nonAffineTensorWorksetForPolyOrder[6] = 32; + nonAffineTensorWorksetForPolyOrder[7] = 16; + nonAffineTensorWorksetForPolyOrder[8] = 16; + + affineTensorWorksetForPolyOrder[1] = 8192; + affineTensorWorksetForPolyOrder[2] = 4096; + affineTensorWorksetForPolyOrder[3] = 1024; + affineTensorWorksetForPolyOrder[4] = 256; + affineTensorWorksetForPolyOrder[5] = 64; + affineTensorWorksetForPolyOrder[6] = 32; + affineTensorWorksetForPolyOrder[7] = 16; + affineTensorWorksetForPolyOrder[8] = 16; + } + break; + case Hdiv: + { + // best for Hdiv - these are for meshes that range from 32768 for p=1 to 64 for p=8 + standardWorksetForPolyOrder[1] = 32768; + standardWorksetForPolyOrder[2] = 32768; + standardWorksetForPolyOrder[3] = 512; + standardWorksetForPolyOrder[4] = 256; + standardWorksetForPolyOrder[5] = 64; + standardWorksetForPolyOrder[6] = 2; + standardWorksetForPolyOrder[7] = 2; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 32768; + nonAffineTensorWorksetForPolyOrder[2] = 16384; + nonAffineTensorWorksetForPolyOrder[3] = 8192; + nonAffineTensorWorksetForPolyOrder[4] = 64; + nonAffineTensorWorksetForPolyOrder[5] = 16; + nonAffineTensorWorksetForPolyOrder[6] = 16; + nonAffineTensorWorksetForPolyOrder[7] = 16; + nonAffineTensorWorksetForPolyOrder[8] = 16; + + affineTensorWorksetForPolyOrder[1] = 16384; + affineTensorWorksetForPolyOrder[2] = 4096; + affineTensorWorksetForPolyOrder[3] = 256; + affineTensorWorksetForPolyOrder[4] = 128; + affineTensorWorksetForPolyOrder[5] = 64; + affineTensorWorksetForPolyOrder[6] = 16; + affineTensorWorksetForPolyOrder[7] = 16; + affineTensorWorksetForPolyOrder[8] = 16; + } + break; + case Hcurl: + { + // best for Hcurl - these are for meshes that range from 32768 for p=1 to 64 for p=8 + standardWorksetForPolyOrder[1] = 4096; + standardWorksetForPolyOrder[2] = 128; + standardWorksetForPolyOrder[3] = 128; + standardWorksetForPolyOrder[4] = 32; + standardWorksetForPolyOrder[5] = 4; + standardWorksetForPolyOrder[6] = 1; + standardWorksetForPolyOrder[7] = 1; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 16384; + nonAffineTensorWorksetForPolyOrder[2] = 512; + nonAffineTensorWorksetForPolyOrder[3] = 128; + nonAffineTensorWorksetForPolyOrder[4] = 64; + nonAffineTensorWorksetForPolyOrder[5] = 32; + nonAffineTensorWorksetForPolyOrder[6] = 16; + nonAffineTensorWorksetForPolyOrder[7] = 16; + nonAffineTensorWorksetForPolyOrder[8] = 16; + + affineTensorWorksetForPolyOrder[1] = 32768; + affineTensorWorksetForPolyOrder[2] = 4096; + affineTensorWorksetForPolyOrder[3] = 128; + affineTensorWorksetForPolyOrder[4] = 64; + affineTensorWorksetForPolyOrder[5] = 16; + affineTensorWorksetForPolyOrder[6] = 16; + affineTensorWorksetForPolyOrder[7] = 16; + affineTensorWorksetForPolyOrder[8] = 16; + } + break; + case L2: + { + // best for L^2 - these are for meshes that range from 32768 for p=1 to 256 for p=8 + standardWorksetForPolyOrder[1] = 8192; + standardWorksetForPolyOrder[2] = 512; + standardWorksetForPolyOrder[3] = 32; + standardWorksetForPolyOrder[4] = 32; + standardWorksetForPolyOrder[5] = 32; + standardWorksetForPolyOrder[6] = 1; + standardWorksetForPolyOrder[7] = 1; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 16384; + nonAffineTensorWorksetForPolyOrder[2] = 4096; + nonAffineTensorWorksetForPolyOrder[3] = 1024; + nonAffineTensorWorksetForPolyOrder[4] = 256; + nonAffineTensorWorksetForPolyOrder[5] = 64; + nonAffineTensorWorksetForPolyOrder[6] = 32; + nonAffineTensorWorksetForPolyOrder[7] = 16; + nonAffineTensorWorksetForPolyOrder[8] = 16; + + affineTensorWorksetForPolyOrder[1] = 32768; + affineTensorWorksetForPolyOrder[2] = 4096; + affineTensorWorksetForPolyOrder[3] = 1024; + affineTensorWorksetForPolyOrder[4] = 256; + affineTensorWorksetForPolyOrder[5] = 128; + affineTensorWorksetForPolyOrder[6] = 32; + affineTensorWorksetForPolyOrder[7] = 16; + affineTensorWorksetForPolyOrder[8] = 16; + } + break; } - int numCells = 1; - for (int d=0; d 8), we try to choose sensible guesses for workset size: + // Standard: 1 is best for polyOrder 8, so it'll be the best for the rest. + // NonAffineTensor, AffineTensor: we seem to bottom out at the number of OpenMP threads: here, 16. + int standardWorksetSize = 1; + int tensorWorksetSize = 16; + for (int polyOrder=9; polyOrder <= polyOrderMax; polyOrder++) { - numCells *= meshWidth; + nonAffineTensorWorksetForPolyOrder[polyOrder] = tensorWorksetSize; + affineTensorWorksetForPolyOrder[polyOrder] = tensorWorksetSize; + standardWorksetForPolyOrder[polyOrder] = standardWorksetSize; } for (int polyOrder=polyOrderMin; polyOrder<=polyOrderMax; polyOrder++) @@ -811,56 +1049,196 @@ int main( int argc, char* argv[] ) WorksetForAlgorithmChoice worksetForAlgorithmChoice; worksetForAlgorithmChoice[Standard] = standardWorksetForPolyOrder [polyOrder]; worksetForAlgorithmChoice[NonAffineTensor] = nonAffineTensorWorksetForPolyOrder[polyOrder]; - worksetForAlgorithmChoice[AffineTensor] = nonAffineTensorWorksetForPolyOrder[polyOrder]; - worksetForAlgorithmChoice[Uniform] = numCells; + worksetForAlgorithmChoice[AffineTensor] = affineTensorWorksetForPolyOrder[polyOrder]; + worksetForAlgorithmChoice[Uniform] = cellCountForPolyOrder[polyOrder]; + const auto & gridDims = gridCellCountsForPolyOrder[polyOrder]; - polyOrderMeshWidthWorksetTestCases.push_back(tuple{polyOrder,meshWidth,worksetForAlgorithmChoice} ); + polyOrderGridDimsWorksetTestCases.push_back(tuple,WorksetForAlgorithmChoice>{polyOrder,gridDims,worksetForAlgorithmChoice} ); } } break; case BestCuda: { { + if (formulationChoices.size() != 1) + { + std::cout << "BestCuda mode is not supported when running multiple formulations.\n"; + exit(-1); + } + + auto formulationChoice = formulationChoices[0]; + // STANDARD - // manually calibrated workset size on P100 (ride) - best for Standard - // these are for 4096-element meshes + // manually calibrated workset size on P100 (weaver) map standardWorksetForPolyOrder; - standardWorksetForPolyOrder[1] = 4096; - standardWorksetForPolyOrder[2] = 1024; - standardWorksetForPolyOrder[3] = 128; - standardWorksetForPolyOrder[4] = 64; - standardWorksetForPolyOrder[5] = 8; - standardWorksetForPolyOrder[6] = 4; - standardWorksetForPolyOrder[7] = 2; - standardWorksetForPolyOrder[8] = 1; - - // Non-Affine Tensor - // For CUDA, 4096 is the best choice for the PointValueCache algorithm for any polyOrder from 1 to 5. - // This likely means we're not exposing enough parallelism within the cell. map nonAffineTensorWorksetForPolyOrder; - nonAffineTensorWorksetForPolyOrder[1] = 4096; - nonAffineTensorWorksetForPolyOrder[2] = 4096; - nonAffineTensorWorksetForPolyOrder[3] = 4096; - nonAffineTensorWorksetForPolyOrder[4] = 4096; - nonAffineTensorWorksetForPolyOrder[5] = 4096; - nonAffineTensorWorksetForPolyOrder[6] = 2048; - nonAffineTensorWorksetForPolyOrder[7] = 512; - nonAffineTensorWorksetForPolyOrder[8] = 512; + map affineTensorWorksetForPolyOrder; - // for the cases that we have not tried yet (polyOrder > 8), we try to choose sensible guesses for workset size: - int nonAffineWorksetSize = 256; // divide by 2 for each polyOrder beyond 8 - int standardWorksetSize = 1; // 1 is best, we think, for polyOrder 8, so it'll be the best for the rest. - for (int polyOrder=9; polyOrder <= polyOrderMax; polyOrder++) + switch(formulationChoice) { - nonAffineTensorWorksetForPolyOrder[polyOrder] = nonAffineWorksetSize; - nonAffineWorksetSize = (nonAffineWorksetSize > 1) ? nonAffineWorksetSize / 2 : 1; - standardWorksetForPolyOrder[polyOrder] = standardWorksetSize; + case Poisson: + { + // best for Poisson - these are for meshes that range from 32768 for p=1 to 256 for p=8 + standardWorksetForPolyOrder[1] = 16384; + standardWorksetForPolyOrder[2] = 512; + standardWorksetForPolyOrder[3] = 128; + standardWorksetForPolyOrder[4] = 8; + standardWorksetForPolyOrder[5] = 4; + standardWorksetForPolyOrder[6] = 1; + standardWorksetForPolyOrder[7] = 1; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 32768; + nonAffineTensorWorksetForPolyOrder[2] = 32768; + nonAffineTensorWorksetForPolyOrder[3] = 16384; + nonAffineTensorWorksetForPolyOrder[4] = 8192; + nonAffineTensorWorksetForPolyOrder[5] = 4096; + nonAffineTensorWorksetForPolyOrder[6] = 2048; + nonAffineTensorWorksetForPolyOrder[7] = 256; + nonAffineTensorWorksetForPolyOrder[8] = 256; + + affineTensorWorksetForPolyOrder[1] = 8192; + affineTensorWorksetForPolyOrder[2] = 8192; + affineTensorWorksetForPolyOrder[3] = 8192; + affineTensorWorksetForPolyOrder[4] = 8192; + affineTensorWorksetForPolyOrder[5] = 4096; + affineTensorWorksetForPolyOrder[6] = 2048; + affineTensorWorksetForPolyOrder[7] = 256; + affineTensorWorksetForPolyOrder[8] = 128; + } + break; + case Hgrad: + { + // best for Hgrad - these are for meshes that range from 32768 for p=1 to 256 for p=8 + standardWorksetForPolyOrder[1] = 32768; + standardWorksetForPolyOrder[2] = 512; + standardWorksetForPolyOrder[3] = 128; + standardWorksetForPolyOrder[4] = 16; + standardWorksetForPolyOrder[5] = 4; + standardWorksetForPolyOrder[6] = 1; + standardWorksetForPolyOrder[7] = 1; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 32768; + nonAffineTensorWorksetForPolyOrder[2] = 32768; + nonAffineTensorWorksetForPolyOrder[3] = 16384; + nonAffineTensorWorksetForPolyOrder[4] = 8192; + nonAffineTensorWorksetForPolyOrder[5] = 4096; + nonAffineTensorWorksetForPolyOrder[6] = 2048; + nonAffineTensorWorksetForPolyOrder[7] = 256; + nonAffineTensorWorksetForPolyOrder[8] = 256; + + affineTensorWorksetForPolyOrder[1] = 32768; + affineTensorWorksetForPolyOrder[2] = 32768; + affineTensorWorksetForPolyOrder[3] = 8192; + affineTensorWorksetForPolyOrder[4] = 8192; + affineTensorWorksetForPolyOrder[5] = 4096; + affineTensorWorksetForPolyOrder[6] = 2048; + affineTensorWorksetForPolyOrder[7] = 256; + affineTensorWorksetForPolyOrder[8] = 256; + } + break; + case Hdiv: + { + // best for Hdiv - these are for meshes that range from 32768 for p=1 to 64 for p=8 + standardWorksetForPolyOrder[1] = 32768; + standardWorksetForPolyOrder[2] = 512; + standardWorksetForPolyOrder[3] = 32; + standardWorksetForPolyOrder[4] = 4; + standardWorksetForPolyOrder[5] = 1; + standardWorksetForPolyOrder[6] = 1; + standardWorksetForPolyOrder[7] = 1; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 32768; + nonAffineTensorWorksetForPolyOrder[2] = 32768; + nonAffineTensorWorksetForPolyOrder[3] = 16384; + nonAffineTensorWorksetForPolyOrder[4] = 4096; + nonAffineTensorWorksetForPolyOrder[5] = 1024; + nonAffineTensorWorksetForPolyOrder[6] = 256; + nonAffineTensorWorksetForPolyOrder[7] = 128; + nonAffineTensorWorksetForPolyOrder[8] = 64; + + affineTensorWorksetForPolyOrder[1] = 32768; + affineTensorWorksetForPolyOrder[2] = 32768; + affineTensorWorksetForPolyOrder[3] = 16384; + affineTensorWorksetForPolyOrder[4] = 4096; + affineTensorWorksetForPolyOrder[5] = 1024; + affineTensorWorksetForPolyOrder[6] = 256; + affineTensorWorksetForPolyOrder[7] = 128; + affineTensorWorksetForPolyOrder[8] = 64; + } + break; + case Hcurl: + { + standardWorksetForPolyOrder[1] = 1024; + standardWorksetForPolyOrder[2] = 128; + standardWorksetForPolyOrder[3] = 16; + standardWorksetForPolyOrder[4] = 4; + standardWorksetForPolyOrder[5] = 1; + standardWorksetForPolyOrder[6] = 1; + standardWorksetForPolyOrder[7] = 1; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 32768; + nonAffineTensorWorksetForPolyOrder[2] = 32768; + nonAffineTensorWorksetForPolyOrder[3] = 8192; + nonAffineTensorWorksetForPolyOrder[4] = 2048; + nonAffineTensorWorksetForPolyOrder[5] = 512; + nonAffineTensorWorksetForPolyOrder[6] = 256; + nonAffineTensorWorksetForPolyOrder[7] = 128; + nonAffineTensorWorksetForPolyOrder[8] = 64; + + affineTensorWorksetForPolyOrder[1] = 32768; + affineTensorWorksetForPolyOrder[2] = 32768; + affineTensorWorksetForPolyOrder[3] = 8192; + affineTensorWorksetForPolyOrder[4] = 2048; + affineTensorWorksetForPolyOrder[5] = 512; + affineTensorWorksetForPolyOrder[6] = 256; + affineTensorWorksetForPolyOrder[7] = 128; + affineTensorWorksetForPolyOrder[8] = 64; + } + break; + case L2: + { + standardWorksetForPolyOrder[1] = 32768; + standardWorksetForPolyOrder[2] = 1024; + standardWorksetForPolyOrder[3] = 128; + standardWorksetForPolyOrder[4] = 16; + standardWorksetForPolyOrder[5] = 4; + standardWorksetForPolyOrder[6] = 1; + standardWorksetForPolyOrder[7] = 1; + standardWorksetForPolyOrder[8] = 1; + + nonAffineTensorWorksetForPolyOrder[1] = 32768; + nonAffineTensorWorksetForPolyOrder[2] = 32768; + nonAffineTensorWorksetForPolyOrder[3] = 16384; + nonAffineTensorWorksetForPolyOrder[4] = 8192; + nonAffineTensorWorksetForPolyOrder[5] = 4096; + nonAffineTensorWorksetForPolyOrder[6] = 2048; + nonAffineTensorWorksetForPolyOrder[7] = 256; + nonAffineTensorWorksetForPolyOrder[8] = 128; + + affineTensorWorksetForPolyOrder[1] = 8192; + affineTensorWorksetForPolyOrder[2] = 8192; + affineTensorWorksetForPolyOrder[3] = 8192; + affineTensorWorksetForPolyOrder[4] = 8192; + affineTensorWorksetForPolyOrder[5] = 4096; + affineTensorWorksetForPolyOrder[6] = 2048; + affineTensorWorksetForPolyOrder[7] = 256; + affineTensorWorksetForPolyOrder[8] = 128; + } + break; } - int numCells = 1; - for (int d=0; d 8), we try to choose sensible guesses for workset size: + int standardWorksetSize = 1; // 1 is best for polyOrder 8, so it'll be the best for the rest. + // for the rest under CUDA, we observe that in most cases, the optimal workset size for non-affine tensor is the cell count. For affine, it's lower by a factor of 2 or 4 in most cases. + for (int polyOrder=9; polyOrder <= polyOrderMax; polyOrder++) { - numCells *= meshWidth; + nonAffineTensorWorksetForPolyOrder[polyOrder] = cellCountForPolyOrder[polyOrder]; + affineTensorWorksetForPolyOrder[polyOrder] = cellCountForPolyOrder[polyOrder] / 2; + standardWorksetForPolyOrder[polyOrder] = standardWorksetSize; } for (int polyOrder=polyOrderMin; polyOrder<=polyOrderMax; polyOrder++) @@ -868,10 +1246,12 @@ int main( int argc, char* argv[] ) WorksetForAlgorithmChoice worksetForAlgorithmChoice; worksetForAlgorithmChoice[Standard] = standardWorksetForPolyOrder [polyOrder]; worksetForAlgorithmChoice[NonAffineTensor] = nonAffineTensorWorksetForPolyOrder[polyOrder]; - worksetForAlgorithmChoice[AffineTensor] = nonAffineTensorWorksetForPolyOrder[polyOrder]; - worksetForAlgorithmChoice[Uniform] = numCells; + worksetForAlgorithmChoice[AffineTensor] = affineTensorWorksetForPolyOrder[polyOrder]; + worksetForAlgorithmChoice[Uniform] = cellCountForPolyOrder[polyOrder]; + + const auto & gridDims = gridCellCountsForPolyOrder[polyOrder]; - polyOrderMeshWidthWorksetTestCases.push_back(tuple{polyOrder,meshWidth,worksetForAlgorithmChoice} ); + polyOrderGridDimsWorksetTestCases.push_back(tuple,WorksetForAlgorithmChoice>{polyOrder,gridDims,worksetForAlgorithmChoice} ); } } break; @@ -883,139 +1263,292 @@ int main( int argc, char* argv[] ) cout << std::setprecision(2) << std::scientific; - map< AlgorithmChoice, map > > maxAlgorithmThroughputForPolyOrder; // values are (throughput in GFlops/sec, worksetSize) + map< AlgorithmChoice, map > > maxAlgorithmThroughputForPolyOrderCore; // values are (throughput in GFlops/sec, worksetSize) + map< AlgorithmChoice, map > > maxAlgorithmThroughputForPolyOrderTotal; // values are (throughput in GFlops/sec, worksetSize) const int charWidth = 15; - for (auto & testCase : polyOrderMeshWidthWorksetTestCases) + for (auto basisFamilyChoice : basisFamilyChoices) { - int polyOrder = std::get<0>(testCase); - int meshWidth = std::get<1>(testCase); - auto worksetSizeMap = std::get<2>(testCase); - std::cout << "\n\n"; - std::cout << "Running with polyOrder = " << polyOrder << ", meshWidth = " << meshWidth << std::endl; - for (auto algorithmChoice : algorithmChoices) + for (auto formulation : formulationChoices) { - int worksetSize = worksetSizeMap[algorithmChoice]; - auto geometry = getMesh(algorithmChoice, meshWidth); - - // timers recorded in performStructuredQuadratureHypercubeGRADGRAD, performStandardQuadratureHypercubeGRADGRAD - auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians"); - auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()"); - auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup"); - - jacobianAndCellMeasureTimer->reset(); - fstIntegrateCall->reset(); - initialSetupTimer->reset(); - - double elapsedTimeSeconds = 0; - double jacobianCellMeasureFlopCount = 0; - double transformIntegrateFlopCount = 0; - - if (algorithmChoice == Standard) - { - // each cell needs on the order of polyOrder^N quadrature points, each of which has a Jacobian of size N * N. - auto timer = Teuchos::TimeMonitor::getNewTimer("Standard Integration"); - timer->start(); - performStandardQuadratureHypercubeGRADGRAD(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); - timer->stop(); - elapsedTimeSeconds = timer->totalElapsedTime(); - - cout << "Standard, workset size: " << setw(charWidth) << worksetSize << endl; - - timer->reset(); - } - else if (algorithmChoice == AffineTensor) - { - auto timer = Teuchos::TimeMonitor::getNewTimer("Affine tensor Integration"); - timer->start(); - performStructuredQuadratureHypercubeGRADGRAD(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); - timer->stop(); - - elapsedTimeSeconds = timer->totalElapsedTime(); - - cout << "Affine Tensor, workset size: " << setw(charWidth) << worksetSize << endl; - - timer->reset(); - } - else if (algorithmChoice == NonAffineTensor) + std::cout << "\n\n***** Formulation: " << to_string(formulation) << " *******\n"; + for (auto & testCase : polyOrderGridDimsWorksetTestCases) { - auto timer = Teuchos::TimeMonitor::getNewTimer("Non-affine tensor Integration"); - timer->start(); - performStructuredQuadratureHypercubeGRADGRAD(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); - timer->stop(); - - elapsedTimeSeconds = timer->totalElapsedTime(); - - cout << "Non-Affine Tensor, workset size: " << setw(charWidth) << worksetSize << endl; - - timer->reset(); - } - else if (algorithmChoice == Uniform) - { - // for uniform, override worksetSize: no loss in taking maximal worksetSize - int numCells = 1; + int polyOrder = std::get<0>(testCase); + auto gridDims = std::get<1>(testCase); + auto worksetSizeMap = std::get<2>(testCase); + std::cout << "\n\n"; + std::cout << "Running with polyOrder = " << polyOrder << ", mesh dims = "; for (int d=0; dstart(); - performStructuredQuadratureHypercubeGRADGRAD(geometry, polyOrder, numCells, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); - timer->stop(); - - elapsedTimeSeconds = timer->totalElapsedTime(); - cout << "Uniform, workset size: " << setw(charWidth) << worksetSize << endl; + std::map > assembledMatrices; + for (auto algorithmChoice : algorithmChoices) + { + int worksetSize = worksetSizeMap[algorithmChoice]; + if (mode == Calibration) + { + // if this workset size is bigger than the optimal for p-1, skip it -- it's highly + // unlikely that for a larger p, the optimal workset size will be *larger*. + const auto & bestThroughputs = maxAlgorithmThroughputForPolyOrderCore[algorithmChoice]; + if (bestThroughputs.find(polyOrder-1) != bestThroughputs.end() ) + { + int bestWorksetSize = bestThroughputs.find(polyOrder-1)->second.second; + if (bestWorksetSize < worksetSize) + { + continue; + } + } + } + auto geometry = getMesh(algorithmChoice, gridDims); + + // timers recorded in performStructuredQuadratureGRADGRAD, performStandardQuadratureGRADGRAD + auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians"); + auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()"); + auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup"); + + jacobianAndCellMeasureTimer->reset(); + fstIntegrateCall->reset(); + initialSetupTimer->reset(); + + double elapsedTimeSeconds = 0; + double jacobianCellMeasureFlopCount = 0; + double transformIntegrateFlopCount = 0; + + Intrepid2::ScalarView assembledMatrix; + if (algorithmChoice == Standard) + { + // each cell needs on the order of polyOrder^N quadrature points, each of which has a Jacobian of size N * N. + auto timer = Teuchos::TimeMonitor::getNewTimer("Standard Integration"); + timer->start(); + switch (basisFamilyChoice) + { + case Nodal: + { + using BasisFamily = DerivedNodalBasisFamily; + assembledMatrix = performStandardQuadrature(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + } + break; + case Hierarchical: + { + using BasisFamily = HierarchicalBasisFamily; + assembledMatrix = performStandardQuadrature(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + } + break; + case Serendipity: + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "basis family choice not yet implemented"); + } + timer->stop(); + elapsedTimeSeconds = timer->totalElapsedTime(); + + cout << "Standard, workset size: " << setw(charWidth) << worksetSize << endl; + + timer->reset(); + } + else if (algorithmChoice == AffineTensor) + { + auto timer = Teuchos::TimeMonitor::getNewTimer("Affine tensor Integration"); + timer->start(); + switch (basisFamilyChoice) + { + case Nodal: + { + using BasisFamily = DerivedNodalBasisFamily; + assembledMatrix = performStructuredQuadrature(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + } + break; + case Hierarchical: + { + using BasisFamily = HierarchicalBasisFamily; + assembledMatrix = performStructuredQuadrature(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + } + break; + case Serendipity: + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "basis family choice not yet implemented"); + } + timer->stop(); + + elapsedTimeSeconds = timer->totalElapsedTime(); + + cout << "Affine Tensor, workset size: " << setw(charWidth) << worksetSize << endl; + + timer->reset(); + } + else if (algorithmChoice == NonAffineTensor) + { + auto timer = Teuchos::TimeMonitor::getNewTimer("Non-affine tensor Integration"); + timer->start(); + switch (basisFamilyChoice) + { + case Nodal: + { + using BasisFamily = DerivedNodalBasisFamily; + assembledMatrix = performStructuredQuadrature(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + } + break; + case Hierarchical: + { + using BasisFamily = HierarchicalBasisFamily; + assembledMatrix = performStructuredQuadrature(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + } + break; + case Serendipity: + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "basis family choice not yet implemented"); + } + timer->stop(); + + elapsedTimeSeconds = timer->totalElapsedTime(); + + cout << "Non-Affine Tensor, workset size: " << setw(charWidth) << worksetSize << endl; + + timer->reset(); + } + else if (algorithmChoice == Uniform) + { + // for uniform, override worksetSize: no loss in taking maximal worksetSize + int numCells = 1; + for (int d=0; dstart(); + switch (basisFamilyChoice) + { + case Nodal: + { + using BasisFamily = DerivedNodalBasisFamily; + assembledMatrix = performStructuredQuadrature(formulation, geometry, polyOrder, numCells, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + } + break; + case Hierarchical: + { + using BasisFamily = HierarchicalBasisFamily; + assembledMatrix = performStructuredQuadrature(formulation, geometry, polyOrder, numCells, transformIntegrateFlopCount, jacobianCellMeasureFlopCount); + } + break; + case Serendipity: + INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "basis family choice not yet implemented"); + } + timer->stop(); + + elapsedTimeSeconds = timer->totalElapsedTime(); + + cout << "Uniform, workset size: " << setw(charWidth) << worksetSize << endl; + + timer->reset(); + } + + assembledMatrices[algorithmChoice] = assembledMatrix; + + const double approximateFlopCountTotal = transformIntegrateFlopCount + jacobianCellMeasureFlopCount; + const double overallThroughputInGFlops = approximateFlopCountTotal / elapsedTimeSeconds / 1.0e9; + + const double previousMaxThroughput = maxAlgorithmThroughputForPolyOrderTotal[algorithmChoice][polyOrder].first; + if (overallThroughputInGFlops > previousMaxThroughput) + { + maxAlgorithmThroughputForPolyOrderTotal[algorithmChoice][polyOrder] = make_pair(overallThroughputInGFlops,worksetSize); + } + + // timing details + double integrateCallTime = fstIntegrateCall->totalElapsedTime(); + double integrateCallPercentage = integrateCallTime / elapsedTimeSeconds * 100.0; + double jacobianTime = jacobianAndCellMeasureTimer->totalElapsedTime(); + double jacobianPercentage = jacobianTime / elapsedTimeSeconds * 100.0; + double initialSetupTime = initialSetupTimer->totalElapsedTime(); + double initialSetupPercentage = initialSetupTime / elapsedTimeSeconds * 100.0; + double remainingTime = elapsedTimeSeconds - (integrateCallTime + jacobianTime + initialSetupTime); + double remainingPercentage = remainingTime / elapsedTimeSeconds * 100.0; + + const double transformIntegrateThroughputInGFlops = transformIntegrateFlopCount / integrateCallTime / 1.0e9; + const double jacobiansThroughputInGFlops = jacobianCellMeasureFlopCount / jacobianTime / 1.0e9; + + const double previousMaxThroughputCore = maxAlgorithmThroughputForPolyOrderCore[algorithmChoice][polyOrder].first; + if (transformIntegrateThroughputInGFlops > previousMaxThroughputCore) + { + maxAlgorithmThroughputForPolyOrderCore[algorithmChoice][polyOrder] = make_pair(transformIntegrateThroughputInGFlops,worksetSize); + } + + cout << "Time (core integration) " << setw(charWidth) << std::scientific << integrateCallTime << " seconds (" << std::fixed << integrateCallPercentage << "%)." << endl; + cout << "flop estimate (core): " << setw(charWidth) << std::scientific << transformIntegrateFlopCount << endl; + cout << "estimated throughput (core): " << setw(charWidth) << std::scientific << transformIntegrateThroughputInGFlops << " GFlops" << endl; + cout << "************************************************" << endl; + cout << std::fixed; + cout << "Time (Jacobians) " << setw(charWidth) << std::scientific << jacobianTime << " seconds (" << std::fixed << jacobianPercentage << "%)." << endl; + cout << "flop estimate (Jacobians): " << setw(charWidth) << std::scientific << jacobianCellMeasureFlopCount << endl; + cout << "estimated throughput (Jac.): " << setw(charWidth) << std::scientific << jacobiansThroughputInGFlops << " GFlops" << endl; + cout << "Time (initial setup) " << setw(charWidth) << std::scientific << initialSetupTime << " seconds (" << std::fixed << initialSetupPercentage << "%)." << endl; + cout << "Time (other) " << setw(charWidth) << std::scientific << remainingTime << " seconds (" << std::fixed << remainingPercentage << "%)." << endl; + cout << "Time (total): " << setw(charWidth) << std::scientific << elapsedTimeSeconds << " seconds.\n"; + cout << "flop estimate (total): " << setw(charWidth) << std::scientific << approximateFlopCountTotal << endl; + cout << "estimated throughput (total): " << setw(charWidth) << std::scientific << overallThroughputInGFlops << " GFlops" << endl; + + cout << endl; + + if (saveTimingsToFile) + { + *timingsFileStream << std::scientific; + + *timingsFileStream << to_string(algorithmChoice) << "\t"; + *timingsFileStream << polyOrder << "\t"; + *timingsFileStream << geometry.numCells() << "\t"; + *timingsFileStream << worksetSize << "\t"; + *timingsFileStream << modeChoiceString << "\t"; + *timingsFileStream << basisFamilyChoiceString << "\t"; + *timingsFileStream << integrateCallTime << "\t"; + *timingsFileStream << transformIntegrateFlopCount << "\t"; + *timingsFileStream << transformIntegrateThroughputInGFlops << "\t"; + *timingsFileStream << jacobianTime << "\t"; + *timingsFileStream << jacobianCellMeasureFlopCount << "\t"; + *timingsFileStream << jacobiansThroughputInGFlops << "\t"; + *timingsFileStream << initialSetupTime << "\t"; + *timingsFileStream << remainingTime << "\t"; + *timingsFileStream << elapsedTimeSeconds << "\t"; + *timingsFileStream << approximateFlopCountTotal << "\t"; + *timingsFileStream << overallThroughputInGFlops; + *timingsFileStream << std::endl; + } + } - timer->reset(); - } - - const double approximateFlopCountTotal = transformIntegrateFlopCount + jacobianCellMeasureFlopCount; - const double overallThroughputInGFlops = approximateFlopCountTotal / elapsedTimeSeconds / 1.0e9; - - const double previousMaxThroughput = maxAlgorithmThroughputForPolyOrder[algorithmChoice][polyOrder].first; - if (overallThroughputInGFlops > previousMaxThroughput) - { - maxAlgorithmThroughputForPolyOrder[algorithmChoice][polyOrder] = make_pair(overallThroughputInGFlops,worksetSize); + if (assembledMatrices.size() > 1) + { + // if we have multiple, then let's compare values to make sure they agree. + Teuchos::basic_FancyOStream out(Teuchos::rcp(&std::cout,false)); + const double relTol = 1e-10; // pretty loose tolerances are required, especially for higher-order hierarchical comparisons to Standard + const double absTol = 1e-8; + auto firstAlgorithm = assembledMatrices.begin()->first; + auto firstMatrix = assembledMatrices.begin()->second; + std::string algorithmName1 = to_string(firstAlgorithm); + + for (const auto &entry : assembledMatrices) + { + auto secondAlgorithm = entry.first; + auto secondMatrix = entry.second; + std::string algorithmName2 = to_string(secondAlgorithm); + testViewFloatingEquality(firstMatrix, secondMatrix, relTol, absTol, out, success, algorithmName1, algorithmName2); +// printFunctor3(firstMatrix, std::cout, algorithmName1); +// printFunctor3(secondMatrix, std::cout, algorithmName2); + } + } } - - // timing details - double integrateCallTime = fstIntegrateCall->totalElapsedTime(); - double integrateCallPercentage = integrateCallTime / elapsedTimeSeconds * 100.0; - double jacobianTime = jacobianAndCellMeasureTimer->totalElapsedTime(); - double jacobianPercentage = jacobianTime / elapsedTimeSeconds * 100.0; - double initialSetupTime = initialSetupTimer->totalElapsedTime(); - double initialSetupPercentage = initialSetupTime / elapsedTimeSeconds * 100.0; - double remainingTime = elapsedTimeSeconds - (integrateCallTime + jacobianTime + initialSetupTime); - double remainingPercentage = remainingTime / elapsedTimeSeconds * 100.0; - - const double transformIntegrateThroughputInGFlops = transformIntegrateFlopCount / integrateCallTime / 1.0e9; - const double jacobiansThroughputInGFlops = jacobianCellMeasureFlopCount / jacobianTime / 1.0e9; - cout << "Time (core integration) " << setw(charWidth) << std::scientific << integrateCallTime << " seconds (" << std::fixed << integrateCallPercentage << "%)." << endl; - cout << "flop estimate (core): " << setw(charWidth) << std::scientific << transformIntegrateFlopCount << endl; - cout << "estimated throughput (core): " << setw(charWidth) << std::scientific << transformIntegrateThroughputInGFlops << " GFlops" << endl; - cout << std::fixed; - cout << "Time (Jacobians) " << setw(charWidth) << std::scientific << jacobianTime << " seconds (" << std::fixed << jacobianPercentage << "%)." << endl; - cout << "flop estimate (Jacobians): " << setw(charWidth) << std::scientific << jacobianCellMeasureFlopCount << endl; - cout << "estimated throughput (Jac.): " << setw(charWidth) << std::scientific << jacobiansThroughputInGFlops << " GFlops" << endl; - cout << "Time (initial setup) " << setw(charWidth) << std::scientific << initialSetupTime << " seconds (" << std::fixed << initialSetupPercentage << "%)." << endl; - cout << "Time (other) " << setw(charWidth) << std::scientific << remainingTime << " seconds (" << std::fixed << remainingPercentage << "%)." << endl; - cout << "Time (total): " << setw(charWidth) << std::scientific << elapsedTimeSeconds << " seconds.\n"; - cout << "flop estimate (total): " << setw(charWidth) << std::scientific << approximateFlopCountTotal << endl; - cout << "estimated throughput (total): " << setw(charWidth) << std::scientific << overallThroughputInGFlops << " GFlops" << endl; - - cout << endl; } - } + } // basisFamilyChoices if (mode == Calibration) { + cout << "Best workset sizes (as determined by 'core integration' throughput, which includes basis transforms, but not setup and/or Jacobian computations):\n"; for (auto & algorithmChoice : algorithmChoices) { + if (algorithmChoice == Uniform) continue; // workset size is not meaningful for uniform (workset is always effectively one cell, or all cells, depending on how you choose to frame it). + cout << "Best workset sizes for " << to_string(algorithmChoice) << ":" << endl; - for (auto & maxThroughputEntry : maxAlgorithmThroughputForPolyOrder[algorithmChoice]) + for (auto & maxThroughputEntry : maxAlgorithmThroughputForPolyOrderCore[algorithmChoice]) { int polyOrder = maxThroughputEntry.first; int worksetSize = maxThroughputEntry.second.second; @@ -1024,7 +1557,14 @@ int main( int argc, char* argv[] ) } } } + if (success) + { + return 0; + } + else + { + std::cout << "ERROR: Assembled matrices did *NOT* match across algorithms.\n"; + return -1; + } } - - return 0; } diff --git a/packages/tpetra/core/src/Tpetra_Details_normImpl.hpp b/packages/tpetra/core/src/Tpetra_Details_normImpl.hpp index 18e4fb0010e9..2d328cb44de0 100644 --- a/packages/tpetra/core/src/Tpetra_Details_normImpl.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_normImpl.hpp @@ -151,18 +151,37 @@ lclNormImpl (const RV& normsOut, } else { // lclNumRows != 0 if (constantStride) { - if (whichNorm == NORM_INF) { - KokkosBlas::nrminf (normsOut, X); - } - else if (whichNorm == NORM_ONE) { - KokkosBlas::nrm1 (normsOut, X); - } - else if (whichNorm == NORM_TWO) { - KokkosBlas::nrm2_squared (normsOut, X); + if (X.extent(1) == 1) { + auto normsOut_0d = Kokkos::subview (normsOut, 0); + auto X_1d = Kokkos::subview (X, Kokkos::ALL(), 0); + if (whichNorm == NORM_INF) { + KokkosBlas::nrminf (normsOut_0d, X_1d); + } + else if (whichNorm == NORM_ONE) { + KokkosBlas::nrm1 (normsOut_0d, X_1d); + } + else if (whichNorm == NORM_TWO) { + KokkosBlas::nrm2_squared (normsOut_0d, X_1d); + } + else { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::logic_error, "Should never get here!"); + } } else { - TEUCHOS_TEST_FOR_EXCEPTION - (true, std::logic_error, "Should never get here!"); + if (whichNorm == NORM_INF) { + KokkosBlas::nrminf (normsOut, X); + } + else if (whichNorm == NORM_ONE) { + KokkosBlas::nrm1 (normsOut, X); + } + else if (whichNorm == NORM_TWO) { + KokkosBlas::nrm2_squared (normsOut, X); + } + else { + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::logic_error, "Should never get here!"); + } } } else { // not constant stride