Skip to content

Commit

Permalink
Merge 'trilinos/Trilinos:trilinos/develop' (13b95f6) into 'EM-Plasma/…
Browse files Browse the repository at this point in the history
…Trilinos:develop' (01a4ee0).

* trilinos/develop:
  Intrepid2: Support Full Exact Sequence in Structured Integration (trilinos#10419)
  Tpetra: Use numVectors==1 specialization for local parts of norms
  • Loading branch information
EMPIRE Jenkins committed Apr 24, 2022
2 parents 01a4ee0 + 13b95f6 commit 9431d42
Show file tree
Hide file tree
Showing 37 changed files with 4,526 additions and 1,491 deletions.
151 changes: 151 additions & 0 deletions packages/intrepid2/assembly-examples/GRADGRADStandardAssembly.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
//
// GRADGRADStandardAssembly.hpp
// Trilinos
//
// Created by Roberts, Nathan V on 7/2/21.
//

#ifndef Intrepid2_GRADGRADStandardAssembly_hpp
#define Intrepid2_GRADGRADStandardAssembly_hpp

#include "JacobianFlopEstimate.hpp"

/** \file GRADGRADStandardAssembly.hpp
\brief Locally assembles a Poisson matrix -- an array of shape (C,F,F), with formulation (grad e_i, grad e_j), using standard Intrepid2 methods; these do not algorithmically exploit geometric structure.
*/

//! Version that uses the classic, generic Intrepid2 paths.
template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureGRADGRAD(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry,
const int &polyOrder, int worksetSize,
double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
{
using ExecutionSpace = typename DeviceType::execution_space;
int numVertices = 1;
for (int d=0; d<spaceDim; d++)
{
numVertices *= 2;
}

auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
initialSetupTimer->start();

using CellTools = Intrepid2::CellTools<DeviceType>;
using FunctionSpaceTools = Intrepid2::FunctionSpaceTools<DeviceType>;

using namespace Intrepid2;

using namespace std;
// dimensions of the returned view are (C,F,F)
auto fs = FUNCTION_SPACE_HGRAD;

shards::CellTopology cellTopo = geometry.cellTopology();

auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);

int numFields = basis->getCardinality();
int numCells = geometry.numCells();

if (worksetSize > numCells) worksetSize = numCells;

// local stiffness matrices:
ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);

auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
int numPoints = cubature->getNumPoints();
ScalarView<PointScalar,DeviceType> cubaturePoints("cubature points",numPoints,spaceDim);
ScalarView<double,DeviceType> cubatureWeights("cubature weights", numPoints);

cubature->getCubature(cubaturePoints, cubatureWeights);

const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices);
const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);

// Allocate some intermediate containers
ScalarView<Scalar,DeviceType> basisValues ("basis values", numFields, numPoints );
ScalarView<Scalar,DeviceType> basisGradValues("basis grad values", numFields, numPoints, spaceDim);

ScalarView<Scalar,DeviceType> transformedGradValues("transformed grad values", worksetSize, numFields, numPoints, spaceDim);
ScalarView<Scalar,DeviceType> transformedWeightedGradValues("transformed weighted grad values", worksetSize, numFields, numPoints, spaceDim);

basis->getValues(basisValues, cubaturePoints, OPERATOR_VALUE );
basis->getValues(basisGradValues, cubaturePoints, OPERATOR_GRAD );

const int numNodesPerCell = geometry.numNodesPerCell();
ScalarView<PointScalar,DeviceType> expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim);
Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0,numCells),
KOKKOS_LAMBDA (const int &cellOrdinal) {
for (int nodeOrdinal=0; nodeOrdinal<numNodesPerCell; nodeOrdinal++)
{
for (int d=0; d<spaceDim; d++)
{
expandedCellNodes(cellOrdinal,nodeOrdinal,d) = geometry(cellOrdinal,nodeOrdinal,d);
}
}
});

ScalarView<Scalar,DeviceType> cellMeasures("cell measures", worksetSize, numPoints);
ScalarView<Scalar,DeviceType> jacobianDeterminant("jacobian determinant", worksetSize, numPoints);
ScalarView<Scalar,DeviceType> jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim);
ScalarView<Scalar,DeviceType> jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim);

initialSetupTimer->stop();

transformIntegrateFlopCount = 0;
jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself
jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)

int cellOffset = 0;
while (cellOffset < numCells)
{
int startCell = cellOffset;
int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;

std::pair<int,int> cellRange = {startCell, startCell+numCellsInWorkset};
auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL());

if (numCellsInWorkset != worksetSize)
{
Kokkos::resize(jacobian, numCellsInWorkset, numPoints, spaceDim, spaceDim);
Kokkos::resize(jacobianInverse, numCellsInWorkset, numPoints, spaceDim, spaceDim);
Kokkos::resize(jacobianDeterminant, numCellsInWorkset, numPoints);
Kokkos::resize(cellMeasures, numCellsInWorkset, numPoints);
Kokkos::resize(transformedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim);
Kokkos::resize(transformedWeightedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim);
}
jacobianAndCellMeasureTimer->start();
CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell.
CellTools::setJacobianInv(jacobianInverse, jacobian);
CellTools::setJacobianDet(jacobianDeterminant, jacobian);

FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights);
ExecutionSpace().fence();
jacobianAndCellMeasureTimer->stop();

// because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
fstIntegrateCall->start();
FunctionSpaceTools::HGRADtransformGRAD(transformedGradValues, jacobianInverse, basisGradValues);
transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
FunctionSpaceTools::multiplyMeasure(transformedWeightedGradValues, cellMeasures, transformedGradValues);
transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedGradValues: one flop for each.

auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());

FunctionSpaceTools::integrate(cellStiffnessSubview, transformedGradValues, transformedWeightedGradValues);
ExecutionSpace().fence();
fstIntegrateCall->stop();

transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction.

cellOffset += worksetSize;
}
// std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl;
return cellStiffness;
}

#endif /* GRADGRADStandardAssembly_h */
153 changes: 153 additions & 0 deletions packages/intrepid2/assembly-examples/GRADGRADStructuredAssembly.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
//
// GRADGRADStructuredAssembly.hpp
// Trilinos
//
// Created by Roberts, Nathan V on 7/2/21.
//

#ifndef GRADGRADStructuredAssembly_h
#define GRADGRADStructuredAssembly_h

#include "JacobianFlopEstimate.hpp"

/** \file GRADGRADStructuredAssembly.hpp
\brief Locally assembles a Poisson matrix -- an array of shape (C,F,F), with formulation (grad e_i, grad e_j), using "structured" Intrepid2 methods; these algorithmically exploit geometric structure as expressed in the provided CellGeometry.
*/

//! Version that takes advantage of new structured integration support, including sum factorization.
template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadratureGRADGRAD(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
{
using namespace Intrepid2;

using ExecutionSpace = typename DeviceType::execution_space;

int numVertices = 1;
for (int d=0; d<spaceDim; d++)
{
numVertices *= 2;
}

auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
initialSetupTimer->start();
using namespace std;
using FunctionSpaceTools = FunctionSpaceTools<DeviceType>;
using IntegrationTools = IntegrationTools<DeviceType>;
// dimensions of the returned view are (C,F,F)
auto fs = FUNCTION_SPACE_HGRAD;

shards::CellTopology cellTopo = geometry.cellTopology();

auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);

int numFields = basis->getCardinality();
int numCells = geometry.numCells();

// local stiffness matrix:
ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);

auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
auto tensorCubatureWeights = cubature->allocateCubatureWeights();
TensorPoints<PointScalar,DeviceType> tensorCubaturePoints = cubature->allocateCubaturePoints();

cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights);

EOperator op = OPERATOR_GRAD;
BasisValues<Scalar,DeviceType> gradientValues = basis->allocateBasisValues(tensorCubaturePoints, op);
basis->getValues(gradientValues, tensorCubaturePoints, op);

// goal here is to do a weighted Poisson; i.e. (f grad u, grad v) on each cell

int cellOffset = 0;

auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");

Data<PointScalar,DeviceType> jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize);
Data<PointScalar,DeviceType> jacobianDet = CellTools<DeviceType>::allocateJacobianDet(jacobian);
Data<PointScalar,DeviceType> jacobianInv = CellTools<DeviceType>::allocateJacobianInv(jacobian);
TensorData<PointScalar,DeviceType> cellMeasures = geometry.allocateCellMeasure(jacobianDet, tensorCubatureWeights);

// lazily-evaluated transformed gradient values (temporary to allow integralData allocation)
auto transformedGradientValuesTemp = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues);
auto integralData = IntegrationTools::allocateIntegralData(transformedGradientValuesTemp, cellMeasures, transformedGradientValuesTemp);

const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases

// TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount)
const double flopsPerJacobianPerCell = flopsPerJacobian(spaceDim, numPoints, numVertices);
const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);

transformIntegrateFlopCount = 0;
jacobianCellMeasureFlopCount = numCells * flopsPerJacobianPerCell; // jacobian itself
jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)

auto refData = geometry.getJacobianRefData(tensorCubaturePoints);

initialSetupTimer->stop();
while (cellOffset < numCells)
{
int startCell = cellOffset;
int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
int endCell = numCellsInWorkset + startCell;

jacobianAndCellMeasureTimer->start();
if (numCellsInWorkset != worksetSize)
{
const int CELL_DIM = 0; // first dimension corresponds to cell
jacobian.setExtent( CELL_DIM, numCellsInWorkset);
jacobianDet.setExtent( CELL_DIM, numCellsInWorkset);
jacobianInv.setExtent( CELL_DIM, numCellsInWorkset);
integralData.setExtent(CELL_DIM, numCellsInWorkset);

// cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension…
cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset);
}

geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell);
CellTools<DeviceType>::setJacobianDet(jacobianDet, jacobian);
CellTools<DeviceType>::setJacobianInv(jacobianInv, jacobian);

// lazily-evaluated transformed gradient values:
auto transformedGradientValues = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues);

geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights);
ExecutionSpace().fence();
jacobianAndCellMeasureTimer->stop();

bool sumInto = false;
double approximateFlopCountIntegrateWorkset = 0;
fstIntegrateCall->start();
IntegrationTools::integrate(integralData, transformedGradientValues, cellMeasures, transformedGradientValues, sumInto, &approximateFlopCountIntegrateWorkset);
ExecutionSpace().fence();
fstIntegrateCall->stop();

// copy into cellStiffness container. (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.)
if (integralData.getUnderlyingViewRank() == 3)
{
std::pair<int,int> cellRange = {startCell, endCell};
auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3());
}
else // underlying view rank is 2; copy to each cell in destination stiffness matrix
{
auto integralView2 = integralData.getUnderlyingView2();
auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{numCellsInWorkset,numFields,numFields});
Kokkos::parallel_for("copy uniform data to expanded container", policy,
KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal);
});
}

transformIntegrateFlopCount += approximateFlopCountIntegrateWorkset;

cellOffset += worksetSize;
}
return cellStiffness;
}

#endif /* GRADGRADStructuredAssembly_h */
Loading

0 comments on commit 9431d42

Please sign in to comment.