diff --git a/packages/intrepid2/assembly-examples/GRADGRADStandardAssembly.hpp b/packages/intrepid2/assembly-examples/GRADGRADStandardAssembly.hpp
new file mode 100644
index 000000000000..19ea89d94b4f
--- /dev/null
+++ b/packages/intrepid2/assembly-examples/GRADGRADStandardAssembly.hpp
@@ -0,0 +1,151 @@
+//
+//  GRADGRADStandardAssembly.hpp
+//  Trilinos
+//
+//  Created by Roberts, Nathan V on 7/2/21.
+//
+
+#ifndef Intrepid2_GRADGRADStandardAssembly_hpp
+#define Intrepid2_GRADGRADStandardAssembly_hpp
+
+#include "JacobianFlopEstimate.hpp"
+
+/** \file   GRADGRADStandardAssembly.hpp
+    \brief  Locally assembles a Poisson matrix -- an array of shape (C,F,F), with formulation (grad e_i, grad e_j), using standard Intrepid2 methods; these do not algorithmically exploit geometric structure.
+ */
+
+//! Version that uses the classic, generic Intrepid2 paths.
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureGRADGRAD(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry,
+                                                                              const int &polyOrder, int worksetSize,
+                                                                              double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  using ExecutionSpace = typename DeviceType::execution_space;
+  int numVertices = 1;
+  for (int d=0; d<spaceDim; d++)
+  {
+    numVertices *= 2;
+  }
+  
+  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
+  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
+  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
+  initialSetupTimer->start();
+  
+  using CellTools = Intrepid2::CellTools<DeviceType>;
+  using FunctionSpaceTools = Intrepid2::FunctionSpaceTools<DeviceType>;
+  
+  using namespace Intrepid2;
+  
+  using namespace std;
+  // dimensions of the returned view are (C,F,F)
+  auto fs = FUNCTION_SPACE_HGRAD;
+
+  shards::CellTopology cellTopo = geometry.cellTopology();
+  
+  auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
+  
+  int numFields = basis->getCardinality();
+  int numCells = geometry.numCells();
+  
+  if (worksetSize > numCells) worksetSize = numCells;
+  
+  // local stiffness matrices:
+  ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
+  
+  auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
+  int numPoints = cubature->getNumPoints();
+  ScalarView<PointScalar,DeviceType> cubaturePoints("cubature points",numPoints,spaceDim);
+  ScalarView<double,DeviceType> cubatureWeights("cubature weights", numPoints);
+  
+  cubature->getCubature(cubaturePoints, cubatureWeights);
+  
+  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
+  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
+  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);
+  
+  // Allocate some intermediate containers
+  ScalarView<Scalar,DeviceType> basisValues    ("basis values", numFields, numPoints );
+  ScalarView<Scalar,DeviceType> basisGradValues("basis grad values", numFields, numPoints, spaceDim);
+
+  ScalarView<Scalar,DeviceType> transformedGradValues("transformed grad values", worksetSize, numFields, numPoints, spaceDim);
+  ScalarView<Scalar,DeviceType> transformedWeightedGradValues("transformed weighted grad values", worksetSize, numFields, numPoints, spaceDim);
+  
+  basis->getValues(basisValues,     cubaturePoints, OPERATOR_VALUE );
+  basis->getValues(basisGradValues, cubaturePoints, OPERATOR_GRAD  );
+  
+  const int numNodesPerCell = geometry.numNodesPerCell();
+  ScalarView<PointScalar,DeviceType> expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim);
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0,numCells),
+  KOKKOS_LAMBDA (const int &cellOrdinal) {
+    for (int nodeOrdinal=0; nodeOrdinal<numNodesPerCell; nodeOrdinal++)
+    {
+      for (int d=0; d<spaceDim; d++)
+      {
+        expandedCellNodes(cellOrdinal,nodeOrdinal,d) = geometry(cellOrdinal,nodeOrdinal,d);
+      }
+    }
+  });
+  
+  ScalarView<Scalar,DeviceType> cellMeasures("cell measures", worksetSize, numPoints);
+  ScalarView<Scalar,DeviceType> jacobianDeterminant("jacobian determinant", worksetSize, numPoints);
+  ScalarView<Scalar,DeviceType> jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim);
+  ScalarView<Scalar,DeviceType> jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim);
+
+  initialSetupTimer->stop();
+  
+  transformIntegrateFlopCount  = 0;
+  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
+  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
+  
+  int cellOffset = 0;
+  while (cellOffset < numCells)
+  {
+    int startCell         = cellOffset;
+    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
+    
+    std::pair<int,int> cellRange = {startCell, startCell+numCellsInWorkset};
+    auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL());
+    
+    if (numCellsInWorkset != worksetSize)
+    {
+      Kokkos::resize(jacobian,                      numCellsInWorkset, numPoints, spaceDim, spaceDim);
+      Kokkos::resize(jacobianInverse,               numCellsInWorkset, numPoints, spaceDim, spaceDim);
+      Kokkos::resize(jacobianDeterminant,           numCellsInWorkset, numPoints);
+      Kokkos::resize(cellMeasures,                  numCellsInWorkset, numPoints);
+      Kokkos::resize(transformedGradValues,         numCellsInWorkset, numFields, numPoints, spaceDim);
+      Kokkos::resize(transformedWeightedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim);
+    }
+    jacobianAndCellMeasureTimer->start();
+    CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell.
+    CellTools::setJacobianInv(jacobianInverse, jacobian);
+    CellTools::setJacobianDet(jacobianDeterminant, jacobian);
+    
+    FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights);
+    ExecutionSpace().fence();
+    jacobianAndCellMeasureTimer->stop();
+    
+    // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
+    fstIntegrateCall->start();
+    FunctionSpaceTools::HGRADtransformGRAD(transformedGradValues, jacobianInverse, basisGradValues);
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
+    FunctionSpaceTools::multiplyMeasure(transformedWeightedGradValues, cellMeasures, transformedGradValues);
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedGradValues: one flop for each.
+        
+    auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
+    
+    FunctionSpaceTools::integrate(cellStiffnessSubview, transformedGradValues, transformedWeightedGradValues);
+    ExecutionSpace().fence();
+    fstIntegrateCall->stop();
+    
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction.
+    
+    cellOffset += worksetSize;
+  }
+//  std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl;
+  return cellStiffness;
+}
+
+#endif /* GRADGRADStandardAssembly_h */
diff --git a/packages/intrepid2/assembly-examples/GRADGRADStructuredAssembly.hpp b/packages/intrepid2/assembly-examples/GRADGRADStructuredAssembly.hpp
new file mode 100644
index 000000000000..7ca60a2b5fd2
--- /dev/null
+++ b/packages/intrepid2/assembly-examples/GRADGRADStructuredAssembly.hpp
@@ -0,0 +1,153 @@
+//
+//  GRADGRADStructuredAssembly.hpp
+//  Trilinos
+//
+//  Created by Roberts, Nathan V on 7/2/21.
+//
+
+#ifndef GRADGRADStructuredAssembly_h
+#define GRADGRADStructuredAssembly_h
+
+#include "JacobianFlopEstimate.hpp"
+
+/** \file   GRADGRADStructuredAssembly.hpp
+    \brief  Locally assembles a Poisson matrix -- an array of shape (C,F,F), with formulation (grad e_i, grad e_j), using "structured" Intrepid2 methods; these algorithmically exploit geometric structure as expressed in the provided CellGeometry.
+ */
+
+//! Version that takes advantage of new structured integration support, including sum factorization.
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadratureGRADGRAD(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                                                             double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  using namespace Intrepid2;
+  
+  using ExecutionSpace = typename DeviceType::execution_space;
+  
+  int numVertices = 1;
+  for (int d=0; d<spaceDim; d++)
+  {
+    numVertices *= 2;
+  }
+  
+  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
+  initialSetupTimer->start();
+  using namespace std;
+  using FunctionSpaceTools = FunctionSpaceTools<DeviceType>;
+  using IntegrationTools   = IntegrationTools<DeviceType>;
+  // dimensions of the returned view are (C,F,F)
+  auto fs = FUNCTION_SPACE_HGRAD;
+  
+  shards::CellTopology cellTopo = geometry.cellTopology();
+  
+  auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
+  
+  int numFields = basis->getCardinality();
+  int numCells = geometry.numCells();
+    
+  // local stiffness matrix:
+  ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
+  
+  auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
+  auto tensorCubatureWeights = cubature->allocateCubatureWeights();
+  TensorPoints<PointScalar,DeviceType> tensorCubaturePoints  = cubature->allocateCubaturePoints();
+  
+  cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights);
+  
+  EOperator op = OPERATOR_GRAD;
+  BasisValues<Scalar,DeviceType> gradientValues = basis->allocateBasisValues(tensorCubaturePoints, op);
+  basis->getValues(gradientValues, tensorCubaturePoints, op);
+  
+  // goal here is to do a weighted Poisson; i.e. (f grad u, grad v) on each cell
+    
+  int cellOffset = 0;
+  
+  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
+  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
+  
+  Data<PointScalar,DeviceType> jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize);
+  Data<PointScalar,DeviceType> jacobianDet = CellTools<DeviceType>::allocateJacobianDet(jacobian);
+  Data<PointScalar,DeviceType> jacobianInv = CellTools<DeviceType>::allocateJacobianInv(jacobian);
+  TensorData<PointScalar,DeviceType> cellMeasures = geometry.allocateCellMeasure(jacobianDet, tensorCubatureWeights);
+  
+  // lazily-evaluated transformed gradient values (temporary to allow integralData allocation)
+  auto transformedGradientValuesTemp = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues);
+  auto integralData = IntegrationTools::allocateIntegralData(transformedGradientValuesTemp, cellMeasures, transformedGradientValuesTemp);
+  
+  const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases
+  
+  // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount)
+  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
+  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
+  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);
+  
+  transformIntegrateFlopCount = 0;
+  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
+  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
+  
+  auto refData = geometry.getJacobianRefData(tensorCubaturePoints);
+  
+  initialSetupTimer->stop();
+  while (cellOffset < numCells)
+  {
+    int startCell         = cellOffset;
+    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
+    int endCell           = numCellsInWorkset + startCell;
+    
+    jacobianAndCellMeasureTimer->start();
+    if (numCellsInWorkset != worksetSize)
+    {
+      const int CELL_DIM = 0; // first dimension corresponds to cell
+      jacobian.setExtent(    CELL_DIM, numCellsInWorkset);
+      jacobianDet.setExtent( CELL_DIM, numCellsInWorkset);
+      jacobianInv.setExtent( CELL_DIM, numCellsInWorkset);
+      integralData.setExtent(CELL_DIM, numCellsInWorkset);
+      
+      // cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension…
+      cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset);
+    }
+    
+    geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell);
+    CellTools<DeviceType>::setJacobianDet(jacobianDet, jacobian);
+    CellTools<DeviceType>::setJacobianInv(jacobianInv, jacobian);
+    
+    // lazily-evaluated transformed gradient values:
+    auto transformedGradientValues = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues);
+    
+    geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights);
+    ExecutionSpace().fence();
+    jacobianAndCellMeasureTimer->stop();
+    
+    bool sumInto = false;
+    double approximateFlopCountIntegrateWorkset = 0;
+    fstIntegrateCall->start();
+    IntegrationTools::integrate(integralData, transformedGradientValues, cellMeasures, transformedGradientValues, sumInto, &approximateFlopCountIntegrateWorkset);
+    ExecutionSpace().fence();
+    fstIntegrateCall->stop();
+    
+    // copy into cellStiffness container.  (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.)
+    if (integralData.getUnderlyingViewRank() == 3)
+    {
+      std::pair<int,int> cellRange = {startCell, endCell};
+      auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
+      Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3());
+    }
+    else // underlying view rank is 2; copy to each cell in destination stiffness matrix
+    {
+      auto integralView2 = integralData.getUnderlyingView2();
+      auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{numCellsInWorkset,numFields,numFields});
+      Kokkos::parallel_for("copy uniform data to expanded container", policy,
+                       KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
+        cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal);
+      });
+    }
+    
+    transformIntegrateFlopCount  += approximateFlopCountIntegrateWorkset;
+    
+    cellOffset += worksetSize;
+  }
+  return cellStiffness;
+}
+
+#endif /* GRADGRADStructuredAssembly_h */
diff --git a/packages/intrepid2/assembly-examples/H1StandardAssembly.hpp b/packages/intrepid2/assembly-examples/H1StandardAssembly.hpp
new file mode 100644
index 000000000000..fadaff6111c6
--- /dev/null
+++ b/packages/intrepid2/assembly-examples/H1StandardAssembly.hpp
@@ -0,0 +1,165 @@
+//
+//  H1StandardAssembly.hpp
+//  Trilinos
+//
+//  Created by Roberts, Nathan V on 3/1/22.
+//
+
+#ifndef Intrepid2_H1StandardAssembly_hpp
+#define Intrepid2_H1StandardAssembly_hpp
+
+#include "JacobianFlopEstimate.hpp"
+
+/** \file   H1StandardAssembly.hpp
+    \brief  Locally assembles a matrix with the H^1 natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j) + (grad e_i, grad e_j), using standard Intrepid2 methods; these do not algorithmically exploit geometric structure.
+ */
+
+//! Version that uses the classic, generic Intrepid2 paths.
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureH1(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry,
+                                                                     const int &polyOrder, int worksetSize,
+                                                                     double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  using namespace Intrepid2;
+  using ExecutionSpace = typename DeviceType::execution_space;
+  
+  int numVertices = 1;
+  for (int d=0; d<spaceDim; d++)
+  {
+    numVertices *= 2;
+  }
+  
+  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
+  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
+  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
+  initialSetupTimer->start();
+  
+  using CellTools = CellTools<DeviceType>;
+  using FunctionSpaceTools = FunctionSpaceTools<DeviceType>;
+  
+  using namespace std;
+  // dimensions of the returned view are (C,F,F)
+  auto fs = FUNCTION_SPACE_HGRAD;
+
+  shards::CellTopology cellTopo = geometry.cellTopology();
+  
+  auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
+  
+  int numFields = basis->getCardinality();
+  int numCells = geometry.numCells();
+  
+  if (worksetSize > numCells) worksetSize = numCells;
+  
+  // local stiffness matrices:
+  ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
+  
+  auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
+  int numPoints = cubature->getNumPoints();
+  ScalarView<PointScalar,DeviceType> cubaturePoints("cubature points",numPoints,spaceDim);
+  ScalarView<double,DeviceType> cubatureWeights("cubature weights", numPoints);
+  
+  cubature->getCubature(cubaturePoints, cubatureWeights);
+  
+  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
+  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
+  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);
+  
+  // Allocate some intermediate containers
+  ScalarView<Scalar,DeviceType> basisValues    ("basis values", numFields, numPoints );
+  ScalarView<Scalar,DeviceType> basisGradValues("basis grad values", numFields, numPoints, spaceDim);
+
+  ScalarView<Scalar,DeviceType> transformedGradValues("transformed grad values", worksetSize, numFields, numPoints, spaceDim);
+  ScalarView<Scalar,DeviceType> transformedWeightedGradValues("transformed weighted grad values", worksetSize, numFields, numPoints, spaceDim);
+  
+  ScalarView<Scalar,DeviceType> transformedBasisValues("transformed basis values", worksetSize, numFields, numPoints);
+  ScalarView<Scalar,DeviceType> transformedWeightedBasisValues("transformed weighted basis values", worksetSize, numFields, numPoints);
+  
+  basis->getValues(basisValues,     cubaturePoints, OPERATOR_VALUE );
+  basis->getValues(basisGradValues, cubaturePoints, OPERATOR_GRAD  );
+  
+  const int numNodesPerCell = geometry.numNodesPerCell();
+  ScalarView<PointScalar,DeviceType> expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim);
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0,numCells),
+  KOKKOS_LAMBDA (const int &cellOrdinal) {
+    for (int nodeOrdinal=0; nodeOrdinal<numNodesPerCell; nodeOrdinal++)
+    {
+      for (int d=0; d<spaceDim; d++)
+      {
+        expandedCellNodes(cellOrdinal,nodeOrdinal,d) = geometry(cellOrdinal,nodeOrdinal,d);
+      }
+    }
+  });
+  
+  ScalarView<Scalar,DeviceType> cellMeasures("cell measures", worksetSize, numPoints);
+  ScalarView<Scalar,DeviceType> jacobianDeterminant("jacobian determinant", worksetSize, numPoints);
+  ScalarView<Scalar,DeviceType> jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim);
+  ScalarView<Scalar,DeviceType> jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim);
+
+  initialSetupTimer->stop();
+  
+  transformIntegrateFlopCount  = 0;
+  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
+  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
+  
+  int cellOffset = 0;
+  while (cellOffset < numCells)
+  {
+    int startCell         = cellOffset;
+    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
+    
+    std::pair<int,int> cellRange = {startCell, startCell+numCellsInWorkset};
+    auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL());
+    
+    if (numCellsInWorkset != worksetSize)
+    {
+      Kokkos::resize(jacobian,                      numCellsInWorkset, numPoints, spaceDim, spaceDim);
+      Kokkos::resize(jacobianInverse,               numCellsInWorkset, numPoints, spaceDim, spaceDim);
+      Kokkos::resize(jacobianDeterminant,           numCellsInWorkset, numPoints);
+      Kokkos::resize(cellMeasures,                  numCellsInWorkset, numPoints);
+      Kokkos::resize(transformedBasisValues,        numCellsInWorkset, numFields, numPoints);
+      Kokkos::resize(transformedGradValues,         numCellsInWorkset, numFields, numPoints, spaceDim);
+      Kokkos::resize(transformedWeightedBasisValues,numCellsInWorkset, numFields, numPoints);
+      Kokkos::resize(transformedWeightedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim);
+    }
+    jacobianAndCellMeasureTimer->start();
+    CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell.
+    CellTools::setJacobianInv(jacobianInverse, jacobian);
+    CellTools::setJacobianDet(jacobianDeterminant, jacobian);
+    
+    FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights);
+    ExecutionSpace().fence();
+    jacobianAndCellMeasureTimer->stop();
+    
+    // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
+    fstIntegrateCall->start();
+    FunctionSpaceTools::HGRADtransformGRAD(transformedGradValues, jacobianInverse, basisGradValues);
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
+    FunctionSpaceTools::multiplyMeasure(transformedWeightedGradValues, cellMeasures, transformedGradValues);
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedGradValues: one flop for each.
+        
+    auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
+    
+    FunctionSpaceTools::integrate(cellStiffnessSubview, transformedGradValues, transformedWeightedGradValues);
+    ExecutionSpace().fence();
+    
+    FunctionSpaceTools::HGRADtransformVALUE(transformedBasisValues, basisValues);
+    FunctionSpaceTools::multiplyMeasure(transformedWeightedBasisValues, cellMeasures, transformedBasisValues);
+    bool sumInto = true; // add the (value,value) integral to the (grad,grad) that we've already integrated
+    FunctionSpaceTools::integrate(cellStiffnessSubview, transformedBasisValues, transformedWeightedBasisValues, sumInto);
+    ExecutionSpace().fence();
+    fstIntegrateCall->stop();
+    
+    // record flops for (grad,grad)
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction; but you do one less add than the number of multiplies, hence the minus 1.
+    // for (value,value):
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * 2 - 1); // one multiply, add for each (P) entry in the contraction.
+    
+    cellOffset += worksetSize;
+  }
+//  std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl;
+  return cellStiffness;
+}
+
+#endif /* H1StandardAssembly_h */
diff --git a/packages/intrepid2/assembly-examples/H1StructuredAssembly.hpp b/packages/intrepid2/assembly-examples/H1StructuredAssembly.hpp
new file mode 100644
index 000000000000..1ec823b406be
--- /dev/null
+++ b/packages/intrepid2/assembly-examples/H1StructuredAssembly.hpp
@@ -0,0 +1,166 @@
+//
+//  H1StructuredAssembly.hpp
+//  Trilinos
+//
+//  Created by Roberts, Nathan V on 3/1/22.
+//
+
+#ifndef H1StructuredAssembly_h
+#define H1StructuredAssembly_h
+
+#include "JacobianFlopEstimate.hpp"
+
+/** \file   H1StructuredAssembly.hpp
+    \brief  Locally assembles a matrix with the H^1 natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j) + (grad e_i, grad e_j), using "structured" Intrepid2 methods; these algorithmically exploit geometric structure as expressed in the provided CellGeometry.
+ */
+
+//! Version that takes advantage of new structured integration support, including sum factorization.
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadratureH1(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                                                       double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  using namespace Intrepid2;
+  
+  using ExecutionSpace = typename DeviceType::execution_space;
+  
+  int numVertices = 1;
+  for (int d=0; d<spaceDim; d++)
+  {
+    numVertices *= 2;
+  }
+  
+  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
+  initialSetupTimer->start();
+  using namespace std;
+  using FunctionSpaceTools = FunctionSpaceTools<DeviceType>;
+  using IntegrationTools   = IntegrationTools<DeviceType>;
+  // dimensions of the returned view are (C,F,F)
+  auto fs = FUNCTION_SPACE_HGRAD;
+  
+  shards::CellTopology cellTopo = geometry.cellTopology();
+  
+  auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
+  
+  int numFields = basis->getCardinality();
+  int numCells = geometry.numCells();
+    
+  // local stiffness matrix:
+  ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
+  
+  auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
+  auto tensorCubatureWeights = cubature->allocateCubatureWeights();
+  TensorPoints<PointScalar,DeviceType> tensorCubaturePoints  = cubature->allocateCubaturePoints();
+  
+  cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights);
+  
+  EOperator opGrad = OPERATOR_GRAD;
+  BasisValues<Scalar,DeviceType> gradientValues = basis->allocateBasisValues(tensorCubaturePoints, opGrad);
+  basis->getValues(gradientValues, tensorCubaturePoints, opGrad);
+  
+  EOperator opValue = OPERATOR_VALUE;
+  BasisValues<Scalar,DeviceType> basisValues = basis->allocateBasisValues(tensorCubaturePoints, opValue);
+  basis->getValues(basisValues, tensorCubaturePoints, opValue);
+  
+  // goal here is to do a weighted Poisson; i.e. (f grad u, grad v) on each cell
+    
+  int cellOffset = 0;
+  
+  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
+  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
+  
+  Data<PointScalar,DeviceType> jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize);
+  Data<PointScalar,DeviceType> jacobianDet = CellTools<DeviceType>::allocateJacobianDet(jacobian);
+  Data<PointScalar,DeviceType> jacobianInv = CellTools<DeviceType>::allocateJacobianInv(jacobian);
+  TensorData<PointScalar,DeviceType> cellMeasures = geometry.allocateCellMeasure(jacobianDet, tensorCubatureWeights);
+  
+  // lazily-evaluated transformed gradient values (temporary to allow integralData allocation)
+  auto transformedGradientValuesTemp = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues);
+  auto integralData = IntegrationTools::allocateIntegralData(transformedGradientValuesTemp, cellMeasures, transformedGradientValuesTemp);
+  
+  const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases
+  
+  // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount)
+  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
+  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
+  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);
+  
+  transformIntegrateFlopCount = 0;
+  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
+  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
+  
+  auto refData = geometry.getJacobianRefData(tensorCubaturePoints);
+  
+  initialSetupTimer->stop();
+  while (cellOffset < numCells)
+  {
+    int startCell         = cellOffset;
+    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
+    int endCell           = numCellsInWorkset + startCell;
+    
+    jacobianAndCellMeasureTimer->start();
+    if (numCellsInWorkset != worksetSize)
+    {
+      const int CELL_DIM = 0; // first dimension corresponds to cell
+      jacobian.setExtent(    CELL_DIM, numCellsInWorkset);
+      jacobianDet.setExtent( CELL_DIM, numCellsInWorkset);
+      jacobianInv.setExtent( CELL_DIM, numCellsInWorkset);
+      integralData.setExtent(CELL_DIM, numCellsInWorkset);
+      
+      // cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension…
+      cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset);
+    }
+    
+    geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell);
+    CellTools<DeviceType>::setJacobianDet(jacobianDet, jacobian);
+    CellTools<DeviceType>::setJacobianInv(jacobianInv, jacobian);
+    
+    // lazily-evaluated transformed gradient values:
+    auto transformedGradientValues = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues);
+    
+    auto transformedBasisValues = FunctionSpaceTools::getHGRADtransformVALUE(numCellsInWorkset, basisValues);
+    
+    geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights);
+    ExecutionSpace().fence();
+    jacobianAndCellMeasureTimer->stop();
+    
+    bool sumInto = false;
+    double approximateFlopCountIntegrateWorksetGRADGRAD = 0;
+    double approximateFlopCountIntegrateWorksetVALUEVALUE = 0;
+    fstIntegrateCall->start();
+    // (grad,grad)
+    IntegrationTools::integrate(integralData, transformedGradientValues, cellMeasures, transformedGradientValues, sumInto, &approximateFlopCountIntegrateWorksetGRADGRAD);
+    
+    // (value,value)
+    sumInto = true; // add to what we already accumulated for (grad,grad)
+    IntegrationTools::integrate(integralData, transformedBasisValues, cellMeasures, transformedBasisValues, sumInto, &approximateFlopCountIntegrateWorksetVALUEVALUE);
+    
+    ExecutionSpace().fence();
+    fstIntegrateCall->stop();
+    
+    // copy into cellStiffness container.  (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.)
+    if (integralData.getUnderlyingViewRank() == 3)
+    {
+      std::pair<int,int> cellRange = {startCell, endCell};
+      auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
+      Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3());
+    }
+    else // underlying view rank is 2; copy to each cell in destination stiffness matrix
+    {
+      auto integralView2 = integralData.getUnderlyingView2();
+      auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{numCellsInWorkset,numFields,numFields});
+      Kokkos::parallel_for("copy uniform data to expanded container", policy,
+                       KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
+        cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal);
+      });
+    }
+    
+    transformIntegrateFlopCount  += approximateFlopCountIntegrateWorksetGRADGRAD + approximateFlopCountIntegrateWorksetVALUEVALUE;
+    
+    cellOffset += worksetSize;
+  }
+  return cellStiffness;
+}
+
+#endif /* GRADGRADStructuredAssembly_h */
diff --git a/packages/intrepid2/assembly-examples/HCURLStandardAssembly.hpp b/packages/intrepid2/assembly-examples/HCURLStandardAssembly.hpp
new file mode 100644
index 000000000000..2d12f1d21cb3
--- /dev/null
+++ b/packages/intrepid2/assembly-examples/HCURLStandardAssembly.hpp
@@ -0,0 +1,185 @@
+//
+//  HCURLStandardAssembly.hpp
+//  Trilinos
+//
+//  Created by Roberts, Nathan V on 3/24/22.
+//
+
+#ifndef Intrepid2_HCURLStandardAssembly_hpp
+#define Intrepid2_HCURLStandardAssembly_hpp
+
+#include "JacobianFlopEstimate.hpp"
+
+/** \file   HCURLStandardAssembly.hpp
+    \brief  Locally assembles a matrix with the H(curl) natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j) + (curl e_i, curl e_j), using standard Intrepid2 methods; these do not algorithmically exploit geometric structure.
+ */
+
+//! Version that uses the classic, generic Intrepid2 paths.
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureHCURL(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry,
+                                                                        const int &polyOrder, int worksetSize,
+                                                                        double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  using namespace Intrepid2;
+  using ExecutionSpace = typename DeviceType::execution_space;
+  
+  int numVertices = 1;
+  for (int d=0; d<spaceDim; d++)
+  {
+    numVertices *= 2;
+  }
+  
+  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
+  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
+  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
+  initialSetupTimer->start();
+  
+  using CellTools = CellTools<DeviceType>;
+  using FunctionSpaceTools = FunctionSpaceTools<DeviceType>;
+  
+  using namespace std;
+  // dimensions of the returned view are (C,F,F)
+  auto fs = FUNCTION_SPACE_HCURL;
+
+  shards::CellTopology cellTopo = geometry.cellTopology();
+  
+  auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
+  
+  int numFields = basis->getCardinality();
+  int numCells = geometry.numCells();
+  
+  if (worksetSize > numCells) worksetSize = numCells;
+  
+  // local stiffness matrices:
+  ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
+  
+  auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
+  int numPoints = cubature->getNumPoints();
+  ScalarView<PointScalar,DeviceType> cubaturePoints("cubature points",numPoints,spaceDim);
+  ScalarView<double,DeviceType> cubatureWeights("cubature weights", numPoints);
+  
+  cubature->getCubature(cubaturePoints, cubatureWeights);
+  
+  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
+  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
+  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);
+  
+  // Allocate some intermediate containers
+  ScalarView<Scalar,DeviceType> basisValues    ("basis values", numFields, numPoints, spaceDim );
+  
+  ScalarView<Scalar,DeviceType> basisCurlValues, transformedCurlValues, transformedWeightedCurlValues;
+  if (spaceDim == 2)
+  {
+    // curl in 2D is a scalar:
+    basisCurlValues = ScalarView<Scalar,DeviceType>("basis curl values", numFields, numPoints);
+    transformedCurlValues = ScalarView<Scalar,DeviceType> ("transformed curl values", worksetSize, numFields, numPoints);
+    transformedWeightedCurlValues = ScalarView<Scalar,DeviceType> ("transformed weighted curl values", worksetSize, numFields, numPoints);
+    
+  }
+  else
+  {
+    // curl in 3D is a vector
+    basisCurlValues = ScalarView<Scalar,DeviceType>("basis curl values", numFields, numPoints, spaceDim);
+    transformedCurlValues = ScalarView<Scalar,DeviceType> ("transformed curl values", worksetSize, numFields, numPoints, spaceDim);
+    transformedWeightedCurlValues = ScalarView<Scalar,DeviceType> ("transformed weighted curl values", worksetSize, numFields, numPoints, spaceDim);
+  }
+  
+  ScalarView<Scalar,DeviceType> transformedBasisValues("transformed basis values", worksetSize, numFields, numPoints, spaceDim);
+  ScalarView<Scalar,DeviceType> transformedWeightedBasisValues("transformed weighted basis values", worksetSize, numFields, numPoints, spaceDim);
+  
+  basis->getValues(basisValues,     cubaturePoints, OPERATOR_VALUE );
+  basis->getValues(basisCurlValues, cubaturePoints, OPERATOR_CURL  );
+  
+  const int numNodesPerCell = geometry.numNodesPerCell();
+  ScalarView<PointScalar,DeviceType> expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim);
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0,numCells),
+  KOKKOS_LAMBDA (const int &cellOrdinal) {
+    for (int nodeOrdinal=0; nodeOrdinal<numNodesPerCell; nodeOrdinal++)
+    {
+      for (int d=0; d<spaceDim; d++)
+      {
+        expandedCellNodes(cellOrdinal,nodeOrdinal,d) = geometry(cellOrdinal,nodeOrdinal,d);
+      }
+    }
+  });
+  
+  ScalarView<Scalar,DeviceType> cellMeasures("cell measures", worksetSize, numPoints);
+  ScalarView<Scalar,DeviceType> jacobianDeterminant("jacobian determinant", worksetSize, numPoints);
+  ScalarView<Scalar,DeviceType> jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim);
+  ScalarView<Scalar,DeviceType> jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim);
+
+  initialSetupTimer->stop();
+  
+  transformIntegrateFlopCount  = 0;
+  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
+  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
+  
+  int cellOffset = 0;
+  while (cellOffset < numCells)
+  {
+    int startCell         = cellOffset;
+    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
+    
+    std::pair<int,int> cellRange = {startCell, startCell+numCellsInWorkset};
+    auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL());
+    
+    if (numCellsInWorkset != worksetSize)
+    {
+      Kokkos::resize(jacobian,                       numCellsInWorkset, numPoints, spaceDim, spaceDim);
+      Kokkos::resize(jacobianInverse,                numCellsInWorkset, numPoints, spaceDim, spaceDim);
+      Kokkos::resize(jacobianDeterminant,            numCellsInWorkset, numPoints);
+      Kokkos::resize(cellMeasures,                   numCellsInWorkset, numPoints);
+      Kokkos::resize(transformedBasisValues,         numCellsInWorkset, numFields, numPoints, spaceDim);
+      Kokkos::resize(transformedWeightedBasisValues, numCellsInWorkset, numFields, numPoints, spaceDim);
+      if (spaceDim == 2)
+      {
+        Kokkos::resize(transformedCurlValues,          numCellsInWorkset, numFields, numPoints);
+        Kokkos::resize(transformedWeightedCurlValues,  numCellsInWorkset, numFields, numPoints);
+      }
+      else
+      {
+        Kokkos::resize(transformedCurlValues,          numCellsInWorkset, numFields, numPoints, spaceDim);
+        Kokkos::resize(transformedWeightedCurlValues,  numCellsInWorkset, numFields, numPoints, spaceDim);
+      }
+    }
+    jacobianAndCellMeasureTimer->start();
+    CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell.
+    CellTools::setJacobianInv(jacobianInverse, jacobian);
+    CellTools::setJacobianDet(jacobianDeterminant, jacobian);
+    
+    FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights);
+    ExecutionSpace().fence();
+    jacobianAndCellMeasureTimer->stop();
+        
+    // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
+    fstIntegrateCall->start();
+    FunctionSpaceTools::HCURLtransformCURL(transformedCurlValues, jacobian, jacobianDeterminant, basisCurlValues);
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
+    FunctionSpaceTools::multiplyMeasure(transformedWeightedCurlValues, cellMeasures, transformedCurlValues);
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedCurlValues: one flop for each.
+        
+    auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
+    
+    FunctionSpaceTools::integrate(cellStiffnessSubview, transformedCurlValues, transformedWeightedCurlValues);
+    
+    FunctionSpaceTools::HCURLtransformVALUE(transformedBasisValues, jacobianInverse, basisValues);
+    FunctionSpaceTools::multiplyMeasure(transformedWeightedBasisValues, cellMeasures, transformedBasisValues);
+    bool sumInto = true; // add the (value,value) integral to the (curl,curl) that we've already integrated
+    FunctionSpaceTools::integrate(cellStiffnessSubview, transformedBasisValues, transformedWeightedBasisValues, sumInto);
+    ExecutionSpace().fence();
+    fstIntegrateCall->stop();
+    
+    // record flops for (curl,curl):
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction; but you do one less add than the number of multiplies, hence the minus 1.
+    // for (value,value):
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction; but you do one less add than the number of multiplies, hence the minus 1.
+    
+    cellOffset += worksetSize;
+  }
+//  std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl;
+  return cellStiffness;
+}
+
+#endif /* HCURLStandardAssembly_h */
diff --git a/packages/intrepid2/assembly-examples/HCURLStructuredAssembly.hpp b/packages/intrepid2/assembly-examples/HCURLStructuredAssembly.hpp
new file mode 100644
index 000000000000..42323581a559
--- /dev/null
+++ b/packages/intrepid2/assembly-examples/HCURLStructuredAssembly.hpp
@@ -0,0 +1,188 @@
+//
+//  HCURLStructuredAssembly.hpp
+//  Trilinos
+//
+//  Created by Roberts, Nathan V on 3/24/22.
+//
+
+#ifndef HCURLStructuredAssembly_h
+#define HCURLStructuredAssembly_h
+
+#include "JacobianFlopEstimate.hpp"
+
+/** \file   HCURLStructuredAssembly.hpp
+    \brief  Locally assembles a matrix with the H(curl) natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j) + (curl e_i, curl e_j), using "structured" Intrepid2 methods; these algorithmically exploit geometric structure as expressed in the provided CellGeometry.
+ */
+
+//! Version that takes advantage of new structured integration support, including sum factorization.  Computes H(curl) norm: (curl, curl) + (value,value).
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadratureHCURL(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                                                          double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  using namespace Intrepid2;
+  
+  using ExecutionSpace = typename DeviceType::execution_space;
+  
+  int numVertices = 1;
+  for (int d=0; d<spaceDim; d++)
+  {
+    numVertices *= 2;
+  }
+  
+  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
+  initialSetupTimer->start();
+  using namespace std;
+  using FunctionSpaceTools = FunctionSpaceTools<DeviceType>;
+  using IntegrationTools   = IntegrationTools<DeviceType>;
+  // dimensions of the returned view are (C,F,F)
+  auto fs = FUNCTION_SPACE_HCURL;
+  
+  shards::CellTopology cellTopo = geometry.cellTopology();
+  
+  auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
+  
+  int numFields = basis->getCardinality();
+  int numCells = geometry.numCells();
+    
+  // local stiffness matrix:
+  ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
+  
+  auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
+  auto tensorCubatureWeights = cubature->allocateCubatureWeights();
+  TensorPoints<PointScalar,DeviceType> tensorCubaturePoints  = cubature->allocateCubaturePoints();
+  
+  cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights);
+  
+  EOperator opCurl = OPERATOR_CURL;
+  BasisValues<Scalar,DeviceType> curlValues = basis->allocateBasisValues(tensorCubaturePoints, opCurl);
+  basis->getValues(curlValues, tensorCubaturePoints, opCurl);
+  
+  EOperator opValue = OPERATOR_VALUE;
+  BasisValues<Scalar,DeviceType> basisValues = basis->allocateBasisValues(tensorCubaturePoints, opValue);
+  basis->getValues(basisValues, tensorCubaturePoints, opValue);
+  
+  int cellOffset = 0;
+  
+  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
+  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
+  
+  Data<PointScalar,DeviceType> jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize);
+  Data<PointScalar,DeviceType> jacobianDet        = CellTools<DeviceType>::allocateJacobianDet(jacobian);
+  Data<PointScalar,DeviceType> jacobianDetInverse = CellTools<DeviceType>::allocateJacobianDet(jacobian);
+  Data<PointScalar,DeviceType> jacobianDividedByJacobianDet = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize); // jacobianTimesJacobianDet has same underlying structure as jacobian
+  Data<PointScalar,DeviceType> jacobianInv = CellTools<DeviceType>::allocateJacobianInv(jacobian);
+  TensorData<PointScalar,DeviceType> cellMeasures = geometry.allocateCellMeasure(jacobianDetInverse, tensorCubatureWeights);
+  
+  Data<PointScalar,DeviceType> jacobianDetInverseExtended; // container with same underlying data as jacobianDet, but extended with CONSTANT type to have same logical shape as Jacobian
+  // setup jacobianDetInverseExtended
+  {
+    auto variationTypes = jacobianDetInverse.getVariationTypes(); // defaults to CONSTANT in ranks beyond the rank of the container; this is what we want for our new extents
+    auto extents        = jacobian.getExtents();
+    
+    jacobianDetInverseExtended = jacobianDetInverse.shallowCopy(jacobian.rank(), extents, variationTypes);
+  }
+  
+  // lazily-evaluated transformed curl values (temporary to allow integralData allocation)
+  auto transformedCurlValuesTemp = FunctionSpaceTools::getHCURLtransformCURL(jacobianDividedByJacobianDet, curlValues);
+  auto integralData = IntegrationTools::allocateIntegralData(transformedCurlValuesTemp, cellMeasures, transformedCurlValuesTemp);
+  
+  const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases
+  
+  // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount)
+  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
+  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
+  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);
+  
+  transformIntegrateFlopCount = 0;
+  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
+  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
+  
+  auto refData = geometry.getJacobianRefData(tensorCubaturePoints);
+  
+  initialSetupTimer->stop();
+  while (cellOffset < numCells)
+  {
+    int startCell         = cellOffset;
+    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
+    int endCell           = numCellsInWorkset + startCell;
+    
+    jacobianAndCellMeasureTimer->start();
+    if (numCellsInWorkset != worksetSize)
+    {
+      const int CELL_DIM = 0; // first dimension corresponds to cell
+      jacobian.setExtent(                    CELL_DIM, numCellsInWorkset);
+      jacobianDet.setExtent(                 CELL_DIM, numCellsInWorkset);
+      jacobianDetInverse.setExtent(          CELL_DIM, numCellsInWorkset);
+      jacobianInv.setExtent(                 CELL_DIM, numCellsInWorkset);
+      integralData.setExtent(                CELL_DIM, numCellsInWorkset);
+      jacobianDividedByJacobianDet.setExtent(CELL_DIM, numCellsInWorkset);
+      
+      // cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension…
+      cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset);
+    }
+    
+    geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell);
+    CellTools<DeviceType>::setJacobianDet(   jacobianDet,        jacobian);
+    CellTools<DeviceType>::setJacobianDetInv(jacobianDetInverse, jacobian);
+    CellTools<DeviceType>::setJacobianInv(   jacobianInv,        jacobian);
+    
+    // compute the jacobian divided by its determinant
+    jacobianDividedByJacobianDet.storeInPlaceProduct(jacobian,jacobianDetInverseExtended);
+    
+    // lazily-evaluated transformed curl, values:
+    TransformedBasisValues<Scalar,DeviceType> transformedCurlValues;
+    if (spaceDim == 2)
+    {
+      transformedCurlValues = FunctionSpaceTools::getHCURLtransformCURL(jacobianDetInverse, curlValues);
+    }
+    else
+    {
+      transformedCurlValues = FunctionSpaceTools::getHCURLtransformCURL(jacobianDividedByJacobianDet, curlValues);
+    }
+    auto transformedBasisValues = FunctionSpaceTools::getHCURLtransformVALUE(jacobianInv, basisValues);
+    
+    geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights);
+    ExecutionSpace().fence();
+    jacobianAndCellMeasureTimer->stop();
+    
+    bool sumInto = false;
+    double approximateFlopCountIntegrateWorksetCURLCURL = 0;
+    double approximateFlopCountIntegrateWorksetVALUEVALUE = 0;
+    fstIntegrateCall->start();
+    // (curl,curl)
+    IntegrationTools::integrate(integralData, transformedCurlValues, cellMeasures, transformedCurlValues, sumInto, &approximateFlopCountIntegrateWorksetCURLCURL);
+    
+    // (value,value)
+    sumInto = true; // add to what we already accumulated for (curl,curl)
+    IntegrationTools::integrate(integralData, transformedBasisValues, cellMeasures, transformedBasisValues, sumInto, &approximateFlopCountIntegrateWorksetVALUEVALUE);
+    
+    ExecutionSpace().fence();
+    fstIntegrateCall->stop();
+    
+    // copy into cellStiffness container.  (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.)
+    if (integralData.getUnderlyingViewRank() == 3)
+    {
+      std::pair<int,int> cellRange = {startCell, endCell};
+      auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
+      Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3());
+    }
+    else // underlying view rank is 2; copy to each cell in destination stiffness matrix
+    {
+      auto integralView2 = integralData.getUnderlyingView2();
+      auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{numCellsInWorkset,numFields,numFields});
+      Kokkos::parallel_for("copy uniform data to expanded container", policy,
+                       KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
+        cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal);
+      });
+    }
+    
+    transformIntegrateFlopCount  += approximateFlopCountIntegrateWorksetCURLCURL + approximateFlopCountIntegrateWorksetVALUEVALUE;
+    
+    cellOffset += worksetSize;
+  }
+  return cellStiffness;
+}
+
+#endif /* HCURLStructuredAssembly_h */
diff --git a/packages/intrepid2/assembly-examples/HDIVStandardAssembly.hpp b/packages/intrepid2/assembly-examples/HDIVStandardAssembly.hpp
new file mode 100644
index 000000000000..4beb1feaf3e0
--- /dev/null
+++ b/packages/intrepid2/assembly-examples/HDIVStandardAssembly.hpp
@@ -0,0 +1,165 @@
+//
+//  HDIVStandardAssembly.hpp
+//  Trilinos
+//
+//  Created by Roberts, Nathan V on 3/7/22.
+//
+
+#ifndef Intrepid2_HDIVStandardAssembly_hpp
+#define Intrepid2_HDIVStandardAssembly_hpp
+
+#include "JacobianFlopEstimate.hpp"
+
+/** \file   HDIVStandardAssembly.hpp
+    \brief  Locally assembles a matrix with the H(div) natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j) + (div e_i, div e_j), using standard Intrepid2 methods; these do not algorithmically exploit geometric structure.
+ */
+
+//! Version that uses the classic, generic Intrepid2 paths.
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureHDIV(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry,
+                                                                       const int &polyOrder, int worksetSize,
+                                                                       double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  using namespace Intrepid2;
+  using ExecutionSpace = typename DeviceType::execution_space;
+  
+  int numVertices = 1;
+  for (int d=0; d<spaceDim; d++)
+  {
+    numVertices *= 2;
+  }
+  
+  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
+  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
+  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
+  initialSetupTimer->start();
+  
+  using CellTools = CellTools<DeviceType>;
+  using FunctionSpaceTools = FunctionSpaceTools<DeviceType>;
+  
+  using namespace std;
+  // dimensions of the returned view are (C,F,F)
+  auto fs = FUNCTION_SPACE_HDIV;
+
+  shards::CellTopology cellTopo = geometry.cellTopology();
+  
+  auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
+  
+  int numFields = basis->getCardinality();
+  int numCells = geometry.numCells();
+  
+  if (worksetSize > numCells) worksetSize = numCells;
+  
+  // local stiffness matrices:
+  ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
+  
+  auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
+  int numPoints = cubature->getNumPoints();
+  ScalarView<PointScalar,DeviceType> cubaturePoints("cubature points",numPoints,spaceDim);
+  ScalarView<double,DeviceType> cubatureWeights("cubature weights", numPoints);
+  
+  cubature->getCubature(cubaturePoints, cubatureWeights);
+  
+  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
+  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
+  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);
+  
+  // Allocate some intermediate containers
+  ScalarView<Scalar,DeviceType> basisValues    ("basis values", numFields, numPoints, spaceDim );
+  ScalarView<Scalar,DeviceType> basisDivValues("basis div values", numFields, numPoints);
+
+  ScalarView<Scalar,DeviceType> transformedDivValues("transformed div values", worksetSize, numFields, numPoints);
+  ScalarView<Scalar,DeviceType> transformedWeightedDivValues("transformed weighted div values", worksetSize, numFields, numPoints);
+  
+  ScalarView<Scalar,DeviceType> transformedBasisValues("transformed basis values", worksetSize, numFields, numPoints, spaceDim);
+  ScalarView<Scalar,DeviceType> transformedWeightedBasisValues("transformed weighted basis values", worksetSize, numFields, numPoints, spaceDim);
+  
+  basis->getValues(basisValues,    cubaturePoints, OPERATOR_VALUE );
+  basis->getValues(basisDivValues, cubaturePoints, OPERATOR_DIV  );
+  
+  const int numNodesPerCell = geometry.numNodesPerCell();
+  ScalarView<PointScalar,DeviceType> expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim);
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0,numCells),
+  KOKKOS_LAMBDA (const int &cellOrdinal) {
+    for (int nodeOrdinal=0; nodeOrdinal<numNodesPerCell; nodeOrdinal++)
+    {
+      for (int d=0; d<spaceDim; d++)
+      {
+        expandedCellNodes(cellOrdinal,nodeOrdinal,d) = geometry(cellOrdinal,nodeOrdinal,d);
+      }
+    }
+  });
+  
+  ScalarView<Scalar,DeviceType> cellMeasures("cell measures", worksetSize, numPoints);
+  ScalarView<Scalar,DeviceType> jacobianDeterminant("jacobian determinant", worksetSize, numPoints);
+  ScalarView<Scalar,DeviceType> jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim);
+  ScalarView<Scalar,DeviceType> jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim);
+
+  initialSetupTimer->stop();
+  
+  transformIntegrateFlopCount  = 0;
+  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
+  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
+  
+  int cellOffset = 0;
+  while (cellOffset < numCells)
+  {
+    int startCell         = cellOffset;
+    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
+    
+    std::pair<int,int> cellRange = {startCell, startCell+numCellsInWorkset};
+    auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL());
+    
+    if (numCellsInWorkset != worksetSize)
+    {
+      Kokkos::resize(jacobian,                      numCellsInWorkset, numPoints, spaceDim, spaceDim);
+      Kokkos::resize(jacobianInverse,               numCellsInWorkset, numPoints, spaceDim, spaceDim);
+      Kokkos::resize(jacobianDeterminant,           numCellsInWorkset, numPoints);
+      Kokkos::resize(cellMeasures,                  numCellsInWorkset, numPoints);
+      Kokkos::resize(transformedBasisValues,        numCellsInWorkset, numFields, numPoints, spaceDim);
+      Kokkos::resize(transformedDivValues,          numCellsInWorkset, numFields, numPoints);
+      Kokkos::resize(transformedWeightedBasisValues,numCellsInWorkset, numFields, numPoints, spaceDim);
+      Kokkos::resize(transformedWeightedDivValues,  numCellsInWorkset, numFields, numPoints);
+    }
+    jacobianAndCellMeasureTimer->start();
+    CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell.
+    CellTools::setJacobianInv(jacobianInverse, jacobian);
+    CellTools::setJacobianDet(jacobianDeterminant, jacobian);
+    
+    FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights);
+    ExecutionSpace().fence();
+    jacobianAndCellMeasureTimer->stop();
+        
+    // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
+    fstIntegrateCall->start();
+    FunctionSpaceTools::HDIVtransformDIV(transformedDivValues, jacobianDeterminant, basisDivValues);
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
+    FunctionSpaceTools::multiplyMeasure(transformedWeightedDivValues, cellMeasures, transformedDivValues);
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedDivValues: one flop for each.
+        
+    auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
+    
+    FunctionSpaceTools::integrate(cellStiffnessSubview, transformedDivValues, transformedWeightedDivValues);
+    ExecutionSpace().fence();
+    
+    FunctionSpaceTools::HDIVtransformVALUE(transformedBasisValues, jacobian, jacobianDeterminant, basisValues);
+    FunctionSpaceTools::multiplyMeasure(transformedWeightedBasisValues, cellMeasures, transformedBasisValues);
+    bool sumInto = true; // add the (value,value) integral to the (div,div) that we've already integrated
+    FunctionSpaceTools::integrate(cellStiffnessSubview, transformedBasisValues, transformedWeightedBasisValues, sumInto);
+    ExecutionSpace().fence();
+    fstIntegrateCall->stop();
+    
+    // record flops for (div,div)
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * 2 - 1); // one multiply, add for each (P) entry in the contraction.
+    // for (value,value):
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction; but you do one less add than the number of multiplies, hence the minus 1.
+    
+    cellOffset += worksetSize;
+  }
+//  std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl;
+  return cellStiffness;
+}
+
+#endif /* HDIVStandardAssembly_h */
diff --git a/packages/intrepid2/assembly-examples/HDIVStructuredAssembly.hpp b/packages/intrepid2/assembly-examples/HDIVStructuredAssembly.hpp
new file mode 100644
index 000000000000..a2eef17f9e35
--- /dev/null
+++ b/packages/intrepid2/assembly-examples/HDIVStructuredAssembly.hpp
@@ -0,0 +1,180 @@
+//
+//  HDIVStructuredAssembly.hpp
+//  Trilinos
+//
+//  Created by Roberts, Nathan V on 3/7/22.
+//
+
+#ifndef HDIVStructuredAssembly_h
+#define HDIVStructuredAssembly_h
+
+#include "JacobianFlopEstimate.hpp"
+
+/** \file   HDIVStructuredAssembly.hpp
+    \brief  Locally assembles a matrix with the H(div) natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j) + (div e_i, div e_j), using "structured" Intrepid2 methods; these algorithmically exploit geometric structure as expressed in the provided CellGeometry.
+ */
+
+//! Version that takes advantage of new structured integration support, including sum factorization.  Computes H(div) norm: (div, div) + (value,value).
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadratureHDIV(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                                                         double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  using namespace Intrepid2;
+  
+  using ExecutionSpace = typename DeviceType::execution_space;
+  
+  int numVertices = 1;
+  for (int d=0; d<spaceDim; d++)
+  {
+    numVertices *= 2;
+  }
+  
+  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
+  initialSetupTimer->start();
+  using namespace std;
+  using FunctionSpaceTools = FunctionSpaceTools<DeviceType>;
+  using IntegrationTools   = IntegrationTools<DeviceType>;
+  // dimensions of the returned view are (C,F,F)
+  auto fs = FUNCTION_SPACE_HDIV;
+  
+  shards::CellTopology cellTopo = geometry.cellTopology();
+  
+  auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
+  
+  int numFields = basis->getCardinality();
+  int numCells = geometry.numCells();
+    
+  // local stiffness matrix:
+  ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
+  
+  auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
+  auto tensorCubatureWeights = cubature->allocateCubatureWeights();
+  TensorPoints<PointScalar,DeviceType> tensorCubaturePoints  = cubature->allocateCubaturePoints();
+  
+  cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights);
+  
+  EOperator opDiv = OPERATOR_DIV;
+  BasisValues<Scalar,DeviceType> divValues = basis->allocateBasisValues(tensorCubaturePoints, opDiv);
+  basis->getValues(divValues, tensorCubaturePoints, opDiv);
+  
+  EOperator opValue = OPERATOR_VALUE;
+  BasisValues<Scalar,DeviceType> basisValues = basis->allocateBasisValues(tensorCubaturePoints, opValue);
+  basis->getValues(basisValues, tensorCubaturePoints, opValue);
+  
+  int cellOffset = 0;
+  
+  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
+  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
+  
+  Data<PointScalar,DeviceType> jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize);
+  Data<PointScalar,DeviceType> jacobianDet        = CellTools<DeviceType>::allocateJacobianDet(jacobian);
+  Data<PointScalar,DeviceType> jacobianDetInverse = CellTools<DeviceType>::allocateJacobianDet(jacobian);
+  Data<PointScalar,DeviceType> jacobianDividedByJacobianDet = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize); // jacobianTimesJacobianDet has same underlying structure as jacobian
+  Data<PointScalar,DeviceType> jacobianInv = CellTools<DeviceType>::allocateJacobianInv(jacobian);
+  TensorData<PointScalar,DeviceType> cellMeasures = geometry.allocateCellMeasure(jacobianDetInverse, tensorCubatureWeights);
+  
+  Data<PointScalar,DeviceType> jacobianDetInverseExtended; // container with same underlying data as jacobianDet, but extended with CONSTANT type to have same logical shape as Jacobian
+  // setup jacobianDetInverseExtended
+  {
+    auto variationTypes = jacobianDetInverse.getVariationTypes(); // defaults to CONSTANT in ranks beyond the rank of the container; this is what we want for our new extents
+    auto extents        = jacobian.getExtents();
+    
+    jacobianDetInverseExtended = jacobianDetInverse.shallowCopy(jacobian.rank(), extents, variationTypes);
+  }
+  
+  // lazily-evaluated transformed div values (temporary to allow integralData allocation)
+  auto transformedDivValuesTemp = FunctionSpaceTools::getHDIVtransformDIV(jacobianDetInverse, divValues);
+  auto integralData = IntegrationTools::allocateIntegralData(transformedDivValuesTemp, cellMeasures, transformedDivValuesTemp);
+  
+  const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases
+  
+  // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount)
+  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
+  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
+  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);
+  
+  transformIntegrateFlopCount = 0;
+  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
+  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
+  
+  auto refData = geometry.getJacobianRefData(tensorCubaturePoints);
+  
+  initialSetupTimer->stop();
+  while (cellOffset < numCells)
+  {
+    int startCell         = cellOffset;
+    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
+    int endCell           = numCellsInWorkset + startCell;
+    
+    jacobianAndCellMeasureTimer->start();
+    if (numCellsInWorkset != worksetSize)
+    {
+      const int CELL_DIM = 0; // first dimension corresponds to cell
+      jacobian.setExtent(                    CELL_DIM, numCellsInWorkset);
+      jacobianDet.setExtent(                 CELL_DIM, numCellsInWorkset);
+      jacobianDetInverse.setExtent(          CELL_DIM, numCellsInWorkset);
+      jacobianInv.setExtent(                 CELL_DIM, numCellsInWorkset);
+      integralData.setExtent(                CELL_DIM, numCellsInWorkset);
+      jacobianDividedByJacobianDet.setExtent(CELL_DIM, numCellsInWorkset);
+      
+      // cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension…
+      cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset);
+    }
+    
+    geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell);
+    CellTools<DeviceType>::setJacobianDet(   jacobianDet,        jacobian);
+    CellTools<DeviceType>::setJacobianDetInv(jacobianDetInverse, jacobian);
+    CellTools<DeviceType>::setJacobianInv(   jacobianInv,        jacobian);
+    
+    // compute the jacobian divided by its determinant
+    jacobianDividedByJacobianDet.storeInPlaceProduct(jacobian,jacobianDetInverseExtended);
+    
+    // lazily-evaluated transformed div, values:
+    auto transformedDivValues   = FunctionSpaceTools::getHDIVtransformDIV(jacobianDetInverse, divValues);
+    auto transformedBasisValues = FunctionSpaceTools::getHDIVtransformVALUE(jacobianDividedByJacobianDet, basisValues);
+    
+    geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights);
+    ExecutionSpace().fence();
+    jacobianAndCellMeasureTimer->stop();
+    
+    bool sumInto = false;
+    double approximateFlopCountIntegrateWorksetDIVDIV = 0;
+    double approximateFlopCountIntegrateWorksetVALUEVALUE = 0;
+    fstIntegrateCall->start();
+    // (div,div)
+    IntegrationTools::integrate(integralData, transformedDivValues, cellMeasures, transformedDivValues, sumInto, &approximateFlopCountIntegrateWorksetDIVDIV);
+    
+    // (value,value)
+    sumInto = true; // add to what we already accumulated for (div,div)
+    IntegrationTools::integrate(integralData, transformedBasisValues, cellMeasures, transformedBasisValues, sumInto, &approximateFlopCountIntegrateWorksetVALUEVALUE);
+    
+    ExecutionSpace().fence();
+    fstIntegrateCall->stop();
+    
+    // copy into cellStiffness container.  (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.)
+    if (integralData.getUnderlyingViewRank() == 3)
+    {
+      std::pair<int,int> cellRange = {startCell, endCell};
+      auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
+      Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3());
+    }
+    else // underlying view rank is 2; copy to each cell in destination stiffness matrix
+    {
+      auto integralView2 = integralData.getUnderlyingView2();
+      auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{numCellsInWorkset,numFields,numFields});
+      Kokkos::parallel_for("copy uniform data to expanded container", policy,
+                       KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
+        cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal);
+      });
+    }
+    
+    transformIntegrateFlopCount  += approximateFlopCountIntegrateWorksetDIVDIV + approximateFlopCountIntegrateWorksetVALUEVALUE;
+    
+    cellOffset += worksetSize;
+  }
+  return cellStiffness;
+}
+
+#endif /* HDIVStructuredAssembly_h */
diff --git a/packages/intrepid2/assembly-examples/HVOLStandardAssembly.hpp b/packages/intrepid2/assembly-examples/HVOLStandardAssembly.hpp
new file mode 100644
index 000000000000..630a6cb6924b
--- /dev/null
+++ b/packages/intrepid2/assembly-examples/HVOLStandardAssembly.hpp
@@ -0,0 +1,142 @@
+//
+//  HVOLStandardAssembly.hpp
+//  Trilinos
+//
+//  Created by Roberts, Nathan V on 3/30/22.
+//
+
+#ifndef Intrepid2_HVOLStandardAssembly_hpp
+#define Intrepid2_HVOLStandardAssembly_hpp
+
+#include "JacobianFlopEstimate.hpp"
+
+/** \file   HVOLStandardAssembly.hpp
+    \brief  Locally assembles a matrix with the H(vol) natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j), using standard Intrepid2 methods; these do not algorithmically exploit geometric structure.
+ */
+
+//! Version that uses the classic, generic Intrepid2 paths.
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadratureHVOL(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry,
+                                                                       const int &polyOrder, int worksetSize,
+                                                                       double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  using namespace Intrepid2;
+  using ExecutionSpace = typename DeviceType::execution_space;
+  
+  int numVertices = 1;
+  for (int d=0; d<spaceDim; d++)
+  {
+    numVertices *= 2;
+  }
+  
+  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
+  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
+  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
+  initialSetupTimer->start();
+  
+  using CellTools = CellTools<DeviceType>;
+  using FunctionSpaceTools = FunctionSpaceTools<DeviceType>;
+  
+  using namespace std;
+  // dimensions of the returned view are (C,F,F)
+  auto fs = FUNCTION_SPACE_HVOL;
+
+  shards::CellTopology cellTopo = geometry.cellTopology();
+  
+  auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
+  
+  int numFields = basis->getCardinality();
+  int numCells = geometry.numCells();
+  
+  if (worksetSize > numCells) worksetSize = numCells;
+  
+  // local stiffness matrices:
+  ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
+  
+  auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
+  int numPoints = cubature->getNumPoints();
+  ScalarView<PointScalar,DeviceType> cubaturePoints("cubature points",numPoints,spaceDim);
+  ScalarView<double,DeviceType> cubatureWeights("cubature weights", numPoints);
+  
+  cubature->getCubature(cubaturePoints, cubatureWeights);
+  
+  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
+  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
+  
+  // Allocate some intermediate containers
+  ScalarView<Scalar,DeviceType> basisValues    ("basis values", numFields, numPoints );
+
+  ScalarView<Scalar,DeviceType> transformedBasisValues("transformed basis values", worksetSize, numFields, numPoints);
+  ScalarView<Scalar,DeviceType> transformedWeightedBasisValues("transformed weighted basis values", worksetSize, numFields, numPoints);
+  
+  basis->getValues(basisValues,     cubaturePoints, OPERATOR_VALUE );
+  
+  const int numNodesPerCell = geometry.numNodesPerCell();
+  ScalarView<PointScalar,DeviceType> expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim);
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0,numCells),
+  KOKKOS_LAMBDA (const int &cellOrdinal) {
+    for (int nodeOrdinal=0; nodeOrdinal<numNodesPerCell; nodeOrdinal++)
+    {
+      for (int d=0; d<spaceDim; d++)
+      {
+        expandedCellNodes(cellOrdinal,nodeOrdinal,d) = geometry(cellOrdinal,nodeOrdinal,d);
+      }
+    }
+  });
+  
+  ScalarView<Scalar,DeviceType> cellMeasures("cell measures", worksetSize, numPoints);
+  ScalarView<Scalar,DeviceType> jacobianDeterminant("jacobian determinant", worksetSize, numPoints);
+  ScalarView<Scalar,DeviceType> jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim);
+
+  initialSetupTimer->stop();
+  
+  transformIntegrateFlopCount  = 0;
+  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
+  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
+  
+  int cellOffset = 0;
+  while (cellOffset < numCells)
+  {
+    int startCell         = cellOffset;
+    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
+    
+    std::pair<int,int> cellRange = {startCell, startCell+numCellsInWorkset};
+    auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL());
+    
+    if (numCellsInWorkset != worksetSize)
+    {
+      Kokkos::resize(jacobian,                       numCellsInWorkset, numPoints, spaceDim, spaceDim);
+      Kokkos::resize(jacobianDeterminant,            numCellsInWorkset, numPoints);
+      Kokkos::resize(cellMeasures,                   numCellsInWorkset, numPoints);
+      Kokkos::resize(transformedBasisValues,         numCellsInWorkset, numFields, numPoints);
+      Kokkos::resize(transformedWeightedBasisValues, numCellsInWorkset, numFields, numPoints);
+    }
+    jacobianAndCellMeasureTimer->start();
+    CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell.
+    CellTools::setJacobianDet(jacobianDeterminant, jacobian);
+    
+    FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights);
+    ExecutionSpace().fence();
+    jacobianAndCellMeasureTimer->stop();
+        
+    // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
+    fstIntegrateCall->start();
+    auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
+    
+    FunctionSpaceTools::HVOLtransformVALUE(transformedBasisValues, jacobianDeterminant, basisValues);
+    FunctionSpaceTools::multiplyMeasure(transformedWeightedBasisValues, cellMeasures, transformedBasisValues);
+    bool sumInto = true; // add the (value,value) integral to the (curl,curl) that we've already integrated
+    FunctionSpaceTools::integrate(cellStiffnessSubview, transformedBasisValues, transformedWeightedBasisValues, sumInto);
+    ExecutionSpace().fence();
+    fstIntegrateCall->stop();
+    
+    // record flops for (value,value):
+    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * 2 - 1); // one multiply, add for each (P) entry in the contraction.
+    cellOffset += worksetSize;
+  }
+//  std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl;
+  return cellStiffness;
+}
+
+#endif /* HVOLStandardAssembly_h */
diff --git a/packages/intrepid2/assembly-examples/HVOLStructuredAssembly.hpp b/packages/intrepid2/assembly-examples/HVOLStructuredAssembly.hpp
new file mode 100644
index 000000000000..275c9d356bf7
--- /dev/null
+++ b/packages/intrepid2/assembly-examples/HVOLStructuredAssembly.hpp
@@ -0,0 +1,153 @@
+//
+//  HVOLStructuredAssembly.hpp
+//  Trilinos
+//
+//  Created by Roberts, Nathan V on 3/30/22.
+//
+
+#ifndef HVOLStructuredAssembly_h
+#define HVOLStructuredAssembly_h
+
+#include "JacobianFlopEstimate.hpp"
+
+/** \file   HVOLStructuredAssembly.hpp
+    \brief  Locally assembles a matrix with the H(vol) natural norm -- an array of shape (C,F,F), with formulation (e_i, e_j), using "structured" Intrepid2 methods; these algorithmically exploit geometric structure as expressed in the provided CellGeometry.
+ */
+
+//! Version that takes advantage of new structured integration support, including sum factorization.  Computes H(vol) norm: (value,value).
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadratureHVOL(Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                                                          double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  using namespace Intrepid2;
+  
+  using ExecutionSpace = typename DeviceType::execution_space;
+  
+  int numVertices = 1;
+  for (int d=0; d<spaceDim; d++)
+  {
+    numVertices *= 2;
+  }
+  
+  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
+  initialSetupTimer->start();
+  using namespace std;
+  using FunctionSpaceTools = FunctionSpaceTools<DeviceType>;
+  using IntegrationTools   = IntegrationTools<DeviceType>;
+  // dimensions of the returned view are (C,F,F)
+  auto fs = FUNCTION_SPACE_HVOL;
+  
+  shards::CellTopology cellTopo = geometry.cellTopology();
+  
+  auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
+  
+  int numFields = basis->getCardinality();
+  int numCells = geometry.numCells();
+    
+  // local stiffness matrix:
+  ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
+  
+  auto cubature = DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
+  auto tensorCubatureWeights = cubature->allocateCubatureWeights();
+  TensorPoints<PointScalar,DeviceType> tensorCubaturePoints  = cubature->allocateCubaturePoints();
+  
+  cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights);
+    
+  EOperator opValue = OPERATOR_VALUE;
+  BasisValues<Scalar,DeviceType> basisValues = basis->allocateBasisValues(tensorCubaturePoints, opValue);
+  basis->getValues(basisValues, tensorCubaturePoints, opValue);
+  
+  int cellOffset = 0;
+  
+  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
+  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
+  
+  Data<PointScalar,DeviceType> jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize);
+  Data<PointScalar,DeviceType> jacobianDet        = CellTools<DeviceType>::allocateJacobianDet(jacobian);
+  Data<PointScalar,DeviceType> jacobianDetInverse = CellTools<DeviceType>::allocateJacobianDet(jacobian);
+  TensorData<PointScalar,DeviceType> cellMeasures = geometry.allocateCellMeasure(jacobianDetInverse, tensorCubatureWeights);
+  
+  // lazily-evaluated transformed values (temporary to allow integralData allocation)
+  auto transformedValuesTemp = FunctionSpaceTools::getHVOLtransformVALUE(jacobianDet, basisValues);
+  auto integralData = IntegrationTools::allocateIntegralData(transformedValuesTemp, cellMeasures, transformedValuesTemp);
+  
+  const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases
+  
+  // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount)
+  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
+  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
+  
+  transformIntegrateFlopCount = 0;
+  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
+  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
+  jacobianCellMeasureFlopCount += numCells * numPoints; // determinant inverse (one divide per (C,P))
+  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
+  
+  auto refData = geometry.getJacobianRefData(tensorCubaturePoints);
+  
+  initialSetupTimer->stop();
+  while (cellOffset < numCells)
+  {
+    int startCell         = cellOffset;
+    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
+    int endCell           = numCellsInWorkset + startCell;
+    
+    jacobianAndCellMeasureTimer->start();
+    if (numCellsInWorkset != worksetSize)
+    {
+      const int CELL_DIM = 0; // first dimension corresponds to cell
+      jacobian.setExtent(           CELL_DIM, numCellsInWorkset);
+      jacobianDet.setExtent(        CELL_DIM, numCellsInWorkset);
+      jacobianDetInverse.setExtent( CELL_DIM, numCellsInWorkset);
+      integralData.setExtent(       CELL_DIM, numCellsInWorkset);
+      
+      // cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension…
+      cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset);
+    }
+    
+    geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell);
+    CellTools<DeviceType>::setJacobianDet(   jacobianDet,        jacobian);
+    CellTools<DeviceType>::setJacobianDetInv(jacobianDetInverse, jacobian);
+    
+    // lazily-evaluated transformed values:
+    auto transformedBasisValues = FunctionSpaceTools::getHVOLtransformVALUE(jacobianDetInverse, basisValues);
+    
+    geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights);
+    ExecutionSpace().fence();
+    jacobianAndCellMeasureTimer->stop();
+    
+    double approximateFlopCountIntegrateWorksetVALUEVALUE = 0;
+    fstIntegrateCall->start();
+
+    // (value,value)
+    bool sumInto = false;
+    IntegrationTools::integrate(integralData, transformedBasisValues, cellMeasures, transformedBasisValues, sumInto, &approximateFlopCountIntegrateWorksetVALUEVALUE);
+    
+    ExecutionSpace().fence();
+    fstIntegrateCall->stop();
+    
+    // copy into cellStiffness container.  (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.)
+    if (integralData.getUnderlyingViewRank() == 3)
+    {
+      std::pair<int,int> cellRange = {startCell, endCell};
+      auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
+      Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3());
+    }
+    else // underlying view rank is 2; copy to each cell in destination stiffness matrix
+    {
+      auto integralView2 = integralData.getUnderlyingView2();
+      auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{numCellsInWorkset,numFields,numFields});
+      Kokkos::parallel_for("copy uniform data to expanded container", policy,
+                       KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
+        cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal);
+      });
+    }
+    
+    transformIntegrateFlopCount  += approximateFlopCountIntegrateWorksetVALUEVALUE;
+    
+    cellOffset += worksetSize;
+  }
+  return cellStiffness;
+}
+
+#endif /* HVOLStructuredAssembly_h */
diff --git a/packages/intrepid2/assembly-examples/JacobianFlopEstimate.hpp b/packages/intrepid2/assembly-examples/JacobianFlopEstimate.hpp
new file mode 100644
index 000000000000..319cf5069c1c
--- /dev/null
+++ b/packages/intrepid2/assembly-examples/JacobianFlopEstimate.hpp
@@ -0,0 +1,125 @@
+//
+//  JacobianFlopEstimate.hpp
+//  Trilinos
+//
+//  Created by Roberts, Nathan V on 7/7/21.
+//
+
+#ifndef Intrepid2_JacobianFlopEstimate_hpp
+#define Intrepid2_JacobianFlopEstimate_hpp
+
+double flopsPerJacobian(const int &spaceDim, const int &numPoints, const int &numGeometryNodes)
+{
+  // implementation looks like:
+//  for (ordinal_type i=0;i<dim;++i)
+//  for (ordinal_type j=0;j<dim;++j) {
+//    _jacobian(cell, point, i, j) = 0;
+//    for (ordinal_type bf=0;bf<cardinality;++bf)
+//      _jacobian(cell, point, i, j) += _worksetCells(cell+_startCell, bf, i) * _basisGrads(bf, point, j); // 2 flops: one multiply, one add
+//  }
+  return 2.0 * spaceDim * spaceDim * numPoints * numGeometryNodes;
+}
+
+double flopsPerJacobianDet(const int &spaceDim, const int &numPoints)
+{
+  //implementation in RealSpaceTools:
+  /*value_type r_val = 0.0;
+  switch (dim) {
+  case 3:
+    r_val = ( inMat(0,0) * inMat(1,1) * inMat(2,2) + // 3 flops: 2 mults, 1 add
+              inMat(1,0) * inMat(2,1) * inMat(0,2) + // 3 flops: 2 mults, 1 add
+              inMat(2,0) * inMat(0,1) * inMat(1,2) - // 3 flops: 2 mults, 1 subtract
+              inMat(2,0) * inMat(1,1) * inMat(0,2) - // 3 flops: 2 mults, 1 subtract
+              inMat(0,0) * inMat(2,1) * inMat(1,2) - // 3 flops: 2 mults, 1 subtract
+              inMat(1,0) * inMat(0,1) * inMat(2,2) ); // 2 flops: 2 mults
+    break;
+  case 2:
+    r_val = ( inMat(0,0) * inMat(1,1) -
+              inMat(0,1) * inMat(1,0) );
+    break;
+  case 1:
+    r_val = ( inMat(0,0) );
+    break;
+  }
+  return r_val;*/
+  int r_val;
+  switch (spaceDim) {
+    case 3: r_val = 17.0 * numPoints; break;
+    case 2: r_val = 3.0 * numPoints; break;
+    case 1: r_val = 0.0; break;
+    default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unhandled spaceDim");
+  }
+  return r_val;
+}
+
+double flopsPerJacobianInverse(const int &spaceDim, const int &numPoints)
+{
+  // implementation looks like:
+  // const value_type val = RealSpaceTools<>::Serial::det(mat);
+  double totalFlops = flopsPerJacobianDet(spaceDim, numPoints);
+  
+  /*
+   case 1: {
+     inv(0,0) = 1.0/mat(0,0);
+     break;
+   }
+   case 2: {
+     inv(0,0) =   mat(1,1)/val;
+     inv(1,1) =   mat(0,0)/val;
+
+     inv(1,0) = - mat(1,0)/val;
+     inv(0,1) = - mat(0,1)/val;
+     break;
+   }
+   */
+  
+  if (spaceDim == 1)
+  {
+    // inv(0,0) = 1.0/mat(0,0); // 1 flop: 1 division
+    totalFlops += 1 * numPoints;
+  }
+  else if (spaceDim == 2)
+  {
+//    inv(0,0) =   mat(1,1)/val; // 1 flop: 1 division
+//    inv(1,1) =   mat(0,0)/val; // 1 flop: 1 division
+//
+//    inv(1,0) = - mat(1,0)/val; // 2 flops: 1 negation, 1 division
+//    inv(0,1) = - mat(0,1)/val; // 2 flops: 1 negation, 1 division
+    totalFlops += 6 * numPoints;
+  }
+  else if (spaceDim == 3)
+  {
+  //  val0 =   mat(1,1)*mat(2,2) - mat(2,1)*mat(1,2); // 3 flops: 2 mults, 1 subtraction
+  //  val1 = - mat(1,0)*mat(2,2) + mat(2,0)*mat(1,2); // 4 flops: 2 mults, 1 negation, 1 add
+  //  val2 =   mat(1,0)*mat(2,1) - mat(2,0)*mat(1,1); // 3 flops: 2 mults, 1 subtraction
+  //
+  //  inv(0,0) = val0/val; // 1 flop
+  //  inv(1,0) = val1/val; // 1 flop
+  //  inv(2,0) = val2/val; // 1 flop
+  //
+  //  val0 =   mat(2,1)*mat(0,2) - mat(0,1)*mat(2,2); // 3
+  //  val1 =   mat(0,0)*mat(2,2) - mat(2,0)*mat(0,2); // 3
+  //  val2 = - mat(0,0)*mat(2,1) + mat(2,0)*mat(0,1); // 4
+  //
+  //  inv(0,1) = val0/val; // 1
+  //  inv(1,1) = val1/val; // 1
+  //  inv(2,1) = val2/val; // 1
+  //
+  //  val0 =   mat(0,1)*mat(1,2) - mat(1,1)*mat(0,2); // 3
+  //  val1 = - mat(0,0)*mat(1,2) + mat(1,0)*mat(0,2); // 4
+  //  val2 =   mat(0,0)*mat(1,1) - mat(1,0)*mat(0,1); // 3
+  //
+  //  inv(0,2) = val0/val; // 1
+  //  inv(1,2) = val1/val; // 1
+  //  inv(2,2) = val2/val; // 1
+    totalFlops += 36.0 * numPoints;
+  }
+  else
+  {
+    INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unhandled spaceDim");
+  }
+  
+  return totalFlops;
+}
+
+#endif /* GRADGRADStandardAssembly_h */
diff --git a/packages/intrepid2/src/Cell/Intrepid2_CellGeometry.hpp b/packages/intrepid2/src/Cell/Intrepid2_CellGeometry.hpp
index 2fadab29d3e3..ed2dbdad09d8 100644
--- a/packages/intrepid2/src/Cell/Intrepid2_CellGeometry.hpp
+++ b/packages/intrepid2/src/Cell/Intrepid2_CellGeometry.hpp
@@ -58,7 +58,7 @@
 #include "Intrepid2_OrientationTools.hpp"
 #include "Intrepid2_TensorData.hpp"
 #include "Intrepid2_TensorPoints.hpp"
-#include "Intrepid2_TransformedVectorData.hpp"
+#include "Intrepid2_TransformedBasisValues.hpp"
 #include "Intrepid2_Utils.hpp"
 #include "Intrepid2_VectorData.hpp"
 
@@ -116,24 +116,22 @@ namespace Intrepid2
     
   public:
     /** \brief  Notionally-private method that provides a common interface for multiple public-facing allocateJacobianData() methods.  (Marked as public due to compiler constraints.)
-       \param [in] points - if a valid points container, specifies the points at which the Jacobian will be evaluated.  (Invalid containers are acceptable for affine CellGeometry.)
+       \param [in] pointComponentView - typically either the first component of a TensorPoints container, or a View containing all the points, but may be empty.  Only used for scalar-type-size-matched construction of new views (important for views of Fad types with hidden dimensions).
        \param [in] pointsPerCell - the number of points at which the Jacobian will be evaluated in each cell.  If points is a valid container, pointsPerCell must match its first dimension.
        \param [in] startCell - the first cell ordinal for which the Jacobian will be evaluated (used to define worksets smaller than the whole CellGeometry).
        \param [in] endCell - the first cell ordinal for which the Jacobian will not be evaluated (used to define worksets smaller than the whole CellGeometry); use -1 to indicate that all cells from startCell on should be evaluated.
        \return a Data object appropriately sized to accomodate the specified Jacobian values.
     */
-    Data<PointScalar,DeviceType> allocateJacobianDataPrivate(const TensorPoints<PointScalar,DeviceType> &points, const int &pointsPerCell, const int startCell, const int endCell) const;
+    Data<PointScalar,DeviceType> allocateJacobianDataPrivate(const ScalarView<PointScalar,DeviceType> &pointComponentView, const int &pointsPerCell, const int startCell, const int endCell) const;
     
     /** \brief  Notionally-private method that provides a common interface for multiple public-facing setJacobianData() methods.  (Marked as public due to compiler constraints.)
        \param [out] jacobianData - a container, allocated by allocateJacobianData(), into which the evaluated Jacobians will be placed.
-       \param [in] points - if a valid points container, specifies the points at which the Jacobian will be evaluated.  (Invalid containers are acceptable for affine CellGeometry.)
        \param [in] pointsPerCell - the number of points at which the Jacobian will be evaluated in each cell.  If points is a valid container, pointsPerCell must match its first dimension.
        \param [in] refData - the return from getJacobianRefData(); may be an empty container, depending on details of CellGeometry (e.g. if it is affine)
        \param [in] startCell - the first cell ordinal for which the Jacobian will be evaluated (used to define worksets smaller than the whole CellGeometry).
        \param [in] endCell - the first cell ordinal for which the Jacobian will not be evaluated (used to define worksets smaller than the whole CellGeometry); use -1 to indicate that all cells from startCell on should be evaluated.
     */
-    void setJacobianDataPrivate(Data<PointScalar,DeviceType> &jacobianData, const TensorPoints<PointScalar,DeviceType> &points, const int &pointsPerCell,
-                                const Data<PointScalar,DeviceType> &refData, const int startCell, const int endCell) const;
+    void setJacobianDataPrivate(Data<PointScalar,DeviceType> &jacobianData, const int &pointsPerCell, const Data<PointScalar,DeviceType> &refData, const int startCell, const int endCell) const;
   protected:
     HypercubeNodeOrdering nodeOrdering_;
     CellGeometryType    cellGeometryType_;
@@ -321,8 +319,14 @@ namespace Intrepid2
     Data<PointScalar,DeviceType> allocateJacobianData(const int &numPoints, const int startCell=0, const int endCell=-1) const;
     
     /** \brief Computes reference-space data for the specified points, to be used in setJacobian().
-       \param [in] points - the points at which the Jacobian will be evaluated.
-       \return a Data object with any reference-space data required.  This may be empty, if no reference-space data is required in setJacobian().
+       \param [in] points - the points at which the Jacobian will be evaluated; shape (P,D) or (C,P,D).
+       \return a Data object with any reference-space data required.  This may be empty, if no reference-space data is required in setJacobian().  If filled, will have shape (F,P,D) or (C,F,P,D).
+    */
+    Data<PointScalar,DeviceType> getJacobianRefData(const ScalarView<PointScalar,DeviceType> &points) const;
+    
+    /** \brief Computes reference-space data for the specified points, to be used in setJacobian().
+       \param [in] points - the points at which the Jacobian will be evaluated, with shape (P,D).
+       \return a Data object with any reference-space data required.  This may be empty, if no reference-space data is required in setJacobian().  If filled, will have shape (F,P,D).
     */
     Data<PointScalar,DeviceType> getJacobianRefData(const TensorPoints<PointScalar,DeviceType> &points) const;
     
diff --git a/packages/intrepid2/src/Cell/Intrepid2_CellGeometryDef.hpp b/packages/intrepid2/src/Cell/Intrepid2_CellGeometryDef.hpp
index f47e8cbf9cfc..ca3403ad4e49 100644
--- a/packages/intrepid2/src/Cell/Intrepid2_CellGeometryDef.hpp
+++ b/packages/intrepid2/src/Cell/Intrepid2_CellGeometryDef.hpp
@@ -181,7 +181,7 @@ namespace Intrepid2
 
   template<class PointScalar, int spaceDim, typename DeviceType>
   Data<PointScalar,DeviceType>
-  CellGeometry<PointScalar,spaceDim,DeviceType>::allocateJacobianDataPrivate(const TensorPoints<PointScalar,DeviceType> &points, const int &pointsPerCell, const int startCell, const int endCell) const
+  CellGeometry<PointScalar,spaceDim,DeviceType>::allocateJacobianDataPrivate(const ScalarView<PointScalar,DeviceType> &pointComponentView, const int &pointsPerCell, const int startCell, const int endCell) const
   {
     ScalarView<PointScalar,DeviceType> data;
     const int rank = 4; // C,P,D,D
@@ -208,7 +208,7 @@ namespace Intrepid2
         
         int cellTypeModulus = uniformJacobianModulus();
         
-        data = getMatchingViewWithLabel(points.getTensorComponent(0), "CellGeometryProvider: Jacobian data", cellTypeModulus, spaceDim, spaceDim);
+        data = getMatchingViewWithLabel(pointComponentView, "CellGeometryProvider: Jacobian data", cellTypeModulus, spaceDim, spaceDim);
       }
       else
       {
@@ -217,7 +217,7 @@ namespace Intrepid2
         variationType[D2_DIM] = BLOCK_PLUS_DIAGONAL;
         blockPlusDiagonalLastNonDiagonal = -1;
         
-        data = getMatchingViewWithLabel(points.getTensorComponent(0), "CellGeometryProvider: Jacobian data", spaceDim);
+        data = getMatchingViewWithLabel(pointComponentView, "CellGeometryProvider: Jacobian data", spaceDim);
       }
     }
     else if (cellGeometryType_ == TENSOR_GRID)
@@ -273,9 +273,9 @@ namespace Intrepid2
   }
   
   template<class PointScalar, int spaceDim, typename DeviceType>
-  void CellGeometry<PointScalar,spaceDim,DeviceType>::setJacobianDataPrivate(Data<PointScalar,DeviceType> &jacobianData, const TensorPoints<PointScalar,DeviceType> &points,
-                                                                                const int &pointsPerCell, const Data<PointScalar,DeviceType> &refData,
-                                                                                const int startCell, const int endCell) const
+  void CellGeometry<PointScalar,spaceDim,DeviceType>::setJacobianDataPrivate(Data<PointScalar,DeviceType> &jacobianData,
+                                                                             const int &pointsPerCell, const Data<PointScalar,DeviceType> &refData,
+                                                                             const int startCell, const int endCell) const
   {
     const int numCellsWorkset = (endCell == -1) ? (numCells_ - startCell) : (endCell - startCell);
     
@@ -384,12 +384,22 @@ namespace Intrepid2
           TEUCHOS_TEST_FOR_EXCEPTION(basisForNodes == Teuchos::null, std::invalid_argument, "basisForNodes must not be null");
           TEUCHOS_TEST_FOR_EXCEPTION(dataView.size() == 0, std::invalid_argument, "underlying view is not valid");
           
-          // refData should contain the basis gradients; shape is (F,P,D)
+          // refData should contain the basis gradients; shape is (F,P,D) or (C,F,P,D)
           INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(!refData.isValid(), std::invalid_argument, "refData should be a valid container for cases with non-affine geometry");
-          INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.rank() != 3, std::invalid_argument, "refData should have shape (F,P,D)");
-          INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(0) != basisForNodes->getCardinality(), std::invalid_argument, "refData should have shape (F,P,D)");
-          INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(1) != points.extent_int(0), std::invalid_argument, "refData should have shape (F,P,D)");
-          INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(2) != spaceDim, std::invalid_argument, "refData should have shape (F,P,D)");
+          INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE((refData.rank() != 3) && (refData.rank() != 4), std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)");
+          if (refData.rank() == 3)
+          {
+            INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(0) != basisForNodes->getCardinality(), std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)");
+            INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(1) != pointsPerCell, std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)");
+            INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(2) != spaceDim, std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)");
+          }
+          else
+          {
+            INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(0) != numCellsWorkset, std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)");
+            INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(1) != basisForNodes->getCardinality(), std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)");
+            INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(2) != pointsPerCell, std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)");
+            INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(refData.extent_int(3) != spaceDim, std::invalid_argument, "refData should have shape (F,P,D) or (C,F,P,D)");
+          }
           
           CellTools::setJacobian(dataView, *this, refData, startCell, endCell);
         }
@@ -744,6 +754,90 @@ namespace Intrepid2
     else return GENERAL;
   }
 
+  template<class PointScalar, int spaceDim, typename DeviceType>
+  Data<PointScalar,DeviceType> CellGeometry<PointScalar,spaceDim,DeviceType>::getJacobianRefData(const ScalarView<PointScalar,DeviceType> &points) const
+  {
+    Data<PointScalar,DeviceType> emptyRefData;
+    if (cellGeometryType_ == UNIFORM_GRID)
+    {
+      // no need for basis computations
+      return emptyRefData;
+    }
+    else if (cellGeometryType_ == TENSOR_GRID)
+    {
+      // no need for basis values
+      return emptyRefData;
+    }
+    else if ((cellGeometryType_ == FIRST_ORDER) || (cellGeometryType_ == HIGHER_ORDER))
+    {
+      const bool simplex = (spaceDim + 1 == cellToNodes_.extent_int(1));
+      if (simplex)
+      {
+        // no need for precomputed basis values
+        return emptyRefData;
+      }
+      else
+      {
+        auto basisForNodes = this->basisForNodes();
+        
+        if (affine_)
+        {
+          // no need for precomputed basis values
+          return emptyRefData;
+        }
+        else
+        {
+          // 2 use cases: (P,D) and (C,P,D).
+          // if (P,D), call the TensorPoints variant
+          if (points.rank() == 2)
+          {
+            TensorPoints<PointScalar,DeviceType> tensorPoints(points);
+            return getJacobianRefData(tensorPoints);
+          }
+          else
+          {
+            const int numCells = points.extent_int(0);
+            const int numPoints = points.extent_int(1);
+            const int numFields = basisForNodes->getCardinality();
+            
+            auto cellBasisGradientsView = getMatchingViewWithLabel(points, "CellGeometryProvider: cellBasisGradients", numCells, numFields, numPoints, spaceDim);
+            auto basisGradientsView     = getMatchingViewWithLabel(points, "CellGeometryProvider: basisGradients", numFields, numPoints, spaceDim);
+            
+            for (int cellOrdinal=0; cellOrdinal<numCells; cellOrdinal++)
+            {
+              auto refPointsForCell = Kokkos::subview(points, cellOrdinal, Kokkos::ALL(), Kokkos::ALL());
+              basisForNodes->getValues(basisGradientsView, refPointsForCell, OPERATOR_GRAD);
+              
+              // At some (likely relatively small) memory cost, we copy the BasisGradients into an explicit (F,P,D) container.
+              // Given that we expect to reuse this for a non-trivial number of cell in the common use case, the extra memory
+              // cost is likely worth the increased flop count, etc.  (One might want to revisit this in cases of high spaceDim
+              // and/or very high polynomial order.)
+              
+              using ExecutionSpace = typename DeviceType::execution_space;
+              auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>({0,0,0},{numFields,numPoints,spaceDim});
+              
+              Kokkos::parallel_for("copy basis gradients", policy,
+              KOKKOS_LAMBDA (const int &fieldOrdinal, const int &pointOrdinal, const int &d) {
+                cellBasisGradientsView(cellOrdinal,fieldOrdinal,pointOrdinal,d) = basisGradientsView(fieldOrdinal,pointOrdinal,d);
+              });
+              ExecutionSpace().fence();
+            }
+            Data<PointScalar,DeviceType> basisRefData(cellBasisGradientsView);
+            return basisRefData;
+          }
+        }
+      }
+    }
+    else
+    {
+      // TODO: handle the other cases
+      INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::invalid_argument, "support for this CellGeometryType is not yet implemented");
+    }
+    return emptyRefData;
+    
+    
+  }
+
   template<class PointScalar, int spaceDim, typename DeviceType>
   Data<PointScalar,DeviceType> CellGeometry<PointScalar,spaceDim,DeviceType>::getJacobianRefData(const TensorPoints<PointScalar,DeviceType> &points) const
   {
@@ -866,11 +960,11 @@ namespace Intrepid2
       // then there are as many distinct orientations possible as there are there are cells per grid cell
       // fill cellNodesHost with sample nodes from grid cell 0
       const int numSubdivisions = numCellsPerGridCell(subdivisionStrategy_); // can be up to 6
-      const int gridCellOrdinal = 0;
       
 #if defined(INTREPID2_COMPILE_DEVICE_CODE)
       /// do not compile host only code with device
 #else
+      const int gridCellOrdinal = 0;
       auto hostPolicy = Kokkos::MDRangePolicy<HostExecSpace,Kokkos::Rank<2>>({0,0},{numSubdivisions,nodesPerCell});
       Kokkos::parallel_for("fill cellNodesHost", hostPolicy,
                            [this,gridCellOrdinal,cellNodesHost] (const int &subdivisionOrdinal, const int &nodeInCell) {
@@ -1251,7 +1345,7 @@ namespace Intrepid2
   Data<PointScalar,DeviceType> CellGeometry<PointScalar,spaceDim,DeviceType>::allocateJacobianData(const TensorPoints<PointScalar,DeviceType> &points, const int startCell, const int endCell) const
   {
     const int pointsPerCell = points.extent_int(0);
-    return allocateJacobianDataPrivate(points,pointsPerCell,startCell,endCell);
+    return allocateJacobianDataPrivate(points.getTensorComponent(0),pointsPerCell,startCell,endCell);
   }
   
   template<class PointScalar, int spaceDim, typename DeviceType>
@@ -1260,8 +1354,7 @@ namespace Intrepid2
     // if points is rank 3, it has shape (C,P,D).  If it's rank 2, (P,D).
     const int pointDimension = (points.rank() == 3) ? 1 : 0;
     const int pointsPerCell = points.extent_int(pointDimension);
-    TensorPoints<PointScalar,DeviceType> tensorPoints(points);
-    return allocateJacobianDataPrivate(tensorPoints,pointsPerCell,startCell,endCell);
+    return allocateJacobianDataPrivate(points,pointsPerCell,startCell,endCell);
   }
   
   template<class PointScalar, int spaceDim, typename DeviceType>
@@ -1269,7 +1362,7 @@ namespace Intrepid2
   {
     INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(!affine_, std::invalid_argument, "this version of allocateJacobianData() is only supported for affine CellGeometry");
     
-    TensorPoints<PointScalar,DeviceType> emptyPoints;
+    ScalarView<PointScalar,DeviceType> emptyPoints;
     return allocateJacobianDataPrivate(emptyPoints,numPoints,startCell,endCell);
   }
   
@@ -1278,7 +1371,7 @@ namespace Intrepid2
                                                                   const Data<PointScalar,DeviceType> &refData, const int startCell, const int endCell) const
   {
     const int pointsPerCell = points.extent_int(0);
-    setJacobianDataPrivate(jacobianData,points,pointsPerCell,refData,startCell,endCell);
+    setJacobianDataPrivate(jacobianData,pointsPerCell,refData,startCell,endCell);
   }
   
   template<class PointScalar, int spaceDim, typename DeviceType>
@@ -1288,8 +1381,7 @@ namespace Intrepid2
     // if points is rank 3, it has shape (C,P,D).  If it's rank 2, (P,D).
     const int pointDimension = (points.rank() == 3) ? 1 : 0;
     const int pointsPerCell = points.extent_int(pointDimension);
-    TensorPoints<PointScalar,DeviceType> tensorPoints(points);
-    setJacobianDataPrivate(jacobianData,tensorPoints,pointsPerCell,refData,startCell,endCell);
+    setJacobianDataPrivate(jacobianData,pointsPerCell,refData,startCell,endCell);
   }
   
   template<class PointScalar, int spaceDim, typename DeviceType>
@@ -1297,9 +1389,8 @@ namespace Intrepid2
   {
     INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(!affine_, std::invalid_argument, "this version of setJacobian() is only supported for affine CellGeometry");
     
-    TensorPoints<PointScalar,DeviceType> emptyPoints;
     Data<PointScalar,DeviceType> emptyRefData;
-    setJacobianDataPrivate(jacobianData,emptyPoints,numPoints,emptyRefData,startCell,endCell);
+    setJacobianDataPrivate(jacobianData,numPoints,emptyRefData,startCell,endCell);
   }
 } // namespace Intrepid2
 
diff --git a/packages/intrepid2/src/Cell/Intrepid2_CellTools.hpp b/packages/intrepid2/src/Cell/Intrepid2_CellTools.hpp
index fb9b7f2073cb..aaa08d1410e1 100644
--- a/packages/intrepid2/src/Cell/Intrepid2_CellTools.hpp
+++ b/packages/intrepid2/src/Cell/Intrepid2_CellTools.hpp
@@ -378,6 +378,15 @@ namespace Intrepid2 {
     template<class PointScalar>
     static void setJacobianDet( Data<PointScalar,DeviceType> & jacobianDet,
                                const Data<PointScalar,DeviceType> & jacobian);
+    
+    /** \brief  Computes reciprocals of determinants corresponding to the Jacobians in the Data container provided
+
+        \param  jacobianDet   [out]  - data with shape (C,P), as returned by CellTools::allocateJacobianDet()
+        \param  jacobian          [in]    - data with shape (C,P,D,D), as returned by CellGeometry::allocateJacobianData()
+    */
+    template<class PointScalar>
+    static void setJacobianDetInv( Data<PointScalar,DeviceType> & jacobianDet,
+                                  const Data<PointScalar,DeviceType> & jacobian);
 
     /** \brief  Computes determinants corresponding to the Jacobians in the Data container provided
 
diff --git a/packages/intrepid2/src/Cell/Intrepid2_CellToolsDefJacobian.hpp b/packages/intrepid2/src/Cell/Intrepid2_CellToolsDefJacobian.hpp
index 9f0c32423b5b..ef682870c4fd 100644
--- a/packages/intrepid2/src/Cell/Intrepid2_CellToolsDefJacobian.hpp
+++ b/packages/intrepid2/src/Cell/Intrepid2_CellToolsDefJacobian.hpp
@@ -459,6 +459,16 @@ namespace Intrepid2 {
     }
   }
 
+  template<typename DeviceType>
+  template<class PointScalar>
+  void CellTools<DeviceType>::setJacobianDetInv( Data<PointScalar,DeviceType> &jacobianDetInv, const Data<PointScalar,DeviceType> & jacobian )
+  {
+    setJacobianDet(jacobianDetInv, jacobian);
+    
+    auto unitData = jacobianDetInv.allocateConstantData(1.0);
+    jacobianDetInv.storeInPlaceQuotient(unitData, jacobianDetInv);
+  }
+
   template<typename DeviceType>
   template<class PointScalar>
   void CellTools<DeviceType>::setJacobianInv( Data<PointScalar,DeviceType> &jacobianInv, const Data<PointScalar,DeviceType> & jacobian )
diff --git a/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisValues.hpp b/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisValues.hpp
index 4857dff98423..787a6d3a82ad 100644
--- a/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisValues.hpp
+++ b/packages/intrepid2/src/Discretization/Basis/Intrepid2_BasisValues.hpp
@@ -163,6 +163,25 @@ namespace Intrepid2
       }
     }
     
+    //! Returns the field ordinal offset for the specified family.
+    KOKKOS_INLINE_FUNCTION
+    int familyFieldOrdinalOffset(const int &familyOrdinal) const
+    {
+      if (vectorData_.isValid())
+      {
+        return vectorData_.familyFieldOrdinalOffset(familyOrdinal);
+      }
+      else
+      {
+        int offset = 0;
+        for (int i=0; i<familyOrdinal; i++)
+        {
+          offset += tensorDataFamilies_[i].extent_int(0); // (F,P,…)
+        }
+        return offset;
+      }
+    }
+    
     //! TensorData accessor for single-family scalar data
     TensorDataType & tensorData()
     {
@@ -171,7 +190,7 @@ namespace Intrepid2
     }
     
     //! TensorData accessor for multi-family scalar data
-    TensorDataType & tensorData(const int &familyOrdinal)
+    const TensorDataType & tensorData(const int &familyOrdinal) const
     {
       INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(familyOrdinal >= numTensorDataFamilies_, std::invalid_argument, "familyOrdinal too large");
       INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(familyOrdinal < 0, std::invalid_argument, "familyOrdinal may not be less than 0");
diff --git a/packages/intrepid2/src/Discretization/FunctionSpaceTools/Intrepid2_FunctionSpaceTools.hpp b/packages/intrepid2/src/Discretization/FunctionSpaceTools/Intrepid2_FunctionSpaceTools.hpp
index 8beeaeb32b05..2318839cb77e 100644
--- a/packages/intrepid2/src/Discretization/FunctionSpaceTools/Intrepid2_FunctionSpaceTools.hpp
+++ b/packages/intrepid2/src/Discretization/FunctionSpaceTools/Intrepid2_FunctionSpaceTools.hpp
@@ -64,7 +64,7 @@
 #include "Intrepid2_CellTools.hpp"
 
 #include "Intrepid2_Data.hpp"
-#include "Intrepid2_TransformedVectorData.hpp"
+#include "Intrepid2_TransformedBasisValues.hpp"
 #include "Intrepid2_VectorData.hpp"
 
 #include "Kokkos_Core.hpp"
@@ -89,7 +89,7 @@ namespace Intrepid2 {
           defined on cells in physical space and indexed by (C,F,P,D).  The transformations are
           computed on entry access; algorithms such as sum factorization rely on having access to the
           reference-space basis values as well as the transformation operator; both are stored in the
-          returned TransformedVectorData object.
+          returned TransformedBasisValues object.
 
           Computes pullback of gradients of \e HGRAD functions
           \f$\Phi^*(\nabla\widehat{u}_f) = \left((DF_c)^{-{\sf T}}\cdot\nabla\widehat{u}_f\right)\circ F^{-1}_{c}\f$
@@ -123,13 +123,351 @@ namespace Intrepid2 {
           \endcode
 
           \param  jacobianInverse  [in]  - Input array containing cell Jacobian inverses.
-          \param  inputVals        [in]  - Input array of reference HGRAD gradients.
-     \return TransformedVectorData object defined on (C,F,P,D) indices; transformation is computed on access.
+          \param  refBasisGradValues [in]  - Input array of reference HGRAD gradients.
+     \return TransformedBasisValues object defined on (C,F,P,D) indices; transformation is computed on access.
       */
     template<class Scalar>
-    static TransformedVectorData<Scalar,DeviceType> getHGRADtransformGRAD(const Data<Scalar,DeviceType> &jacobianInverse, const VectorData<Scalar,DeviceType> &refBasisGradValues)
+    static TransformedBasisValues<Scalar,DeviceType> getHGRADtransformGRAD(const Data<Scalar,DeviceType> &jacobianInverse, const BasisValues<Scalar,DeviceType> &refBasisGradValues)
     {
-      return TransformedVectorData<Scalar,DeviceType>(jacobianInverse,refBasisGradValues);
+      return TransformedBasisValues<Scalar,DeviceType>(jacobianInverse,refBasisGradValues);
+    }
+        
+    /** \brief Transformation of a (scalar) value field in the H-grad space, defined at points on a
+        reference cell, stored in the user-provided container <var><b>input</b></var>
+        and indexed by (F,P), into the output container <var><b>output</b></var>,
+        defined on cells in physical space and indexed by (C,F,P).  This transformation is trivial, and the
+        returned container is logically indexed by (C,F,P), but only contains (F,P) distinct data entries.
+   
+        Computes pullback of \e HGRAD functions \f$\Phi^*(\widehat{u}_f) = \widehat{u}_f\circ F^{-1}_{c} \f$
+        for points in one or more physical cells that are images of a given set of points in the reference cell:
+        \f[
+        \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,.
+        \f]
+        In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container
+        should contain the values of the function set \f$\{\widehat{u}_f\}_{f=0}^{F}\f$ at the
+        reference points:
+        \f[
+        input(f,p) = \widehat{u}_f(\widehat{x}_p) \,.
+        \f]
+        The method returns
+        \f[
+        output(c,f,p)
+        = \widehat{u}_f\circ F^{-1}_{c}(x_{c,p})
+        = \widehat{u}_f(\widehat{x}_p) =  input(f,p) \qquad 0\le c < C \,,
+        \f]
+        i.e., it simply replicates the values in the user-provided container to every cell.
+        See Section \ref sec_pullbacks for more details about pullbacks.
+    
+        \code
+        |------|----------------------|--------------------------------------------------|
+        |      |         Index        |                   Dimension                      |
+        |------|----------------------|--------------------------------------------------|
+        |   C  |         cell         |  0 <= C < num. integration domains               |
+        |   F  |         field        |  0 <= F < dim. of the basis                      |
+        |   P  |         point        |  0 <= P < num. integration points                |
+        |------|----------------------|--------------------------------------------------|
+        \endcode
+
+        \param  input [in]  - Input container of reference HGRAD values.
+        \return TransformedBasisValues object of logical shape (C,F,P).
+    */
+    template<class Scalar>
+    static TransformedBasisValues<Scalar,DeviceType>
+    getHGRADtransformVALUE(const ordinal_type &numCells, const BasisValues<Scalar,DeviceType> &refBasisValues)
+    {
+      return TransformedBasisValues<Scalar,DeviceType>(numCells,refBasisValues);
+    }
+    
+    /** \brief Transformation of a (vector) value field in the H-curl space, defined at points on a
+        reference cell, stored in the user-provided container <var><b>inputVals</b></var>
+        and indexed by (F,P,D), into the output container <var><b>outputVals</b></var>,
+        defined on cells in physical space and indexed by (C,F,P,D).
+
+        Computes pullback of \e HCURL functions
+        \f$\Phi^*(\widehat{\bf u}_f) = \left((DF_c)^{-{\sf T}}\cdot\widehat{\bf u}_f\right)\circ F^{-1}_{c}\f$
+        for points in one or more physical cells that are images of a given set of points in the reference cell:
+        \f[
+        \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,.
+        \f]
+        In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container
+        should contain the values of the vector function set \f$\{\widehat{\bf u}_f\}_{f=0}^{F}\f$ at the
+        reference points:
+        \f[
+        inputVals(f,p,*) = \widehat{\bf u}_f(\widehat{x}_p) \,.
+        \f]
+        The method returns
+        \f[
+        outputVals(c,f,p,*)
+        = \left((DF_c)^{-{\sf T}}\cdot\widehat{\bf u}_f\right)\circ F^{-1}_{c}(x_{c,p})
+        = (DF_c)^{-{\sf T}}(\widehat{x}_p)\cdot\widehat{\bf u}_f(\widehat{x}_p) \qquad 0\le c < C \,.
+        \f]
+        See Section \ref sec_pullbacks for more details about pullbacks.
+        \code
+        |------|----------------------|--------------------------------------------------|
+        |      |         Index        |                   Dimension                      |
+        |------|----------------------|--------------------------------------------------|
+        |   C  |         cell         |  0 <= C < num. integration domains               |
+        |   F  |         field        |  0 <= F < dim. of native basis                   |
+        |   P  |         point        |  0 <= P < num. integration points                |
+        |   D  |         space dim    |  0 <= D < spatial dimension                      |
+        |------|----------------------|--------------------------------------------------|
+        \endcode
+
+        \param  jacobianInverse  [in]  - Input array containing cell Jacobian inverses.
+        \param  inputVals        [in]  - Input array of reference HCURL values.
+        \return lazily-evaluated transformed basis values.
+    */
+    template<class Scalar>
+    static TransformedBasisValues<Scalar,DeviceType>
+    getHCURLtransformVALUE(const Data<Scalar,DeviceType> &jacobianInverse,
+                          const BasisValues<Scalar,DeviceType> &refBasisValues )
+    {
+      return TransformedBasisValues<Scalar,DeviceType>(jacobianInverse,refBasisValues);
+    }
+    
+    /** \brief Transformation of a 3D curl field in the H-curl space, defined at points on a
+        reference cell, stored in the user-provided container <var><b>inputVals</b></var>
+        and indexed by (F,P,D), into the output container <var><b>outputVals</b></var>,
+        defined on cells in physical space and indexed by (C,F,P,D).
+
+        Computes pullback of curls of \e HCURL functions
+        \f$\Phi^*(\widehat{\bf u}_f) = \left(J^{-1}_{c} DF_{c}\cdot\nabla\times\widehat{\bf u}_f\right)\circ F^{-1}_{c}\f$
+        for points in one or more physical cells that are images of a given set of points in the reference cell:
+        \f[
+        \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,.
+        \f]
+        In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container
+        should contain the curls of the vector function set \f$\{\widehat{\bf u}_f\}_{f=0}^{F}\f$ at the
+        reference points:
+        \f[
+        inputVals(f,p,*) = \nabla\times\widehat{\bf u}_f(\widehat{x}_p) \,.
+        \f]
+        The method returns
+        \f[
+        outputVals(c,f,p,*)
+        = \left(J^{-1}_{c} DF_{c}\cdot\nabla\times\widehat{\bf u}_f\right)\circ F^{-1}_{c}(x_{c,p})
+        = J^{-1}_{c}(\widehat{x}_p) DF_{c}(\widehat{x}_p)\cdot\nabla\times\widehat{\bf u}_f(\widehat{x}_p)
+        \qquad 0\le c < C \,.
+        \f]
+        See Section \ref sec_pullbacks for more details about pullbacks.
+    
+        \code
+        |------|----------------------|--------------------------------------------------|
+        |      |         Index        |                   Dimension                      |
+        |------|----------------------|--------------------------------------------------|
+        |   C  |         cell         |  0 <= C < num. integration domains               |
+        |   F  |         field        |  0 <= F < dim. of the basis                      |
+        |   P  |         point        |  0 <= P < num. integration points                |
+        |   D  |         space dim    |  0 <= D < spatial dimension                      |
+        |------|----------------------|--------------------------------------------------|
+        \endcode
+
+     \param  jacobianDividedByJacobianDet [in]  - Input cell Jacobians, divided by their determinant.
+     \param  inputVals                                           [in]  - Input container of reference HDIV values.
+     \return lazily-evaluated transformed basis values.
+    */
+    template<class Scalar>
+    static TransformedBasisValues<Scalar,DeviceType>
+    getHCURLtransformCURL(const Data<Scalar,DeviceType> &jacobianDividedByJacobianDet,
+                          const BasisValues<Scalar,DeviceType> &refBasisValues )
+    {
+      return TransformedBasisValues<Scalar,DeviceType>(jacobianDividedByJacobianDet,refBasisValues);
+    }
+    
+    /** \brief Transformation of a 2D curl field in the H-curl space, defined at points on a
+        reference cell, stored in the user-provided container <var><b>inputVals</b></var>
+        and indexed by (F,P,D), into the output container <var><b>outputVals</b></var>,
+        defined on cells in physical space and indexed by (C,F,P).
+
+        Computes pullback of curls of \e HCURL functions
+        \f$\Phi^*(\widehat{\bf u}_f) = \left(J^{-1}_{c}\nabla\times\widehat{\bf u}_{f}\right) \circ F^{-1}_{c} \f$
+        for points in one or more physical cells that are images of a given set of points in the reference cell:
+        \f[
+        \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,.
+        \f]
+        In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container
+        should contain the 2d curls of the vector function set \f$\{\widehat{\bf u}_f\}_{f=0}^{F}\f$ at the
+        reference points:
+        \f[
+        inputVals(f,p) = \nabla\times\widehat{\bf u}_f(\widehat{x}_p) \,.
+        \f]
+        The method returns
+        \f[
+        outputVals(c,f,p,*)
+        = \left(J^{-1}_{c}\nabla\times\widehat{\bf u}_{f}\right) \circ F^{-1}_{c} (x_{c,p})
+        = J^{-1}_{c}(\widehat{x}_p) \nabla\times\widehat{\bf u}_{f} (\widehat{x}_p)
+        \qquad 0\le c < C \,.
+        \f]
+        See Section \ref sec_pullbacks for more details about pullbacks.
+
+        \code
+        |------|----------------------|--------------------------------------------------|
+        |      |         Index        |                   Dimension                      |
+        |------|----------------------|--------------------------------------------------|
+        |   C  |         cell         |  0 <= C < num. integration domains               |
+        |   F  |         field        |  0 <= F < dim. of the basis                      |
+        |   P  |         point        |  0 <= P < num. integration points                |
+        |   D  |         space dim    |  0 <= D < spatial dimension                      |
+        |------|----------------------|--------------------------------------------------|
+        \endcode
+
+        \param  outputVals   [out] - Output array with transformed values
+        \param  jacobianDetInverse  [in]  - Reciprocals of input cell Jacobian determinants.
+        \param  inputVals    [in]  - Input array of reference HCURL curls.
+    */
+
+    template<class Scalar>
+    static TransformedBasisValues<Scalar,DeviceType>
+    getHCURLtransformCURL2D(const Data<Scalar,DeviceType> &jacobianDetInverse,
+                            const BasisValues<Scalar,DeviceType> &refBasisValues )
+    {
+      return TransformedBasisValues<Scalar,DeviceType>(jacobianDetInverse,refBasisValues);
+    }
+
+    /** \brief Transformation of a (vector) value field in the H-div space, defined at points on a
+        reference cell, stored in the user-provided container <var><b>inputVals</b></var>
+        and indexed by (F,P,D), into the output container <var><b>outputVals</b></var>,
+        defined on cells in physical space and indexed by (C,F,P,D).
+
+        Computes pullback of \e HDIV functions
+        \f$\Phi^*(\widehat{\bf u}_f) = \left(J^{-1}_{c} DF_{c}\cdot\widehat{\bf u}_f\right)\circ F^{-1}_{c} \f$
+        for points in one or more physical cells that are images of a given set of points in the reference cell:
+        \f[
+        \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,.
+        \f]
+        In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container
+        should contain the values of the vector function set \f$\{\widehat{\bf u}_f\}_{f=0}^{F}\f$ at the
+        reference points:
+        \f[
+        inputVals(f,p,*) = \widehat{\bf u}_f(\widehat{x}_p) \,.
+        \f]
+        The method returns
+        \f[
+        outputVals(c,f,p,*)
+        = \left(J^{-1}_{c} DF_{c}\cdot \widehat{\bf u}_f\right)\circ F^{-1}_{c}(x_{c,p})
+        = J^{-1}_{c}(\widehat{x}_p) DF_{c}(\widehat{x}_p)\cdot\widehat{\bf u}_f(\widehat{x}_p)
+        \qquad 0\le c < C \,.
+        \f]
+        See Section \ref sec_pullbacks for more details about pullbacks.
+    
+        \code
+        |------|----------------------|--------------------------------------------------|
+        |      |         Index        |                   Dimension                      |
+        |------|----------------------|--------------------------------------------------|
+        |   C  |         cell         |  0 <= C < num. integration domains               |
+        |   F  |         field        |  0 <= F < dim. of the basis                      |
+        |   P  |         point        |  0 <= P < num. integration points                |
+        |   D  |         space dim    |  0 <= D < spatial dimension                      |
+        |------|----------------------|--------------------------------------------------|
+        \endcode
+
+        \param  jacobianDividedByJacobianDet [in]  - Input cell Jacobians, divided by their determinant.
+        \param  inputVals                                           [in]  - Input container of reference HDIV values.
+     \return TransformedBasisValues object of logical shape (C,F,P,D).
+    */
+
+    template<class Scalar>
+    static TransformedBasisValues<Scalar,DeviceType>
+    getHDIVtransformVALUE(const Data<Scalar,DeviceType> &jacobianDividedByJacobianDet,
+                          const BasisValues<Scalar,DeviceType> &refBasisValues )
+    {
+      return TransformedBasisValues<Scalar,DeviceType>(jacobianDividedByJacobianDet,refBasisValues);
+    }
+    
+    /** \brief Transformation of a divergence field in the H-div space, defined at points on a
+        reference cell, stored in the user-provided container <var><b>inputVals</b></var>
+        and indexed by (F,P), into the output container <var><b>outputVals</b></var>,
+        defined on cells in physical space and indexed by (C,F,P).
+
+        Computes pullback of the divergence of \e HDIV functions
+        \f$\Phi^*(\widehat{\bf u}_f) = \left(J^{-1}_{c}\nabla\cdot\widehat{\bf u}_{f}\right) \circ F^{-1}_{c} \f$
+        for points in one or more physical cells that are images of a given set of points in the reference cell:
+        \f[
+        \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,.
+        \f]
+        In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container
+        should contain the divergencies of the vector function set \f$\{\widehat{\bf u}_f\}_{f=0}^{F}\f$ at the
+        reference points:
+        \f[
+        inputVals(f,p) = \nabla\cdot\widehat{\bf u}_f(\widehat{x}_p) \,.
+        \f]
+        The method returns
+        \f[
+        outputVals(c,f,p,*)
+        = \left(J^{-1}_{c}\nabla\cdot\widehat{\bf u}_{f}\right) \circ F^{-1}_{c} (x_{c,p})
+        = J^{-1}_{c}(\widehat{x}_p) \nabla\cdot\widehat{\bf u}_{f} (\widehat{x}_p)
+        \qquad 0\le c < C \,.
+        \f]
+        See Section \ref sec_pullbacks for more details about pullbacks.
+    
+        \code
+        |------|----------------------|--------------------------------------------------|
+        |      |         Index        |                   Dimension                      |
+        |------|----------------------|--------------------------------------------------|
+        |   C  |         cell         |  0 <= C < num. integration domains               |
+        |   F  |         field        |  0 <= F < dim. of the basis                      |
+        |   P  |         point        |  0 <= P < num. integration points                |
+        |------|----------------------|--------------------------------------------------|
+        \endcode
+
+     \param  jacobianDetInverse  [in]  - Reciprocals of input cell Jacobian determinants.
+     \param  refBasisDivValues    [in]  - Input container of reference HDIV divergences.
+  \return TransformedBasisValues object of logical shape (C,F,P).
+    */
+    template<class Scalar>
+    static TransformedBasisValues<Scalar,DeviceType>
+    getHDIVtransformDIV(const Data<Scalar,DeviceType> &jacobianDetInverse,
+                        const BasisValues<Scalar,DeviceType> &refBasisDivValues )
+    {
+      return TransformedBasisValues<Scalar,DeviceType>(jacobianDetInverse,refBasisDivValues);
+    }
+    
+    /** \brief Transformation of a (scalar) value field in the H-vol space, defined at points on a
+        reference cell, stored in the user-provided container <var><b>inputVals</b></var>
+        and indexed by (F,P), into the output container <var><b>outputVals</b></var>,
+        defined on cells in physical space and indexed by (C,F,P).
+
+        Computes pullback of \e HVOL functions
+        \f$\Phi^*(\widehat{u}_f) = \left(J^{-1}_{c}\widehat{u}_{f}\right) \circ F^{-1}_{c} \f$
+        for points in one or more physical cells that are images of a given set of points in the reference cell:
+        \f[
+        \{ x_{c,p} \}_{p=0}^P = \{ F_{c} (\widehat{x}_p) \}_{p=0}^{P}\qquad 0\le c < C \,.
+        \f]
+        In this case \f$ F^{-1}_{c}(x_{c,p}) = \widehat{x}_p \f$ and the user-provided container
+        should contain the values of the functions in the set \f$\{\widehat{\bf u}_f\}_{f=0}^{F}\f$ at the
+        reference points:
+        \f[
+        inputVals(f,p) = \widehat{u}_f(\widehat{x}_p) \,.
+        \f]
+        The method returns
+        \f[
+        outputVals(c,f,p,*)
+        = \left(J^{-1}_{c}\widehat{u}_{f}\right) \circ F^{-1}_{c} (x_{c,p})
+        = J^{-1}_{c}(\widehat{x}_p) \widehat{u}_{f} (\widehat{x}_p)
+        \qquad 0\le c < C \,.
+        \f]
+        See Section \ref sec_pullbacks for more details about pullbacks.
+    
+        \code
+        |------|----------------------|--------------------------------------------------|
+        |      |         Index        |                   Dimension                      |
+        |------|----------------------|--------------------------------------------------|
+        |   C  |         cell         |  0 <= C < num. integration domains               |
+        |   F  |         field        |  0 <= F < dim. of the basis                      |
+        |   P  |         point        |  0 <= P < num. integration points                |
+        |------|----------------------|--------------------------------------------------|
+        \endcode
+
+     \param  jacobianDetInverse  [in]  - Reciprocals of input cell Jacobian determinants.
+     \param  refBasisValues    [in]  - Input container of reference HVOL values.
+     \return TransformedBasisValues object of logical shape (C,F,P).
+    */
+    template<class Scalar>
+    static TransformedBasisValues<Scalar,DeviceType>
+    getHVOLtransformVALUE(const Data<Scalar,DeviceType> &jacobianDetInverse,
+                          const BasisValues<Scalar,DeviceType> &refBasisValues )
+    {
+      return TransformedBasisValues<Scalar,DeviceType>(jacobianDetInverse,refBasisValues);
     }
     
     /** \brief Transformation of a (scalar) value field in the H-grad space, defined at points on a
diff --git a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationTools.hpp b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationTools.hpp
index 727dd28c2a98..2780ad891922 100644
--- a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationTools.hpp
+++ b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationTools.hpp
@@ -63,7 +63,7 @@
 #include "Intrepid2_CellTools.hpp"
 
 #include "Intrepid2_Data.hpp"
-#include "Intrepid2_TransformedVectorData.hpp"
+#include "Intrepid2_TransformedBasisValues.hpp"
 #include "Intrepid2_VectorData.hpp"
 
 #include "Kokkos_Core.hpp"
@@ -88,9 +88,9 @@ namespace Intrepid2 {
         \return <b>integrals</b>, a container with logical shape (C,F,F), suitable for passing as the first argument to the integrate() variant that takes an Intrepid2::Data object as its first, <b>integrals</b>, argument.
     */
     template<class Scalar>
-    static Data<Scalar,DeviceType> allocateIntegralData(const TransformedVectorData<Scalar,DeviceType> vectorDataLeft,
+    static Data<Scalar,DeviceType> allocateIntegralData(const TransformedBasisValues<Scalar,DeviceType> vectorDataLeft,
                                                         const TensorData<Scalar,DeviceType> cellMeasures,
-                                                        const TransformedVectorData<Scalar,DeviceType> vectorDataRight);
+                                                        const TransformedBasisValues<Scalar,DeviceType> vectorDataRight);
     
     /** \brief   Contracts \a <b>vectorDataLeft</b> and \a <b>vectorDataRight</b> containers on
         point and space dimensions, weighting each point according to <b>cellMeasures</b>,
@@ -104,9 +104,9 @@ namespace Intrepid2 {
         \param  approxFlops               [in] - if not NULL, the double pointed to will be set with an estimated number of floating point operations.  Intended for performance assessment purposes.
     */
     template<class Scalar>
-    static void integrate(Data<Scalar,DeviceType> integrals, const TransformedVectorData<Scalar,DeviceType> &vectorDataLeft,
+    static void integrate(Data<Scalar,DeviceType> integrals, const TransformedBasisValues<Scalar,DeviceType> &vectorDataLeft,
                           const TensorData<Scalar,DeviceType> &cellMeasures,
-                          const TransformedVectorData<Scalar,DeviceType> &vectorDataRight, const bool sumInto = false,
+                          const TransformedBasisValues<Scalar,DeviceType> &vectorDataRight, const bool sumInto = false,
                           double* approximateFlops = NULL);
   }; // end IntegrationTools class
 
diff --git a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp
index 041ab6d45c49..cf5a8aa68905 100644
--- a/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp
+++ b/packages/intrepid2/src/Discretization/Integration/Intrepid2_IntegrationToolsDef.hpp
@@ -1490,7 +1490,7 @@ namespace Intrepid2 {
 //            const int numPointsFirst = leftFirstComponent.extent_int(1);
 //
 //            Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
-//              pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
+//              pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
 //            });
 //
 //            // synchronize threads
@@ -1608,7 +1608,7 @@ namespace Intrepid2 {
           for (int b_component=0; b_component < rightComponentSpan_; b_component++)
           {
 //            Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
-//              pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
+//              pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
 //            });
             flopCount += composedTransform_.extent_int(1) * cellMeasures_.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
 
@@ -1789,9 +1789,9 @@ namespace Intrepid2 {
 
 template<typename DeviceType>
 template<class Scalar>
-Data<Scalar,DeviceType> IntegrationTools<DeviceType>::allocateIntegralData(const TransformedVectorData<Scalar,DeviceType> vectorDataLeft,
+Data<Scalar,DeviceType> IntegrationTools<DeviceType>::allocateIntegralData(const TransformedBasisValues<Scalar,DeviceType> vectorDataLeft,
                                                                            const TensorData<Scalar,DeviceType> cellMeasures,
-                                                                           const TransformedVectorData<Scalar,DeviceType> vectorDataRight)
+                                                                           const TransformedBasisValues<Scalar,DeviceType> vectorDataRight)
 {
   // allocates a (C,F,F) container for storing integral data
   
@@ -1834,9 +1834,9 @@ Data<Scalar,DeviceType> IntegrationTools<DeviceType>::allocateIntegralData(const
 //! 2. arbitrary mesh: cellMeasures has trivial tensor product structure (one tensorial component).
 template<typename DeviceType>
 template<class Scalar>
-void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals, const TransformedVectorData<Scalar,DeviceType> & vectorDataLeft,
+void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals, const TransformedBasisValues<Scalar,DeviceType> & basisValuesLeft,
                                              const TensorData<Scalar,DeviceType> & cellMeasures,
-                                             const TransformedVectorData<Scalar,DeviceType> & vectorDataRight, const bool sumInto,
+                                             const TransformedBasisValues<Scalar,DeviceType> & basisValuesRight, const bool sumInto,
                                              double* approximateFlops)
 {
   using ExecutionSpace = typename DeviceType::execution_space;
@@ -1855,68 +1855,89 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
   }
   
   const int cellDataExtent = integrals.getDataExtent(0);
-  const int numFieldsLeft  = vectorDataLeft.numFields();
-  const int numFieldsRight = vectorDataRight.numFields();
-  const int spaceDim       = vectorDataLeft.spaceDim();
+  const int numFieldsLeft  = basisValuesLeft.numFields();
+  const int numFieldsRight = basisValuesRight.numFields();
+  const int spaceDim       = basisValuesLeft.spaceDim();
   
-  INTREPID2_TEST_FOR_EXCEPTION(vectorDataLeft.spaceDim() != vectorDataRight.spaceDim(), std::invalid_argument, "vectorDataLeft and vectorDataRight must agree on the space dimension");
+  INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.spaceDim() != basisValuesRight.spaceDim(), std::invalid_argument, "vectorDataLeft and vectorDataRight must agree on the space dimension");
   
-  // TODO: add more checks against validity of inputs
+  const int leftFamilyCount  =  basisValuesLeft.basisValues().numFamilies();
+  const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
   
   // we require that the number of tensor components in the vectors are the same for each vector entry
   // this is not strictly necessary, but it makes implementation easier, and we don't at present anticipate other use cases
   int numTensorComponentsLeft = -1;
-  const auto &refVectorLeft   = vectorDataLeft.vectorData();
-  int numFamiliesLeft         = refVectorLeft.numFamilies();
-  int numVectorComponentsLeft = refVectorLeft.numComponents();
-  Kokkos::Array<int,7> maxFieldsForComponentLeft  {0,0,0,0,0,0,0};
-  Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
-  for (int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
+  const bool isVectorValued = basisValuesLeft.vectorData().isValid();
+  if (isVectorValued)
   {
-    for (int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
+    const bool rightIsVectorValued = basisValuesRight.vectorData().isValid();
+    INTREPID2_TEST_FOR_EXCEPTION(!rightIsVectorValued, std::invalid_argument, "left and right must either both be vector-valued, or both scalar-valued");
+    const auto &refVectorLeft   = basisValuesLeft.vectorData();
+    int numFamiliesLeft         = refVectorLeft.numFamilies();
+    int numVectorComponentsLeft = refVectorLeft.numComponents();
+    Kokkos::Array<int,7> maxFieldsForComponentLeft  {0,0,0,0,0,0,0};
+    Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
+    for (int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
     {
-      const TensorData<Scalar,DeviceType> &tensorData = refVectorLeft.getComponent(familyOrdinal,vectorComponent);
-      if (tensorData.numTensorComponents() > 0)
+      for (int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
       {
-        if (numTensorComponentsLeft == -1)
-        {
-          numTensorComponentsLeft = tensorData.numTensorComponents();
-        }
-        INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
-        for (int r=0; r<numTensorComponentsLeft; r++)
+        const TensorData<Scalar,DeviceType> &tensorData = refVectorLeft.getComponent(familyOrdinal,vectorComponent);
+        if (tensorData.numTensorComponents() > 0)
         {
-          maxFieldsForComponentLeft[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentLeft[r]);
+          if (numTensorComponentsLeft == -1)
+          {
+            numTensorComponentsLeft = tensorData.numTensorComponents();
+          }
+          INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
+          for (int r=0; r<numTensorComponentsLeft; r++)
+          {
+            maxFieldsForComponentLeft[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentLeft[r]);
+          }
         }
       }
     }
-  }
-  int numTensorComponentsRight = -1;
-  const auto &refVectorRight   = vectorDataRight.vectorData();
-  int numFamiliesRight         = refVectorRight.numFamilies();
-  int numVectorComponentsRight = refVectorRight.numComponents();
-  for (int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
-  {
-    for (int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
+    int numTensorComponentsRight = -1;
+    const auto &refVectorRight   = basisValuesRight.vectorData();
+    int numFamiliesRight         = refVectorRight.numFamilies();
+    int numVectorComponentsRight = refVectorRight.numComponents();
+    for (int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
     {
-      const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
-      if (tensorData.numTensorComponents() > 0)
+      for (int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
       {
-        if (numTensorComponentsRight == -1)
-        {
-          numTensorComponentsRight = tensorData.numTensorComponents();
-        }
-        INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataRight must have the same number of tensor components as every other");
-        for (int r=0; r<numTensorComponentsRight; r++)
+        const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
+        if (tensorData.numTensorComponents() > 0)
         {
-          maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
+          if (numTensorComponentsRight == -1)
+          {
+            numTensorComponentsRight = tensorData.numTensorComponents();
+          }
+          INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataRight must have the same number of tensor components as every other");
+          for (int r=0; r<numTensorComponentsRight; r++)
+          {
+            maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
+          }
         }
       }
     }
+    INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != numVectorComponentsRight, std::invalid_argument, "Left and right vector entries must have the same number of tensorial components");
+  }
+  else
+  {
+    numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents(); // family ordinal 0
+    for (int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
+    {
+      INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "All families must match in the number of tensor components");
+    }
+    
+    // check that right tensor component count also agrees
+    for (int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
+    {
+      INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
+    }
   }
-  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != numVectorComponentsRight, std::invalid_argument, "Left and right vector entries must have the same number of tensorial components");
   const int numPointTensorComponents = cellMeasures.numTensorComponents() - 1;
     
-  if ((numPointTensorComponents == numTensorComponentsLeft) && vectorDataLeft.axisAligned() && vectorDataRight.axisAligned())
+  if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.axisAligned() && basisValuesRight.axisAligned())
   {
     // cellMeasures is a non-trivial tensor product, and the pullbacks are all diagonals.
     
@@ -1932,86 +1953,6 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
       pointDimensions[r] = cellMeasures.getTensorComponent(r+1).extent_int(0);
     }
 
-    Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents> componentIntegrals; // these are reference-space integrals; no Jacobians or cell measures applied as yet
-    
-    for (int r=0; r<numPointTensorComponents; r++)
-    {
-      for (int d=0; d<spaceDim; d++)
-      {
-        /*
-         Four cases to consider:
-         1. Both vector data containers are filled with axial components - first component in 3D has form (f,0,0), second (0,f,0), third (0,0,f).
-         2. Both vector data containers have arbitrary components - in 3D: (f1,f2,f3) where f1 is given by the first component, f2 by the second, f3 by the third.
-         3. First container is axial, second arbitrary.
-         4. First is arbitrary, second axial.
-         
-         But note that in all four cases, the structure of the integral is the same: you have three vector component integrals that get summed.  The actual difference between
-         the cases does not show up in the reference-space integrals here, but in the accumulation in physical space below, where the tensor field numbering comes into play.
-         
-         The choice between axial and arbitrary affects the way the fields are numbered; the arbitrary components' indices refer to the same vector function, so they correspond,
-         while the axial components refer to distinct scalar functions, so their numbering in the data container is cumulative.
-        */
-        
-        const Data<Scalar,DeviceType> &  leftComponent = vectorDataLeft.vectorData().getComponent(d).getTensorComponent(r);
-        const Data<Scalar,DeviceType> & rightComponent = vectorDataRight.vectorData().getComponent(d).getTensorComponent(r);
-        
-        // It may be worth considering the possibility that some of these components point to the same data -- if so, we could possibly get better data locality by making the corresponding componentIntegral entries point to the same location as well.  (And we could avoid some computations here.)
-        
-        const Data<Scalar,DeviceType> & quadratureWeights = cellMeasures.getTensorComponent(r+1);
-        
-        int leftComponentCount  =  leftComponent.extent_int(0);
-        int rightComponentCount = rightComponent.extent_int(0);
-        
-        const bool allocateFadStorage = !std::is_pod<Scalar>::value;
-        if (allocateFadStorage)
-        {
-          auto fad_size_output = dimension_scalar(integrals.getUnderlyingView());
-          componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftComponentCount, rightComponentCount, fad_size_output);
-        }
-        else
-        {
-          componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftComponentCount, rightComponentCount);
-        }
-        
-        Kokkos::deep_copy(componentIntegrals[r][d], 0.0); // initialize
-        
-        const int & numPoints = pointDimensions[r];
-        const int  leftComponentRank =  leftComponent.rank();
-        const int rightComponentRank = rightComponent.rank();
-        
-        const int  leftComponentDimSpan =  leftComponent.extent_int(2);
-        const int rightComponentDimSpan = rightComponent.extent_int(2);
-        INTREPID2_TEST_FOR_EXCEPTION(( leftComponentDimSpan != rightComponentDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
-        
-//        // TODO: add support for cases where the component ranks are 3, and spaceDim > 1.
-//        INTREPID2_TEST_FOR_EXCEPTION(( leftComponentRank == 3) && ( leftComponent.extent_int(2) > 1), std::invalid_argument, "spaceDim > 1 not yet supported by this integrate() use case.");
-//        INTREPID2_TEST_FOR_EXCEPTION((rightComponentRank == 3) && (rightComponent.extent_int(2) > 1), std::invalid_argument, "spaceDim > 1 not yet supported by this integrate() use case.");
-        
-        auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftComponentCount,rightComponentCount});
-        
-        for (int a=0; a<leftComponentDimSpan; a++)
-        {
-          Kokkos::parallel_for("compute componentIntegrals", policy,
-          KOKKOS_LAMBDA (const int &i, const int &j) {
-            Scalar refSpaceIntegral = 0.0;
-            for (int ptOrdinal=0; ptOrdinal<numPoints; ptOrdinal++)
-            {
-              const Scalar &  leftValue = ( leftComponentRank == 2) ?  leftComponent(i,ptOrdinal) :  leftComponent(i,ptOrdinal,a);
-              const Scalar & rightValue = (rightComponentRank == 2) ? rightComponent(j,ptOrdinal) : rightComponent(j,ptOrdinal,a);
-              refSpaceIntegral += leftValue * rightValue * quadratureWeights(ptOrdinal);
-            }
-            componentIntegrals[r][d](i,j) = refSpaceIntegral;
-          });
-        }
-        
-        if (approximateFlops != NULL)
-        {
-          *approximateFlops += leftComponentCount*rightComponentCount*numPoints*(3); // two multiplies, one add in innermost loop
-        }
-      }
-    }
-    ExecutionSpace().fence(); // ensure that kernels launched above have completed before the kernels below launch.
-
     // only one of these will be a valid container:
     Kokkos::View<Scalar**,  DeviceType> integralView2;
     Kokkos::View<Scalar***, DeviceType> integralView3;
@@ -2023,21 +1964,16 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
     {
       integralView2 = integrals.getUnderlyingView2();
     }
-    
-    const int leftFamilyCount  =  vectorDataLeft.vectorData().numFamilies();
-    const int rightFamilyCount = vectorDataRight.vectorData().numFamilies();
-    
     for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
     {
-      bool haveLaunchedContributionToLeftFamily = false; // for tracking whether we need a fence() below
-      
       int a_offset = 0; // left vector component offset
-      int leftFieldOffset = vectorDataLeft.vectorData().familyFieldOrdinalOffset(leftFamilyOrdinal);
+      int leftFieldOffset = basisValuesLeft.basisValues().familyFieldOrdinalOffset(leftFamilyOrdinal);
       
-      const int leftVectorComponentCount = vectorDataLeft.vectorData().numComponents();
+      const int leftVectorComponentCount = isVectorValued ? basisValuesLeft.vectorData().numComponents() : 1;
       for (int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
       {
-        const auto & leftComponent = vectorDataLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal);
+        TensorData<Scalar,DeviceType> leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal)
+                                                                     : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
         if (!leftComponent.isValid())
         {
           a_offset++; // empty components are understood to take up one dimension
@@ -2049,15 +1985,14 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
         
         for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
         {
-          bool haveLaunchedContributionToRightFamily = false; // for tracking whether we need a fence() below
-          
           int b_offset = 0; // right vector component offset
-          int rightFieldOffset = vectorDataRight.vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
+          int rightFieldOffset = basisValuesRight.vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
 
-          const int rightVectorComponentCount = vectorDataRight.vectorData().numComponents();
+          const int rightVectorComponentCount = isVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
           for (int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
           {
-            const auto & rightComponent = vectorDataRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal);
+            TensorData<Scalar,DeviceType> rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal)
+                                                                          : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
             if (!rightComponent.isValid())
             {
               b_offset++; // empty components are understood to take up one dimension
@@ -2082,13 +2017,81 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
             const int d_start = a_offset;
             const int d_end   = d_start + leftDimSpan;
             
-            if (haveLaunchedContributionToLeftFamily && haveLaunchedContributionToRightFamily)
+            using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
+            ComponentIntegralsArray componentIntegrals;
+            for (int r=0; r<numPointTensorComponents; r++)
             {
-              // accumulation to the same block in integrals container; fence to ensure that that has completed before the next kernel is launched
-              ExecutionSpace().fence();
-            }
-            haveLaunchedContributionToLeftFamily  = true;
-            haveLaunchedContributionToRightFamily = true;
+              /*
+               Four vector data cases to consider:
+               1. Both vector data containers are filled with axial components - first component in 3D has form (f,0,0), second (0,f,0), third (0,0,f).
+               2. Both vector data containers have arbitrary components - in 3D: (f1,f2,f3) where f1 is given by the first component, f2 by the second, f3 by the third.
+               3. First container is axial, second arbitrary.
+               4. First is arbitrary, second axial.
+               
+               But note that in all four cases, the structure of the integral is the same: you have three vector component integrals that get summed.  The actual difference between
+               the cases does not show up in the reference-space integrals here, but in the accumulation in physical space below, where the tensor field numbering comes into play.
+               
+               The choice between axial and arbitrary affects the way the fields are numbered; the arbitrary components' indices refer to the same vector function, so they correspond,
+               while the axial components refer to distinct scalar functions, so their numbering in the data container is cumulative.
+              */
+                
+              Data<Scalar,DeviceType>  quadratureWeights = cellMeasures.getTensorComponent(r+1);
+              const int numPoints = pointDimensions[r];
+                
+                // It may be worth considering the possibility that some of these components point to the same data -- if so, we could possibly get better data locality by making the corresponding componentIntegral entries point to the same location as well.  (And we could avoid some computations here.)
+        
+              Data<Scalar,DeviceType>  leftTensorComponent =  leftComponent.getTensorComponent(r);
+              Data<Scalar,DeviceType> rightTensorComponent = rightComponent.getTensorComponent(r);
+              
+              const int  leftTensorComponentRank    =  leftTensorComponent.rank();
+              const int  leftTensorComponentDimSpan =  leftTensorComponent.extent_int(2);
+              const int  leftTensorComponentFields  =  leftTensorComponent.extent_int(0);
+              const int rightTensorComponentRank    = rightTensorComponent.rank();
+              const int rightTensorComponentDimSpan = rightTensorComponent.extent_int(2);
+              const int rightTensorComponentFields  = rightTensorComponent.extent_int(0);
+              
+              INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
+            
+              for (int d=d_start; d<d_end; d++)
+              {
+                const bool allocateFadStorage = !std::is_pod<Scalar>::value;
+                if (allocateFadStorage)
+                {
+                  auto fad_size_output = dimension_scalar(integrals.getUnderlyingView());
+                  componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
+                }
+                else
+                {
+                  componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
+                }
+            
+                auto componentIntegralView = componentIntegrals[r][d];
+                
+                auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
+                
+                for (int a=0; a<leftTensorComponentDimSpan; a++)
+                {
+                  Kokkos::parallel_for("compute componentIntegrals", policy,
+                  KOKKOS_LAMBDA (const int &i, const int &j) {
+                    Scalar refSpaceIntegral = 0.0;
+                    for (int ptOrdinal=0; ptOrdinal<numPoints; ptOrdinal++)
+                    {
+                      const Scalar &  leftValue = ( leftTensorComponentRank == 2) ?  leftTensorComponent(i,ptOrdinal) :  leftTensorComponent(i,ptOrdinal,a);
+                      const Scalar & rightValue = (rightTensorComponentRank == 2) ? rightTensorComponent(j,ptOrdinal) : rightTensorComponent(j,ptOrdinal,a);
+                      refSpaceIntegral += leftValue * rightValue * quadratureWeights(ptOrdinal);
+                    }
+                    componentIntegralView(i,j) = refSpaceIntegral;
+                  });
+                }
+                
+                if (approximateFlops != NULL)
+                {
+                  *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3); // two multiplies, one add in innermost loop
+                }
+              } // d
+            } // r
+            
+            ExecutionSpace().fence();
             
             Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount}; // separately declared in effort to get around Intel 17.0.1 compiler weirdness.
             Kokkos::Array<int,3> lowerBounds {0,0,0};
@@ -2107,8 +2110,8 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
               Scalar integralSum = 0.0;
               for (int d=d_start; d<d_end; d++)
               {
-                const Scalar & transformLeft_d  =  vectorDataLeft.transformWeight(cellDataOrdinal,0,d,d);
-                const Scalar & transformRight_d = vectorDataRight.transformWeight(cellDataOrdinal,0,d,d);
+                const Scalar & transformLeft_d  =  basisValuesLeft.transformWeight(cellDataOrdinal,0,d,d);
+                const Scalar & transformRight_d = basisValuesRight.transformWeight(cellDataOrdinal,0,d,d);
                 
                 const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
         //            approximateFlopCount++;
@@ -2136,11 +2139,9 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
                 }
               }
             });
-            
             b_offset += rightDimSpan;
-          } // rightVectorComponentOrdinal loop
-        } // rightFamilyOrdinal loop
-        
+          } // rightVectorComponentOrdinal
+        } // rightFamilyOrdinal
         a_offset += leftDimSpan;
       } // leftVectorComponentOrdinal
     } // leftFamilyOrdinal
@@ -2154,28 +2155,74 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
   else // general case (not axis-aligned + affine tensor-product structure)
   {
     // prepare composed transformation matrices
-    const Data<Scalar,DeviceType> & leftTransform  = vectorDataLeft.transform();
-    const Data<Scalar,DeviceType> & rightTransform = vectorDataRight.transform();
+    const Data<Scalar,DeviceType> & leftTransform  = basisValuesLeft.transform();
+    const Data<Scalar,DeviceType> & rightTransform = basisValuesRight.transform();
     const bool transposeLeft  = true;
     const bool transposeRight = false;
 //    auto timer = Teuchos::TimeMonitor::getNewTimer("mat-mat");
 //    timer->start();
-    Data<Scalar,DeviceType> composedTransform = leftTransform.allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
-    composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
+    // transforms can be matrices -- (C,P,D,D): rank 4 -- or scalar weights -- (C,P): rank 2
+    const bool matrixTransform = (leftTransform.rank() == 4) || (rightTransform.rank() == 4);
+    Data<Scalar,DeviceType> composedTransform;
+    // invalid/empty transforms are used when the identity is intended.
+    if (leftTransform.isValid() && rightTransform.isValid())
+    {
+      if (matrixTransform)
+      {
+        composedTransform = leftTransform.allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
+        composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
+        
+        // if the composedTransform matrices are full, the following is a good estimate.  If they have some diagonal portions, this will overcount.
+        if (approximateFlops != NULL)
+        {
+          *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2;
+        }
+      }
+      else
+      {
+        composedTransform = leftTransform.allocateInPlaceCombinationResult(leftTransform, rightTransform);
+        composedTransform.storeInPlaceProduct(leftTransform, rightTransform);
+        
+        // re-cast composedTranform as a rank 4 (C,P,D,D) object -- a 1 x 1 matrix at each (C,P).
+        const int newRank   = 4;
+        auto extents        = composedTransform.getExtents();
+        auto variationTypes = composedTransform.getVariationTypes();
+        composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
+        if (approximateFlops != NULL)
+        {
+          *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
+        }
+      }
+    }
+    else if (leftTransform.isValid())
+    {
+      // rightTransform is the identity
+      composedTransform = leftTransform;
+    }
+    else if (rightTransform.isValid())
+    {
+      // leftTransform is the identity
+      composedTransform = rightTransform;
+    }
+    else
+    {
+      // both left and right transforms are identity
+      Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.numCells(),basisValuesLeft.numPoints(),spaceDim,spaceDim};
+      Kokkos::Array<DataVariationType,4> variationTypes {CONSTANT,CONSTANT,BLOCK_PLUS_DIAGONAL,BLOCK_PLUS_DIAGONAL};
+      
+      Kokkos::View<Scalar*,DeviceType> identityUnderlyingView("Intrepid2::FST::integrate() - identity view",spaceDim);
+      Kokkos::deep_copy(identityUnderlyingView, 1.0);
+      composedTransform = Data<Scalar,DeviceType>(identityUnderlyingView,extents,variationTypes);
+    }
+    
 //    timer->stop();
 //    std::cout << "Completed mat-mat in " << timer->totalElapsedTime() << " seconds.\n";
 //    timer->reset();
     
-    // if the composedTransform matrices are full, the following is a good estimate.  If they have some diagonal portions, this will overcount.
-    if (approximateFlops != NULL)
-    {
-      *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2;
-    }
-    
-    const int leftFamilyCount     = vectorDataLeft. vectorData().numFamilies();
-    const int rightFamilyCount    = vectorDataRight.vectorData().numFamilies();
-    const int leftComponentCount  = vectorDataLeft. vectorData().numComponents();
-    const int rightComponentCount = vectorDataRight.vectorData().numComponents();
+    const int leftFamilyCount     = basisValuesLeft. basisValues().numFamilies();
+    const int rightFamilyCount    = basisValuesRight.basisValues().numFamilies();
+    const int leftComponentCount  = isVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
+    const int rightComponentCount = isVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
     
     int leftFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
     for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
@@ -2186,11 +2233,12 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
       bool haveLaunchedContributionToCurrentFamilyLeft = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
       for (int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
       {
-        const TensorData<Scalar,DeviceType> & leftComponent = vectorDataLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal);
+        TensorData<Scalar,DeviceType> leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal)
+                                                                     : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
         if (!leftComponent.isValid())
         {
            // represents zero
-          a_offset += vectorDataLeft.vectorData().numDimsForComponent(leftComponentOrdinal);
+          a_offset += basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal);
           continue;
         }
            
@@ -2203,11 +2251,12 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
           int b_offset = 0;
           for (int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
           {
-            const TensorData<Scalar,DeviceType> & rightComponent = vectorDataRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal);
+            TensorData<Scalar,DeviceType> rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal)
+                                                                          : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
             if (!rightComponent.isValid())
             {
                // represents zero
-              b_offset += vectorDataRight.vectorData().numDimsForComponent(rightComponentOrdinal);
+              b_offset += basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal);
               continue;
             }
             
@@ -2305,20 +2354,20 @@ void IntegrationTools<DeviceType>::integrate(Data<Scalar,DeviceType> integrals,
                 }
               }
             }
-            b_offset += vectorDataRight.vectorData().numDimsForComponent(rightComponentOrdinal);
+            b_offset += isVectorValued ? basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
           }
-          rightFieldOrdinalOffset += vectorDataRight.vectorData().numFieldsInFamily(rightFamilyOrdinal);
+          rightFieldOrdinalOffset += isVectorValued ? basisValuesRight.vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
         }
-        a_offset += vectorDataLeft.vectorData().numDimsForComponent(leftComponentOrdinal);
+        a_offset += isVectorValued ? basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
       }
-      leftFieldOrdinalOffset += vectorDataLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal);
+      leftFieldOrdinalOffset += isVectorValued ? basisValuesLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
     }
   }
 //  if (approximateFlops != NULL)
 //  {
 //    std::cout << "Approximate flop count (new): " << *approximateFlops << std::endl;
 //  }
-  ExecutionSpace().fence(); // make sure we've finished writing to integrals container before exiting…
+  ExecutionSpace().fence(); // make sure we've finished writing to integrals container before we return
 }
 
 } // end namespace Intrepid2
diff --git a/packages/intrepid2/src/Shared/Intrepid2_Data.hpp b/packages/intrepid2/src/Shared/Intrepid2_Data.hpp
index 4c626f897b1c..e5a1ab39f1ce 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_Data.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_Data.hpp
@@ -367,13 +367,25 @@ class ZeroView {
     }
 
   public:
-    //! For use with Data object into which a value will be stored.
+    //! For use with Data object into which a value will be stored.  We use passThroughBlockDiagonalArgs = true for storeInPlaceCombination().
+    template<bool passThroughBlockDiagonalArgs>
     struct FullArgExtractorWritableData
     {
       template<class ViewType, class ...IntArgs>
       static KOKKOS_INLINE_FUNCTION reference_type get(const ViewType &view, const IntArgs&... intArgs)
       {
-        return view.getWritableEntry(intArgs...);
+        return view.getWritableEntryWithPassThroughOption(passThroughBlockDiagonalArgs, intArgs...);
+      }
+    };
+    
+    //! For use with Data object into which a value will be stored.  We use passThroughBlockDiagonalArgs = true for storeInPlaceCombination().
+    template<bool passThroughBlockDiagonalArgs>
+    struct FullArgExtractorData
+    {
+      template<class ViewType, class ...IntArgs>
+      static KOKKOS_INLINE_FUNCTION const_reference_type get(const ViewType &view, const IntArgs&... intArgs)
+      {
+        return view.getEntryWithPassThroughOption(passThroughBlockDiagonalArgs, intArgs...);
       }
     };
     
@@ -475,8 +487,8 @@ class ZeroView {
       const ConstantArgExtractor<reference_type> constArg;
       
       const FullArgExtractor<reference_type> fullArgs;
-      const FullArgExtractor<const_reference_type> fullArgsConst;
-      const FullArgExtractorWritableData fullArgsWritable;
+      const FullArgExtractorData<true> fullArgsData; // true: pass through block diagonal args.  This is due to the behavior of dataExtentRangePolicy() for block diagonal args.
+      const FullArgExtractorWritableData<true> fullArgsWritable; // true: pass through block diagonal args.  This is due to the behavior of dataExtentRangePolicy() for block diagonal args.
       
       const SingleArgExtractor<reference_type,0> arg0;
       const SingleArgExtractor<reference_type,1> arg1;
@@ -539,7 +551,7 @@ class ZeroView {
           }
           else // this_full, not B_full: B may have modular data, etc.
           {
-            auto BAE = fullArgsConst;
+            auto BAE = fullArgsData;
             storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, AAE, BAE);
           }
         }
@@ -567,7 +579,7 @@ class ZeroView {
           {
             // since storing to Data object requires a call to getWritableEntry(), we use FullArgExtractorWritableData
             auto thisAE = fullArgsWritable;
-            auto BAE    = fullArgsConst;
+            auto BAE    = fullArgsData;
             storeInPlaceCombination(policy, thisData, A_underlying, B, binaryOperator, thisAE, AAE, BAE);
           }
         }
@@ -590,7 +602,7 @@ class ZeroView {
           else  // this_full, not A_full: A may have modular data, etc.
           {
             // use A (the Data object).  This could be further optimized by using A's underlying View and an appropriately-defined ArgExtractor.
-            auto AAE = fullArgsConst;
+            auto AAE = fullArgsData;
             storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, thisAE, AAE, BAE);
           }
         }
@@ -618,7 +630,7 @@ class ZeroView {
           {
             // since storing to Data object requires a call to getWritableEntry(), we use FullArgExtractorWritableData
             auto thisAE = fullArgsWritable;
-            auto AAE    = fullArgsConst;
+            auto AAE    = fullArgsData;
             storeInPlaceCombination(policy, thisData, A, B_underlying, binaryOperator, thisAE, AAE, BAE);
           }
         }
@@ -661,12 +673,12 @@ class ZeroView {
             // B is not full-extent in dimension argThis; use the Data object
             switch (argThis)
             {
-              case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg0, arg0, fullArgsConst); break;
-              case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg1, arg1, fullArgsConst); break;
-              case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg2, arg2, fullArgsConst); break;
-              case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg3, arg3, fullArgsConst); break;
-              case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg4, arg4, fullArgsConst); break;
-              case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg5, arg5, fullArgsConst); break;
+              case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg0, arg0, fullArgsData); break;
+              case 1: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg1, arg1, fullArgsData); break;
+              case 2: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg2, arg2, fullArgsData); break;
+              case 3: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg3, arg3, fullArgsData); break;
+              case 4: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg4, arg4, fullArgsData); break;
+              case 5: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, arg5, arg5, fullArgsData); break;
               default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index");
             }
           }
@@ -675,12 +687,12 @@ class ZeroView {
             // A is not full-extent in dimension argThis; use the Data object
             switch (argThis)
             {
-              case 0: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg0, fullArgsConst, arg0); break;
-              case 1: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg1, fullArgsConst, arg1); break;
-              case 2: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg2, fullArgsConst, arg2); break;
-              case 3: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg3, fullArgsConst, arg3); break;
-              case 4: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg4, fullArgsConst, arg4); break;
-              case 5: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg5, fullArgsConst, arg5); break;
+              case 0: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg0, fullArgsData, arg0); break;
+              case 1: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg1, fullArgsData, arg1); break;
+              case 2: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg2, fullArgsData, arg2); break;
+              case 3: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg3, fullArgsData, arg3); break;
+              case 4: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg4, fullArgsData, arg4); break;
+              case 5: storeInPlaceCombination(policy, this_underlying, A, B_underlying, binaryOperator, arg5, fullArgsData, arg5); break;
               default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid/unexpected arg index");
             }
           }
@@ -715,7 +727,7 @@ class ZeroView {
             {
               // A is full; B is not full, but not constant or full-extent 1D
               // unoptimized in B access:
-              auto BAE = fullArgsConst;
+              auto BAE = fullArgsData;
               storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, AAE, BAE);
             }
           }
@@ -742,7 +754,7 @@ class ZeroView {
               }
               else
               {
-                auto BAE = fullArgsConst;
+                auto BAE = fullArgsData;
                 switch (argIndex)
                 {
                   case 0: storeInPlaceCombination(policy, this_underlying, A_underlying, B, binaryOperator, thisAE, arg0, BAE); break;
@@ -758,8 +770,8 @@ class ZeroView {
             else // A not full, and not full-extent 1D
             {
               // unoptimized in A, B accesses.
-              auto AAE    = fullArgsConst;
-              auto BAE    = fullArgsConst;
+              auto AAE    = fullArgsData;
+              auto BAE    = fullArgsData;
               storeInPlaceCombination(policy, this_underlying, A, B, binaryOperator, thisAE, AAE, BAE);
             }
           }
@@ -768,8 +780,8 @@ class ZeroView {
         {
           // completely un-optimized case: we use Data objects for this, A, B.
           auto thisAE = fullArgsWritable;
-          auto AAE    = fullArgsConst;
-          auto BAE    = fullArgsConst;
+          auto AAE    = fullArgsData;
+          auto BAE    = fullArgsData;
           storeInPlaceCombination(policy, thisData, A, B, binaryOperator, thisAE, AAE, BAE);
         }
       }
@@ -783,7 +795,7 @@ class ZeroView {
       auto policy = dataExtentRangePolicy<rank>();
       
       using DataType = Data<DataScalar,DeviceType>;
-      using ThisAE = FullArgExtractorWritableData;
+      using ThisAE = FullArgExtractorWritableData<true>;
       using AAE    = FullArgExtractor<const_reference_type>;
       using BAE    = FullArgExtractor<const_reference_type>;
       
@@ -897,108 +909,126 @@ class ZeroView {
           INTREPID2_TEST_FOR_EXCEPTION(true,std::invalid_argument,"Unsupported data rank");
       }
     }
-    
-    //! Returns an l-value reference to the specified logical entry in the underlying view.  Note that for variation types other than GENERAL, multiple valid argument sets will refer to the same memory location.  Intended for Intrepid2 developers and expert users only.
+
+    //! Returns an l-value reference to the specified logical entry in the underlying view.  Note that for variation types other than GENERAL, multiple valid argument sets will refer to the same memory location.  Intended for Intrepid2 developers and expert users only.  If passThroughBlockDiagonalArgs is TRUE, the corresponding arguments are interpreted as entries in the 1D packed matrix rather than as logical 2D matrix row and column.
     template<class ...IntArgs>
     KOKKOS_INLINE_FUNCTION
-    reference_type getWritableEntry(const IntArgs... intArgs) const
+    reference_type getWritableEntryWithPassThroughOption(const bool &passThroughBlockDiagonalArgs, const IntArgs... intArgs) const
     {
-#ifdef INTREPID2_HAVE_DEBUG
-      INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(numArgs != rank_, std::invalid_argument, "getWritableEntry() should have the same number of arguments as the logical rank.");
-#endif
-      constexpr int numArgs = sizeof...(intArgs);
-      if (underlyingMatchesLogical_)
-      {
-        // in this case, we require that numArgs == dataRank_
-        return getUnderlyingView<numArgs>()(intArgs...);
-      }
-      
-      // extract the type of the first argument; use that for the arrays below
-      using int_type = std::tuple_element_t<0, std::tuple<IntArgs...>>;
-      
-      const Kokkos::Array<int_type, numArgs> args {intArgs...};
-      Kokkos::Array<int_type, 7> refEntry;
-      for (int d=0; d<numArgs; d++)
-      {
-        switch (variationType_[d])
+  #ifdef INTREPID2_HAVE_DEBUG
+        INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(numArgs != rank_, std::invalid_argument, "getWritableEntry() should have the same number of arguments as the logical rank.");
+  #endif
+        constexpr int numArgs = sizeof...(intArgs);
+        if (underlyingMatchesLogical_)
         {
-          case CONSTANT: refEntry[d] = 0;                              break;
-          case GENERAL:  refEntry[d] = args[d];                        break;
-          case MODULAR:  refEntry[d] = args[d] % variationModulus_[d]; break;
-          case BLOCK_PLUS_DIAGONAL:
+          // in this case, we require that numArgs == dataRank_
+          return getUnderlyingView<numArgs>()(intArgs...);
+        }
+        
+        // extract the type of the first argument; use that for the arrays below
+        using int_type = std::tuple_element_t<0, std::tuple<IntArgs...>>;
+        
+        const Kokkos::Array<int_type, numArgs+1> args {intArgs...,0}; // we pad with one extra entry (0) to avoid gcc compiler warnings about references beyond the bounds of the array (the [d+1]'s below)
+        Kokkos::Array<int_type, 7> refEntry;
+        for (int d=0; d<numArgs; d++)
+        {
+          switch (variationType_[d])
           {
-            const int numNondiagonalEntries = blockPlusDiagonalNumNondiagonalEntries(blockPlusDiagonalLastNonDiagonal_);
-            
-            const int_type &i = args[d];
-            if (d+1 >= numArgs)
-            {
-              INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::invalid_argument, "BLOCK_PLUS_DIAGONAL must be present for two dimensions; here, encountered only one");
-            }
-            else
+            case CONSTANT: refEntry[d] = 0;                              break;
+            case GENERAL:  refEntry[d] = args[d];                        break;
+            case MODULAR:  refEntry[d] = args[d] % variationModulus_[d]; break;
+            case BLOCK_PLUS_DIAGONAL:
             {
-              const int_type &j = args[d+1];
-              
-              if ((i > static_cast<int_type>(blockPlusDiagonalLastNonDiagonal_)) || (j > static_cast<int_type>(blockPlusDiagonalLastNonDiagonal_)))
+              if (passThroughBlockDiagonalArgs)
               {
-                if (i != j)
+                refEntry[d]   = args[d];
+                refEntry[d+1] = args[d+1]; // this had better be == 0
+                INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(args[d+1] != 0, std::invalid_argument, "getWritableEntry() called with passThroughBlockDiagonalArgs = true, but nonzero second matrix argument.");
+              }
+              else
+              {
+                const int numNondiagonalEntries = blockPlusDiagonalNumNondiagonalEntries(blockPlusDiagonalLastNonDiagonal_);
+                
+                const int_type &i = args[d];
+                if (d+1 >= numArgs)
                 {
-                  // off diagonal: zero
-                  return zeroView_(0); // NOTE: this branches in an argument-dependent way; this is not great for CUDA performance.  When using BLOCK_PLUS_DIAGONAL, should generally avoid calls to this getEntry() method.  (Use methods that directly take advantage of the data packing instead.)
+                  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::invalid_argument, "BLOCK_PLUS_DIAGONAL must be present for two dimensions; here, encountered only one");
                 }
                 else
                 {
-                  refEntry[d] = blockPlusDiagonalDiagonalEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i);
+                  const int_type &j = args[d+1];
+                  
+                  if ((i > static_cast<int_type>(blockPlusDiagonalLastNonDiagonal_)) || (j > static_cast<int_type>(blockPlusDiagonalLastNonDiagonal_)))
+                  {
+                    if (i != j)
+                    {
+                      // off diagonal: zero
+                      return zeroView_(0); // NOTE: this branches in an argument-dependent way; this is not great for CUDA performance.  When using BLOCK_PLUS_DIAGONAL, should generally avoid calls to this getEntry() method.  (Use methods that directly take advantage of the data packing instead.)
+                    }
+                    else
+                    {
+                      refEntry[d] = blockPlusDiagonalDiagonalEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i);
+                    }
+                  }
+                  else
+                  {
+                    refEntry[d] = blockPlusDiagonalBlockEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i, j);
+                  }
+
+                  // skip next d (this is required also to be BLOCK_PLUS_DIAGONAL, and we've consumed its arg as j above)
+                  refEntry[d+1] = 0;
                 }
               }
-              else
-              {
-                refEntry[d] = blockPlusDiagonalBlockEntryIndex(blockPlusDiagonalLastNonDiagonal_, numNondiagonalEntries, i, j);
-              }
-
-              // skip next d (this is required also to be BLOCK_PLUS_DIAGONAL, and we've consumed its arg as j above)
-              refEntry[d+1] = 0;
+              d++;
             }
-            d++;
           }
         }
-      }
-       // refEntry should be zero-filled beyond numArgs, for cases when rank_ < dataRank_ (this only is allowed if the extra dimensions each has extent 1).
-      for (int d=numArgs; d<7; d++)
-      {
-        refEntry[d] = 0;
-      }
+         // refEntry should be zero-filled beyond numArgs, for cases when rank_ < dataRank_ (this only is allowed if the extra dimensions each has extent 1).
+        for (int d=numArgs; d<7; d++)
+        {
+          refEntry[d] = 0;
+        }
+        
+        if (dataRank_ == 1)
+        {
+          return data1_(refEntry[activeDims_[0]]);
+        }
+        else if (dataRank_ == 2)
+        {
+          return data2_(refEntry[activeDims_[0]],refEntry[activeDims_[1]]);
+        }
+        else if (dataRank_ == 3)
+        {
+          return data3_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]]);
+        }
+        else if (dataRank_ == 4)
+        {
+          return data4_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]]);
+        }
+        else if (dataRank_ == 5)
+        {
+          return data5_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]],
+                        refEntry[activeDims_[4]]);
+        }
+        else if (dataRank_ == 6)
+        {
+          return data6_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]],
+                        refEntry[activeDims_[4]],refEntry[activeDims_[5]]);
+        }
+        else // dataRank_ == 7
+        {
+          return data7_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]],
+                        refEntry[activeDims_[4]],refEntry[activeDims_[5]],refEntry[activeDims_[6]]);
+        }
       
-      if (dataRank_ == 1)
-      {
-        return data1_(refEntry[activeDims_[0]]);
-      }
-      else if (dataRank_ == 2)
-      {
-        return data2_(refEntry[activeDims_[0]],refEntry[activeDims_[1]]);
-      }
-      else if (dataRank_ == 3)
-      {
-        return data3_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]]);
-      }
-      else if (dataRank_ == 4)
-      {
-        return data4_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]]);
-      }
-      else if (dataRank_ == 5)
-      {
-        return data5_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]],
-                      refEntry[activeDims_[4]]);
-      }
-      else if (dataRank_ == 6)
-      {
-        return data6_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]],
-                      refEntry[activeDims_[4]],refEntry[activeDims_[5]]);
-      }
-      else // dataRank_ == 7
-      {
-        return data7_(refEntry[activeDims_[0]],refEntry[activeDims_[1]],refEntry[activeDims_[2]],refEntry[activeDims_[3]],
-                      refEntry[activeDims_[4]],refEntry[activeDims_[5]],refEntry[activeDims_[6]]);
-      }
+    }
+    
+    //! Returns an l-value reference to the specified logical entry in the underlying view.  Note that for variation types other than GENERAL, multiple valid argument sets will refer to the same memory location.  Intended for Intrepid2 developers and expert users only.  If passThroughBlockDiagonalArgs is TRUE, the corresponding arguments are interpreted as entries in the 1D packed matrix rather than as logical 2D matrix row and column.
+    template<class ...IntArgs>
+    KOKKOS_INLINE_FUNCTION
+    reference_type getWritableEntry(const IntArgs... intArgs) const
+    {
+      return getWritableEntryWithPassThroughOption(false, intArgs...);
     }
   public:
     //! Generic data copying method to allow construction of Data object from DynRankViews for which deep_copy() to the underlying view would be disallowed.  This method made public to allow CUDA compilation (because it contains a Kokkos lambda).
@@ -1058,7 +1088,7 @@ class ZeroView {
       {
         std::vector<int> dataExtents;
 
-        bool blockPlusDiagonalEncountered = true;
+        bool blockPlusDiagonalEncountered = false;
         for (int d=0; d<rank_; d++)
         {
           const DimensionInfo & dimInfo = dimInfoVector[d];
@@ -1704,7 +1734,7 @@ class ZeroView {
     //! returns the true extent of the data corresponding to the logical dimension provided; if the data does not vary in that dimension, returns 1
     KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
     {
-      for (unsigned i=0; i<activeDims_.size(); i++)
+      for (int i=0; i<numActiveDims_; i++)
       {
         if (activeDims_[i] == d)
         {
@@ -1742,12 +1772,20 @@ class ZeroView {
       return variationType_;
     }
     
+    //! Returns a (read-only) value corresponding to the specified logical data location.  If passThroughBlockDiagonalArgs is TRUE, the corresponding arguments are interpreted as entries in the 1D packed matrix rather than as logical 2D matrix row and column.
+    template<class ...IntArgs>
+    KOKKOS_INLINE_FUNCTION
+    return_type getEntryWithPassThroughOption(const bool &passThroughBlockDiagonalArgs, const IntArgs&... intArgs) const
+    {
+      return getWritableEntryWithPassThroughOption(passThroughBlockDiagonalArgs, intArgs...);
+    }
+    
     //! Returns a (read-only) value corresponding to the specified logical data location.
     template<class ...IntArgs>
     KOKKOS_INLINE_FUNCTION
     return_type getEntry(const IntArgs&... intArgs) const
     {
-      return getWritableEntry(intArgs...);
+      return getEntryWithPassThroughOption(false, intArgs...);
     }
     
     template <bool...> struct bool_pack;
@@ -1806,6 +1844,14 @@ class ZeroView {
       return false; // statement should be unreachable; included because compilers don't necessarily recognize that fact...
     }
     
+    //! Constructs a container with extents matching this, with a single-value underlying View, CONSTANT in each dimension.
+    //! \param value  [in] - the constant value.
+    //! \return A container with the same logical shape as this, with a single-value underlying View.
+    Data<DataScalar,DeviceType> allocateConstantData( const DataScalar &value )
+    {
+      return Data<DataScalar,DeviceType>(value, this->getExtents());
+    }
+    
     //! Constructs a container suitable for storing the result of an in-place combination of the two provided data containers.  The two containers must have the same logical shape.
     //! \see storeInPlaceCombination()
     //! \param A  [in] - the first data container.
@@ -2183,6 +2229,23 @@ class ZeroView {
       return policy;
     }
     
+    //! Creates a new Data object with the same underlying view, but with the specified logical rank, extents, and variation types.
+    Data shallowCopy(const int rank, const Kokkos::Array<int,7> &extents, const Kokkos::Array<DataVariationType,7> &variationTypes)
+    {
+      switch (dataRank_)
+      {
+        case 1: return Data(data1_, extents, variationTypes);
+        case 2: return Data(data2_, extents, variationTypes);
+        case 3: return Data(data3_, extents, variationTypes);
+        case 4: return Data(data4_, extents, variationTypes);
+        case 5: return Data(data5_, extents, variationTypes);
+        case 6: return Data(data6_, extents, variationTypes);
+        case 7: return Data(data7_, extents, variationTypes);
+        default:
+          INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unhandled dataRank_");
+      }
+    }
+    
     //! Places the result of an in-place combination (e.g., entrywise sum) into this data container.
     template<class BinaryOperator>
     void storeInPlaceCombination(const Data<DataScalar,DeviceType> &A, const Data<DataScalar,DeviceType> &B, BinaryOperator binaryOperator)
diff --git a/packages/intrepid2/src/Shared/Intrepid2_TensorData.hpp b/packages/intrepid2/src/Shared/Intrepid2_TensorData.hpp
index f42846072a67..15d93372955c 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_TensorData.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_TensorData.hpp
@@ -157,6 +157,32 @@ namespace Intrepid2
       initialize();
     }
     
+    /**
+     \brief Constructor to combine two other TensorData objects
+     \param [in] first - TensorData object with the components for the first dimension(s)
+     \param [in] second - TensorData object with the components for the remaining dimension(s).
+     \param [in] separateFirstComponent - if true, indicates that the first component (from the first TensorData object) will be indexed separately (this is used when the first index corresponds to a cell ordinal)
+     
+       When <var>separateFirstComponent</var> is true, all components are required to have rank 1, and TensorData has rank 2, with the first argument reserved for the first component.  The second argument is indexed precisely as described above, omitting the first component.
+    */
+    TensorData(const TensorData &first, const TensorData &second, bool separateFirstComponent = false)
+    :
+    separateFirstComponent_(separateFirstComponent),
+    numTensorComponents_(first.numTensorComponents() + second.numTensorComponents())
+    {
+      ordinal_type r = 0;
+      for (ordinal_type r1=0; r1<first.numTensorComponents(); r1++, r++)
+      {
+        tensorComponents_[r] = first.getTensorComponent(r1);
+      }
+      for (ordinal_type r2=0; r2<second.numTensorComponents(); r2++, r++)
+      {
+        tensorComponents_[r] = second.getTensorComponent(r2);
+      }
+      
+      initialize();
+    }
+    
     /**
      \brief Simple constructor for the case of trivial tensor-product structure (single component)
      \param [in] tensorComponent - the data component.
diff --git a/packages/intrepid2/src/Shared/Intrepid2_TensorPoints.hpp b/packages/intrepid2/src/Shared/Intrepid2_TensorPoints.hpp
index 3dac45917282..778b42a160e8 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_TensorPoints.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_TensorPoints.hpp
@@ -71,11 +71,28 @@ namespace Intrepid2 {
     bool isValid_;
     using reference_type = typename ScalarView<PointScalar,DeviceType>::reference_type;
     
+    void TEST_VALID_POINT_COMPONENTS()
+    {
+#ifdef HAVE_INTREPID2_DEBUG
+      if (isValid_)
+      {
+        for (ordinal_type r=0; r<numTensorComponents_; r++)
+        {
+          const auto & pointComponent = pointTensorComponents_[r];
+          INTREPID2_TEST_FOR_EXCEPTION(2 != pointComponent.rank(), std::invalid_argument, "Each component must have shape (P,D)");
+          INTREPID2_TEST_FOR_EXCEPTION(pointComponent.extent_int(0) <= 0, std::invalid_argument, "Each component must have at least one point");
+        }
+      }
+#endif
+    }
+    
     /**
      \brief Initialize members based on constructor parameters.
     */
     void initialize()
     {
+      TEST_VALID_POINT_COMPONENTS();
+      
       totalPointCount_ = 1;
       totalDimension_  = 0;
       for (ordinal_type r=0; r<numTensorComponents_; r++)
@@ -132,6 +149,27 @@ namespace Intrepid2 {
       initialize();
     }
     
+    /**
+     \brief Constructor that takes a subset of the tensorial components of another points container.
+     \param [in] otherPointsContainer - the original points container
+     \param [in] whichDims - the tensorial component indices to take from the other container.
+     
+     \note this does not copy the points.
+    */
+    TensorPoints(TensorPoints otherPointsContainer, std::vector<int> whichDims)
+    :
+    numTensorComponents_(whichDims.size()),
+    isValid_(true)
+    {
+      int r = 0;
+      for (const auto & componentOrdinal : whichDims)
+      {
+        pointTensorComponents_[r++] = otherPointsContainer.getTensorComponent(componentOrdinal);
+      }
+      
+      initialize();
+    }
+    
     /**
      \brief Constructor with variable-length std::vector argument.
      \param [in] pointTensorComponents - the components representing the points.
diff --git a/packages/intrepid2/src/Shared/Intrepid2_TestUtils.hpp b/packages/intrepid2/src/Shared/Intrepid2_TestUtils.hpp
index d76bdc9ef603..981325720f03 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_TestUtils.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_TestUtils.hpp
@@ -566,20 +566,22 @@ The total number of points defined will be a triangular number; if n=numPointsBa
   }
 
   template <class FunctorType1, class FunctorType2, int rank, typename Scalar=typename FunctorType1::value_type, class ExecutionSpace = typename FunctorType1::execution_space>
-  typename std::enable_if< !supports_rank<FunctorType1,rank>::value, void >::type
+  typename std::enable_if< !(supports_rank<FunctorType1,rank>::value && supports_rank<FunctorType2,rank>::value), void >::type
   testFloatingEquality(const FunctorType1 &functor1, const FunctorType2 &functor2, double relTol, double absTol, Teuchos::FancyOStream &out, bool &success,
                        std::string functor1Name = "Functor 1", std::string functor2Name = "Functor 2")
   {
-    INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "testFloatingEquality() called on FunctorType1 that does not support the specified rank.\n");
+    INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "testFloatingEquality() called on FunctorType1 or FunctorType2 that does not support the specified rank.\n");
   }
 
   //! this method assumes both functors are accesible on device.
   template <class FunctorType1, class FunctorType2, int rank, typename Scalar=typename FunctorType1::value_type, class ExecutionSpace = typename FunctorType1::execution_space>
-  typename std::enable_if< supports_rank<FunctorType1,rank>::value, void >::type
+  typename std::enable_if< (supports_rank<FunctorType1,rank>::value && supports_rank<FunctorType2,rank>::value), void >::type
   testFloatingEquality(const FunctorType1 &functor1, const FunctorType2 &functor2, double relTol, double absTol, Teuchos::FancyOStream &out, bool &success,
                        std::string functor1Name = "Functor 1", std::string functor2Name = "Functor 2")
   {
-    static_assert( supports_rank<FunctorType2,rank>::value, "Both Functor1 and Functor2 must support the specified rank through operator().");
+    static_assert( supports_rank<FunctorType1,rank>::value, "Functor1 must support the specified rank through operator().");
+    static_assert( supports_rank<FunctorType2,rank>::value, "Functor2 must support the specified rank through operator().");
+    
     using Functor1IteratorScalar = FunctorIterator<FunctorType1, Scalar, rank>;
     using Functor2IteratorScalar = FunctorIterator<FunctorType2, Scalar, rank>;
 
diff --git a/packages/intrepid2/src/Shared/Intrepid2_TransformedVectorData.hpp b/packages/intrepid2/src/Shared/Intrepid2_TransformedBasisValues.hpp
similarity index 62%
rename from packages/intrepid2/src/Shared/Intrepid2_TransformedVectorData.hpp
rename to packages/intrepid2/src/Shared/Intrepid2_TransformedBasisValues.hpp
index 197e3ed17e21..e103106e3207 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_TransformedVectorData.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_TransformedBasisValues.hpp
@@ -41,43 +41,46 @@
 // ************************************************************************
 // @HEADER
 
-/** \file   Intrepid2_TransformedVectorData.hpp
-    \brief  Structure-preserving representation of transformed vector data; reference space values and transformations are stored separately.
+/** \file   Intrepid2_TransformedBasisValues.hpp
+    \brief  Structure-preserving representation of transformed basis values; reference space values and transformations are stored separately.
+ 
+ There are effectively two modes: one for vector-valued BasisValues, and one for scalar-valued BasisValues.  In the former case the transformation is logically a matrix, with shape (C,P,D,D).  In the latter case, the transformation is logically a weight on each physical-space quadrature point, with shape (C,P).  If the transform is left unset, it is understood to be the identity.
 
     \author Nathan V. Roberts
 */
 
-#ifndef Intrepid2_TransformedVectorData_h
-#define Intrepid2_TransformedVectorData_h
-
-#include "Intrepid2_VectorData.hpp"
+#ifndef Intrepid2_TransformedBasisValues_h
+#define Intrepid2_TransformedBasisValues_h
 
+#include "Intrepid2_BasisValues.hpp"
 #include "Intrepid2_ScalarView.hpp"
 
 namespace Intrepid2 {
-/** \class Intrepid2::TransformedVectorData
+/** \class Intrepid2::TransformedBasisValues
     \brief Structure-preserving representation of transformed vector data; reference space values and transformations are stored separately.
  
- TransformedVectorData provides a View-like interface of rank 4, with shape (C,F,P,D).  When the corresponding accessor is used, the transformed value is determined from corresponding reference space values and the transformation.
+ TransformedBasisValues provides a View-like interface of rank 4, with shape (C,F,P,D).  When the corresponding accessor is used, the transformed value is determined from corresponding reference space values and the transformation.
 */
   template<class Scalar, typename DeviceType>
-  class TransformedVectorData
+  class TransformedBasisValues
   {
   public:
-    using Transform = Data<Scalar,DeviceType>;
+    ordinal_type numCells_;
     
-    Data<Scalar,DeviceType> transform_; // (C,P,D,D) jacobian or jacobian inverse; can also be unset for identity transform
+    Data<Scalar,DeviceType> transform_; // vector case: (C,P,D,D) jacobian or jacobian inverse; can also be unset for identity transform.  Scalar case: (C,P), or unset for identity.
     
-    VectorData<Scalar, DeviceType> vectorData_; // notionally (F,P,D) container
+    BasisValues<Scalar, DeviceType> basisValues_;
     
     /**
      \brief Standard constructor.
-     \param [in] transform - the transformation matrix, with logical shape (C,P,D,D)
-     \param [in] vectorData - the reference-space data to be transformed, with logical shape (F,P,D)
+     \param [in] transform - the transformation (matrix), with logical shape (C,P) or (C,P,D,D)
+     \param [in] basisValues - the reference-space data to be transformed, with logical shape (F,P) (for scalar values) or (F,P,D) (for vector values)
     */
-    TransformedVectorData(const Data<Scalar,DeviceType> &transform, const VectorData<Scalar,DeviceType> &vectorData)
+    TransformedBasisValues(const Data<Scalar,DeviceType> &transform, const BasisValues<Scalar,DeviceType> &basisValues)
     :
-    transform_(transform), vectorData_(vectorData)
+    numCells_(transform.extent_int(0)),
+    transform_(transform),
+    basisValues_(basisValues)
     {
       // sanity check: when transform is diagonal, we expect there to be no pointwise variation.
       INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(transform_.isDiagonal() && (transform_.getVariationTypes()[1] != CONSTANT), std::invalid_argument, "When transform is diagonal, we assume in various places that there is no pointwise variation; the transform_ Data should have CONSTANT as its variation type in dimension 1.");
@@ -87,17 +90,27 @@ namespace Intrepid2 {
      \brief Constructor for the case of an identity transform.
      \param [in] vectorData - the reference-space data, with logical shape (F,P,D)
     */
-    TransformedVectorData(const VectorData<Scalar, DeviceType> &vectorData)
+    TransformedBasisValues(const ordinal_type &numCells, const BasisValues<Scalar,DeviceType> &basisValues)
     :
-    vectorData_(vectorData)
+    numCells_(numCells),
+    basisValues_(basisValues)
     {}
     
     //! copy-like constructor for differing device types.  This may do a deep_copy of underlying views, depending on the memory spaces involved.
     template<typename OtherDeviceType, class = typename std::enable_if<!std::is_same<DeviceType, OtherDeviceType>::value>::type>
-    TransformedVectorData(const TransformedVectorData<Scalar,OtherDeviceType> &transformedVectorData)
+    TransformedBasisValues(const TransformedBasisValues<Scalar,OtherDeviceType> &transformedVectorData)
     :
+    numCells_(transformedVectorData.numCells()),
     transform_(transformedVectorData.transform()),
-    vectorData_(transformedVectorData.vectorData())
+    basisValues_(transformedVectorData.basisValues())
+    {}
+    
+    /**
+     \brief Default constructor; an invalid container.  Will return -1 for numCells().
+     */
+    TransformedBasisValues()
+    :
+    numCells_(-1)
     {}
     
     //! Returns true if the transformation matrix is diagonal.
@@ -113,7 +126,12 @@ namespace Intrepid2 {
         return transform_.isDiagonal();
       }
     }
-    
+
+    BasisValues<Scalar, DeviceType> basisValues() const
+    {
+      return basisValues_;
+    }
+
     //! Returns the true data extent in the cell dimension (e.g., will be 1 for transform matrices that do not vary from one cell to the next).
     KOKKOS_INLINE_FUNCTION int cellDataExtent() const
     {
@@ -129,50 +147,78 @@ namespace Intrepid2 {
     //! Returns the logical extent in the cell dimension, which is the 0 dimension in this container.
     KOKKOS_INLINE_FUNCTION int numCells() const
     {
-      return transform_.extent_int(0);
+      return numCells_;
     }
     
     //! Returns the logical extent in the fields dimension, which is the 1 dimension in this container.
     KOKKOS_INLINE_FUNCTION int numFields() const
     {
-      return vectorData_.numFields();
+      return basisValues_.extent_int(0);
     }
     
     //! Returns the logical extent in the points dimension, which is the 2 dimension in this container.
     KOKKOS_INLINE_FUNCTION int numPoints() const
     {
-      return vectorData_.numPoints();
+      return basisValues_.extent_int(1);
     }
     
     //! Returns the logical extent in the space dimension, which is the 3 dimension in this container.
     KOKKOS_INLINE_FUNCTION int spaceDim() const
     {
-      return vectorData_.spaceDim();
+      return basisValues_.extent_int(2);
     }
     
-    //! Accessor, with arguments (C,F,P,D).
+    //! Scalar accessor, with arguments (C,F,P).
+    KOKKOS_INLINE_FUNCTION Scalar operator()(const int &cellOrdinal, const int &fieldOrdinal, const int &pointOrdinal) const
+    {
+      if (!transform_.isValid())
+      {
+        // null transform is understood as the identity
+        return basisValues_(fieldOrdinal,pointOrdinal);
+      }
+      else
+      {
+        return transform_(cellOrdinal,pointOrdinal) * basisValues_(fieldOrdinal,pointOrdinal);
+      }
+    }
+    
+    //! Vector accessor, with arguments (C,F,P,D).
     KOKKOS_INLINE_FUNCTION Scalar operator()(const int &cellOrdinal, const int &fieldOrdinal, const int &pointOrdinal, const int &dim) const
     {
       if (!transform_.isValid())
       {
         // null transform is understood as the identity
-        return vectorData_(fieldOrdinal,pointOrdinal,dim);
+        return basisValues_(fieldOrdinal,pointOrdinal,dim);
       }
       else if (transform_.isDiagonal())
       {
-        return transform_(cellOrdinal,pointOrdinal,dim,dim) * vectorData_(fieldOrdinal,pointOrdinal,dim);
+        return transform_(cellOrdinal,pointOrdinal,dim,dim) * basisValues_(fieldOrdinal,pointOrdinal,dim);
       }
       else
       {
         Scalar value = 0.0;
         for (int d2=0; d2<transform_.extent_int(2); d2++)
         {
-          value += transform_(cellOrdinal,pointOrdinal,dim,d2) * vectorData_(fieldOrdinal,pointOrdinal,d2);
+          value += transform_(cellOrdinal,pointOrdinal,dim,d2) * basisValues_(fieldOrdinal,pointOrdinal,d2);
         }
         return value;
       }
     }
     
+    //! Returns the specified entry in the (scalar) transform.  (Only valid for scalar-valued BasisValues; see the four-argument transformWeight() for the vector-valued case.)
+    KOKKOS_INLINE_FUNCTION Scalar transformWeight(const int &cellOrdinal, const int &pointOrdinal) const
+    {
+      if (!transform_.isValid())
+      {
+        // null transform is understood as identity
+        return 1.0;
+      }
+      else
+      {
+        return transform_(cellOrdinal,pointOrdinal);
+      }
+    }
+    
     //! Returns the specified entry in the transform matrix.
     KOKKOS_INLINE_FUNCTION Scalar transformWeight(const int &cellOrdinal, const int &pointOrdinal, const int &dim1, const int &dim2) const
     {
@@ -196,14 +242,14 @@ namespace Intrepid2 {
     //! Returns the reference-space vector data.
     const VectorData<Scalar,DeviceType> & vectorData() const
     {
-      return vectorData_;
+      return basisValues_.vectorData();
     }
     
-    //! Returns the rank of the container, which is 4.
+    //! Returns the rank of the container, which is 3 for scalar values, and 4 for vector values.
     KOKKOS_INLINE_FUNCTION
-    constexpr unsigned rank() const
+    unsigned rank() const
     {
-      return 4; // shape is (C,F,P,D)
+      return basisValues_.rank() + 1; // transformation adds a cell dimension
     }
     
     //! Returns the extent in the specified dimension as an int.
@@ -216,10 +262,9 @@ namespace Intrepid2 {
       else if (r == 3) return spaceDim();
       else if (r  > 3) return 1;
       
-      INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::invalid_argument, "Unsupported rank");
       return -1; // unreachable return; here to avoid compiler warnings.
     }
   };
 }
 
-#endif /* Intrepid2_TransformedVectorData_h */
+#endif /* Intrepid2_TransformedBasisValues_h */
diff --git a/packages/intrepid2/src/Shared/Intrepid2_Types.hpp b/packages/intrepid2/src/Shared/Intrepid2_Types.hpp
index cd362e2e4b68..2e94566365d9 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_Types.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_Types.hpp
@@ -139,6 +139,8 @@ namespace Intrepid2 {
     static constexpr ordinal_type MaxDerivative        = 10;
     /// Maximum number of tensor/Cartesian products that can be taken: this allows hypercube basis in 7D to be formed by 7 line basis components.
     static constexpr ordinal_type MaxTensorComponents  = 7;
+    /// Maximum number of components that a VectorData object will store -- 66 corresponds to OPERATOR_D10 on an H^1 hexahedral basis.  For now, we limit to 7, matching MaxTensorComponents.
+    static constexpr ordinal_type MaxVectorComponents  = 7;
 
     // we do not want to use hard-wired epsilon, threshold and tolerence. 
     // static constexpr double Epsilon   = 1.0e-16; 
diff --git a/packages/intrepid2/src/Shared/Intrepid2_VectorData.hpp b/packages/intrepid2/src/Shared/Intrepid2_VectorData.hpp
index 5544c20cee6d..2e3913538794 100644
--- a/packages/intrepid2/src/Shared/Intrepid2_VectorData.hpp
+++ b/packages/intrepid2/src/Shared/Intrepid2_VectorData.hpp
@@ -65,17 +65,16 @@ namespace Intrepid2 {
   class VectorData
   {
   public:
-    using VectorArray = Kokkos::Array< TensorData<Scalar,DeviceType>, Parameters::MaxTensorComponents>; // for axis-aligned case, these correspond entry-wise to the axis with which the vector values align
+    using VectorArray = Kokkos::Array< TensorData<Scalar,DeviceType>, Parameters::MaxVectorComponents >; // for axis-aligned case, these correspond entry-wise to the axis with which the vector values align
     using FamilyVectorArray = Kokkos::Array< VectorArray, Parameters::MaxTensorComponents>;
 
     FamilyVectorArray vectorComponents_; // outer: family ordinal; inner: component/spatial dimension ordinal
     bool axialComponents_; // if true, each entry in vectorComponents_ is an axial component vector; for 3D: (f1,0,0); (0,f2,0); (0,0,f3).  The 0s are represented by trivial/invalid TensorData objects.  In this case, numComponents_ == numFamilies_.
      
     int totalDimension_;
-    Kokkos::Array<int,Parameters::MaxTensorComponents> dimToComponent_;
-    Kokkos::Array<int,Parameters::MaxTensorComponents> dimToComponentDim_;
-    
-    Kokkos::Array<int,Parameters::MaxTensorComponents> numDimsForComponent_;
+    Kokkos::Array<int, Parameters::MaxVectorComponents> dimToComponent_;
+    Kokkos::Array<int, Parameters::MaxVectorComponents> dimToComponentDim_;
+    Kokkos::Array<int, Parameters::MaxVectorComponents> numDimsForComponent_;
     
     Kokkos::Array<int,Parameters::MaxTensorComponents> familyFieldUpperBound_; // one higher than the end of family indicated
     
@@ -128,6 +127,7 @@ namespace Intrepid2 {
         INTREPID2_TEST_FOR_EXCEPTION(!validEntryFoundForFamily, std::invalid_argument, "Each family must have at least one valid TensorData entry");
       }
 
+      // do a pass through components to determine total component dim (totalDimension_) and size lookups appropriately
       int currentDim = 0;
       for (unsigned j=0; j<numComponents_; j++)
       {
@@ -141,11 +141,6 @@ namespace Intrepid2 {
             {
               validEntryFoundForComponent = true;
               numDimsForComponent = vectorComponents_[i][j].extent_int(2); // (F,P,D) container or (F,P) container
-              for (int dim=0; dim<numDimsForComponent; dim++)
-              {
-                dimToComponent_[currentDim+dim]    = j;
-                dimToComponentDim_[currentDim+dim] = dim;
-              }
             }
             else
             {
@@ -157,16 +152,26 @@ namespace Intrepid2 {
         {
           // assume that the component takes up exactly one space dim
           numDimsForComponent = 1;
-          dimToComponent_[currentDim]    = j;
-          dimToComponentDim_[currentDim] = 0;
         }
         
         numDimsForComponent_[j] = numDimsForComponent;
         
         currentDim += numDimsForComponent;
       }
-      numPoints_      = numPoints;
       totalDimension_ = currentDim;
+      
+      currentDim = 0;
+      for (unsigned j=0; j<numComponents_; j++)
+      {
+        int numDimsForComponent = numDimsForComponent_[j];
+        for (int dim=0; dim<numDimsForComponent; dim++)
+        {
+          dimToComponent_[currentDim+dim]    = j;
+          dimToComponentDim_[currentDim+dim] = dim;
+        }
+        currentDim += numDimsForComponent;
+      }
+      numPoints_      = numPoints;
     }
   public:
     /**
@@ -182,7 +187,7 @@ namespace Intrepid2 {
     numComponents_(numComponents)
     {
       static_assert(numFamilies <= Parameters::MaxTensorComponents,   "numFamilies must be less than Parameters::MaxTensorComponents");
-      static_assert(numComponents <= Parameters::MaxTensorComponents, "numComponents must be less than Parameters::MaxTensorComponents");
+      static_assert(numComponents <= Parameters::MaxVectorComponents, "numComponents must be less than Parameters::MaxVectorComponents");
       for (unsigned i=0; i<numFamilies; i++)
       {
         for (unsigned j=0; j<numComponents; j++)
@@ -209,8 +214,8 @@ namespace Intrepid2 {
         INTREPID2_TEST_FOR_EXCEPTION(vectorComponents[i].size() != numComponents_, std::invalid_argument, "each family must have the same number of components");
       }
       
-      INTREPID2_TEST_FOR_EXCEPTION(numFamilies_ > Parameters::MaxTensorComponents,   std::invalid_argument, "numFamilies must be less than Parameters::MaxTensorComponents");
-      INTREPID2_TEST_FOR_EXCEPTION(numComponents_ > Parameters::MaxTensorComponents, std::invalid_argument, "numComponents must be less than Parameters::MaxTensorComponents");
+      INTREPID2_TEST_FOR_EXCEPTION(numFamilies_ > Parameters::MaxTensorComponents,   std::invalid_argument, "numFamilies must be at most Parameters::MaxTensorComponents");
+      INTREPID2_TEST_FOR_EXCEPTION(numComponents_ > Parameters::MaxVectorComponents,   std::invalid_argument, "numComponents must be at most Parameters::MaxVectorComponents");
       for (unsigned i=0; i<numFamilies_; i++)
       {
         for (unsigned j=0; j<numComponents_; j++)
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/BasisEquivalenceTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/BasisEquivalenceTests.cpp
index 3dfed45c8934..461aef36b6b3 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/BasisEquivalenceTests.cpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/BasisEquivalenceTests.cpp
@@ -468,7 +468,11 @@ namespace
     using CGBasis = HierarchicalBasisFamily<DefaultTestDeviceType>::HGRAD_LINE;
     using DGBasis = DGHierarchicalBasisFamily<DefaultTestDeviceType>::HGRAD_LINE;
     
-    std::vector<EOperator> opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5};
+    // NOTE: for the moment, OPERATOR_Dn for n > 2 not supported by DerivedBasis.  We can support more by either increasing
+    //       Parameters::MaxVectorComponents (which is 7 right now), or by changing VectorData to allow a dynamic number of
+    //       components.  (We were doing the latter using Kokkos::vector, but have switched to a Kokkos::Array instead to
+    //       avoid using UVM.)
+    std::vector<EOperator> opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2}; //, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5};
     
     // these tolerances are selected such that we have a little leeway for architectural differences
     // (It is true, though, that we incur a fair amount of floating point error for higher order bases in higher dimensions)
@@ -488,7 +492,11 @@ namespace
     using CGBasis = HierarchicalBasisFamily<DefaultTestDeviceType>::HGRAD_QUAD;
     using DGBasis = DGHierarchicalBasisFamily<DefaultTestDeviceType>::HGRAD_QUAD;
     
-    std::vector<EOperator> opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5};
+    // NOTE: for the moment, OPERATOR_Dn for n > 2 not supported by DerivedBasis.  We can support more by either increasing
+    //       Parameters::MaxVectorComponents (which is 7 right now), or by changing VectorData to allow a dynamic number of
+    //       components.  (We were doing the latter using Kokkos::vector, but have switched to a Kokkos::Array instead to
+    //       avoid using UVM.)
+    std::vector<EOperator> opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2}; //, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5};
     
     // these tolerances are selected such that we have a little leeway for architectural differences
     // (It is true, though, that we incur a fair amount of floating point error for higher order bases in higher dimensions)
@@ -508,7 +516,11 @@ namespace
     using HierarchicalBasis = HierarchicalBasisFamily<DefaultTestDeviceType>::HGRAD_QUAD;
     using NodalBasis        = NodalBasisFamily<DefaultTestDeviceType>::HGRAD_QUAD;
     
-    std::vector<EOperator> opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5};
+    // NOTE: for the moment, OPERATOR_Dn for n > 2 not supported by DerivedBasis.  We can support more by either increasing
+    //       Parameters::MaxVectorComponents (which is 7 right now), or by changing VectorData to allow a dynamic number of
+    //       components.  (We were doing the latter using Kokkos::vector, but have switched to a Kokkos::Array instead to
+    //       avoid using UVM.)
+    std::vector<EOperator> opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2}; //, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5};
     
     // these tolerances are selected such that we have a little leeway for architectural differences
     // (It is true, though, that we incur a fair amount of floating point error for higher order bases in higher dimensions)
@@ -533,7 +545,11 @@ namespace
     const double relTol=1e-11; // 2e-12 is sharp on development setup for polyOrder=2; relaxing for potential architectural differences
     const double absTol=1e-11; // 9e-13 is sharp on development setup for polyOrder=2; relaxing for potential architectural differences
     
-    std::vector<EOperator> opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5};
+    // NOTE: for the moment, OPERATOR_Dn for n > 2 not supported by DerivedBasis.  We can support more by either increasing
+    //       Parameters::MaxVectorComponents (which is 7 right now), or by changing VectorData to allow a dynamic number of
+    //       components.  (We were doing the latter using Kokkos::vector, but have switched to a Kokkos::Array instead to
+    //       avoid using UVM.)
+    std::vector<EOperator> opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2}; //, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5};
     for (int polyOrder=1; polyOrder<3; polyOrder++)
     {
       CGBasis cgBasis(polyOrder);
@@ -552,7 +568,11 @@ namespace
     const double relTol=1e-12; // 2e-13 is sharp on development setup for polyOrder=2; relaxing for potential architectural differences
     const double absTol=1e-13; // 2e-14 is sharp on development setup for polyOrder=2; relaxing for potential architectural differences
     
-    std::vector<EOperator> opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5};
+    // NOTE: for the moment, OPERATOR_Dn for n > 2 not supported by DerivedBasis.  We can support more by either increasing
+    //       Parameters::MaxVectorComponents (which is 7 right now), or by changing VectorData to allow a dynamic number of
+    //       components.  (We were doing the latter using Kokkos::vector, but have switched to a Kokkos::Array instead to
+    //       avoid using UVM.)
+    std::vector<EOperator> opsToTest {OPERATOR_GRAD, OPERATOR_D1, OPERATOR_D2}; //, OPERATOR_D3, OPERATOR_D4, OPERATOR_D5};
     for (int polyOrder=1; polyOrder<3; polyOrder++)
     {
       HierarchicalBasis hierarchicalBasis(polyOrder);
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/BasisValuesTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/BasisValuesTests.cpp
index aba9c289a1c7..34cc645c4292 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/BasisValuesTests.cpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/BasisValuesTests.cpp
@@ -58,7 +58,7 @@
 #include "Intrepid2_ProjectedGeometry.hpp"
 #include "Intrepid2_ProjectedGeometryExamples.hpp"
 #include "Intrepid2_ScalarView.hpp"
-#include "Intrepid2_TransformedVectorData.hpp"
+#include "Intrepid2_TransformedBasisValues.hpp"
 #include "Intrepid2_Types.hpp"
 #include "Intrepid2_TestUtils.hpp"
 
@@ -249,7 +249,6 @@ namespace
     using DeviceType = Intrepid2::DefaultTestDeviceType;
     using Basis = HierarchicalBasisFamily<DeviceType>::HGRAD_QUAD;
     
-    // for now, the BasisValues path only supports the standard exact-sequence operators
     std::vector<EOperator> opsToTest {OPERATOR_VALUE, OPERATOR_GRAD};
     
     const double relTol=1e-13;
@@ -285,7 +284,6 @@ namespace
     using DeviceType = Intrepid2::DefaultTestDeviceType;
     using Basis = HierarchicalBasisFamily<DeviceType>::HGRAD_HEX;
     
-    // for now, the BasisValues path only supports the standard exact-sequence operators
     std::vector<EOperator> opsToTest {OPERATOR_VALUE, OPERATOR_GRAD};
     
     const double relTol=1e-13;
@@ -447,7 +445,6 @@ namespace
     using DeviceType = Intrepid2::DefaultTestDeviceType;
     using Basis = HierarchicalBasisFamily<DeviceType>::HVOL_QUAD;
     
-    // for now, the BasisValues path only supports the standard exact-sequence operators
     std::vector<EOperator> opsToTest {OPERATOR_VALUE};
     
     const double relTol=1e-13;
@@ -465,7 +462,6 @@ namespace
     using DeviceType = Intrepid2::DefaultTestDeviceType;
     using Basis = HierarchicalBasisFamily<DeviceType>::HVOL_HEX;
     
-    // for now, the BasisValues path only supports the standard exact-sequence operators
     std::vector<EOperator> opsToTest {OPERATOR_VALUE};
     
     const double relTol=1e-13;
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/CMakeLists.txt b/packages/intrepid2/unit-test/MonolithicExecutable/CMakeLists.txt
index 17758a980275..77ac7eb27b1b 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/CMakeLists.txt
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/CMakeLists.txt
@@ -2,7 +2,9 @@
 
 
 #FILE(GLOB TEST_SOURCES "*.cpp" ${ETI_SOURCES} )
-FILE(GLOB TEST_SOURCES "*.cpp" )
+FILE(GLOB TEST_SOURCES "*.cpp")
+
+INCLUDE_DIRECTORIES("../../assembly-examples")
 
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
   Intrepid2_Tests
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp
index 1305831c79a9..dfe4d42bff2a 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/DataTests.cpp
@@ -297,6 +297,58 @@ namespace
     testFloatingEquality3(AB_actual, AB_expected, relTol, absTol, out, success);
   }
 
+  TEUCHOS_UNIT_TEST( Data, InPlaceProduct_Matrix )
+  {
+    double relTol = 1e-13;
+    double absTol = 1e-13;
+    
+    // Use two Data objects A and B, each with logical shape (10,3,2,2) -- (C,P,D,D), say.
+    // with A having variation types of CONSTANT, CONSTANT, and BLOCK_DIAGONAL (with the final two dimensions being purely diagonal)
+    // and B having variation types of CONSTANT for all entries
+    // Result should have variation types of CONSTANT, CONSTANT, and BLOCK_DIAGONAL, with the diagonal matrix entries weighted by the value from B.
+    // (This test is modeled on Jacobians for a uniform geometry being weighted by inverse determinants, as happens when computing inputs to getHDIVtransformVALUE().
+    using DeviceType = DefaultTestDeviceType;
+    using Scalar = double;
+    
+    const int rank        = 4;
+    const int cellCount   = 10;
+    const int pointCount  = 3;
+    const int spaceDim    = 2;
+    
+    const double bValue = M_PI;
+    
+    auto AView = getView<Scalar,DeviceType>("A", spaceDim); // diagonal: one entry per dimension
+    auto ABView = getView<Scalar,DeviceType>("A .* B", spaceDim); // should be same shape as A
+    
+    auto AViewHost  = Kokkos::create_mirror(AView);
+    auto ABViewHost = Kokkos::create_mirror(ABView);
+    for (int d=0; d<spaceDim; d++)
+    {
+      AViewHost(d)  = d + 1.;
+      ABViewHost(d) = AViewHost(d) * bValue;
+    }
+    Kokkos::deep_copy( AView,  AViewHost);
+    Kokkos::deep_copy(ABView, ABViewHost);
+    
+    Kokkos::Array<int,rank> extents {cellCount, pointCount, spaceDim, spaceDim};
+    Kokkos::Array<DataVariationType,rank> A_variation {CONSTANT, CONSTANT, BLOCK_PLUS_DIAGONAL, BLOCK_PLUS_DIAGONAL};
+    
+    Data<Scalar,DeviceType> A(AView,extents,A_variation);
+    Data<Scalar,DeviceType> B(bValue,extents);
+    
+    // expected variation for A+B:
+    Kokkos::Array<DataVariationType,4> AB_variation  = A_variation;
+    // expected Data object for A+B:
+    Data<Scalar,DeviceType> AB_expected(ABView,extents,AB_variation);
+    
+    auto AB_actual = Data<Scalar,DeviceType>::allocateInPlaceCombinationResult(A, B);
+    
+    AB_actual.storeInPlaceProduct(A, B);
+    
+    // test AB_actual equals AB_expected.  (This will iterate over the logical extents.)
+    testFloatingEquality4(AB_actual, AB_expected, relTol, absTol, out, success);
+  }
+
 // #pragma mark Data: MatVec
 /** \brief Data provides matrix-vector multiplication support.  This method checks correctness of the computed mat-vec for a particular case involving a 2x2 matrix and a 2x1 vector.
 */
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/HostCopyTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/HostCopyTests.cpp
index 485087257fba..4303ab4417d4 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/HostCopyTests.cpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/HostCopyTests.cpp
@@ -53,7 +53,7 @@
 #include "Intrepid2_DefaultCubatureFactory.hpp"
 #include "Intrepid2_TensorData.hpp"
 #include "Intrepid2_TensorPoints.hpp"
-#include "Intrepid2_TransformedVectorData.hpp"
+#include "Intrepid2_TransformedBasisValues.hpp"
 #include "Intrepid2_VectorData.hpp"
 #include "Intrepid2_ScalarView.hpp"
 #include "Intrepid2_Types.hpp"
@@ -190,7 +190,7 @@ namespace
     }
   }
 
-  TEUCHOS_UNIT_TEST(HostCopy, TransformedVectorData)
+  TEUCHOS_UNIT_TEST(HostCopy, TransformedBasisValues)
   {
     using DeviceType = DefaultTestDeviceType;
     using Scalar = double;
@@ -218,10 +218,9 @@ namespace
     const int blockPlusDiagonalLastNonDiagonal = -1; // only diagonal
     Data<Scalar,DeviceType> transform(scalingView,rank,extents,variationTypes,blockPlusDiagonalLastNonDiagonal);
     
-    TransformedVectorData<Scalar,DeviceType> transformedVectorData(transform,vectorData);
+    TransformedBasisValues<Scalar,DeviceType> transformedVectorData(transform,vectorData);
     
-    using HostExecSpace = Kokkos::HostSpace::execution_space;
-    TransformedVectorData<Scalar,HostExecSpace> transformedVectorDataHost(transformedVectorData);
+    TransformedBasisValues<Scalar,Kokkos::HostSpace> transformedVectorDataHost(transformedVectorData);
 
     TEST_EQUALITY(transformedVectorData.rank(), transformedVectorDataHost.rank());
     TEST_EQUALITY(transformedVectorData.extent_int(0), transformedVectorDataHost.extent_int(0));
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests.cpp
index a076d361f884..2c54ae18c636 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests.cpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/StructuredIntegrationTests.cpp
@@ -70,20 +70,236 @@
 #include "Intrepid2_Data.hpp"
 #include "Intrepid2_TensorData.hpp"
 #include "Intrepid2_TensorPoints.hpp"
-#include "Intrepid2_TransformedVectorData.hpp"
+#include "Intrepid2_TransformedBasisValues.hpp"
 #include "Intrepid2_VectorData.hpp"
 
 #include "Intrepid2_ScalarView.hpp"
 
+#include "GRADGRADStandardAssembly.hpp"
+#include "GRADGRADStructuredAssembly.hpp"
+#include "H1StandardAssembly.hpp"
+#include "H1StructuredAssembly.hpp"
+#include "HDIVStandardAssembly.hpp"
+#include "HDIVStructuredAssembly.hpp"
+#include "HCURLStandardAssembly.hpp"
+#include "HCURLStructuredAssembly.hpp"
+#include "HVOLStandardAssembly.hpp"
+#include "HVOLStructuredAssembly.hpp"
+
 namespace
 {
   using namespace Intrepid2;
 
+  enum FormulationChoice
+  {
+    Poisson, // (grad, grad)
+    Hgrad,   // (grad, grad) + (value, value)
+    Hdiv,    // (div, div)   + (value, value)
+    Hcurl,   // (curl, curl) + (value, value)
+    L2       // (value, value)
+  };
+
+  enum AlgorithmChoice
+  {
+    Standard,
+    AffineNonTensor,
+    NonAffineTensor,
+    AffineTensor,
+    DiagonalJacobian,
+    Uniform
+  };
+
+  enum BasisFamilyChoice
+  {
+    Nodal,
+    Hierarchical,
+    Serendipity
+  };
+
+  // tags to allow us to use templated Teuchos tests
+  class PoissonFormulation {
+  public:
+    static const FormulationChoice formulation = Poisson;
+  };
+  class HgradFormulation {
+  public:
+    static const FormulationChoice formulation = Hgrad;
+  };
+  class HdivFormulation {
+  public:
+    static const FormulationChoice formulation = Hdiv;
+  };
+  class HcurlFormulation {
+  public:
+    static const FormulationChoice formulation = Hcurl;
+  };
+  class L2Formulation {
+  public:
+    static const FormulationChoice formulation = L2;
+  };
+// StandardAlgorithm is not actually used in our test templates: Standard is the baseline we compare against.
+// We therefore comment it out here; the unused static member "algorithm" below generates compilation errors on
+// some platforms (with warnings as errors turned on).
+//  class StandardAlgorithm
+//  {
+//  public:
+//    static const AlgorithmChoice algorithm = Standard;
+//  };
+  class AffineNonTensorAlgorithm
+  {
+  public:
+    static const AlgorithmChoice algorithm = AffineNonTensor;
+  };
+  class NonAffineTensorAlgorithm
+  {
+  public:
+    static const AlgorithmChoice algorithm = NonAffineTensor;
+  };
+  class AffineTensorAlgorithm
+  {
+  public:
+    static const AlgorithmChoice algorithm = AffineTensor;
+  };
+  class UniformAlgorithm
+  {
+  public:
+    static const AlgorithmChoice algorithm = Uniform;
+  };
+// DiagonalJacobianAlgorithm is not yet used in our test templates: getMesh() does not support DiagonalJacobian yet.
+// (We hope soon to add support for orthogonal extrusions in CellGeometry, which would give us those
+//  diagonal Jacobians in a natural way.)
+// We therefore comment out the class out here; the unused static member "algorithm" below generates compilation errors on
+// some platforms (with warnings as errors turned on).
+//  class DiagonalJacobianAlgorithm // note that DiagonalJacobian is not yet supported by getMesh()
+//  {
+//  public:
+//    static const AlgorithmChoice algorithm = DiagonalJacobian;
+//  };
+  class D1
+  {
+  public:
+    static const int spaceDim = 1;
+  };
+  class D2
+  {
+  public:
+    static const int spaceDim = 2;
+  };
+  class D3
+  {
+  public:
+    static const int spaceDim = 3;
+  };
+  class P1
+  {
+  public:
+    static const int polyOrder = 1;
+  };
+  class P2
+  {
+  public:
+    static const int polyOrder = 2;
+  };
+  class P3
+  {
+  public:
+    static const int polyOrder = 3;
+  };
+  class P4
+  {
+  public:
+    static const int polyOrder = 4;
+  };
+
+  using namespace Intrepid2;
+
+  template< typename PointScalar, int spaceDim, typename DeviceType >
+  inline
+  CellGeometry<PointScalar, spaceDim, DeviceType> getMesh(AlgorithmChoice algorithmChoice, const Kokkos::Array<int,spaceDim> &gridCellCounts)
+  {
+    Kokkos::Array<PointScalar,spaceDim> domainExtents;
+    for (int d=0; d<spaceDim; d++)
+    {
+      domainExtents[d]  = 1.0;
+    }
+    auto uniformTensorGeometry = uniformCartesianMesh<PointScalar,spaceDim,DeviceType>(domainExtents, gridCellCounts);
+    
+    switch (algorithmChoice)
+    {
+      case Standard:
+      case NonAffineTensor:
+      {
+        // Standard and non-affine tensor use the same geometry; the difference is how this is used in assembly
+        const bool copyAffineness = false;
+        auto genericGeometry = getNodalCellGeometry(uniformTensorGeometry, copyAffineness);
+        return genericGeometry;
+      }
+      case Uniform:
+        return uniformTensorGeometry;
+      case AffineNonTensor:
+      case AffineTensor:
+      {
+        const bool copyAffineness = true;
+        auto affineNonTensorGeometry = getNodalCellGeometry(uniformTensorGeometry, copyAffineness);
+        return affineNonTensorGeometry;
+      }
+      case DiagonalJacobian:
+      {
+        INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "DiagonalJacobian case not yet implemented");
+      }
+    }
+    return uniformTensorGeometry; // this line should be unreachable; included to avoid compiler warnings from nvcc
+  }
+
+  template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+  Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadrature(FormulationChoice formulation,
+                                          Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                          double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+  {
+    switch (formulation)
+    {
+      case Poisson:
+        return performStandardQuadratureGRADGRAD<Scalar,BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+      case Hgrad:
+        return performStandardQuadratureH1<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+      case Hdiv:
+        return performStandardQuadratureHDIV<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+      case Hcurl:
+        return performStandardQuadratureHCURL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+      case L2:
+        return performStandardQuadratureHVOL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+      default:
+        INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported formulation");
+    }
+  }
+
+  template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+  Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadrature(FormulationChoice formulation,
+                                            Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                            double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+  {
+    switch (formulation)
+    {
+      case Poisson:
+        return performStructuredQuadratureGRADGRAD<Scalar,BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+      case Hgrad:
+        return performStructuredQuadratureH1<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+      case Hdiv:
+        return performStructuredQuadratureHDIV<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+      case Hcurl:
+        return performStructuredQuadratureHCURL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+      case L2:
+        return performStructuredQuadratureHVOL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+      default:
+        INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported formulation");
+    }
+  }
+
 //! version of integrate that performs a standard integration for affine meshes; does not take advantage of the tensor product structure at all
 //! this version can be used to verify correctness of other versions
 template<class Scalar, typename DeviceType>
-void integrate_baseline(Data<Scalar,DeviceType> integrals, const TransformedVectorData<Scalar,DeviceType> vectorDataLeft,
-                        const TensorData<Scalar,DeviceType> cellMeasures, const TransformedVectorData<Scalar,DeviceType> vectorDataRight)
+void integrate_baseline(Data<Scalar,DeviceType> integrals, const TransformedBasisValues<Scalar,DeviceType> vectorDataLeft,
+                        const TensorData<Scalar,DeviceType> cellMeasures, const TransformedBasisValues<Scalar,DeviceType> vectorDataRight)
 {
   const int spaceDim       = vectorDataLeft.spaceDim();
   
@@ -135,120 +351,11 @@ void integrate_baseline(Data<Scalar,DeviceType> integrals, const TransformedVect
       integralView(fieldOrdinalLeft,fieldOrdinalRight) = integral;
     }
   });
-//  const int numFieldsLeft  = vectorDataLeft.numFields();
-//  const int numFieldsRight = vectorDataRight.numFields();
-//  int approximateFlopCount = (spaceDim*2 + 2) * numPoints * numFieldsLeft * numFieldsRight * cellDataExtent;
-//  printView(integralView, std::cout, "stiffness in " + std::to_string(spaceDim) + "D");
-//  std::cout << "\n\nApproximate flop count (baseline): " << approximateFlopCount << std::endl;
-}
-
-//! version that uses the classic Intrepid2 paths
-template<class Scalar, class PointScalar, int spaceDim, typename DeviceType>
-ScalarView<Scalar,DeviceType> performStandardQuadratureHypercube(int meshWidth, int polyOrder, int worksetSize)
-{
-  using ExecutionSpace = typename DeviceType::execution_space;
-  using CellTools = Intrepid2::CellTools<DeviceType>;
-  using FunctionSpaceTools = Intrepid2::FunctionSpaceTools<ExecutionSpace>;
-  
-  using namespace std;
-  // dimensions of the returned view are (C,F,F)
-  auto fs = Intrepid2::FUNCTION_SPACE_HGRAD;
-
-  shards::CellTopology lineTopo = shards::getCellTopologyData< shards::Line<> >();
-  shards::CellTopology cellTopo;
-  if      (spaceDim == 1) cellTopo = shards::getCellTopologyData< shards::Line<>          >();
-  else if (spaceDim == 2) cellTopo = shards::getCellTopologyData< shards::Quadrilateral<> >();
-  else if (spaceDim == 3) cellTopo = shards::getCellTopologyData< shards::Hexahedron<>    >();
-  
-  auto basis = Intrepid2::getBasis< Intrepid2::NodalBasisFamily<DeviceType> >(cellTopo, fs, polyOrder);
-  
-  int numFields = basis->getCardinality();
-  int numHypercubes = 1;
-  for (int d=0; d<spaceDim; d++)
-  {
-    numHypercubes *= meshWidth;
-  }
-  int numCells = numHypercubes;
-  
-  if (worksetSize > numCells) worksetSize = numCells;
-  
-  // local stiffness matrix:
-  ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
-  
-  auto cubature = Intrepid2::DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
-  int numPoints = cubature->getNumPoints();
-  ScalarView<PointScalar,DeviceType> cubaturePoints("cubature points",numPoints,spaceDim);
-  ScalarView<double,DeviceType> cubatureWeights("cubature weights", numPoints);
-  
-  cubature->getCubature(cubaturePoints, cubatureWeights);
-  
-  // Allocate some intermediate containers
-  ScalarView<Scalar,DeviceType> basisValues    ("basis values", numFields, numPoints );
-  ScalarView<Scalar,DeviceType> basisGradValues("basis grad values", numFields, numPoints, spaceDim);
-
-  ScalarView<Scalar,DeviceType> transformedGradValues("transformed grad values", worksetSize, numFields, numPoints, spaceDim);
-  ScalarView<Scalar,DeviceType> transformedWeightedGradValues("transformed weighted grad values", worksetSize, numFields, numPoints, spaceDim);
-  
-  basis->getValues(basisValues,     cubaturePoints, Intrepid2::OPERATOR_VALUE );
-  basis->getValues(basisGradValues, cubaturePoints, Intrepid2::OPERATOR_GRAD  );
-  
-  CellGeometry<PointScalar,spaceDim,DeviceType> cellNodes = uniformCartesianMesh<PointScalar,spaceDim,DeviceType>(1.0, meshWidth);
-  
-  const int numNodesPerCell = cellNodes.numNodesPerCell();
-  ScalarView<PointScalar,DeviceType> expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim);
-  using ExecutionSpace = typename DeviceType::execution_space;
-  auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{numCells,numNodesPerCell});
-  Kokkos::parallel_for("fill expanded cell nodes", policy,
-  KOKKOS_LAMBDA (const int &cellOrdinal, const int &nodeOrdinal)
-  {
-    for (int d=0; d<spaceDim; d++)
-    {
-      expandedCellNodes(cellOrdinal,nodeOrdinal,d) = cellNodes(cellOrdinal,nodeOrdinal,d);
-    }
-  });
-  
-  // goal here is to do a weighted Poisson; i.e. (f grad u, grad v) on each cell
-
-  ScalarView<Scalar,DeviceType> cellMeasures("cell measures", worksetSize, numPoints);
-  ScalarView<Scalar,DeviceType> jacobianDeterminant("jacobian determinant", worksetSize, numPoints);
-  ScalarView<Scalar,DeviceType> jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim);
-  ScalarView<Scalar,DeviceType> jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim);
-
-  int cellOffset = 0;
-  while (cellOffset < numCells)
-  {
-    int startCell         = cellOffset;
-    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
-    
-    std::pair<int,int> cellRange = {startCell, startCell+numCellsInWorkset};
-    auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL());
-    
-    // note that the following will not work if numCellsInWorkset != worksetSize
-    // (we would need to take an appropriate subview of jacobian, etc. containers)
-    INTREPID2_TEST_FOR_EXCEPTION(numCellsInWorkset != worksetSize, std::invalid_argument, "workset size must evenly divide the number of cells!");
-    CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo);
-    CellTools::setJacobianInv(jacobianInverse, jacobian);
-    CellTools::setJacobianDet(jacobianDeterminant, jacobian);
-    
-    FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights);
-    FunctionSpaceTools::HGRADtransformGRAD(transformedGradValues, jacobianInverse, basisGradValues);
-    FunctionSpaceTools::multiplyMeasure(transformedWeightedGradValues, cellMeasures, transformedGradValues);
-    
-//    printView(transformedGradValues, std::cout, "transformedGradValues");
-//    printView(cellMeasures, std::cout, "cellMeasures");
-    
-    auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
-    
-    FunctionSpaceTools::integrate(cellStiffnessSubview, transformedGradValues, transformedWeightedGradValues);
-    
-    cellOffset += worksetSize;
-  }
-  return cellStiffness;
 }
 
   template<class Scalar, typename DeviceType>
-  void testIntegrateMatchesBaseline(const TransformedVectorData<Scalar,DeviceType> vectorDataLeft,
-                                    const TensorData<Scalar,DeviceType> cellMeasures, const TransformedVectorData<Scalar,DeviceType> vectorDataRight,
+  void testIntegrateMatchesBaseline(const TransformedBasisValues<Scalar,DeviceType> vectorDataLeft,
+                                    const TensorData<Scalar,DeviceType> cellMeasures, const TransformedBasisValues<Scalar,DeviceType> vectorDataRight,
                                     Teuchos::FancyOStream &out, bool &success)
   {
     const double relTol = 1e-12;
@@ -264,15 +371,6 @@ ScalarView<Scalar,DeviceType> performStandardQuadratureHypercube(int meshWidth,
     
     const int integralsBaselineViewRank = integralsBaseline.getUnderlyingViewRank();
     
-//    std::cout << "integralsBaselineView.extent_int(0) = " << integralsBaselineView.extent_int(0) << std::endl;
-//    std::cout << "integralsBaselineView.extent_int(1) = " << integralsBaselineView.extent_int(1) << std::endl;
-//    std::cout << "integralsBaselineView.extent_int(2) = " << integralsBaselineView.extent_int(2) << std::endl;
-//
-//    std::cout << "integralsBaselineView.rank() = " << integralsBaselineView.rank() << std::endl;
-//
-//    std::cout << "integralsBaselineView(0,0)  = " << integralsBaselineView(0,0) << std::endl;
-//    std::cout << "integralsIntegrateView(0,0) = " << integralsIntegrateView(0,0) << std::endl;
-    
     printFunctor3( integralsBaseline, out, "integralsBaseline");
     printFunctor3(integralsIntegrate, out, "integralsIntegrate");
 //    printView(integralsBaselineView, out);
@@ -293,341 +391,199 @@ ScalarView<Scalar,DeviceType> performStandardQuadratureHypercube(int meshWidth,
     }
   }
 
-  template<class Scalar, class PointScalar, int spaceDim>
-  void testQuadratureHypercube(bool useAffinePath, int meshWidth, int polyOrder, int worksetSize, Teuchos::FancyOStream &out, bool &success)
+  template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+  void testQuadratureHypercube(int meshWidth, int polyOrder, int worksetSize,
+                               const FormulationChoice &formulation, const AlgorithmChoice &algorithm,
+                               const double &relTol, const double &absTol,
+                               Teuchos::FancyOStream &out, bool &success)
   {
-    const double relTol = 1e-12;
-    const double absTol = 1e-12;
-    
     using namespace std;
-    using DeviceType = DefaultTestDeviceType;
-    
-    using IntegrationTools   = Intrepid2::IntegrationTools<DeviceType>;
-    // dimensions of the returned view are (C,F,F)
-    auto fs = Intrepid2::FUNCTION_SPACE_HGRAD;
-    
-    auto lineBasis = Intrepid2::getLineBasis< Intrepid2::NodalBasisFamily<DeviceType> >(fs, polyOrder);
     
-    int numFields_1D = lineBasis->getCardinality();
-    
-    int numFields = 1;
-    int numHypercubes = 1;
+    Kokkos::Array<int,spaceDim> gridCellCounts;
     for (int d=0; d<spaceDim; d++)
     {
-      numHypercubes *= meshWidth;
-      numFields     *= numFields_1D;
+      gridCellCounts[d] = meshWidth;
     }
-    int numCells = numHypercubes;
-    
-    if (worksetSize > numCells) worksetSize = numCells;
-    
-    // local stiffness matrix:
-    ScalarView<Scalar,DeviceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
-    
-    shards::CellTopology lineTopo = shards::getCellTopologyData< shards::Line<> >();
-    shards::CellTopology cellTopo;
-    if      (spaceDim == 1) cellTopo = shards::getCellTopologyData< shards::Line<>          >();
-    else if (spaceDim == 2) cellTopo = shards::getCellTopologyData< shards::Quadrilateral<> >();
-    else if (spaceDim == 3) cellTopo = shards::getCellTopologyData< shards::Hexahedron<>    >();
-    
-    auto lineCubature = Intrepid2::DefaultCubatureFactory::create<DeviceType>(lineTopo,polyOrder*2);
-    int numPoints_1D = lineCubature->getNumPoints();
-    ScalarView<PointScalar,DeviceType> lineCubaturePoints("line cubature points",numPoints_1D,1);
-    ScalarView<double,DeviceType> lineCubatureWeights("line cubature weights", numPoints_1D);
-    
-    lineCubature->getCubature(lineCubaturePoints, lineCubatureWeights);
-    
-    // Allocate some intermediate containers
-    ScalarView<Scalar,DeviceType> lineBasisValues    ("line basis values",      numFields_1D, numPoints_1D   );
-    ScalarView<Scalar,DeviceType> lineBasisGradValues("line basis grad values", numFields_1D, numPoints_1D, 1);
-    
-    // for now, we use 1D values to build up the 2D or 3D gradients
-    // eventually, TensorBasis should offer a getValues() variant that returns tensor basis data
-    lineBasis->getValues(lineBasisValues,     lineCubaturePoints, Intrepid2::OPERATOR_VALUE );
-    lineBasis->getValues(lineBasisGradValues, lineCubaturePoints, Intrepid2::OPERATOR_GRAD  );
     
-    // drop the trivial space dimension in line gradient values:
-    Kokkos::resize(lineBasisGradValues, numFields_1D, numPoints_1D);
-      
-    Kokkos::Array<TensorData<Scalar,DeviceType>, spaceDim> vectorComponents;
-    
-    for (int d=0; d<spaceDim; d++)
-    {
-      Kokkos::Array<Data<Scalar,DeviceType>, spaceDim> gradComponent_d;
-      for (int d2=0; d2<spaceDim; d2++)
-      {
-        if (d2 == d) gradComponent_d[d2] = Data<Scalar,DeviceType>(lineBasisGradValues);
-        else         gradComponent_d[d2] = Data<Scalar,DeviceType>(lineBasisValues);
-      }
-      vectorComponents[d] = TensorData<Scalar,DeviceType>(gradComponent_d);
-    }
-    VectorData<Scalar,DeviceType> gradientValues(vectorComponents, false); // false: not axis-aligned
-    
-    CellGeometry<PointScalar,spaceDim,DeviceType> cellNodes = uniformCartesianMesh<PointScalar,spaceDim,DeviceType>(1.0, meshWidth);
-    
-    if (useAffinePath)
-    {
-      out << "Testing path with affine, grid-aligned CellGeometry.\n";
-    }
-    else
-    {
-      // make a "generic" copy of cellNodes, one that uses the (C,N), (N,D) node specification.  This will not know that the geometry is affine, grid-aligned, or uniform.
-      const bool copyAffineness = false; // want to go through the non-affine geometry path
-      CellGeometry<PointScalar,spaceDim,DeviceType> nodalCellGeometry = getNodalCellGeometry(cellNodes, copyAffineness);
-      
-      cellNodes = nodalCellGeometry;
-      out << "Testing non-affine path.\n";
-    }
-    
-    auto cubature = Intrepid2::DefaultCubatureFactory::create<DeviceType>(cellTopo,polyOrder*2);
-    auto tensorCubatureWeights = cubature->allocateCubatureWeights();
-    auto tensorCubaturePoints  = cubature->allocateCubaturePoints();
-    
-    cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights);
-    
-    // goal here is to do a weighted Poisson; i.e. (f grad u, grad v) on each cell
-    
-    int pointsPerCell = 1;
-    for (int d=0; d<spaceDim; d++)
-    {
-      pointsPerCell *= numPoints_1D;
-    }
+    auto geometry = getMesh<PointScalar, spaceDim, DeviceType>(algorithm, gridCellCounts);
+    double flopCountIntegration = 0, flopCountJacobian = 0;
+    auto standardIntegrals = performStandardQuadrature<Scalar, BasisFamily>(formulation, geometry, polyOrder, worksetSize, flopCountIntegration, flopCountJacobian);
     
-    Data<PointScalar,DeviceType> jacobian = cellNodes.allocateJacobianData(tensorCubaturePoints);
-    Data<PointScalar,DeviceType> jacobianDet = CellTools<DeviceType>::allocateJacobianDet(jacobian);
-    Data<PointScalar,DeviceType> jacobianInv = CellTools<DeviceType>::allocateJacobianInv(jacobian);
+    auto structuredIntegrals = performStructuredQuadrature<Scalar, BasisFamily>(formulation, geometry, polyOrder, worksetSize, flopCountIntegration, flopCountJacobian);
     
-    auto refData = cellNodes.getJacobianRefData(tensorCubaturePoints);
-    cellNodes.setJacobian(jacobian, tensorCubaturePoints, refData);
-    CellTools<DeviceType>::setJacobianDet(jacobianDet, jacobian);
-    CellTools<DeviceType>::setJacobianInv(jacobianInv, jacobian);
-    
-    // lazily-evaluated transformed gradient values:
-    using FunctionSpaceToolsDT = ::Intrepid2::FunctionSpaceTools<DeviceType>; // TODO: once FunctionSpaceTools has proper DeviceType support, change the earlier FunctionSpaceTools typedef, and use it here…
-    auto transformedGradientValues = FunctionSpaceToolsDT::getHGRADtransformGRAD(jacobianInv, gradientValues);
-    auto standardIntegrals = performStandardQuadratureHypercube<Scalar,PointScalar,spaceDim,DeviceType>(meshWidth, polyOrder, worksetSize);
-    
-    TensorData<PointScalar,DeviceType> cellMeasures = cellNodes.allocateCellMeasure(jacobianDet, tensorCubatureWeights);
-    if (!cellNodes.affine())
-    {
-      // if cellNodes is not (known to be) affine, then cellMeasures should not have a separate first component (indicating the cell dimension is separated, thanks to point-invariant cell Jacobian determinant)
-      TEST_EQUALITY(cellMeasures.separateFirstComponent(), false);
-    }
-    cellNodes.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights);
-    
-    auto integralData = IntegrationTools::allocateIntegralData(transformedGradientValues, cellMeasures, transformedGradientValues);
-    
-    IntegrationTools::integrate(integralData, transformedGradientValues, cellMeasures, transformedGradientValues);
-    
-    auto integralDataBaseline = IntegrationTools::allocateIntegralData(transformedGradientValues, cellMeasures, transformedGradientValues);
-    
-    integrate_baseline(integralDataBaseline, transformedGradientValues, cellMeasures, transformedGradientValues);
-    
-    out << "Comparing new integration path path with baseline integration…\n";
-    testIntegrateMatchesBaseline(transformedGradientValues, cellMeasures, transformedGradientValues, out, success);
-    
-    out << "Comparing baseline to standard Intrepid2 integration…\n";
-    testFloatingEquality3(standardIntegrals, integralDataBaseline, relTol, absTol, out, success, "standard Intrepid2 integral", "reduced data integral - baseline");
-    
-    out << "Comparing new integration path with standard Intrepid2 integration…\n";
-    testFloatingEquality3(standardIntegrals, integralData, relTol, absTol, out, success, "standard Intrepid2 integral", "reduced data integral");
+    out << "Comparing standard Intrepid2 integration to new integration path…\n";
+    testFloatingEquality3(standardIntegrals, structuredIntegrals, relTol, absTol, out, success, "standard Intrepid2 integral", "reduced data integral - baseline");
   }
 
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_1D_p1_AffinePath
-  TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_1D_p1_AffinePath )
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(StructuredIntegration, QuadratureUniformMesh, FormulationTag, AlgorithmTag, DimTag, PolyOrderTag)
   {
     using DataScalar  = double;
     using PointScalar = double;
     const int meshWidth = 1;
-    const int polyOrder = 1;
-    const int spaceDim = 1;
+    const int spaceDim = DimTag::spaceDim;
+    const int polyOrder = PolyOrderTag::polyOrder;
     const int worksetSize = meshWidth;
-    const bool affinePath = true;
-    
-    testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
-  }
 
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_1D_p1_GeneralPath
-  TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_1D_p1_GeneralPath )
-  {
-    using DataScalar  = double;
-    using PointScalar = double;
-    const int meshWidth = 4;
-    const int polyOrder = 1;
-    const int spaceDim = 1;
-    const int worksetSize = meshWidth;
-    const bool affinePath = false;
-    
-    testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
-  }
-
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_1D_p4_GeneralPath
-TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_1D_p4_GeneralPath )
-{
-  using DataScalar  = double;
-  using PointScalar = double;
-  const int meshWidth = 4;
-  const int polyOrder = 4;
-  const int spaceDim = 1;
-  const int worksetSize = meshWidth;
-  const bool affinePath = false;
-  
-  testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
-}
-
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_1D_p2
-  TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_1D_p2 )
-  {
-    using DataScalar  = double;
-    using PointScalar = double;
-    const int meshWidth = 1;
-    const int polyOrder = 2;
-    const int spaceDim = 1;
-    const int worksetSize = meshWidth;
-    const bool affinePath = true;
+    using DeviceType = DefaultTestDeviceType;
+    using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
     
-    testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
-  }
-
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_2D_p1
-  TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_2D_p1 )
-  {
-    using DataScalar  = double;
-    using PointScalar = double;
-    const int meshWidth = 3;
-    const int polyOrder = 1;
-    const int spaceDim = 2;
-    const int worksetSize = meshWidth * meshWidth;
-    const bool affinePath = true;
+    const AlgorithmChoice algorithm = AlgorithmTag::algorithm;
+    const FormulationChoice formulation = FormulationTag::formulation;
     
-    testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
-  }
-
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_2D_p1_GeneralPath
-  TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_2D_p1_GeneralPath )
-  {
-    using DataScalar  = double;
-    using PointScalar = double;
-    const int meshWidth = 2;
-    const int polyOrder = 1;
-    const int spaceDim = 2;
-    const int worksetSize = meshWidth * meshWidth;
-    const bool affinePath = false;
+    double relTol = 1e-12;
+    double absTol = 1e-12;
     
-    testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
+    testQuadratureHypercube<DataScalar, BasisFamily, PointScalar, spaceDim, DeviceType>(meshWidth, polyOrder, worksetSize, formulation, algorithm,
+                                                                                        relTol, absTol, out, success);
   }
 
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_2D_p2_GeneralPath
-TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_2D_p2_GeneralPath )
-{
-  using DataScalar  = double;
-  using PointScalar = double;
-  const int meshWidth = 2;
-  const int polyOrder = 2;
-  const int spaceDim = 2;
-  const int worksetSize = meshWidth * meshWidth;
-  const bool affinePath = false;
-  
-  testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
-}
+  // comparisons are to Standard algorithm, so we don't instantiate with Standard:
+  // 1D, p=1 tests:
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D1, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D1, P1)
+  // 1D, p=2 tests:
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D1, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D1, P2)
+  // 1D, p=4 tests:
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D1, P4)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D1, P4)
 
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_2D_p2
-  TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_2D_p2 )
-  {
-    using DataScalar  = double;
-    using PointScalar = double;
-    const int meshWidth = 3;
-    const int polyOrder = 2;
-    const int spaceDim = 2;
-    const int worksetSize = meshWidth * meshWidth;
-    const bool affinePath = true;
-    
-    testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
-  }
+  // 2D, p=1 tests:
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineTensorAlgorithm,    D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    NonAffineTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineNonTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    UniformAlgorithm,         D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineTensorAlgorithm,    D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   NonAffineTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineNonTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   UniformAlgorithm,         D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D2, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D2, P1)
+  // 2D, p=2 tests:
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineTensorAlgorithm,    D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    NonAffineTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineNonTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    UniformAlgorithm,         D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineTensorAlgorithm,    D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   NonAffineTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineNonTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   UniformAlgorithm,         D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D2, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D2, P2)
 
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_3D_p1
-  TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_3D_p1 )
-  {
-    using DataScalar  = double;
-    using PointScalar = double;
-    const int meshWidth = 2;
-    const int polyOrder = 1;
-    const int spaceDim = 3;
-    const int worksetSize = meshWidth * meshWidth * meshWidth;
-    const bool affinePath = true;
-    
-    testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
-  }
-
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_3D_p1_GeneralPath
-TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_3D_p1_GeneralPath )
-{
-  using DataScalar  = double;
-  using PointScalar = double;
-  const int meshWidth = 2;
-  const int polyOrder = 1;
-  const int spaceDim = 3;
-  const int worksetSize = meshWidth * meshWidth * meshWidth;
-  const bool affinePath = false;
-  
-  testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
-}
-
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_3D_p2
-  TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_3D_p2 )
-  {
-    using DataScalar  = double;
-    using PointScalar = double;
-    const int meshWidth = 2;
-    const int polyOrder = 2;
-    const int spaceDim = 3;
-    const int worksetSize = meshWidth * meshWidth * meshWidth;
-    const bool affinePath = true;
-    
-    testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
-  }
-
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_3D_p2_GeneralPath
-TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_3D_p2_GeneralPath )
-{
-  using DataScalar  = double;
-  using PointScalar = double;
-  const int meshWidth = 1;
-  const int polyOrder = 2;
-  const int spaceDim = 3;
-  const int worksetSize = meshWidth * meshWidth * meshWidth;
-  const bool affinePath = false;
-  
-  testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
-}
-
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_3D_p3
-  TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_3D_p3 )
-  {
-    using DataScalar  = double;
-    using PointScalar = double;
-    const int meshWidth = 2;
-    const int polyOrder = 3;
-    const int spaceDim = 3;
-    const int worksetSize = meshWidth * meshWidth * meshWidth;
-    const bool affinePath = true;
-    
-    testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
-  }
-
-// #pragma mark StructuredIntegration: QuadratureUniformMesh_3D_p3_GeneralPath
-  TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureUniformMesh_3D_p3_GeneralPath )
-  {
-    using DataScalar  = double;
-    using PointScalar = double;
-    const int meshWidth = 1;
-    const int polyOrder = 3;
-    const int spaceDim = 3;
-    const int worksetSize = meshWidth * meshWidth * meshWidth;
-    const bool affinePath = false;
-    
-    testQuadratureHypercube<DataScalar,PointScalar,spaceDim>(affinePath,meshWidth, polyOrder, worksetSize, out, success);
-  }
+  // 3D, p=1 tests:
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineTensorAlgorithm,    D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    NonAffineTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineNonTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    UniformAlgorithm,         D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineTensorAlgorithm,    D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   NonAffineTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineNonTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   UniformAlgorithm,         D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D3, P1)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D3, P1)
+  // 3D, p=2 tests:
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineTensorAlgorithm,    D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    NonAffineTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineNonTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    UniformAlgorithm,         D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineTensorAlgorithm,    D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   NonAffineTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineNonTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   UniformAlgorithm,         D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D3, P2)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D3, P2)
+  // 3D, p=3 tests:
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineTensorAlgorithm,    D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, NonAffineTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, AffineNonTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, PoissonFormulation, UniformAlgorithm,         D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineTensorAlgorithm,    D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   NonAffineTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   AffineNonTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HgradFormulation,   UniformAlgorithm,         D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineTensorAlgorithm,    D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    NonAffineTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    AffineNonTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HdivFormulation,    UniformAlgorithm,         D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineTensorAlgorithm,    D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   NonAffineTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   AffineNonTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, HcurlFormulation,   UniformAlgorithm,         D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineTensorAlgorithm,    D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      NonAffineTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      AffineNonTensorAlgorithm, D3, P3)
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT(StructuredIntegration, QuadratureUniformMesh, L2Formulation,      UniformAlgorithm,         D3, P3)
 
 // #pragma mark StructuredIntegration: QuadratureSynthetic_AxisAlignedPath_Case1
 TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_AxisAlignedPath_Case1 )
@@ -661,7 +617,7 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_AxisAlignedPath_Ca
   
   VectorData<DataScalar,DeviceType> unitVectorData(vectorComponents);
   
-  TransformedVectorData<DataScalar,DeviceType> transformedUnitVectorData(explicitIdentityMatrix,unitVectorData);
+  TransformedBasisValues<DataScalar,DeviceType> transformedUnitVectorData(explicitIdentityMatrix,unitVectorData);
   
   Data<DataScalar,DeviceType> constantCellMeasuresCellComponent(1.0, Kokkos::Array<int,1>{numCells});
   Data<DataScalar,DeviceType> constantCellMeasuresPointComponent(1.0, Kokkos::Array<int,1>{numComponentPoints});
@@ -712,7 +668,7 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case1
   
   VectorData<DataScalar,DeviceType> unitVectorData(vectorComponents);
   
-  TransformedVectorData<DataScalar,DeviceType> transformedUnitVectorData(explicitIdentityMatrix,unitVectorData);
+  TransformedBasisValues<DataScalar,DeviceType> transformedUnitVectorData(explicitIdentityMatrix,unitVectorData);
   
   Data<DataScalar,DeviceType> constantCellMeasuresData(1.0, Kokkos::Array<int,2>{numCells,numPoints});
   TensorData<DataScalar,DeviceType> constantCellMeasures(constantCellMeasuresData);
@@ -775,7 +731,7 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case2
   
   VectorData<DataScalar,DeviceType> vectorData(vectorComponents);
   
-  TransformedVectorData<DataScalar,DeviceType> transformedVectorData(explicitIdentityMatrix,vectorData);
+  TransformedBasisValues<DataScalar,DeviceType> transformedVectorData(explicitIdentityMatrix,vectorData);
   
   Data<DataScalar,DeviceType> constantCellMeasuresData(1.0, Kokkos::Array<int,2>{numCells,numPoints});
   TensorData<DataScalar,DeviceType> constantCellMeasures(constantCellMeasuresData);
@@ -858,7 +814,7 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case3
   
   VectorData<DataScalar,DeviceType> vectorData(vectorComponents);
   
-  TransformedVectorData<DataScalar,DeviceType> transformedUnitVectorData(explicitIdentityMatrix,vectorData);
+  TransformedBasisValues<DataScalar,DeviceType> transformedUnitVectorData(explicitIdentityMatrix,vectorData);
   
   Data<DataScalar,DeviceType> constantCellMeasuresData(1.0, Kokkos::Array<int,2>{numCells,numPoints});
   TensorData<DataScalar,DeviceType> constantCellMeasures(constantCellMeasuresData);
@@ -929,7 +885,7 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case4
   TEST_EQUALITY(numFieldsPerFamily, vectorData.numFieldsInFamily(0));
   TEST_EQUALITY(numFieldsPerFamily, vectorData.numFieldsInFamily(1));
   
-  TransformedVectorData<DataScalar,DeviceType> transformedUnitVectorData(explicitIdentityMatrix,vectorData);
+  TransformedBasisValues<DataScalar,DeviceType> transformedUnitVectorData(explicitIdentityMatrix,vectorData);
   
   Data<DataScalar,DeviceType> constantCellMeasuresData(1.0, Kokkos::Array<int,2>{numCells,numPoints});
   TensorData<DataScalar,DeviceType> constantCellMeasures(constantCellMeasuresData);
@@ -1011,8 +967,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case5
   TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0));
   TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(1));
   
-  TransformedVectorData<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
-  TransformedVectorData<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
+  TransformedBasisValues<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
+  TransformedBasisValues<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
   
   Data<DataScalar,DeviceType> constantCellMeasuresData(1.0, Kokkos::Array<int,2>{numCells,numPoints});
   TensorData<DataScalar,DeviceType> constantCellMeasures(constantCellMeasuresData);
@@ -1096,8 +1052,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_AxisAlignedPath_Ca
    VectorData<DataScalar,DeviceType> vectorDataRight(vectorComponentsRight);
    TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0));
    
-   TransformedVectorData<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
-   TransformedVectorData<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
+   TransformedBasisValues<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
+   TransformedBasisValues<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
    
    Data<DataScalar,DeviceType>  constantCellMeasuresCellComponent(1.0, Kokkos::Array<int,1>{numCells});
    Data<DataScalar,DeviceType> constantCellMeasuresPointComponent(1.0, Kokkos::Array<int,1>{numComponentPoints});
@@ -1155,44 +1111,26 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case6_
   fieldComponentExtents[0] = numFields3;
   Data<DataScalar,DeviceType> fieldComponentData3(fieldComponentDataView3,fieldComponentExtents,fieldComponentVariationTypes);
   
-   TensorData<DataScalar,DeviceType>  tensorDataLeft(std::vector< Data<DataScalar,DeviceType> >{fieldComponentData1,fieldComponentData2,fieldComponentData3});
-   TensorData<DataScalar,DeviceType> tensorDataRight(std::vector< Data<DataScalar,DeviceType> >{fieldComponentData1,fieldComponentData3,fieldComponentData2});
+  TensorData<DataScalar,DeviceType>  tensorDataLeft(std::vector< Data<DataScalar,DeviceType> >{fieldComponentData1,fieldComponentData2,fieldComponentData3});
+  TensorData<DataScalar,DeviceType> tensorDataRight(std::vector< Data<DataScalar,DeviceType> >{fieldComponentData1,fieldComponentData3,fieldComponentData2});
    
-   auto identityMatrixView = getFixedRankView<DataScalar>("identity matrix", spaceDim, spaceDim);
-   auto identityMatrixViewHost = getHostCopy(identityMatrixView);
-   
-   for (int d1=0; d1<spaceDim; d1++)
-   {
-     for (int d2=0; d2<spaceDim; d2++)
-     {
-       identityMatrixViewHost(d1,d2) = (d1 == d2) ? 1.0 : 0.0;
-     }
-   }
-   Kokkos::deep_copy(identityMatrixView, identityMatrixViewHost);
-   
-   const int numPoints = numComponentPoints * numComponentPoints;
-   Kokkos::Array<int,4> transformationExtents {numCells, numPoints, spaceDim, spaceDim};
-   Kokkos::Array<DataVariationType,4> transformationVariationType {CONSTANT, CONSTANT, GENERAL, GENERAL};
-   
-   Data<DataScalar,DeviceType> explicitIdentityMatrix(identityMatrixView, transformationExtents, transformationVariationType);
+  auto identityMatrixView = getFixedRankView<DataScalar>("identity matrix", spaceDim, spaceDim);
+  auto identityMatrixViewHost = getHostCopy(identityMatrixView);
    
-//   const int numFamilies = 3;
-//   Kokkos::Array<TensorData<DataScalar,DeviceType>, spaceDim > firstFamilyLeft  {tensorDataLeft,TensorData<DataScalar,DeviceType>(),TensorData<DataScalar,DeviceType>()};
-//   Kokkos::Array<TensorData<DataScalar,DeviceType>, spaceDim > secondFamilyLeft {TensorData<DataScalar,DeviceType>(),tensorDataLeft,TensorData<DataScalar,DeviceType>()};
-//   Kokkos::Array<TensorData<DataScalar,DeviceType>, spaceDim > thirdFamilyLeft  {TensorData<DataScalar,DeviceType>(),TensorData<DataScalar,DeviceType>(),tensorDataLeft};
-//   Kokkos::Array< Kokkos::Array<TensorData<DataScalar,DeviceType>, spaceDim>, numFamilies> vectorComponentsLeft {firstFamilyLeft, secondFamilyLeft, thirdFamilyLeft};
-//
-//   VectorData<DataScalar,DeviceType> vectorDataLeft(vectorComponentsLeft);
-//   TEST_EQUALITY(numFieldsPerFamilyLeft, vectorDataLeft.numFieldsInFamily(0));
-//   TEST_EQUALITY(numFieldsPerFamilyLeft, vectorDataLeft.numFieldsInFamily(1));
-//
-//   Kokkos::Array<TensorData<DataScalar,DeviceType>, spaceDim > firstFamilyRight  {tensorDataRight,TensorData<DataScalar,DeviceType>(),TensorData<DataScalar,DeviceType>()};
-//   Kokkos::Array<TensorData<DataScalar,DeviceType>, spaceDim > secondFamilyRight {TensorData<DataScalar,DeviceType>(),tensorDataRight,TensorData<DataScalar,DeviceType>()};
-//   Kokkos::Array<TensorData<DataScalar,DeviceType>, spaceDim > thirdFamilyRight  {TensorData<DataScalar,DeviceType>(),TensorData<DataScalar,DeviceType>(),tensorDataRight};
-//   Kokkos::Array< Kokkos::Array<TensorData<DataScalar,DeviceType>, spaceDim>, numFamilies> vectorComponentsRight {firstFamilyRight, secondFamilyRight, thirdFamilyRight};
-//  VectorData<DataScalar,DeviceType> vectorDataRight(vectorComponentsRight);
-//  TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0));
-//  TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(1));
+  for (int d1=0; d1<spaceDim; d1++)
+  {
+    for (int d2=0; d2<spaceDim; d2++)
+    {
+      identityMatrixViewHost(d1,d2) = (d1 == d2) ? 1.0 : 0.0;
+    }
+  }
+  Kokkos::deep_copy(identityMatrixView, identityMatrixViewHost);
+  
+  const int numPoints = numComponentPoints * numComponentPoints;
+  Kokkos::Array<int,4> transformationExtents {numCells, numPoints, spaceDim, spaceDim};
+  Kokkos::Array<DataVariationType,4> transformationVariationType {CONSTANT, CONSTANT, GENERAL, GENERAL};
+  
+  Data<DataScalar,DeviceType> explicitIdentityMatrix(identityMatrixView, transformationExtents, transformationVariationType);
   
   const int numFamilies = 1;
   Kokkos::Array<TensorData<DataScalar,DeviceType>, spaceDim > firstFamilyLeft  {tensorDataLeft,TensorData<DataScalar,DeviceType>(),TensorData<DataScalar,DeviceType>()};
@@ -1207,8 +1145,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case6_
    VectorData<DataScalar,DeviceType> vectorDataRight(vectorComponentsRight);
    TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0));
    
-   TransformedVectorData<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
-   TransformedVectorData<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
+   TransformedBasisValues<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
+   TransformedBasisValues<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
    
    Data<DataScalar,DeviceType> constantCellMeasuresData(1.0, Kokkos::Array<int,2>{numCells,numPoints});
    TensorData<DataScalar,DeviceType> constantCellMeasures(constantCellMeasuresData);
@@ -1294,8 +1232,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case7_
    VectorData<DataScalar,DeviceType> vectorDataRight(vectorComponentsRight);
    TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0));
    
-   TransformedVectorData<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
-   TransformedVectorData<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
+   TransformedBasisValues<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
+   TransformedBasisValues<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
    
    Data<DataScalar,DeviceType> constantCellMeasuresData(1.0, Kokkos::Array<int,2>{numCells,numPoints});
    TensorData<DataScalar,DeviceType> constantCellMeasures(constantCellMeasuresData);
@@ -1385,8 +1323,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case8_
    TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0));
    TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(1));
   
-   TransformedVectorData<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
-   TransformedVectorData<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
+   TransformedBasisValues<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
+   TransformedBasisValues<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
    
    Data<DataScalar,DeviceType> constantCellMeasuresData(1.0, Kokkos::Array<int,2>{numCells,numPoints});
    TensorData<DataScalar,DeviceType> constantCellMeasures(constantCellMeasuresData);
@@ -1473,8 +1411,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case9_
   VectorData<DataScalar,DeviceType> vectorDataRight(vectorComponentsRight);
   TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0));
   
-  TransformedVectorData<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
-  TransformedVectorData<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
+  TransformedBasisValues<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
+  TransformedBasisValues<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
   
   Data<DataScalar,DeviceType> constantCellMeasuresData(1.0, Kokkos::Array<int,2>{numCells,numPoints});
   TensorData<DataScalar,DeviceType> constantCellMeasures(constantCellMeasuresData);
@@ -1556,8 +1494,8 @@ TEUCHOS_UNIT_TEST( StructuredIntegration, QuadratureSynthetic_GeneralPath_Case10
   VectorData<DataScalar,DeviceType> vectorDataRight(vectorComponentsRight);
   TEST_EQUALITY(numFieldsPerFamilyRight, vectorDataRight.numFieldsInFamily(0));
   
-  TransformedVectorData<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
-  TransformedVectorData<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
+  TransformedBasisValues<DataScalar,DeviceType>  transformedUnitVectorDataLeft(explicitIdentityMatrix,vectorDataLeft);
+  TransformedBasisValues<DataScalar,DeviceType> transformedUnitVectorDataRight(explicitIdentityMatrix,vectorDataRight);
   
   auto cellMeasures = getFixedRankView<DataScalar>("cellMeasures", numCells, numPoints);
   
diff --git a/packages/intrepid2/unit-test/MonolithicExecutable/TransformedVectorDataTests.cpp b/packages/intrepid2/unit-test/MonolithicExecutable/TransformedBasisValuesTests.cpp
similarity index 94%
rename from packages/intrepid2/unit-test/MonolithicExecutable/TransformedVectorDataTests.cpp
rename to packages/intrepid2/unit-test/MonolithicExecutable/TransformedBasisValuesTests.cpp
index 6f956cf3d2b9..3e9f8729c081 100644
--- a/packages/intrepid2/unit-test/MonolithicExecutable/TransformedVectorDataTests.cpp
+++ b/packages/intrepid2/unit-test/MonolithicExecutable/TransformedBasisValuesTests.cpp
@@ -42,7 +42,7 @@
 
 
 /** \file   TransformedVectorDataTests.cpp
-    \brief  Tests against TransformedVectorData.
+    \brief  Tests against TransformedBasisValues.
     \author Created by Nate Roberts
 */
 
@@ -65,7 +65,7 @@
 #include "Intrepid2_FunctionSpaceTools.hpp"
 #include "Intrepid2_TensorData.hpp"
 #include "Intrepid2_TensorPoints.hpp"
-#include "Intrepid2_TransformedVectorData.hpp"
+#include "Intrepid2_TransformedBasisValues.hpp"
 #include "Intrepid2_VectorData.hpp"
 
 #include "Intrepid2_ScalarView.hpp"
@@ -136,7 +136,8 @@ namespace
       }
       vectorComponents[d] = TensorData<Scalar,DeviceType>(gradComponent_d);
     }
-    VectorData<Scalar,DeviceType> gradientValues(vectorComponents, false); // false: not axis-aligned
+    VectorData<Scalar,DeviceType> gradientVectorData(vectorComponents, false); // false: not axis-aligned
+    BasisValues<Scalar, DeviceType> gradientValues(gradientVectorData);
     
     CellGeometry<PointScalar,spaceDim,DeviceType> cellNodes = uniformCartesianMesh<PointScalar,spaceDim,DeviceType>(1.0, meshWidth);
     
@@ -215,7 +216,7 @@ namespace
     testFloatingEquality4(transformedGradValues, transformedGradientData, relTol, absTol, out, success);
   }
 
-  TEUCHOS_UNIT_TEST( TransformedVectorData, TransformedVector_1D_p1 )
+  TEUCHOS_UNIT_TEST( TransformedBasisValues, TransformedVector_1D_p1 )
   {
     const int spaceDim = 1;
     const int polyOrder = 1;
@@ -223,7 +224,7 @@ namespace
     testVectorTransformation<spaceDim>(polyOrder, meshWidth, out, success);
   }
    
- TEUCHOS_UNIT_TEST( TransformedVectorData, TransformedVector_1D_p2 )
+ TEUCHOS_UNIT_TEST( TransformedBasisValues, TransformedVector_1D_p2 )
  {
    const int spaceDim = 1;
    const int polyOrder = 2;
@@ -231,7 +232,7 @@ namespace
    testVectorTransformation<spaceDim>(polyOrder, meshWidth, out, success);
  }
   
- TEUCHOS_UNIT_TEST( TransformedVectorData, TransformedVector_2D_p1 )
+ TEUCHOS_UNIT_TEST( TransformedBasisValues, TransformedVector_2D_p1 )
  {
    const int spaceDim = 2;
    const int polyOrder = 1;
@@ -239,7 +240,7 @@ namespace
    testVectorTransformation<spaceDim>(polyOrder, meshWidth, out, success);
  }
   
- TEUCHOS_UNIT_TEST( TransformedVectorData, TransformedVector_2D_p2 )
+ TEUCHOS_UNIT_TEST( TransformedBasisValues, TransformedVector_2D_p2 )
  {
    const int spaceDim = 2;
    const int polyOrder = 2;
diff --git a/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp b/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp
index aa57b4e2536c..052a8b50da3c 100644
--- a/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp
+++ b/packages/intrepid2/unit-test/performance/DataCombination/DataCombinationPerformance.cpp
@@ -300,14 +300,6 @@ int main( int argc, char* argv[] )
     {
       for (CaseChoice caseChoice2 : caseChoices)
       {
-//        {
-//          // DEBUGGING:
-//          if ((caseChoice1 != General) && (caseChoice2 == General))
-//          {
-//            cout << "Set breakpoint here.\n";
-//          }
-//        }
-        
         // since constant takes so little time (and measurement is therefore noisy), we do a bunch of measurements and use their average
         const bool bothConstant   = (caseChoice1 == Constant) && (caseChoice2 == Constant);
         const int numMeasurements = bothConstant ? 1000 : 1;
diff --git a/packages/intrepid2/unit-test/performance/StructuredIntegration/CMakeLists.txt b/packages/intrepid2/unit-test/performance/StructuredIntegration/CMakeLists.txt
index 1b9659237466..86db43427c93 100644
--- a/packages/intrepid2/unit-test/performance/StructuredIntegration/CMakeLists.txt
+++ b/packages/intrepid2/unit-test/performance/StructuredIntegration/CMakeLists.txt
@@ -2,12 +2,14 @@ SET(SOURCES "")
 
 FILE(GLOB SOURCES *.cpp)
 
+INCLUDE_DIRECTORIES("../../../assembly-examples")
+
 SET(LIBRARIES intrepid2)
 
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
   StructuredIntegrationPerformance
   SOURCES ${SOURCES}
-  ARGS 
+  ARGS
   NUM_MPI_PROCS 1
   ADD_DIR_TO_NAME
   )
diff --git a/packages/intrepid2/unit-test/performance/StructuredIntegration/StructuredIntegrationPerformance.cpp b/packages/intrepid2/unit-test/performance/StructuredIntegration/StructuredIntegrationPerformance.cpp
index 80e37693833e..f3d0657dc7e1 100644
--- a/packages/intrepid2/unit-test/performance/StructuredIntegration/StructuredIntegrationPerformance.cpp
+++ b/packages/intrepid2/unit-test/performance/StructuredIntegration/StructuredIntegrationPerformance.cpp
@@ -58,6 +58,27 @@
 #include "Intrepid2_DefaultCubatureFactory.hpp"
 #include "Intrepid2_FunctionSpaceTools.hpp"
 #include "Intrepid2_IntegrationTools.hpp"
+#include "Intrepid2_TestUtils.hpp"
+
+#include "GRADGRADStandardAssembly.hpp"
+#include "GRADGRADStructuredAssembly.hpp"
+#include "H1StandardAssembly.hpp"
+#include "H1StructuredAssembly.hpp"
+#include "HDIVStandardAssembly.hpp"
+#include "HDIVStructuredAssembly.hpp"
+#include "HCURLStandardAssembly.hpp"
+#include "HCURLStructuredAssembly.hpp"
+#include "HVOLStandardAssembly.hpp"
+#include "HVOLStructuredAssembly.hpp"
+
+enum FormulationChoice
+{
+  Poisson, // (grad, grad)
+  Hgrad,   // (grad, grad) + (value, value)
+  Hdiv,    // (div, div)   + (value, value)
+  Hcurl,   // (curl, curl) + (value, value)
+  L2       // (value, value)
+};
 
 enum AlgorithmChoice
 {
@@ -69,6 +90,13 @@ enum AlgorithmChoice
   Uniform
 };
 
+enum BasisFamilyChoice
+{
+  Nodal,
+  Hierarchical,
+  Serendipity
+};
+
 std::string to_string(AlgorithmChoice choice)
 {
   switch (choice) {
@@ -83,20 +111,31 @@ std::string to_string(AlgorithmChoice choice)
   }
 }
 
+std::string to_string(FormulationChoice choice)
+{
+  switch (choice) {
+    case Poisson: return "Poisson";
+    case Hgrad:   return "Hgrad";
+    case Hdiv:    return "Hdiv";
+    case Hcurl:   return "Hcurl";
+    case L2:      return "L2";
+    
+    default:      return "Unknown FormulationChoice";
+  }
+}
+
 using namespace Intrepid2;
 
-template< typename PointScalar, int spaceDim, typename ExecutionSpace >
+template< typename PointScalar, int spaceDim, typename DeviceType >
 inline
-CellGeometry<PointScalar, spaceDim, ExecutionSpace> getMesh(AlgorithmChoice algorithmChoice, const int &meshWidth)
+CellGeometry<PointScalar, spaceDim, DeviceType> getMesh(AlgorithmChoice algorithmChoice, const Kokkos::Array<int,spaceDim> &gridCellCounts)
 {
   Kokkos::Array<PointScalar,spaceDim> domainExtents;
-  Kokkos::Array<int,spaceDim> gridCellCounts;
   for (int d=0; d<spaceDim; d++)
   {
-    domainExtents[d]  = 0.0;
-    gridCellCounts[d] = meshWidth;
+    domainExtents[d]  = 1.0;
   }
-  auto uniformTensorGeometry = uniformCartesianMesh<PointScalar,spaceDim,ExecutionSpace>(domainExtents, gridCellCounts);
+  auto uniformTensorGeometry = uniformCartesianMesh<PointScalar,spaceDim,DeviceType>(domainExtents, gridCellCounts);
   
   switch (algorithmChoice)
   {
@@ -125,352 +164,138 @@ CellGeometry<PointScalar, spaceDim, ExecutionSpace> getMesh(AlgorithmChoice algo
   return uniformTensorGeometry; // this line should be unreachable; included to avoid compiler warnings from nvcc
 }
 
-double flopsPerJacobian(const int &spaceDim, const int &numPoints, const int &numGeometryNodes)
-{
-  // implementation looks like:
-//  for (ordinal_type i=0;i<dim;++i)
-//  for (ordinal_type j=0;j<dim;++j) {
-//    _jacobian(cell, point, i, j) = 0;
-//    for (ordinal_type bf=0;bf<cardinality;++bf)
-//      _jacobian(cell, point, i, j) += _worksetCells(cell+_startCell, bf, i) * _basisGrads(bf, point, j); // 2 flops: one multiply, one add
-//  }
-  return 2.0 * spaceDim * spaceDim * numPoints * numGeometryNodes;
-}
-
-double flopsPerJacobianDet(const int &spaceDim, const int &numPoints)
+template< typename PointScalar, int spaceDim, typename DeviceType >
+inline
+CellGeometry<PointScalar, spaceDim, DeviceType> getMesh(AlgorithmChoice algorithmChoice, const int &meshWidth)
 {
-  //implementation in RealSpaceTools:
-  /*value_type r_val = 0.0;
-  switch (dim) {
-  case 3:
-    r_val = ( inMat(0,0) * inMat(1,1) * inMat(2,2) + // 3 flops: 2 mults, 1 add
-              inMat(1,0) * inMat(2,1) * inMat(0,2) + // 3 flops: 2 mults, 1 add
-              inMat(2,0) * inMat(0,1) * inMat(1,2) - // 3 flops: 2 mults, 1 subtract
-              inMat(2,0) * inMat(1,1) * inMat(0,2) - // 3 flops: 2 mults, 1 subtract
-              inMat(0,0) * inMat(2,1) * inMat(1,2) - // 3 flops: 2 mults, 1 subtract
-              inMat(1,0) * inMat(0,1) * inMat(2,2) ); // 2 flops: 2 mults
-    break;
-  case 2:
-    r_val = ( inMat(0,0) * inMat(1,1) -
-              inMat(0,1) * inMat(1,0) );
-    break;
-  case 1:
-    r_val = ( inMat(0,0) );
-    break;
-  }
-  return r_val;*/
-  int r_val;
-  switch (spaceDim) {
-    case 3: r_val = 17.0 * numPoints; break;
-    case 2: r_val = 3.0 * numPoints; break;
-    case 1: r_val = 0.0; break;
-    default: INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unhandled spaceDim");
+  Kokkos::Array<int,spaceDim> gridCellCounts;
+  for (int d=0; d<spaceDim; d++)
+  {
+    gridCellCounts[d] = meshWidth;
   }
-  return r_val;
+  return getMesh<PointScalar, spaceDim, DeviceType>(algorithmChoice, gridCellCounts);
 }
 
-double flopsPerJacobianInverse(const int &spaceDim, const int &numPoints)
+//! Returns an Array of roughly isotropic grid dimensions for which (C,F,F) stifness matrix will have at most maxStiffnessEntryCount entries, and at most maxElements total cells.
+template<int spaceDim>
+Kokkos::Array<int,spaceDim>
+getMeshWidths(int basisCardinality, int maxStiffnessEntryCount, int maxElements)
 {
-  // implementation looks like:
-  // const value_type val = RealSpaceTools<>::Serial::det(mat);
-  double totalFlops = flopsPerJacobianDet(spaceDim, numPoints);
+  Kokkos::Array<int,spaceDim> meshWidths;
+  const int entriesPerElement = basisCardinality * basisCardinality;
+  const int maxElementCount   = std::min(maxStiffnessEntryCount / entriesPerElement, maxElements);
   
-  if (spaceDim == 3)
+  // initialize meshWidths:
+  for (int d=0; d<spaceDim; d++)
   {
-  //  val0 =   mat(1,1)*mat(2,2) - mat(2,1)*mat(1,2); // 3 flops: 2 mults, 1 subtraction
-  //  val1 = - mat(1,0)*mat(2,2) + mat(2,0)*mat(1,2); // 4 flops: 2 mults, 1 negation, 1 add
-  //  val2 =   mat(1,0)*mat(2,1) - mat(2,0)*mat(1,1); // 3 flops: 2 mults, 1 subtraction
-  //
-  //  inv(0,0) = val0/val; // 1 flop
-  //  inv(1,0) = val1/val; // 1 flop
-  //  inv(2,0) = val2/val; // 1 flop
-  //
-  //  val0 =   mat(2,1)*mat(0,2) - mat(0,1)*mat(2,2); // 3
-  //  val1 =   mat(0,0)*mat(2,2) - mat(2,0)*mat(0,2); // 3
-  //  val2 = - mat(0,0)*mat(2,1) + mat(2,0)*mat(0,1); // 4
-  //
-  //  inv(0,1) = val0/val; // 1
-  //  inv(1,1) = val1/val; // 1
-  //  inv(2,1) = val2/val; // 1
-  //
-  //  val0 =   mat(0,1)*mat(1,2) - mat(1,1)*mat(0,2); // 3
-  //  val1 = - mat(0,0)*mat(1,2) + mat(1,0)*mat(0,2); // 4
-  //  val2 =   mat(0,0)*mat(1,1) - mat(1,0)*mat(0,1); // 3
-  //
-  //  inv(0,2) = val0/val; // 1
-  //  inv(1,2) = val1/val; // 1
-  //  inv(2,2) = val2/val; // 1
-    totalFlops += 36.0 * numPoints;
+    meshWidths[d] = 1;
   }
-  else
+  
+  // double in each dimension until doing so would make the number of elements would exceed maxElementCount:
+  int numElements = 1;
+  int d = 0;
+  while (numElements * 2 <= maxElementCount)
   {
-    INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unhandled spaceDim");
+    meshWidths[d] *= 2;
+    d = (d + 1) % spaceDim;
+    numElements *= 2;
   }
-  
-  return totalFlops;
+  return meshWidths;
 }
 
-//! version that uses the classic, generic Intrepid2 paths
-template<class Scalar, class PointScalar, int spaceDim, typename ExecSpaceType>
-ScalarView<Scalar,ExecSpaceType> performStandardQuadratureHypercubeGRADGRAD(CellGeometry<PointScalar, spaceDim, ExecSpaceType> &geometry,
-                                                                            const int &polyOrder, int worksetSize,
-                                                                            double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStandardQuadrature(FormulationChoice formulation,
+                                        Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                        double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
 {
-  int numVertices = 1;
-  for (int d=0; d<spaceDim; d++)
+  switch (formulation)
   {
-    numVertices *= 2;
+    case Poisson:
+      return performStandardQuadratureGRADGRAD<Scalar,BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    case Hgrad:
+      return performStandardQuadratureH1<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    case Hdiv:
+      return performStandardQuadratureHDIV<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    case Hcurl:
+      return performStandardQuadratureHCURL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    case L2:
+      return performStandardQuadratureHVOL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    default:
+      return Intrepid2::ScalarView<Scalar,DeviceType>();
   }
-  
-  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
-  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
-  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
-  initialSetupTimer->start();
-  
-  using CellTools = Intrepid2::CellTools<Kokkos::DefaultExecutionSpace>;
-  using FunctionSpaceTools = Intrepid2::FunctionSpaceTools<Kokkos::DefaultExecutionSpace>;
-  
-  using namespace std;
-  // dimensions of the returned view are (C,F,F)
-  auto fs = Intrepid2::FUNCTION_SPACE_HGRAD;
-
-  shards::CellTopology cellTopo = geometry.cellTopology();
-  
-  auto basis = Intrepid2::getBasis< Intrepid2::NodalBasisFamily<Kokkos::DefaultExecutionSpace> >(cellTopo, fs, polyOrder);
-  
-  int numFields = basis->getCardinality();
-  int numCells = geometry.numCells();
-  
-  if (worksetSize > numCells) worksetSize = numCells;
-  
-  // local stiffness matrices:
-  ScalarView<Scalar,ExecSpaceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
-  
-  using Kokkos::DefaultExecutionSpace;
-  auto cubature = Intrepid2::DefaultCubatureFactory::create<ExecSpaceType>(cellTopo,polyOrder*2);
-  int numPoints = cubature->getNumPoints();
-  ScalarView<PointScalar,ExecSpaceType> cubaturePoints("cubature points",numPoints,spaceDim);
-  ScalarView<double,ExecSpaceType> cubatureWeights("cubature weights", numPoints);
-  
-  cubature->getCubature(cubaturePoints, cubatureWeights);
-  
-  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
-  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
-  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);
-  
-  // Allocate some intermediate containers
-  ScalarView<Scalar,ExecSpaceType> basisValues    ("basis values", numFields, numPoints );
-  ScalarView<Scalar,ExecSpaceType> basisGradValues("basis grad values", numFields, numPoints, spaceDim);
-
-  ScalarView<Scalar,ExecSpaceType> transformedGradValues("transformed grad values", worksetSize, numFields, numPoints, spaceDim);
-  ScalarView<Scalar,ExecSpaceType> transformedWeightedGradValues("transformed weighted grad values", worksetSize, numFields, numPoints, spaceDim);
-  
-  basis->getValues(basisValues,     cubaturePoints, Intrepid2::OPERATOR_VALUE );
-  basis->getValues(basisGradValues, cubaturePoints, Intrepid2::OPERATOR_GRAD  );
-  
-  const int numNodesPerCell = geometry.numNodesPerCell();
-  ScalarView<PointScalar,ExecSpaceType> expandedCellNodes("expanded cell nodes",numCells,numNodesPerCell,spaceDim);
-  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpaceType>(0,numCells),
-  KOKKOS_LAMBDA (const int &cellOrdinal) {
-    for (int nodeOrdinal=0; nodeOrdinal<numNodesPerCell; nodeOrdinal++)
-    {
-      for (int d=0; d<spaceDim; d++)
-      {
-        expandedCellNodes(cellOrdinal,nodeOrdinal,d) = geometry(cellOrdinal,nodeOrdinal,d);
-      }
-    }
-  });
-  
-  ScalarView<Scalar,ExecSpaceType> cellMeasures("cell measures", worksetSize, numPoints);
-  ScalarView<Scalar,ExecSpaceType> jacobianDeterminant("jacobian determinant", worksetSize, numPoints);
-  ScalarView<Scalar,ExecSpaceType> jacobian("jacobian", worksetSize, numPoints, spaceDim, spaceDim);
-  ScalarView<Scalar,ExecSpaceType> jacobianInverse("jacobian inverse", worksetSize, numPoints, spaceDim, spaceDim);
+}
 
-  initialSetupTimer->stop();
-  
-  transformIntegrateFlopCount  = 0;
-  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
-  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
-  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
-  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
-  
-  int cellOffset = 0;
-  while (cellOffset < numCells)
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim, typename DeviceType>
+Intrepid2::ScalarView<Scalar,DeviceType> performStructuredQuadrature(FormulationChoice formulation,
+                                          Intrepid2::CellGeometry<PointScalar, spaceDim, DeviceType> &geometry, const int &polyOrder, const int &worksetSize,
+                                          double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+{
+  switch (formulation)
   {
-    int startCell         = cellOffset;
-    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
-    
-    std::pair<int,int> cellRange = {startCell, startCell+numCellsInWorkset};
-    auto cellWorkset = Kokkos::subview(expandedCellNodes, cellRange, Kokkos::ALL(), Kokkos::ALL());
-    
-    if (numCellsInWorkset != worksetSize)
-    {
-      Kokkos::resize(jacobian,                      numCellsInWorkset, numPoints, spaceDim, spaceDim);
-      Kokkos::resize(jacobianInverse,               numCellsInWorkset, numPoints, spaceDim, spaceDim);
-      Kokkos::resize(jacobianDeterminant,           numCellsInWorkset, numPoints);
-      Kokkos::resize(cellMeasures,                  numCellsInWorkset, numPoints);
-      Kokkos::resize(transformedGradValues,         numCellsInWorkset, numFields, numPoints, spaceDim);
-      Kokkos::resize(transformedWeightedGradValues, numCellsInWorkset, numFields, numPoints, spaceDim);
-    }
-    jacobianAndCellMeasureTimer->start();
-    CellTools::setJacobian(jacobian, cubaturePoints, cellWorkset, cellTopo); // accounted for outside loop, as numCells * flopsPerJacobianPerCell.
-    CellTools::setJacobianInv(jacobianInverse, jacobian);
-    CellTools::setJacobianDet(jacobianDeterminant, jacobian);
-    
-    FunctionSpaceTools::computeCellMeasure(cellMeasures, jacobianDeterminant, cubatureWeights);
-    ExecSpaceType().fence();
-    jacobianAndCellMeasureTimer->stop();
-    
-    // because structured integration performs transformations within integrate(), to get a fairer comparison here we include the transformation calls.
-    fstIntegrateCall->start();
-    FunctionSpaceTools::HGRADtransformGRAD(transformedGradValues, jacobianInverse, basisGradValues);
-    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim) * (spaceDim - 1) * 2.0; // 2: one multiply, one add per (P,D) entry in the contraction.
-    FunctionSpaceTools::multiplyMeasure(transformedWeightedGradValues, cellMeasures, transformedGradValues);
-    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numPoints) * double(spaceDim); // multiply each entry of transformedGradValues: one flop for each.
-        
-    auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
-    
-    FunctionSpaceTools::integrate(cellStiffnessSubview, transformedGradValues, transformedWeightedGradValues);
-    ExecSpaceType().fence();
-    fstIntegrateCall->stop();
-    
-    transformIntegrateFlopCount += double(numCellsInWorkset) * double(numFields) * double(numFields) * double(numPoints * spaceDim * 2 - 1); // 2: one multiply, one add per (P,D) entry in the contraction.
-    
-    cellOffset += worksetSize;
+    case Poisson:
+      return performStructuredQuadratureGRADGRAD<Scalar,BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    case Hgrad:
+      return performStructuredQuadratureH1<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    case Hdiv:
+      return performStructuredQuadratureHDIV<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    case Hcurl:
+      return performStructuredQuadratureHCURL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    case L2:
+      return performStructuredQuadratureHVOL<Scalar, BasisFamily>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+    default:
+      return Intrepid2::ScalarView<Scalar,DeviceType>();
   }
-//  std::cout << "standard integration, approximateFlopCount: " << approximateFlopCount << std::endl;
-  return cellStiffness;
 }
 
-//! returns an estimated count of the floating point operations performed.
-template<class Scalar, class PointScalar, int spaceDim, typename ExecSpaceType>
-void performStructuredQuadratureHypercubeGRADGRAD(CellGeometry<PointScalar, spaceDim, ExecSpaceType> &geometry, const int &polyOrder, const int &worksetSize,
-                                                  double &transformIntegrateFlopCount, double &jacobianCellMeasureFlopCount)
+template<class Scalar, class BasisFamily, class PointScalar, int spaceDim>
+typename BasisFamily::BasisPtr getBasisForFormulation(FormulationChoice formulation, shards::CellTopology &cellTopo, const int polyOrder)
 {
-  int numVertices = 1;
-  for (int d=0; d<spaceDim; d++)
+  Intrepid2::EFunctionSpace fs;
+  switch (formulation)
   {
-    numVertices *= 2;
+    case Poisson: fs = FUNCTION_SPACE_HGRAD; break;
+    case Hgrad:   fs = FUNCTION_SPACE_HGRAD; break;
+    case Hdiv:    fs = FUNCTION_SPACE_HDIV;  break;
+    case Hcurl:   fs = FUNCTION_SPACE_HCURL; break;
+    case L2:      fs = FUNCTION_SPACE_HVOL;  break;
   }
   
-  auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
-  initialSetupTimer->start();
-  using namespace std;
-  using FunctionSpaceTools = Intrepid2::FunctionSpaceTools<ExecSpaceType>;
-  using IntegrationTools   = Intrepid2::IntegrationTools<ExecSpaceType>;
-  // dimensions of the returned view are (C,F,F)
-  auto fs = Intrepid2::FUNCTION_SPACE_HGRAD;
-  
-  shards::CellTopology cellTopo = geometry.cellTopology();
-  
-  auto basis = Intrepid2::getBasis< Intrepid2::DerivedNodalBasisFamily<Kokkos::DefaultExecutionSpace> >(cellTopo, fs, polyOrder);
-  
-  int numFields = basis->getCardinality();
-  int numCells = geometry.numCells();
-    
-  // local stiffness matrix:
-  ScalarView<Scalar,ExecSpaceType> cellStiffness("cell stiffness matrices",numCells,numFields,numFields);
-  
-  auto cubature = Intrepid2::DefaultCubatureFactory::create<ExecSpaceType>(cellTopo,polyOrder*2);
-  auto tensorCubatureWeights = cubature->allocateCubatureWeights();
-  TensorPoints<PointScalar,ExecSpaceType> tensorCubaturePoints  = cubature->allocateCubaturePoints();
-  
-  cubature->getCubature(tensorCubaturePoints, tensorCubatureWeights);
-  
-  EOperator op = OPERATOR_GRAD;
-  BasisValues<Scalar,ExecSpaceType> gradientValues = basis->allocateBasisValues(tensorCubaturePoints, op);
-  basis->getValues(gradientValues, tensorCubaturePoints, op);
-  
-  // goal here is to do a weighted Poisson; i.e. (f grad u, grad v) on each cell
-    
-  int cellOffset = 0;
-  
-  auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
-  auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
-  
-  Data<PointScalar,ExecSpaceType> jacobian = geometry.allocateJacobianData(tensorCubaturePoints, 0, worksetSize);
-  Data<PointScalar,ExecSpaceType> jacobianDet = CellTools<ExecSpaceType>::allocateJacobianDet(jacobian);
-  Data<PointScalar,ExecSpaceType> jacobianInv = CellTools<ExecSpaceType>::allocateJacobianInv(jacobian);
-  TensorData<PointScalar,ExecSpaceType> cellMeasures = geometry.allocateCellMeasure(jacobianDet, tensorCubatureWeights);
-  
-  // lazily-evaluated transformed gradient values (temporary to allow integralData allocation)
-  auto transformedGradientValuesTemp = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues.vectorData());
-  auto integralData = IntegrationTools::allocateIntegralData(transformedGradientValuesTemp, cellMeasures, transformedGradientValuesTemp);
-  
-  const int numPoints = jacobian.getDataExtent(1); // data extent will be 1 for affine, numPoints for other cases
-  
-  // TODO: make the below determination accurate for diagonal/block-diagonal cases… (right now, will overcount)
-  const double flopsPerJacobianPerCell    = flopsPerJacobian(spaceDim, numPoints, numVertices);
-  const double flopsPerJacobianDetPerCell = flopsPerJacobianDet(spaceDim, numPoints);
-  const double flopsPerJacobianInvPerCell = flopsPerJacobianInverse(spaceDim, numPoints);
-  
-  transformIntegrateFlopCount = 0;
-  jacobianCellMeasureFlopCount  = numCells * flopsPerJacobianPerCell;    // jacobian itself
-  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianInvPerCell; // inverse
-  jacobianCellMeasureFlopCount += numCells * flopsPerJacobianDetPerCell; // determinant
-  jacobianCellMeasureFlopCount += numCells * numPoints; // cell measure: (C,P) gets weighted with cubature weights of shape (P)
-  
-  auto refData = geometry.getJacobianRefData(tensorCubaturePoints);
+  auto basis = getBasis< BasisFamily >(cellTopo, fs, polyOrder);
+  return basis;
+}
+
+template<class Scalar, class PointScalar, class DeviceType, int spaceDim>
+BasisPtr<DeviceType,Scalar,Scalar> getHypercubeBasisForFormulation(FormulationChoice formulation, BasisFamilyChoice basisFamilyChoice, const int polyOrder)
+{
+  shards::CellTopology cellTopo;
+  switch (spaceDim)
+  {
+    case 1: cellTopo = shards::CellTopology(shards::getCellTopologyData<shards::Line<2> >());          break;
+    case 2: cellTopo = shards::CellTopology(shards::getCellTopologyData<shards::Quadrilateral<4> >()); break;
+    case 3: cellTopo = shards::CellTopology(shards::getCellTopologyData<shards::Hexahedron<8> >());    break;
+    default:
+      INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported spaceDim");
+  }
   
-  initialSetupTimer->stop();
-  while (cellOffset < numCells)
+  switch (basisFamilyChoice)
   {
-    int startCell         = cellOffset;
-    int numCellsInWorkset = (cellOffset + worksetSize - 1 < numCells) ? worksetSize : numCells - startCell;
-    int endCell           = numCellsInWorkset + startCell;
-    
-    jacobianAndCellMeasureTimer->start();
-    if (numCellsInWorkset != worksetSize)
-    {
-      const int CELL_DIM = 0; // first dimension corresponds to cell
-      jacobian.setExtent(    CELL_DIM, numCellsInWorkset);
-      jacobianDet.setExtent( CELL_DIM, numCellsInWorkset);
-      jacobianInv.setExtent( CELL_DIM, numCellsInWorkset);
-      integralData.setExtent(CELL_DIM, numCellsInWorkset);
-      
-      // cellMeasures is a TensorData object with separateFirstComponent_ = true; the below sets the cell dimension…
-      cellMeasures.setFirstComponentExtentInDimension0(numCellsInWorkset);
-    }
-    
-    geometry.setJacobian(jacobian, tensorCubaturePoints, refData, startCell, endCell);
-    CellTools<ExecSpaceType>::setJacobianDet(jacobianDet, jacobian);
-    CellTools<ExecSpaceType>::setJacobianInv(jacobianInv, jacobian);
-    
-    // lazily-evaluated transformed gradient values:
-    auto transformedGradientValues = FunctionSpaceTools::getHGRADtransformGRAD(jacobianInv, gradientValues.vectorData());
-    
-    geometry.computeCellMeasure(cellMeasures, jacobianDet, tensorCubatureWeights);
-    ExecSpaceType().fence();
-    jacobianAndCellMeasureTimer->stop();
-    
-    bool sumInto = false;
-    double approximateFlopCountIntegrateWorkset = 0;
-    fstIntegrateCall->start();
-    IntegrationTools::integrate(integralData, transformedGradientValues, cellMeasures, transformedGradientValues, sumInto, &approximateFlopCountIntegrateWorkset);
-    ExecSpaceType().fence();
-    fstIntegrateCall->stop();
-    
-    // copy into cellStiffness container.  (Alternately, do something like allocateIntegralData, but outside this loop, and take a subview to construct the workset integralData.)
-    if (integralData.getUnderlyingViewRank() == 3)
+    case Nodal:
     {
-      std::pair<int,int> cellRange = {startCell, endCell};
-      auto cellStiffnessSubview = Kokkos::subview(cellStiffness, cellRange, Kokkos::ALL(), Kokkos::ALL());
-      Kokkos::deep_copy(cellStiffnessSubview, integralData.getUnderlyingView3());
+      using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+      return getBasisForFormulation<Scalar, BasisFamily, Scalar, spaceDim>(formulation, cellTopo, polyOrder);
     }
-    else // underlying view rank is 2; copy to each cell in destination stiffness matrix
+      break;
+    case Hierarchical:
     {
-      auto integralView2 = integralData.getUnderlyingView2();
-      auto policy = Kokkos::MDRangePolicy<ExecSpaceType,Kokkos::Rank<3>>({0,0,0},{numCellsInWorkset,numFields,numFields});
-      Kokkos::parallel_for("copy uniform data to expanded container", policy,
-                       KOKKOS_LAMBDA (const int &cellOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
-        cellStiffness(startCell + cellOrdinal, leftFieldOrdinal, rightFieldOrdinal) = integralView2(leftFieldOrdinal,rightFieldOrdinal);
-      });
+      using BasisFamily = HierarchicalBasisFamily<DeviceType>;
+      return getBasisForFormulation<Scalar, BasisFamily, Scalar, spaceDim>(formulation, cellTopo, polyOrder);
     }
-    
-    transformIntegrateFlopCount  += approximateFlopCountIntegrateWorkset;
-    
-    cellOffset += worksetSize;
+      break;
+    case Serendipity:
+      INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "basis family choice not yet implemented");
   }
+  
+  return Teuchos::null;
 }
 
 int main( int argc, char* argv[] )
@@ -485,9 +310,13 @@ int main( int argc, char* argv[] )
   using std::vector;
   
   {
-    // For now, we focus on 3D Poisson test.
-    // The non-affine tensor case is one that, like the Standard case, does *not* make any algorithmic assumptions about the geometry.
-    // This should be a good proxy for Poisson with unstructured material data on a curvilinear hexahedral mesh.
+    // Here, we support various 3D "formulations": Poisson, H^1, H(div), H(curl), and L^2 norms.  (Poisson is the default.)
+    // The geometry we use is an axis-aligned, uniform hypercube mesh, but in the non-affine tensor case, the CellGeometry object does
+    // not express the axis-aligned or uniform structure, and the computations are as they would be in a more general hypercube mesh.
+    // Similarly, the Affine case only assumes an affine mesh; it need not be uniform or axis-aligned.
+   
+    string timingsFilePath;
+    
     const int spaceDim = 3;
     
     enum Mode
@@ -501,10 +330,13 @@ int main( int argc, char* argv[] )
     Mode mode;
     
     vector<AlgorithmChoice> allAlgorithmChoices {Standard, NonAffineTensor, AffineTensor, Uniform};
+    vector<FormulationChoice> allFormulationChoices {Poisson, Hgrad, Hdiv, Hcurl, L2};
     
     Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options
     
     string algorithmChoiceString = "All"; // alternatives: Standard, NonAffineTensor, AffineTensor, Uniform
+    string formulationChoiceString = "Poisson";
+    string basisFamilyChoiceString = "Nodal";
     
     int polyOrderFixed = -1;
     int polyOrderMin = 1;
@@ -512,11 +344,22 @@ int main( int argc, char* argv[] )
     
     string modeChoiceString = "Test"; // alternatives: Calibration, BestSerial, BestCuda
     
+    bool saveTimingsToFile = false;
+    string outputDir = ".";
+    
     cmdp.setOption("algorithm", &algorithmChoiceString, "Options: All, Standard, NonAffineTensor, AffineTensor, Uniform");
+    cmdp.setOption("formulation", &formulationChoiceString, "Options: Poisson, Hgrad, Hdiv, Hcurl, L2");
     cmdp.setOption("polyOrder", &polyOrderFixed, "Single polynomial degree to run at");
     cmdp.setOption("minPolyOrder", &polyOrderMin, "Starting polynomial degree to run at");
     cmdp.setOption("maxPolyOrder", &polyOrderMax, "Maximum polynomial degree to run at");
     cmdp.setOption("mode", &modeChoiceString);
+    cmdp.setOption("basisFamily", &basisFamilyChoiceString, "Options: Nodal, Hierarchical, Serendipity");
+    cmdp.setOption("saveTimings", "dontSaveTimings", &saveTimingsToFile, "Save timings to a file in outputDir.");
+    cmdp.setOption("outputDir", &outputDir, "Directory for saving timings file");
+    
+    Teuchos::RCP<std::ofstream> timingsFileStream;
+    
+    bool success = true;
     
     if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
     {
@@ -525,6 +368,48 @@ int main( int argc, char* argv[] )
   #endif
       return -1;
     }
+    
+    if (saveTimingsToFile)
+    {
+      std::ostringstream fileNameStream;
+      fileNameStream << outputDir << "/";
+      fileNameStream << "timings_" << algorithmChoiceString << "_";
+      fileNameStream << formulationChoiceString << "_";
+      if (polyOrderFixed != -1)
+      {
+        fileNameStream << "p" << polyOrderFixed;
+      }
+      else
+      {
+        fileNameStream << "p" << polyOrderMin << "_to_p" << polyOrderMax << "_";
+      }
+      fileNameStream << modeChoiceString << "_";
+      fileNameStream << basisFamilyChoiceString;
+      fileNameStream << ".dat";
+      
+      timingsFilePath = fileNameStream.str();
+      
+      timingsFileStream = Teuchos::rcp( new std::ofstream(timingsFilePath, std::ios::out) );
+      
+      *timingsFileStream << "Algorithm\t";
+      *timingsFileStream << "p\t";
+      *timingsFileStream << "Element Count\t";
+      *timingsFileStream << "Workset Size\t";
+      *timingsFileStream << "mode\t";
+      *timingsFileStream << "Basis Family\t";
+      *timingsFileStream << "Core Integration Timing\t";
+      *timingsFileStream << "Core Integration Flops\t";
+      *timingsFileStream << "Core Integration Throughput\t";
+      *timingsFileStream << "Jac. Timing\t";
+      *timingsFileStream << "Jac. Flops\t";
+      *timingsFileStream << "Jac. Throughput\t";
+      *timingsFileStream << "Initialization Timing\t";
+      *timingsFileStream << "Other Timing\t";
+      *timingsFileStream << "Total Time\t";
+      *timingsFileStream << "Total Flops\t";
+      *timingsFileStream << "Total Throughput";
+      *timingsFileStream << std::endl;
+    }
 
     vector<AlgorithmChoice> algorithmChoices;
     if (algorithmChoiceString == "All")
@@ -556,6 +441,62 @@ int main( int argc, char* argv[] )
       return -1;
     }
     
+    vector<FormulationChoice> formulationChoices;
+    if (formulationChoiceString == "All")
+    {
+      formulationChoices = allFormulationChoices;
+    }
+    else if (formulationChoiceString == "Poisson")
+    {
+      formulationChoices = vector<FormulationChoice>{Poisson};
+    }
+    else if (formulationChoiceString == "Hgrad")
+    {
+      formulationChoices = vector<FormulationChoice>{Hgrad};
+    }
+    else if (formulationChoiceString == "Hdiv")
+    {
+      formulationChoices = vector<FormulationChoice>{Hdiv};
+    }
+    else if (formulationChoiceString == "Hcurl")
+    {
+      formulationChoices = vector<FormulationChoice>{Hcurl};
+    }
+    else if (formulationChoiceString == "L2")
+    {
+      formulationChoices = vector<FormulationChoice>{L2};
+    }
+    else
+    {
+      cout << "Unrecognized formulation choice: " << formulationChoiceString << endl;
+#ifdef HAVE_MPI
+      MPI_Finalize();
+#endif
+      return -1;
+    }
+    
+    vector<BasisFamilyChoice> basisFamilyChoices;
+    if (basisFamilyChoiceString == "Nodal")
+    {
+      basisFamilyChoices = vector<BasisFamilyChoice>{Nodal};
+    }
+    else if (basisFamilyChoiceString == "Hierarchical")
+    {
+      basisFamilyChoices = vector<BasisFamilyChoice>{Hierarchical};
+    }
+    else if (basisFamilyChoiceString == "Serendipity")
+    {
+      basisFamilyChoices = vector<BasisFamilyChoice>{Serendipity};
+    }
+    else
+    {
+      cout << "Unrecognized basis family choice: " << basisFamilyChoiceString << endl;
+#ifdef HAVE_MPI
+      MPI_Finalize();
+#endif
+      return -1;
+    }
+    
     if (polyOrderFixed > 0)
     {
       polyOrderMin = polyOrderFixed;
@@ -593,6 +534,7 @@ int main( int argc, char* argv[] )
     
     using Scalar = double;
     using ExecutionSpace = Kokkos::DefaultExecutionSpace;
+    using DeviceType = Kokkos::DefaultExecutionSpace::device_type;
     
     using std::vector;
     using std::map;
@@ -605,27 +547,45 @@ int main( int argc, char* argv[] )
     
     using WorksetForAlgorithmChoice = map<AlgorithmChoice, int>;
     
-    vector< tuple<int,int,WorksetForAlgorithmChoice> > polyOrderMeshWidthWorksetTestCases;
+    vector< tuple<int,Kokkos::Array<int,spaceDim>,WorksetForAlgorithmChoice> > polyOrderGridDimsWorksetTestCases;
+    
+    const int maxStiffnessGB = 2;
+    const int maxEntryCount = maxStiffnessGB * 1024 * (1024 * 1024 / sizeof(Scalar));
+    
+    const int maxCellCount = 32768;
+    
+    map<int,int> cellCountForPolyOrder;
     
-    const int meshWidth = 16;
-    vector<int> worksetSizes {1,2,4,8,16,32,64,128,256,512,1024,2048,4096};
+    map<int, Kokkos::Array<int,spaceDim> > gridCellCountsForPolyOrder;
+    for (int p=polyOrderMin; p<=polyOrderMax; p++)
+    {
+      int maxBasisCardinality = 1;
+      for (auto formulationChoice : formulationChoices)
+      {
+        for (auto basisFamilyChoice : basisFamilyChoices)
+        {
+          auto basis = getHypercubeBasisForFormulation<Scalar, Scalar, DeviceType, spaceDim>(formulationChoice, basisFamilyChoice, p);
+          maxBasisCardinality = std::max(basis->getCardinality(), maxBasisCardinality);
+        }
+      }
+      gridCellCountsForPolyOrder[p] = getMeshWidths<spaceDim>(maxBasisCardinality, maxEntryCount, maxCellCount);
+      
+      int cellCount = 1;
+      for (int d=0; d<spaceDim; d++)
+      {
+        cellCount *= gridCellCountsForPolyOrder[p][d];
+      }
+      
+      cellCountForPolyOrder[p] = cellCount;
+    }
     
-    // due to memory constraints, restrict the workset size for higher orders
-    map<int,int> maxWorksetSizeForPolyOrder;
-    maxWorksetSizeForPolyOrder[1] = 4096;
-    maxWorksetSizeForPolyOrder[2] = 4096;
-    maxWorksetSizeForPolyOrder[3] = 4096;
-    maxWorksetSizeForPolyOrder[4] = 4096;
-    maxWorksetSizeForPolyOrder[5] = 4096;
-    maxWorksetSizeForPolyOrder[6] = 2048;
-    maxWorksetSizeForPolyOrder[7] = 1024;
-    maxWorksetSizeForPolyOrder[8] = 512;
+    vector<int> worksetSizes {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768};
     
     map<int,int> minWorksetSizeForPolyOrder;
-    minWorksetSizeForPolyOrder[1] = 1;
-    minWorksetSizeForPolyOrder[2] = 1;
-    minWorksetSizeForPolyOrder[3] = 1;
-    minWorksetSizeForPolyOrder[4] = 1;
+    minWorksetSizeForPolyOrder[1] = 256;
+    minWorksetSizeForPolyOrder[2] = 64;
+    minWorksetSizeForPolyOrder[3] = 16;
+    minWorksetSizeForPolyOrder[4] = 4;
     minWorksetSizeForPolyOrder[5] = 1;
     minWorksetSizeForPolyOrder[6] = 1;
     minWorksetSizeForPolyOrder[7] = 1;
@@ -638,7 +598,7 @@ int main( int argc, char* argv[] )
         {
           for (int worksetSize : worksetSizes)
           {
-            if (worksetSize > maxWorksetSizeForPolyOrder[polyOrder])
+            if (worksetSize > cellCountForPolyOrder[polyOrder])
             {
               continue;
             }
@@ -652,13 +612,16 @@ int main( int argc, char* argv[] )
             {
               worksetForAlgorithmChoice[algorithmChoice] = worksetSize;
             }
-            polyOrderMeshWidthWorksetTestCases.push_back(tuple<int,int,WorksetForAlgorithmChoice>{polyOrder,meshWidth,worksetForAlgorithmChoice} );
+            auto gridDims = gridCellCountsForPolyOrder[polyOrder];
+            polyOrderGridDimsWorksetTestCases.push_back(tuple<int,Kokkos::Array<int,spaceDim>,WorksetForAlgorithmChoice>{polyOrder,gridDims,worksetForAlgorithmChoice} );
           }
         }
       }
       break;
       case Test:
       {
+        // DEBUGGING -- for ease of debugging, a single test case.
+//        vector< tuple<int,int,int> > testCases { tuple<int,int,int> {1,1,1} };
         // for test run, use the same modestly-sized tuples for each AlgorithmChoice
         // (note that meshWidth varies here)
         vector< tuple<int,int,int> > testCases { tuple<int,int,int> {1,8,512},
@@ -684,45 +647,189 @@ int main( int argc, char* argv[] )
             worksetForAlgorithmChoice[algorithmChoice] = std::get<2>(testCase);
           }
           worksetForAlgorithmChoice[Uniform] = numCells;
-          polyOrderMeshWidthWorksetTestCases.push_back(tuple<int,int,WorksetForAlgorithmChoice>{polyOrder,meshWidth,worksetForAlgorithmChoice} );
+          Kokkos::Array<int,spaceDim> gridDims;
+          for (int d=0; d<spaceDim; d++)
+          {
+            gridDims[d] = meshWidth;
+          }
+          polyOrderGridDimsWorksetTestCases.push_back(tuple<int,Kokkos::Array<int,spaceDim>,WorksetForAlgorithmChoice>{polyOrder,gridDims,worksetForAlgorithmChoice} );
         }
       }
       break;
       case BestSerial:
       {
-        // manually calibrated workset sizes on iMac Pro (2.3 GHz Xeon W, 18-core, running in serial)
-        // (these were calibrated without much tuning for the affine tensor case; if/when that happens, will want to recalibrate.)
+        if (formulationChoices.size() != 1)
+        {
+          std::cout << "BestSerial mode is not supported when running multiple formulations.\n";
+          exit(-1);
+        }
         
-        map<int,int> standardWorksetForPolyOrder;
-        standardWorksetForPolyOrder[1] = 64;
-        standardWorksetForPolyOrder[2] = 64;
-        standardWorksetForPolyOrder[3] = 128;
-        standardWorksetForPolyOrder[4] = 64;
-        standardWorksetForPolyOrder[5] = 16;
-        standardWorksetForPolyOrder[6] = 2;
-        standardWorksetForPolyOrder[7] = 2;
-        standardWorksetForPolyOrder[8] = 1;
+        auto formulationChoice = formulationChoices[0];
         
-        // Non-Affine Tensor
-        map<int,int> nonAffineTensorWorksetForPolyOrder;
-        nonAffineTensorWorksetForPolyOrder[1] = 256;
-        nonAffineTensorWorksetForPolyOrder[2] = 256;
-        nonAffineTensorWorksetForPolyOrder[3] = 128;
-        nonAffineTensorWorksetForPolyOrder[4] = 64;
-        nonAffineTensorWorksetForPolyOrder[5] = 16;
-        nonAffineTensorWorksetForPolyOrder[6] = 8;
-        nonAffineTensorWorksetForPolyOrder[7] = 2;
-        nonAffineTensorWorksetForPolyOrder[8] = 2;
+        // manually calibrated workset sizes on Mac Pro (2.5 GHz Xeon W, 28-core, running in serial)
         
+        map<int,int> standardWorksetForPolyOrder;
+        map<int,int> nonAffineTensorWorksetForPolyOrder;
         map<int,int> affineTensorWorksetForPolyOrder;
-        affineTensorWorksetForPolyOrder[1] = 256;
-        affineTensorWorksetForPolyOrder[2] = 128;
-        affineTensorWorksetForPolyOrder[3] = 16;
-        affineTensorWorksetForPolyOrder[4] = 4;
-        affineTensorWorksetForPolyOrder[5] = 2;
-        affineTensorWorksetForPolyOrder[6] = 1;
-        affineTensorWorksetForPolyOrder[7] = 1;
-        affineTensorWorksetForPolyOrder[8] = 1;
+        
+        switch(formulationChoice)
+        {
+          case Poisson:
+          {
+            // best for Poisson - these are for meshes that range from 32768 for p=1 to 256 for p=8
+            standardWorksetForPolyOrder[1] = 8192;
+            standardWorksetForPolyOrder[2] = 4096;
+            standardWorksetForPolyOrder[3] =   64;
+            standardWorksetForPolyOrder[4] =   16;
+            standardWorksetForPolyOrder[5] =   16;
+            standardWorksetForPolyOrder[6] =    1;
+            standardWorksetForPolyOrder[7] =    1;
+            standardWorksetForPolyOrder[8] =    1;
+            
+            nonAffineTensorWorksetForPolyOrder[1] = 2048;
+            nonAffineTensorWorksetForPolyOrder[2] =  256;
+            nonAffineTensorWorksetForPolyOrder[3] =  128;
+            nonAffineTensorWorksetForPolyOrder[4] =   16;
+            nonAffineTensorWorksetForPolyOrder[5] =    2;
+            nonAffineTensorWorksetForPolyOrder[6] =    1;
+            nonAffineTensorWorksetForPolyOrder[7] =    1;
+            nonAffineTensorWorksetForPolyOrder[8] =    1;
+            
+            affineTensorWorksetForPolyOrder[1] = 4096;
+            affineTensorWorksetForPolyOrder[2] =   64;
+            affineTensorWorksetForPolyOrder[3] =   32;
+            affineTensorWorksetForPolyOrder[4] =    4;
+            affineTensorWorksetForPolyOrder[5] =    2;
+            affineTensorWorksetForPolyOrder[6] =    1;
+            affineTensorWorksetForPolyOrder[7] =    1;
+            affineTensorWorksetForPolyOrder[8] =    1;
+          }
+            break;
+          case Hgrad:
+          {
+            // best for Hgrad - these are for meshes that range from 32768 for p=1 to 256 for p=8
+            standardWorksetForPolyOrder[1] = 32768;
+            standardWorksetForPolyOrder[2] = 16384;
+            standardWorksetForPolyOrder[3] =   512;
+            standardWorksetForPolyOrder[4] =   512;
+            standardWorksetForPolyOrder[5] =   512;
+            standardWorksetForPolyOrder[6] =     2;
+            standardWorksetForPolyOrder[7] =     1;
+            standardWorksetForPolyOrder[8] =     1;
+            
+            nonAffineTensorWorksetForPolyOrder[1] = 4096;
+            nonAffineTensorWorksetForPolyOrder[2] =  512;
+            nonAffineTensorWorksetForPolyOrder[3] =  128;
+            nonAffineTensorWorksetForPolyOrder[4] =   32;
+            nonAffineTensorWorksetForPolyOrder[5] =   16;
+            nonAffineTensorWorksetForPolyOrder[6] =    1;
+            nonAffineTensorWorksetForPolyOrder[7] =    1;
+            nonAffineTensorWorksetForPolyOrder[8] =    1;
+            
+            affineTensorWorksetForPolyOrder[1] = 8192;
+            affineTensorWorksetForPolyOrder[2] =  512;
+            affineTensorWorksetForPolyOrder[3] =  128;
+            affineTensorWorksetForPolyOrder[4] =   64;
+            affineTensorWorksetForPolyOrder[5] =   16;
+            affineTensorWorksetForPolyOrder[6] =    1;
+            affineTensorWorksetForPolyOrder[7] =    1;
+            affineTensorWorksetForPolyOrder[8] =    1;
+          }
+            break;
+          case Hdiv:
+          {
+            // best for Hdiv - these are for meshes that range from 32768 for p=1 to 64 for p=8
+            standardWorksetForPolyOrder[1] = 256;
+            standardWorksetForPolyOrder[2] =  64;
+            standardWorksetForPolyOrder[3] =  64;
+            standardWorksetForPolyOrder[4] =  16;
+            standardWorksetForPolyOrder[5] =   4;
+            standardWorksetForPolyOrder[6] =   1;
+            standardWorksetForPolyOrder[7] =   1;
+            standardWorksetForPolyOrder[8] =   1;
+            
+            nonAffineTensorWorksetForPolyOrder[1] = 4096;
+            nonAffineTensorWorksetForPolyOrder[2] =  256;
+            nonAffineTensorWorksetForPolyOrder[3] =   64;
+            nonAffineTensorWorksetForPolyOrder[4] =   16;
+            nonAffineTensorWorksetForPolyOrder[5] =    4;
+            nonAffineTensorWorksetForPolyOrder[6] =    1;
+            nonAffineTensorWorksetForPolyOrder[7] =    1;
+            nonAffineTensorWorksetForPolyOrder[8] =    1;
+            
+            affineTensorWorksetForPolyOrder[1] = 8192;
+            affineTensorWorksetForPolyOrder[2] =  512;
+            affineTensorWorksetForPolyOrder[3] =   64;
+            affineTensorWorksetForPolyOrder[4] =   16;
+            affineTensorWorksetForPolyOrder[5] =    8;
+            affineTensorWorksetForPolyOrder[6] =    1;
+            affineTensorWorksetForPolyOrder[7] =    1;
+            affineTensorWorksetForPolyOrder[8] =    1;
+          }
+            break;
+          case Hcurl:
+          {
+            // best for Hcurl - these are for meshes that range from 32768 for p=1 to 64 for p=8
+            standardWorksetForPolyOrder[1] = 1024;
+            standardWorksetForPolyOrder[2] =  512;
+            standardWorksetForPolyOrder[3] =  256;
+            standardWorksetForPolyOrder[4] =    4;
+            standardWorksetForPolyOrder[5] =    1;
+            standardWorksetForPolyOrder[6] =    1;
+            standardWorksetForPolyOrder[7] =    1;
+            standardWorksetForPolyOrder[8] =    1;
+            
+            nonAffineTensorWorksetForPolyOrder[1] = 512;
+            nonAffineTensorWorksetForPolyOrder[2] =  64;
+            nonAffineTensorWorksetForPolyOrder[3] =  16;
+            nonAffineTensorWorksetForPolyOrder[4] =   4;
+            nonAffineTensorWorksetForPolyOrder[5] =   1;
+            nonAffineTensorWorksetForPolyOrder[6] =   1;
+            nonAffineTensorWorksetForPolyOrder[7] =   1;
+            nonAffineTensorWorksetForPolyOrder[8] =   1;
+            
+            affineTensorWorksetForPolyOrder[1] = 1024;
+            affineTensorWorksetForPolyOrder[2] =  128;
+            affineTensorWorksetForPolyOrder[3] =   16;
+            affineTensorWorksetForPolyOrder[4] =    4;
+            affineTensorWorksetForPolyOrder[5] =    1;
+            affineTensorWorksetForPolyOrder[6] =    1;
+            affineTensorWorksetForPolyOrder[7] =    1;
+            affineTensorWorksetForPolyOrder[8] =    1;
+          }
+            break;
+          case L2:
+          {
+            // best for L^2 - these are for meshes that range from 32768 for p=1 to 256 for p=8
+            standardWorksetForPolyOrder[1] = 1024;
+            standardWorksetForPolyOrder[2] =  256;
+            standardWorksetForPolyOrder[3] =   64;
+            standardWorksetForPolyOrder[4] =   16;
+            standardWorksetForPolyOrder[5] =   16;
+            standardWorksetForPolyOrder[6] =   16;
+            standardWorksetForPolyOrder[7] =    1;
+            standardWorksetForPolyOrder[8] =    1;
+            
+            nonAffineTensorWorksetForPolyOrder[1] = 16384;
+            nonAffineTensorWorksetForPolyOrder[2] =   512;
+            nonAffineTensorWorksetForPolyOrder[3] =   256;
+            nonAffineTensorWorksetForPolyOrder[4] =    64;
+            nonAffineTensorWorksetForPolyOrder[5] =    16;
+            nonAffineTensorWorksetForPolyOrder[6] =     8;
+            nonAffineTensorWorksetForPolyOrder[7] =     2;
+            nonAffineTensorWorksetForPolyOrder[8] =     1;
+            
+            affineTensorWorksetForPolyOrder[1] = 32768;
+            affineTensorWorksetForPolyOrder[2] =  1024;
+            affineTensorWorksetForPolyOrder[3] =   256;
+            affineTensorWorksetForPolyOrder[4] =   128;
+            affineTensorWorksetForPolyOrder[5] =    16;
+            affineTensorWorksetForPolyOrder[6] =     8;
+            affineTensorWorksetForPolyOrder[7] =     1;
+            affineTensorWorksetForPolyOrder[8] =     1;
+          }
+            break;
+        }
         
         // for the cases that we have not tried yet (polyOrder > 8), we try to choose sensible guesses for workset size:
         // 1 is best, we think, for polyOrder 8, so it'll be the best for the rest.
@@ -734,76 +841,207 @@ int main( int argc, char* argv[] )
           standardWorksetForPolyOrder[polyOrder]        = worksetSize;
         }
         
-        int numCells = 1;
-        for (int d=0; d<spaceDim; d++)
-        {
-          numCells *= meshWidth;
-        }
-        
         for (int polyOrder=polyOrderMin; polyOrder<=polyOrderMax; polyOrder++)
         {
           WorksetForAlgorithmChoice worksetForAlgorithmChoice;
           worksetForAlgorithmChoice[Standard]        = standardWorksetForPolyOrder       [polyOrder];
           worksetForAlgorithmChoice[NonAffineTensor] = nonAffineTensorWorksetForPolyOrder[polyOrder];
-          worksetForAlgorithmChoice[AffineTensor]    = nonAffineTensorWorksetForPolyOrder[polyOrder];
-          worksetForAlgorithmChoice[Uniform]         = numCells;
+          worksetForAlgorithmChoice[AffineTensor]    = affineTensorWorksetForPolyOrder[polyOrder];
+          worksetForAlgorithmChoice[Uniform]         = cellCountForPolyOrder[polyOrder];
+          
+          const auto & gridDims = gridCellCountsForPolyOrder[polyOrder];
           
-          polyOrderMeshWidthWorksetTestCases.push_back(tuple<int,int,WorksetForAlgorithmChoice>{polyOrder,meshWidth,worksetForAlgorithmChoice} );
+          polyOrderGridDimsWorksetTestCases.push_back(tuple<int,Kokkos::Array<int,spaceDim>,WorksetForAlgorithmChoice>{polyOrder,gridDims,worksetForAlgorithmChoice} );
         }
       }
         break;
       
       case BestOpenMP_16:
       {
-        // manually calibrated workset sizes on iMac Pro (2.3 GHz Xeon W, 18-core, running with OpenMP, OMP_NUM_THREADS=16)
+        if (formulationChoices.size() != 1)
+        {
+          std::cout << "BestOpenMP_16 mode is not supported when running multiple formulations.\n";
+          exit(-1);
+        }
+        
+        auto formulationChoice = formulationChoices[0];
+        
+        // manually calibrated workset sizes on Mac Pro (2.5 GHz Xeon W, 28-core, running with OpenMP, OMP_NUM_THREADS=16)
         // Calibration for sum factorization cases was run while usePointCacheForRank3Tensor = true.
-        // (these were calibrated without much tuning for the affine tensor case; if/when that happens, will want to recalibrate.)
         
         map<int,int> standardWorksetForPolyOrder;
-        standardWorksetForPolyOrder[1] = 1024;
-        standardWorksetForPolyOrder[2] =  128;
-        standardWorksetForPolyOrder[3] =  128;
-        standardWorksetForPolyOrder[4] =  256;
-        standardWorksetForPolyOrder[5] =    8;
-        standardWorksetForPolyOrder[6] =    2;
-        standardWorksetForPolyOrder[7] =    2;
-        standardWorksetForPolyOrder[8] =    1;
-        
-        // Non-Affine Tensor
         map<int,int> nonAffineTensorWorksetForPolyOrder;
-        nonAffineTensorWorksetForPolyOrder[1] = 512;
-        nonAffineTensorWorksetForPolyOrder[2] = 128;
-        nonAffineTensorWorksetForPolyOrder[3] = 256;
-        nonAffineTensorWorksetForPolyOrder[4] = 128;
-        nonAffineTensorWorksetForPolyOrder[5] =  32;
-        nonAffineTensorWorksetForPolyOrder[6] =  16;
-        nonAffineTensorWorksetForPolyOrder[7] =  16;
-        nonAffineTensorWorksetForPolyOrder[8] =  16;
-        
         map<int,int> affineTensorWorksetForPolyOrder;
-        affineTensorWorksetForPolyOrder[1] =  4096;
-        affineTensorWorksetForPolyOrder[2] =   256;
-        affineTensorWorksetForPolyOrder[3] =   512;
-        affineTensorWorksetForPolyOrder[4] =   128;
-        affineTensorWorksetForPolyOrder[5] =    64;
-        affineTensorWorksetForPolyOrder[6] =    32;
-        affineTensorWorksetForPolyOrder[7] =    16;
-        affineTensorWorksetForPolyOrder[8] =    64;
         
-        // for the cases that we have not tried yet (polyOrder > 8), we try to choose sensible guesses for workset size:
-        // 1 is best, we think, for polyOrder 8, so it'll be the best for the rest.
-        int worksetSize = 1;
-        for (int polyOrder=9; polyOrder <= polyOrderMax; polyOrder++)
+        switch(formulationChoice)
         {
-          nonAffineTensorWorksetForPolyOrder[polyOrder] = worksetSize;
-          affineTensorWorksetForPolyOrder[polyOrder]    = worksetSize;
-          standardWorksetForPolyOrder[polyOrder]        = worksetSize;
+          case Poisson:
+          {
+            // best for Poisson - these are for meshes that range from 32768 for p=1 to 256 for p=8
+            standardWorksetForPolyOrder[1] = 4096;
+            standardWorksetForPolyOrder[2] = 2048;
+            standardWorksetForPolyOrder[3] = 2048;
+            standardWorksetForPolyOrder[4] = 2048;
+            standardWorksetForPolyOrder[5] = 2048;
+            standardWorksetForPolyOrder[6] = 2048;
+            standardWorksetForPolyOrder[7] =    4;
+            standardWorksetForPolyOrder[8] =    2;
+            
+            nonAffineTensorWorksetForPolyOrder[1] = 2048;
+            nonAffineTensorWorksetForPolyOrder[2] =  512;
+            nonAffineTensorWorksetForPolyOrder[3] =  256;
+            nonAffineTensorWorksetForPolyOrder[4] =  128;
+            nonAffineTensorWorksetForPolyOrder[5] =   64;
+            nonAffineTensorWorksetForPolyOrder[6] =   32;
+            nonAffineTensorWorksetForPolyOrder[7] =   16;
+            nonAffineTensorWorksetForPolyOrder[8] =   16;
+            
+            affineTensorWorksetForPolyOrder[1] = 8192;
+            affineTensorWorksetForPolyOrder[2] = 4096;
+            affineTensorWorksetForPolyOrder[3] = 1024;
+            affineTensorWorksetForPolyOrder[4] =  256;
+            affineTensorWorksetForPolyOrder[5] =   64;
+            affineTensorWorksetForPolyOrder[6] =   32;
+            affineTensorWorksetForPolyOrder[7] =   16;
+            affineTensorWorksetForPolyOrder[8] =   16;
+          }
+            break;
+          case Hgrad:
+          {
+            // best for Hgrad - these are for meshes that range from 32768 for p=1 to 256 for p=8
+            standardWorksetForPolyOrder[1] = 16384;
+            standardWorksetForPolyOrder[2] =  8192;
+            standardWorksetForPolyOrder[3] =  8192;
+            standardWorksetForPolyOrder[4] =  2048;
+            standardWorksetForPolyOrder[5] =   512;
+            standardWorksetForPolyOrder[6] =   512;
+            standardWorksetForPolyOrder[7] =   512;
+            standardWorksetForPolyOrder[8] =     1;
+            
+            nonAffineTensorWorksetForPolyOrder[1] = 16384;
+            nonAffineTensorWorksetForPolyOrder[2] =  8192;
+            nonAffineTensorWorksetForPolyOrder[3] =   256;
+            nonAffineTensorWorksetForPolyOrder[4] =   256;
+            nonAffineTensorWorksetForPolyOrder[5] =    64;
+            nonAffineTensorWorksetForPolyOrder[6] =    32;
+            nonAffineTensorWorksetForPolyOrder[7] =    16;
+            nonAffineTensorWorksetForPolyOrder[8] =    16;
+            
+            affineTensorWorksetForPolyOrder[1] =  8192;
+            affineTensorWorksetForPolyOrder[2] =  4096;
+            affineTensorWorksetForPolyOrder[3] =  1024;
+            affineTensorWorksetForPolyOrder[4] =   256;
+            affineTensorWorksetForPolyOrder[5] =    64;
+            affineTensorWorksetForPolyOrder[6] =    32;
+            affineTensorWorksetForPolyOrder[7] =    16;
+            affineTensorWorksetForPolyOrder[8] =    16;
+          }
+            break;
+          case Hdiv:
+          {
+            // best for Hdiv - these are for meshes that range from 32768 for p=1 to 64 for p=8
+            standardWorksetForPolyOrder[1] = 32768;
+            standardWorksetForPolyOrder[2] = 32768;
+            standardWorksetForPolyOrder[3] =   512;
+            standardWorksetForPolyOrder[4] =   256;
+            standardWorksetForPolyOrder[5] =    64;
+            standardWorksetForPolyOrder[6] =     2;
+            standardWorksetForPolyOrder[7] =     2;
+            standardWorksetForPolyOrder[8] =     1;
+            
+            nonAffineTensorWorksetForPolyOrder[1] = 32768;
+            nonAffineTensorWorksetForPolyOrder[2] = 16384;
+            nonAffineTensorWorksetForPolyOrder[3] =  8192;
+            nonAffineTensorWorksetForPolyOrder[4] =    64;
+            nonAffineTensorWorksetForPolyOrder[5] =    16;
+            nonAffineTensorWorksetForPolyOrder[6] =    16;
+            nonAffineTensorWorksetForPolyOrder[7] =    16;
+            nonAffineTensorWorksetForPolyOrder[8] =    16;
+            
+            affineTensorWorksetForPolyOrder[1] = 16384;
+            affineTensorWorksetForPolyOrder[2] =  4096;
+            affineTensorWorksetForPolyOrder[3] =   256;
+            affineTensorWorksetForPolyOrder[4] =   128;
+            affineTensorWorksetForPolyOrder[5] =    64;
+            affineTensorWorksetForPolyOrder[6] =    16;
+            affineTensorWorksetForPolyOrder[7] =    16;
+            affineTensorWorksetForPolyOrder[8] =    16;
+          }
+            break;
+          case Hcurl:
+          {
+            // best for Hcurl - these are for meshes that range from 32768 for p=1 to 64 for p=8
+            standardWorksetForPolyOrder[1] = 4096;
+            standardWorksetForPolyOrder[2] =  128;
+            standardWorksetForPolyOrder[3] =  128;
+            standardWorksetForPolyOrder[4] =   32;
+            standardWorksetForPolyOrder[5] =    4;
+            standardWorksetForPolyOrder[6] =    1;
+            standardWorksetForPolyOrder[7] =    1;
+            standardWorksetForPolyOrder[8] =    1;
+            
+            nonAffineTensorWorksetForPolyOrder[1] = 16384;
+            nonAffineTensorWorksetForPolyOrder[2] =   512;
+            nonAffineTensorWorksetForPolyOrder[3] =   128;
+            nonAffineTensorWorksetForPolyOrder[4] =    64;
+            nonAffineTensorWorksetForPolyOrder[5] =    32;
+            nonAffineTensorWorksetForPolyOrder[6] =    16;
+            nonAffineTensorWorksetForPolyOrder[7] =    16;
+            nonAffineTensorWorksetForPolyOrder[8] =    16;
+            
+            affineTensorWorksetForPolyOrder[1] = 32768;
+            affineTensorWorksetForPolyOrder[2] =  4096;
+            affineTensorWorksetForPolyOrder[3] =   128;
+            affineTensorWorksetForPolyOrder[4] =    64;
+            affineTensorWorksetForPolyOrder[5] =    16;
+            affineTensorWorksetForPolyOrder[6] =    16;
+            affineTensorWorksetForPolyOrder[7] =    16;
+            affineTensorWorksetForPolyOrder[8] =    16;
+          }
+            break;
+          case L2:
+          {
+            // best for L^2 - these are for meshes that range from 32768 for p=1 to 256 for p=8
+            standardWorksetForPolyOrder[1] = 8192;
+            standardWorksetForPolyOrder[2] =  512;
+            standardWorksetForPolyOrder[3] =   32;
+            standardWorksetForPolyOrder[4] =   32;
+            standardWorksetForPolyOrder[5] =   32;
+            standardWorksetForPolyOrder[6] =    1;
+            standardWorksetForPolyOrder[7] =    1;
+            standardWorksetForPolyOrder[8] =    1;
+            
+            nonAffineTensorWorksetForPolyOrder[1] = 16384;
+            nonAffineTensorWorksetForPolyOrder[2] =  4096;
+            nonAffineTensorWorksetForPolyOrder[3] =  1024;
+            nonAffineTensorWorksetForPolyOrder[4] =   256;
+            nonAffineTensorWorksetForPolyOrder[5] =    64;
+            nonAffineTensorWorksetForPolyOrder[6] =    32;
+            nonAffineTensorWorksetForPolyOrder[7] =    16;
+            nonAffineTensorWorksetForPolyOrder[8] =    16;
+            
+            affineTensorWorksetForPolyOrder[1] = 32768;
+            affineTensorWorksetForPolyOrder[2] =  4096;
+            affineTensorWorksetForPolyOrder[3] =  1024;
+            affineTensorWorksetForPolyOrder[4] =   256;
+            affineTensorWorksetForPolyOrder[5] =   128;
+            affineTensorWorksetForPolyOrder[6] =    32;
+            affineTensorWorksetForPolyOrder[7] =    16;
+            affineTensorWorksetForPolyOrder[8] =    16;
+          }
+            break;
         }
         
-        int numCells = 1;
-        for (int d=0; d<spaceDim; d++)
+        // for the cases that we have not tried yet (polyOrder > 8), we try to choose sensible guesses for workset size:
+        // Standard: 1 is best for polyOrder 8, so it'll be the best for the rest.
+        // NonAffineTensor, AffineTensor: we seem to bottom out at the number of OpenMP threads: here, 16.
+        int standardWorksetSize = 1;
+        int tensorWorksetSize = 16;
+        for (int polyOrder=9; polyOrder <= polyOrderMax; polyOrder++)
         {
-          numCells *= meshWidth;
+          nonAffineTensorWorksetForPolyOrder[polyOrder] = tensorWorksetSize;
+          affineTensorWorksetForPolyOrder[polyOrder]    = tensorWorksetSize;
+          standardWorksetForPolyOrder[polyOrder]        = standardWorksetSize;
         }
         
         for (int polyOrder=polyOrderMin; polyOrder<=polyOrderMax; polyOrder++)
@@ -811,56 +1049,196 @@ int main( int argc, char* argv[] )
           WorksetForAlgorithmChoice worksetForAlgorithmChoice;
           worksetForAlgorithmChoice[Standard]        = standardWorksetForPolyOrder       [polyOrder];
           worksetForAlgorithmChoice[NonAffineTensor] = nonAffineTensorWorksetForPolyOrder[polyOrder];
-          worksetForAlgorithmChoice[AffineTensor]    = nonAffineTensorWorksetForPolyOrder[polyOrder];
-          worksetForAlgorithmChoice[Uniform]         = numCells;
+          worksetForAlgorithmChoice[AffineTensor]    = affineTensorWorksetForPolyOrder[polyOrder];
+          worksetForAlgorithmChoice[Uniform]         = cellCountForPolyOrder[polyOrder];
+          const auto & gridDims = gridCellCountsForPolyOrder[polyOrder];
           
-          polyOrderMeshWidthWorksetTestCases.push_back(tuple<int,int,WorksetForAlgorithmChoice>{polyOrder,meshWidth,worksetForAlgorithmChoice} );
+          polyOrderGridDimsWorksetTestCases.push_back(tuple<int,Kokkos::Array<int,spaceDim>,WorksetForAlgorithmChoice>{polyOrder,gridDims,worksetForAlgorithmChoice} );
         }
       }
         break;
       case BestCuda:
       {
         {
+          if (formulationChoices.size() != 1)
+          {
+            std::cout << "BestCuda mode is not supported when running multiple formulations.\n";
+            exit(-1);
+          }
+          
+          auto formulationChoice = formulationChoices[0];
+          
           // STANDARD
-          // manually calibrated workset size on P100 (ride) - best for Standard
-          // these are for 4096-element meshes
+          // manually calibrated workset size on P100 (weaver)
           map<int,int> standardWorksetForPolyOrder;
-          standardWorksetForPolyOrder[1] = 4096;
-          standardWorksetForPolyOrder[2] = 1024;
-          standardWorksetForPolyOrder[3] = 128;
-          standardWorksetForPolyOrder[4] = 64;
-          standardWorksetForPolyOrder[5] = 8;
-          standardWorksetForPolyOrder[6] = 4;
-          standardWorksetForPolyOrder[7] = 2;
-          standardWorksetForPolyOrder[8] = 1;
-          
-          // Non-Affine Tensor
-          // For CUDA, 4096 is the best choice for the PointValueCache algorithm for any polyOrder from 1 to 5.
-          // This likely means we're not exposing enough parallelism within the cell.
           map<int,int> nonAffineTensorWorksetForPolyOrder;
-          nonAffineTensorWorksetForPolyOrder[1] = 4096;
-          nonAffineTensorWorksetForPolyOrder[2] = 4096;
-          nonAffineTensorWorksetForPolyOrder[3] = 4096;
-          nonAffineTensorWorksetForPolyOrder[4] = 4096;
-          nonAffineTensorWorksetForPolyOrder[5] = 4096;
-          nonAffineTensorWorksetForPolyOrder[6] = 2048;
-          nonAffineTensorWorksetForPolyOrder[7] = 512;
-          nonAffineTensorWorksetForPolyOrder[8] = 512;
+          map<int,int> affineTensorWorksetForPolyOrder;
           
-          // for the cases that we have not tried yet (polyOrder > 8), we try to choose sensible guesses for workset size:
-          int nonAffineWorksetSize = 256; // divide by 2 for each polyOrder beyond 8
-          int standardWorksetSize  = 1;  // 1 is best, we think, for polyOrder 8, so it'll be the best for the rest.
-          for (int polyOrder=9; polyOrder <= polyOrderMax; polyOrder++)
+          switch(formulationChoice)
           {
-            nonAffineTensorWorksetForPolyOrder[polyOrder] = nonAffineWorksetSize;
-            nonAffineWorksetSize = (nonAffineWorksetSize > 1) ? nonAffineWorksetSize / 2 : 1;
-            standardWorksetForPolyOrder[polyOrder] = standardWorksetSize;
+            case Poisson:
+            {
+              // best for Poisson - these are for meshes that range from 32768 for p=1 to 256 for p=8
+              standardWorksetForPolyOrder[1] = 16384;
+              standardWorksetForPolyOrder[2] =   512;
+              standardWorksetForPolyOrder[3] =   128;
+              standardWorksetForPolyOrder[4] =     8;
+              standardWorksetForPolyOrder[5] =     4;
+              standardWorksetForPolyOrder[6] =     1;
+              standardWorksetForPolyOrder[7] =     1;
+              standardWorksetForPolyOrder[8] =     1;
+              
+              nonAffineTensorWorksetForPolyOrder[1] = 32768;
+              nonAffineTensorWorksetForPolyOrder[2] = 32768;
+              nonAffineTensorWorksetForPolyOrder[3] = 16384;
+              nonAffineTensorWorksetForPolyOrder[4] =  8192;
+              nonAffineTensorWorksetForPolyOrder[5] =  4096;
+              nonAffineTensorWorksetForPolyOrder[6] =  2048;
+              nonAffineTensorWorksetForPolyOrder[7] =   256;
+              nonAffineTensorWorksetForPolyOrder[8] =   256;
+              
+              affineTensorWorksetForPolyOrder[1] = 8192;
+              affineTensorWorksetForPolyOrder[2] = 8192;
+              affineTensorWorksetForPolyOrder[3] = 8192;
+              affineTensorWorksetForPolyOrder[4] = 8192;
+              affineTensorWorksetForPolyOrder[5] = 4096;
+              affineTensorWorksetForPolyOrder[6] = 2048;
+              affineTensorWorksetForPolyOrder[7] =  256;
+              affineTensorWorksetForPolyOrder[8] =  128;
+            }
+              break;
+            case Hgrad:
+            {
+              // best for Hgrad - these are for meshes that range from 32768 for p=1 to 256 for p=8
+              standardWorksetForPolyOrder[1] = 32768;
+              standardWorksetForPolyOrder[2] =   512;
+              standardWorksetForPolyOrder[3] =   128;
+              standardWorksetForPolyOrder[4] =    16;
+              standardWorksetForPolyOrder[5] =     4;
+              standardWorksetForPolyOrder[6] =     1;
+              standardWorksetForPolyOrder[7] =     1;
+              standardWorksetForPolyOrder[8] =     1;
+              
+              nonAffineTensorWorksetForPolyOrder[1] = 32768;
+              nonAffineTensorWorksetForPolyOrder[2] = 32768;
+              nonAffineTensorWorksetForPolyOrder[3] = 16384;
+              nonAffineTensorWorksetForPolyOrder[4] =  8192;
+              nonAffineTensorWorksetForPolyOrder[5] =  4096;
+              nonAffineTensorWorksetForPolyOrder[6] =  2048;
+              nonAffineTensorWorksetForPolyOrder[7] =   256;
+              nonAffineTensorWorksetForPolyOrder[8] =   256;
+              
+              affineTensorWorksetForPolyOrder[1] = 32768;
+              affineTensorWorksetForPolyOrder[2] = 32768;
+              affineTensorWorksetForPolyOrder[3] =  8192;
+              affineTensorWorksetForPolyOrder[4] =  8192;
+              affineTensorWorksetForPolyOrder[5] =  4096;
+              affineTensorWorksetForPolyOrder[6] =  2048;
+              affineTensorWorksetForPolyOrder[7] =   256;
+              affineTensorWorksetForPolyOrder[8] =   256;
+            }
+              break;
+            case Hdiv:
+            {
+              // best for Hdiv - these are for meshes that range from 32768 for p=1 to 64 for p=8
+              standardWorksetForPolyOrder[1] = 32768;
+              standardWorksetForPolyOrder[2] =   512;
+              standardWorksetForPolyOrder[3] =    32;
+              standardWorksetForPolyOrder[4] =     4;
+              standardWorksetForPolyOrder[5] =     1;
+              standardWorksetForPolyOrder[6] =     1;
+              standardWorksetForPolyOrder[7] =     1;
+              standardWorksetForPolyOrder[8] =     1;
+              
+              nonAffineTensorWorksetForPolyOrder[1] = 32768;
+              nonAffineTensorWorksetForPolyOrder[2] = 32768;
+              nonAffineTensorWorksetForPolyOrder[3] = 16384;
+              nonAffineTensorWorksetForPolyOrder[4] =  4096;
+              nonAffineTensorWorksetForPolyOrder[5] =  1024;
+              nonAffineTensorWorksetForPolyOrder[6] =   256;
+              nonAffineTensorWorksetForPolyOrder[7] =   128;
+              nonAffineTensorWorksetForPolyOrder[8] =    64;
+              
+              affineTensorWorksetForPolyOrder[1] = 32768;
+              affineTensorWorksetForPolyOrder[2] = 32768;
+              affineTensorWorksetForPolyOrder[3] = 16384;
+              affineTensorWorksetForPolyOrder[4] =  4096;
+              affineTensorWorksetForPolyOrder[5] =  1024;
+              affineTensorWorksetForPolyOrder[6] =   256;
+              affineTensorWorksetForPolyOrder[7] =   128;
+              affineTensorWorksetForPolyOrder[8] =    64;
+            }
+              break;
+            case Hcurl:
+            {
+              standardWorksetForPolyOrder[1] = 1024;
+              standardWorksetForPolyOrder[2] =  128;
+              standardWorksetForPolyOrder[3] =   16;
+              standardWorksetForPolyOrder[4] =    4;
+              standardWorksetForPolyOrder[5] =    1;
+              standardWorksetForPolyOrder[6] =    1;
+              standardWorksetForPolyOrder[7] =    1;
+              standardWorksetForPolyOrder[8] =    1;
+              
+              nonAffineTensorWorksetForPolyOrder[1] = 32768;
+              nonAffineTensorWorksetForPolyOrder[2] = 32768;
+              nonAffineTensorWorksetForPolyOrder[3] =  8192;
+              nonAffineTensorWorksetForPolyOrder[4] =  2048;
+              nonAffineTensorWorksetForPolyOrder[5] =   512;
+              nonAffineTensorWorksetForPolyOrder[6] =   256;
+              nonAffineTensorWorksetForPolyOrder[7] =   128;
+              nonAffineTensorWorksetForPolyOrder[8] =    64;
+              
+              affineTensorWorksetForPolyOrder[1] = 32768;
+              affineTensorWorksetForPolyOrder[2] = 32768;
+              affineTensorWorksetForPolyOrder[3] =  8192;
+              affineTensorWorksetForPolyOrder[4] =  2048;
+              affineTensorWorksetForPolyOrder[5] =   512;
+              affineTensorWorksetForPolyOrder[6] =   256;
+              affineTensorWorksetForPolyOrder[7] =   128;
+              affineTensorWorksetForPolyOrder[8] =    64;
+            }
+              break;
+            case L2:
+            {
+              standardWorksetForPolyOrder[1] = 32768;
+              standardWorksetForPolyOrder[2] =  1024;
+              standardWorksetForPolyOrder[3] =   128;
+              standardWorksetForPolyOrder[4] =    16;
+              standardWorksetForPolyOrder[5] =     4;
+              standardWorksetForPolyOrder[6] =     1;
+              standardWorksetForPolyOrder[7] =     1;
+              standardWorksetForPolyOrder[8] =     1;
+              
+              nonAffineTensorWorksetForPolyOrder[1] = 32768;
+              nonAffineTensorWorksetForPolyOrder[2] = 32768;
+              nonAffineTensorWorksetForPolyOrder[3] = 16384;
+              nonAffineTensorWorksetForPolyOrder[4] =  8192;
+              nonAffineTensorWorksetForPolyOrder[5] =  4096;
+              nonAffineTensorWorksetForPolyOrder[6] =  2048;
+              nonAffineTensorWorksetForPolyOrder[7] =   256;
+              nonAffineTensorWorksetForPolyOrder[8] =   128;
+              
+              affineTensorWorksetForPolyOrder[1] = 8192;
+              affineTensorWorksetForPolyOrder[2] = 8192;
+              affineTensorWorksetForPolyOrder[3] = 8192;
+              affineTensorWorksetForPolyOrder[4] = 8192;
+              affineTensorWorksetForPolyOrder[5] = 4096;
+              affineTensorWorksetForPolyOrder[6] = 2048;
+              affineTensorWorksetForPolyOrder[7] =  256;
+              affineTensorWorksetForPolyOrder[8] =  128;
+            }
+              break;
           }
           
-          int numCells = 1;
-          for (int d=0; d<spaceDim; d++)
+          // for the cases that we have not tried yet (polyOrder > 8), we try to choose sensible guesses for workset size:
+          int standardWorksetSize  = 1;  // 1 is best for polyOrder 8, so it'll be the best for the rest.
+          // for the rest under CUDA, we observe that in most cases, the optimal workset size for non-affine tensor is the cell count.  For affine, it's lower by a factor of 2 or 4 in most cases.
+          for (int polyOrder=9; polyOrder <= polyOrderMax; polyOrder++)
           {
-            numCells *= meshWidth;
+            nonAffineTensorWorksetForPolyOrder[polyOrder] = cellCountForPolyOrder[polyOrder];
+            affineTensorWorksetForPolyOrder[polyOrder]    = cellCountForPolyOrder[polyOrder] / 2;
+            standardWorksetForPolyOrder[polyOrder] = standardWorksetSize;
           }
           
           for (int polyOrder=polyOrderMin; polyOrder<=polyOrderMax; polyOrder++)
@@ -868,10 +1246,12 @@ int main( int argc, char* argv[] )
             WorksetForAlgorithmChoice worksetForAlgorithmChoice;
             worksetForAlgorithmChoice[Standard]        = standardWorksetForPolyOrder       [polyOrder];
             worksetForAlgorithmChoice[NonAffineTensor] = nonAffineTensorWorksetForPolyOrder[polyOrder];
-            worksetForAlgorithmChoice[AffineTensor]    = nonAffineTensorWorksetForPolyOrder[polyOrder];
-            worksetForAlgorithmChoice[Uniform]         = numCells;
+            worksetForAlgorithmChoice[AffineTensor]    = affineTensorWorksetForPolyOrder[polyOrder];
+            worksetForAlgorithmChoice[Uniform]         = cellCountForPolyOrder[polyOrder];
+            
+            const auto & gridDims = gridCellCountsForPolyOrder[polyOrder];
             
-            polyOrderMeshWidthWorksetTestCases.push_back(tuple<int,int,WorksetForAlgorithmChoice>{polyOrder,meshWidth,worksetForAlgorithmChoice} );
+            polyOrderGridDimsWorksetTestCases.push_back(tuple<int,Kokkos::Array<int,spaceDim>,WorksetForAlgorithmChoice>{polyOrder,gridDims,worksetForAlgorithmChoice} );
           }
         }
         break;
@@ -883,139 +1263,292 @@ int main( int argc, char* argv[] )
     
     cout << std::setprecision(2) << std::scientific;
     
-    map< AlgorithmChoice, map<int, pair<double,int> > > maxAlgorithmThroughputForPolyOrder; // values are (throughput in GFlops/sec, worksetSize)
+    map< AlgorithmChoice, map<int, pair<double,int> > > maxAlgorithmThroughputForPolyOrderCore;  // values are (throughput in GFlops/sec, worksetSize)
+    map< AlgorithmChoice, map<int, pair<double,int> > > maxAlgorithmThroughputForPolyOrderTotal; // values are (throughput in GFlops/sec, worksetSize)
     
     const int charWidth = 15;
     
-    for (auto & testCase : polyOrderMeshWidthWorksetTestCases)
+    for (auto basisFamilyChoice : basisFamilyChoices)
     {
-      int polyOrder       = std::get<0>(testCase);
-      int meshWidth       = std::get<1>(testCase);
-      auto worksetSizeMap = std::get<2>(testCase);
-      std::cout << "\n\n";
-      std::cout << "Running with polyOrder = " << polyOrder << ", meshWidth = " << meshWidth << std::endl;
-      for (auto algorithmChoice : algorithmChoices)
+      for (auto formulation : formulationChoices)
       {
-        int worksetSize = worksetSizeMap[algorithmChoice];
-        auto geometry = getMesh<Scalar, spaceDim, ExecutionSpace>(algorithmChoice, meshWidth);
-        
-        // timers recorded in performStructuredQuadratureHypercubeGRADGRAD, performStandardQuadratureHypercubeGRADGRAD
-        auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
-        auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
-        auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
-
-        jacobianAndCellMeasureTimer->reset();
-        fstIntegrateCall->reset();
-        initialSetupTimer->reset();
-        
-        double elapsedTimeSeconds = 0;
-        double jacobianCellMeasureFlopCount = 0;
-        double transformIntegrateFlopCount = 0;
-        
-        if (algorithmChoice == Standard)
-        {
-          // each cell needs on the order of polyOrder^N quadrature points, each of which has a Jacobian of size N * N.
-          auto timer = Teuchos::TimeMonitor::getNewTimer("Standard Integration");
-          timer->start();
-          performStandardQuadratureHypercubeGRADGRAD<Scalar,Scalar,spaceDim,ExecutionSpace>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
-          timer->stop();
-          elapsedTimeSeconds = timer->totalElapsedTime();
-          
-          cout << "Standard, workset size:          " << setw(charWidth) << worksetSize << endl;
-          
-          timer->reset();
-        }
-        else if (algorithmChoice == AffineTensor)
-        {
-          auto timer = Teuchos::TimeMonitor::getNewTimer("Affine tensor Integration");
-          timer->start();
-          performStructuredQuadratureHypercubeGRADGRAD<Scalar>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
-          timer->stop();
-          
-          elapsedTimeSeconds = timer->totalElapsedTime();
-          
-          cout << "Affine Tensor, workset size:     " << setw(charWidth) << worksetSize << endl;
-                    
-          timer->reset();
-        }
-        else if (algorithmChoice == NonAffineTensor)
+        std::cout << "\n\n***** Formulation: " << to_string(formulation) << " *******\n";
+        for (auto & testCase : polyOrderGridDimsWorksetTestCases)
         {
-          auto timer = Teuchos::TimeMonitor::getNewTimer("Non-affine tensor Integration");
-          timer->start();
-          performStructuredQuadratureHypercubeGRADGRAD<Scalar>(geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
-          timer->stop();
-          
-          elapsedTimeSeconds = timer->totalElapsedTime();
-          
-          cout << "Non-Affine Tensor, workset size: " << setw(charWidth) << worksetSize << endl;
-          
-          timer->reset();
-        }
-        else if (algorithmChoice == Uniform)
-        {
-          // for uniform, override worksetSize: no loss in taking maximal worksetSize
-          int numCells = 1;
+          int polyOrder       = std::get<0>(testCase);
+          auto gridDims       = std::get<1>(testCase);
+          auto worksetSizeMap = std::get<2>(testCase);
+          std::cout << "\n\n";
+          std::cout << "Running with polyOrder = " << polyOrder << ", mesh dims = ";
           for (int d=0; d<spaceDim; d++)
           {
-            numCells *= meshWidth;
+            std::cout << gridDims[d];
+            if (d < spaceDim - 1) std::cout << " x ";
+            else                  std::cout << std::endl;
           }
-          auto timer = Teuchos::TimeMonitor::getNewTimer("Uniform Integration");
-          timer->start();
-          performStructuredQuadratureHypercubeGRADGRAD<Scalar>(geometry, polyOrder, numCells, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
-          timer->stop();
-          
-          elapsedTimeSeconds = timer->totalElapsedTime();
           
-          cout << "Uniform, workset size:           " << setw(charWidth) << worksetSize << endl;
+          std::map<AlgorithmChoice, Intrepid2::ScalarView<Scalar,DeviceType> > assembledMatrices;
+          for (auto algorithmChoice : algorithmChoices)
+          {
+            int worksetSize = worksetSizeMap[algorithmChoice];
+            if (mode == Calibration)
+            {
+              // if this workset size is bigger than the optimal for p-1, skip it -- it's highly
+              // unlikely that for a larger p, the optimal workset size will be *larger*.
+              const auto & bestThroughputs = maxAlgorithmThroughputForPolyOrderCore[algorithmChoice];
+              if (bestThroughputs.find(polyOrder-1) != bestThroughputs.end() )
+              {
+                int bestWorksetSize = bestThroughputs.find(polyOrder-1)->second.second;
+                if (bestWorksetSize < worksetSize)
+                {
+                  continue;
+                }
+              }
+            }
+            auto geometry = getMesh<Scalar, spaceDim, ExecutionSpace>(algorithmChoice, gridDims);
+            
+            // timers recorded in performStructuredQuadratureGRADGRAD, performStandardQuadratureGRADGRAD
+            auto jacobianAndCellMeasureTimer = Teuchos::TimeMonitor::getNewTimer("Jacobians");
+            auto fstIntegrateCall = Teuchos::TimeMonitor::getNewTimer("transform + integrate()");
+            auto initialSetupTimer = Teuchos::TimeMonitor::getNewTimer("Initial Setup");
+
+            jacobianAndCellMeasureTimer->reset();
+            fstIntegrateCall->reset();
+            initialSetupTimer->reset();
+            
+            double elapsedTimeSeconds = 0;
+            double jacobianCellMeasureFlopCount = 0;
+            double transformIntegrateFlopCount = 0;
+            
+            Intrepid2::ScalarView<Scalar,DeviceType> assembledMatrix;
+            if (algorithmChoice == Standard)
+            {
+              // each cell needs on the order of polyOrder^N quadrature points, each of which has a Jacobian of size N * N.
+              auto timer = Teuchos::TimeMonitor::getNewTimer("Standard Integration");
+              timer->start();
+              switch (basisFamilyChoice)
+              {
+                case Nodal:
+                {
+                  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+                  assembledMatrix = performStandardQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                }
+                  break;
+                case Hierarchical:
+                {
+                  using BasisFamily = HierarchicalBasisFamily<DeviceType>;
+                  assembledMatrix = performStandardQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                }
+                  break;
+                case Serendipity:
+                  INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "basis family choice not yet implemented");
+              }
+              timer->stop();
+              elapsedTimeSeconds = timer->totalElapsedTime();
+              
+              cout << "Standard, workset size:          " << setw(charWidth) << worksetSize << endl;
+              
+              timer->reset();
+            }
+            else if (algorithmChoice == AffineTensor)
+            {
+              auto timer = Teuchos::TimeMonitor::getNewTimer("Affine tensor Integration");
+              timer->start();
+              switch (basisFamilyChoice)
+              {
+                case Nodal:
+                {
+                  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                }
+                  break;
+                case Hierarchical:
+                {
+                  using BasisFamily = HierarchicalBasisFamily<DeviceType>;
+                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                }
+                  break;
+                case Serendipity:
+                  INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "basis family choice not yet implemented");
+              }
+              timer->stop();
+              
+              elapsedTimeSeconds = timer->totalElapsedTime();
+              
+              cout << "Affine Tensor, workset size:     " << setw(charWidth) << worksetSize << endl;
+                        
+              timer->reset();
+            }
+            else if (algorithmChoice == NonAffineTensor)
+            {
+              auto timer = Teuchos::TimeMonitor::getNewTimer("Non-affine tensor Integration");
+              timer->start();
+              switch (basisFamilyChoice)
+              {
+                case Nodal:
+                {
+                  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                }
+                  break;
+                case Hierarchical:
+                {
+                  using BasisFamily = HierarchicalBasisFamily<DeviceType>;
+                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, worksetSize, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                }
+                  break;
+                case Serendipity:
+                  INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "basis family choice not yet implemented");
+              }
+              timer->stop();
+              
+              elapsedTimeSeconds = timer->totalElapsedTime();
+              
+              cout << "Non-Affine Tensor, workset size: " << setw(charWidth) << worksetSize << endl;
+              
+              timer->reset();
+            }
+            else if (algorithmChoice == Uniform)
+            {
+              // for uniform, override worksetSize: no loss in taking maximal worksetSize
+              int numCells = 1;
+              for (int d=0; d<spaceDim; d++)
+              {
+                numCells *= gridDims[d];
+              }
+              auto timer = Teuchos::TimeMonitor::getNewTimer("Uniform Integration");
+              timer->start();
+              switch (basisFamilyChoice)
+              {
+                case Nodal:
+                {
+                  using BasisFamily = DerivedNodalBasisFamily<DeviceType>;
+                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, numCells, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                }
+                  break;
+                case Hierarchical:
+                {
+                  using BasisFamily = HierarchicalBasisFamily<DeviceType>;
+                  assembledMatrix = performStructuredQuadrature<Scalar,BasisFamily>(formulation, geometry, polyOrder, numCells, transformIntegrateFlopCount, jacobianCellMeasureFlopCount);
+                }
+                  break;
+                case Serendipity:
+                  INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "basis family choice not yet implemented");
+              }
+              timer->stop();
+              
+              elapsedTimeSeconds = timer->totalElapsedTime();
+              
+              cout << "Uniform, workset size:           " << setw(charWidth) << worksetSize << endl;
+              
+              timer->reset();
+            }
+            
+            assembledMatrices[algorithmChoice] = assembledMatrix;
+            
+            const double approximateFlopCountTotal = transformIntegrateFlopCount + jacobianCellMeasureFlopCount;
+            const double overallThroughputInGFlops = approximateFlopCountTotal / elapsedTimeSeconds / 1.0e9;
+            
+            const double previousMaxThroughput = maxAlgorithmThroughputForPolyOrderTotal[algorithmChoice][polyOrder].first;
+            if (overallThroughputInGFlops > previousMaxThroughput)
+            {
+              maxAlgorithmThroughputForPolyOrderTotal[algorithmChoice][polyOrder] = make_pair(overallThroughputInGFlops,worksetSize);
+            }
+            
+            // timing details
+            double integrateCallTime       = fstIntegrateCall->totalElapsedTime();
+            double integrateCallPercentage = integrateCallTime / elapsedTimeSeconds * 100.0;
+            double jacobianTime            = jacobianAndCellMeasureTimer->totalElapsedTime();
+            double jacobianPercentage      = jacobianTime / elapsedTimeSeconds * 100.0;
+            double initialSetupTime        = initialSetupTimer->totalElapsedTime();
+            double initialSetupPercentage  = initialSetupTime / elapsedTimeSeconds * 100.0;
+            double remainingTime           = elapsedTimeSeconds - (integrateCallTime + jacobianTime + initialSetupTime);
+            double remainingPercentage     = remainingTime / elapsedTimeSeconds * 100.0;
+            
+            const double transformIntegrateThroughputInGFlops = transformIntegrateFlopCount  / integrateCallTime / 1.0e9;
+            const double jacobiansThroughputInGFlops          = jacobianCellMeasureFlopCount / jacobianTime      / 1.0e9;
+            
+            const double previousMaxThroughputCore = maxAlgorithmThroughputForPolyOrderCore[algorithmChoice][polyOrder].first;
+            if (transformIntegrateThroughputInGFlops > previousMaxThroughputCore)
+            {
+              maxAlgorithmThroughputForPolyOrderCore[algorithmChoice][polyOrder] = make_pair(transformIntegrateThroughputInGFlops,worksetSize);
+            }
+            
+            cout << "Time (core integration)      " << setw(charWidth) << std::scientific << integrateCallTime << " seconds (" << std::fixed << integrateCallPercentage << "%)." << endl;
+            cout << "flop estimate (core):        " << setw(charWidth) << std::scientific << transformIntegrateFlopCount << endl;
+            cout << "estimated throughput (core): " << setw(charWidth) << std::scientific << transformIntegrateThroughputInGFlops << " GFlops" << endl;
+            cout << "************************************************" << endl;
+            cout << std::fixed;
+            cout << "Time (Jacobians)                 " << setw(charWidth) << std::scientific << jacobianTime      << " seconds (" << std::fixed << jacobianPercentage      << "%)." << endl;
+            cout << "flop estimate (Jacobians):       " << setw(charWidth) << std::scientific << jacobianCellMeasureFlopCount << endl;
+            cout << "estimated throughput (Jac.):     " << setw(charWidth) << std::scientific << jacobiansThroughputInGFlops << " GFlops" << endl;
+            cout << "Time (initial setup)             " << setw(charWidth) << std::scientific << initialSetupTime  << " seconds (" << std::fixed << initialSetupPercentage  << "%)." << endl;
+            cout << "Time (other)                     " << setw(charWidth) << std::scientific << remainingTime     << " seconds (" << std::fixed << remainingPercentage     << "%)." << endl;
+            cout << "Time (total):                    " << setw(charWidth) << std::scientific << elapsedTimeSeconds   << " seconds.\n";
+            cout << "flop estimate (total):           " << setw(charWidth) << std::scientific << approximateFlopCountTotal << endl;
+            cout << "estimated throughput (total):    " << setw(charWidth) << std::scientific << overallThroughputInGFlops << " GFlops" << endl;
+            
+            cout << endl;
+            
+            if (saveTimingsToFile)
+            {
+              *timingsFileStream << std::scientific;
+              
+              *timingsFileStream << to_string(algorithmChoice) << "\t";
+              *timingsFileStream << polyOrder << "\t";
+              *timingsFileStream << geometry.numCells() << "\t";
+              *timingsFileStream << worksetSize << "\t";
+              *timingsFileStream << modeChoiceString << "\t";
+              *timingsFileStream << basisFamilyChoiceString << "\t";
+              *timingsFileStream << integrateCallTime << "\t";
+              *timingsFileStream << transformIntegrateFlopCount << "\t";
+              *timingsFileStream << transformIntegrateThroughputInGFlops << "\t";
+              *timingsFileStream << jacobianTime << "\t";
+              *timingsFileStream << jacobianCellMeasureFlopCount << "\t";
+              *timingsFileStream << jacobiansThroughputInGFlops << "\t";
+              *timingsFileStream << initialSetupTime << "\t";
+              *timingsFileStream << remainingTime << "\t";
+              *timingsFileStream << elapsedTimeSeconds << "\t";
+              *timingsFileStream << approximateFlopCountTotal << "\t";
+              *timingsFileStream << overallThroughputInGFlops;
+              *timingsFileStream << std::endl;
+            }
+          }
           
-          timer->reset();
-        }
-        
-        const double approximateFlopCountTotal = transformIntegrateFlopCount + jacobianCellMeasureFlopCount;
-        const double overallThroughputInGFlops = approximateFlopCountTotal / elapsedTimeSeconds / 1.0e9;
-        
-        const double previousMaxThroughput = maxAlgorithmThroughputForPolyOrder[algorithmChoice][polyOrder].first;
-        if (overallThroughputInGFlops > previousMaxThroughput)
-        {
-          maxAlgorithmThroughputForPolyOrder[algorithmChoice][polyOrder] = make_pair(overallThroughputInGFlops,worksetSize);
+          if (assembledMatrices.size() > 1)
+          {
+            // if we have multiple, then let's compare values to make sure they agree.
+            Teuchos::basic_FancyOStream<char> out(Teuchos::rcp(&std::cout,false));
+            const double relTol = 1e-10; // pretty loose tolerances are required, especially for higher-order hierarchical comparisons to Standard
+            const double absTol = 1e-8;
+            auto firstAlgorithm = assembledMatrices.begin()->first;
+            auto firstMatrix = assembledMatrices.begin()->second;
+            std::string algorithmName1 = to_string(firstAlgorithm);
+            
+            for (const auto &entry : assembledMatrices)
+            {
+              auto secondAlgorithm = entry.first;
+              auto secondMatrix    = entry.second;
+              std::string algorithmName2 = to_string(secondAlgorithm);
+              testViewFloatingEquality(firstMatrix, secondMatrix, relTol, absTol, out, success, algorithmName1, algorithmName2);
+//              printFunctor3(firstMatrix, std::cout, algorithmName1);
+//              printFunctor3(secondMatrix, std::cout, algorithmName2);
+            }
+          }
         }
-        
-        // timing details
-        double integrateCallTime       = fstIntegrateCall->totalElapsedTime();
-        double integrateCallPercentage = integrateCallTime / elapsedTimeSeconds * 100.0;
-        double jacobianTime            = jacobianAndCellMeasureTimer->totalElapsedTime();
-        double jacobianPercentage      = jacobianTime / elapsedTimeSeconds * 100.0;
-        double initialSetupTime        = initialSetupTimer->totalElapsedTime();
-        double initialSetupPercentage  = initialSetupTime / elapsedTimeSeconds * 100.0;
-        double remainingTime           = elapsedTimeSeconds - (integrateCallTime + jacobianTime + initialSetupTime);
-        double remainingPercentage     = remainingTime / elapsedTimeSeconds * 100.0;
-        
-        const double transformIntegrateThroughputInGFlops = transformIntegrateFlopCount  / integrateCallTime / 1.0e9;
-        const double jacobiansThroughputInGFlops          = jacobianCellMeasureFlopCount / jacobianTime      / 1.0e9;
-        cout << "Time (core integration)          " << setw(charWidth) << std::scientific << integrateCallTime << " seconds (" << std::fixed << integrateCallPercentage << "%)." << endl;
-        cout << "flop estimate (core):            " << setw(charWidth) << std::scientific << transformIntegrateFlopCount << endl;
-        cout << "estimated throughput (core):     " << setw(charWidth) << std::scientific << transformIntegrateThroughputInGFlops << " GFlops" << endl;
-        cout << std::fixed;
-        cout << "Time (Jacobians)                 " << setw(charWidth) << std::scientific << jacobianTime      << " seconds (" << std::fixed << jacobianPercentage      << "%)." << endl;
-        cout << "flop estimate (Jacobians):       " << setw(charWidth) << std::scientific << jacobianCellMeasureFlopCount << endl;
-        cout << "estimated throughput (Jac.):     " << setw(charWidth) << std::scientific << jacobiansThroughputInGFlops << " GFlops" << endl;
-        cout << "Time (initial setup)             " << setw(charWidth) << std::scientific << initialSetupTime  << " seconds (" << std::fixed << initialSetupPercentage  << "%)." << endl;
-        cout << "Time (other)                     " << setw(charWidth) << std::scientific << remainingTime     << " seconds (" << std::fixed << remainingPercentage     << "%)." << endl;
-        cout << "Time (total):                    " << setw(charWidth) << std::scientific << elapsedTimeSeconds   << " seconds.\n";
-        cout << "flop estimate (total):           " << setw(charWidth) << std::scientific << approximateFlopCountTotal << endl;
-        cout << "estimated throughput (total):    " << setw(charWidth) << std::scientific << overallThroughputInGFlops << " GFlops" << endl;
-        
-        cout << endl;
       }
-    }
+    } // basisFamilyChoices
     
     if (mode == Calibration)
     {
+      cout << "Best workset sizes (as determined by 'core integration' throughput, which includes basis transforms, but not setup and/or Jacobian computations):\n";
       for (auto & algorithmChoice : algorithmChoices)
       {
+        if (algorithmChoice == Uniform) continue; // workset size is not meaningful for uniform (workset is always effectively one cell, or all cells, depending on how you choose to frame it).
+        
         cout << "Best workset sizes for " << to_string(algorithmChoice) << ":" << endl;
         
-        for (auto & maxThroughputEntry : maxAlgorithmThroughputForPolyOrder[algorithmChoice])
+        for (auto & maxThroughputEntry : maxAlgorithmThroughputForPolyOrderCore[algorithmChoice])
         {
           int polyOrder   = maxThroughputEntry.first;
           int worksetSize = maxThroughputEntry.second.second;
@@ -1024,7 +1557,14 @@ int main( int argc, char* argv[] )
         }
       }
     }
+    if (success)
+    {
+      return 0;
+    }
+    else
+    {
+      std::cout << "ERROR: Assembled matrices did *NOT* match across algorithms.\n";
+      return -1;
+    }
   }
-  
-  return 0;
 }
diff --git a/packages/tpetra/core/src/Tpetra_Details_normImpl.hpp b/packages/tpetra/core/src/Tpetra_Details_normImpl.hpp
index 18e4fb0010e9..2d328cb44de0 100644
--- a/packages/tpetra/core/src/Tpetra_Details_normImpl.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_normImpl.hpp
@@ -151,18 +151,37 @@ lclNormImpl (const RV& normsOut,
   }
   else { // lclNumRows != 0
     if (constantStride) {
-      if (whichNorm == NORM_INF) {
-        KokkosBlas::nrminf (normsOut, X);
-      }
-      else if (whichNorm == NORM_ONE) {
-        KokkosBlas::nrm1 (normsOut, X);
-      }
-      else if (whichNorm == NORM_TWO) {
-        KokkosBlas::nrm2_squared (normsOut, X);
+      if (X.extent(1) == 1) {
+        auto normsOut_0d = Kokkos::subview (normsOut, 0);
+        auto X_1d = Kokkos::subview (X, Kokkos::ALL(), 0);
+        if (whichNorm == NORM_INF) {
+          KokkosBlas::nrminf (normsOut_0d, X_1d);
+        }
+        else if (whichNorm == NORM_ONE) {
+          KokkosBlas::nrm1 (normsOut_0d, X_1d);
+        }
+        else if (whichNorm == NORM_TWO) {
+          KokkosBlas::nrm2_squared (normsOut_0d, X_1d);
+        }
+        else {
+          TEUCHOS_TEST_FOR_EXCEPTION
+            (true, std::logic_error, "Should never get here!");
+        }
       }
       else {
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (true, std::logic_error, "Should never get here!");
+        if (whichNorm == NORM_INF) {
+          KokkosBlas::nrminf (normsOut, X);
+        }
+        else if (whichNorm == NORM_ONE) {
+          KokkosBlas::nrm1 (normsOut, X);
+        }
+        else if (whichNorm == NORM_TWO) {
+          KokkosBlas::nrm2_squared (normsOut, X);
+        }
+        else {
+          TEUCHOS_TEST_FOR_EXCEPTION
+            (true, std::logic_error, "Should never get here!");
+        }
       }
     }
     else { // not constant stride