diff --git a/README.md b/README.md
index b816ed51..beccfedb 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ YASK contains a domain-specific compiler to convert scalar stencil code to SIMD-
       for functional testing if you don't have native support for any given instruction set.
 
 ### Backward-compatibility notices:
+* Version 2.16.03 moved the position of the log-file name to the last column in the CSV output of `utils/bin/yask_log_to_csv.pl`.
 * Version 2.15.04 required a call to `yc_grid::set_dynamic_step_alloc(true)` to allow changing the
 allocation in the step (time) dimension for grid variables created at YASK compile-time.
 * Version 2.15.02 required all "misc" indices to be yask-compiler-time constants.
diff --git a/docs/YASK-intro.pdf b/docs/YASK-intro.pdf
index a4c56e86..98c14207 100644
Binary files a/docs/YASK-intro.pdf and b/docs/YASK-intro.pdf differ
diff --git a/include/yc_node_api.hpp b/include/yc_node_api.hpp
index 5e477661..b0c62fc7 100644
--- a/include/yc_node_api.hpp
+++ b/include/yc_node_api.hpp
@@ -178,20 +178,22 @@ namespace yask {
             LHS.
 
             An optional domain condition may be provided to define the sub-domain
-            to which this equation applies. See new_first_domain_index()
-            for more information and an example.
-            Conditions are always evaluated with respect to the overall
+            to which this equation applies. 
+            Domain conditions are always evaluated with respect to the overall
             problem domain, i.e., independent of any specific
             MPI domain decomposition that might occur at run-time.
-            If a condition is not provided, the equation applies to the
+            If a domain condition is not provided, the equation applies to the
             entire problem domain.
             A domain condition can be added to an equation after its creation
             via yc_equation_node.set_cond().
+            See yc_equation_node.set_cond() for more information and an example.
 
-            A step-index condition is similar to a domain condition, but
-            applies to the current step (usually time).
-            A step-index condition can be added to an equation after its creation
+            A step condition is similar to a domain condition, but
+            enables or disables the entire equation based on the current step (usually time)
+            and/or other values.
+            A step condition can only be added to an equation after its creation
             via yc_equation_node.set_step_cond().
+            See yc_equation_node.set_step_cond() for more information and an example.
 
             @returns Pointer to new \ref yc_equation_node object.
         */
@@ -200,7 +202,7 @@ namespace yask {
                           /**< [in] Grid-point before EQUALS operator. */,
                           yc_number_node_ptr rhs
                           /**< [in] Expression after EQUALS operator. */,
-                          yc_bool_node_ptr cond = nullptr
+                          yc_bool_node_ptr sub_domain_cond = nullptr
                           /**< [in] Optional expression defining sub-domain
                              where `lhs EQUALS rhs` is valid. */ );
 
@@ -291,35 +293,7 @@ namespace yask {
            domain in `dim` dimension.
            The `dim` argument is created via new_domain_index().
 
-           Typical C++ usage:
-
-           \code{.cpp}
-           auto x = node_fac.new_domain_index("x");
-
-           // Create boolean expression for the
-           // boundary sub-domain "x < first_x + 10".
-           auto first_x = node_fac.new_first_domain_index(x);
-           auto left_bc_cond = node_fac.new_less_than_node(x, first_x + 10);
-
-           // Create a new equation that is valid in this range.
-           auto left_bc_eq =
-             node_fac.new_equation_node(grid_pt_expr, left_bc_expr, left_bc_cond);
-           \endcode
-
-           Specification of the "interior" part of a 2-D domain could be
-           represented by an expression similar to
-           `x >= new_first_domain_index(x) + 20 &&
-           x <= new_last_domain_index(x) - 20 &&
-           y >= new_first_domain_index(y) + 20 &&
-           y <= new_last_domain_index(y) - 20`.
-
-           @note The entire domain in dimension "x" would be represented by
-           `x >= new_first_domain_index(x) && x <= new_last_domain_index(x)`, but
-           that is the default condition so does not need to be specified.
-
-           @note Be sure to use an expression like "x < first_x + 10"
-           instead of merely "x < 10" to avoid the assumption that
-           the first index is always zero (0).
+           See yc_equation_node.set_cond() for more information and an example.
 
            @returns Pointer to new \ref yc_index_node object.
         */
@@ -333,6 +307,8 @@ namespace yask {
            domain in `dim` dimension.
            The `dim` argument is created via new_domain_index().
 
+           See yc_equation_node.set_cond() for more information and an example.
+
            @returns Pointer to new \ref yc_index_node object.
         */
         virtual yc_number_node_ptr
@@ -466,15 +442,80 @@ namespace yask {
             `nullptr` if not defined. */
         virtual yc_bool_node_ptr get_cond() =0;
 
-        /// Set the condition describing the sub-domain.
-        /** See yc_node_factory::new_equation_node(). */
-        virtual void set_cond(yc_bool_node_ptr cond
-                              /**< [in] Boolean expression describing the sub-domain
+        /// Set the condition describing the sub-domain for this equation.
+        /**
+           See yc_node_factory::new_equation_node() for an overall description
+           of conditions.
+        
+           Typical C++ usage to create a sub-domain condition:
+
+           \code{.cpp}
+           auto x = node_fac.new_domain_index("x");
+
+           // Create boolean expression for a 10-point wide left boundary area.
+           auto first_x = node_fac.new_first_domain_index(x);
+           auto left_bc_cond = x < first_x + 10;
+
+           // Indicate that an expression is valid only in this area.
+           // (Assumes left_bc_expr was already defined.)
+           left_bc_expr.set_cond(left_bc_cond);
+           \endcode
+
+           Specification of the "interior" part of a 2-D domain could be
+           represented by an expression like
+           `(x >= node_fac.new_first_domain_index(x) + 20) && (x <= node_fac.new_last_domain_index(x) - 20) && (y >= node_fac.new_first_domain_index(y) + 20) && (y <= node_fac.new_last_domain_index(y) - 20)`.
+
+           @warning For performance, sub-domain expressions are only
+           evaluated once when yk_solution::prepare_solution() is called,
+           and the results are analyzed and cached internally.  Thus,
+           sub-domain expressions should not include a step index or a
+           reference to any other varible that might change during or
+           between time-steps. See set_step_cond() for the mechanism to
+           enable equations based on variables that can change between
+           time-steps.
+
+           @note The entire domain in dimension "x" would be represented by
+           `(x >= node_fac.new_first_domain_index(x)) && (x <= node_fac.new_last_domain_index(x))`, but
+           that is the default condition so does not need to be specified.
+
+           @note Be sure to use an expression like `x < first_x + 10`
+           instead of merely `x < 10` to avoid the assumption that
+           the first index is always zero (0). More importantly, use
+           an expression like `x > last_x - 10` instead of hard-coding
+           the last index.
+        */
+        virtual void set_cond(yc_bool_node_ptr sub_domain_cond
+                              /**< [in] Boolean expression describing
+                                 where in the sub-domain this expression is valid
                                  or `nullptr` to remove the condition. */ ) =0;
 
-        /// Set the condition describing the valid step indices.
+        /// Set the condition describing when the equation is valid.
+        /**
+           See yc_node_factory::new_equation_node() for an overall description
+           of conditions.
+        
+           Typical C++ usage to create a step condition:
+
+           \code{.cpp}
+           auto t = node_fac.new_step_index("t");
+
+           // Create boolean expression that is true every third step.
+           auto my_step_cond = (t % 3 == 0);
+
+           // Indicate that an expression is valid only when step_cond is true.
+           // (Assumes my_expr was already defined.)
+           my_expr.set_step_cond(my_step_cond);
+           \endcode
+
+           Step conditions may also refer to elements in grid variables including
+           scalars (1-D) and arrays (2-D). For non-scalar variables, indices
+           used in a step condition _cannot_ include domain variables like `x` or `y`, but 
+           constants are allowed. In this way, equations can be enabled or
+           disabled programmatically by setting elements in the tested variables.
+        */
         virtual void set_step_cond(yc_bool_node_ptr step_cond
-                                   /**< [in] Boolean expression describing a valid step
+                                   /**< [in] Boolean expression describing 
+                                      when the expression is valid
                                       or `nullptr` to remove the condition. */ ) =0;
 
         /// Create a deep copy of AST starting with this node.
diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp
index cd1378a9..5040b96c 100644
--- a/src/common/common_utils.cpp
+++ b/src/common/common_utils.cpp
@@ -46,7 +46,7 @@ namespace yask {
     // for numbers above 9 (at least up to 99).
 
     // Format: "major.minor.patch".
-    const string version = "2.16.02";
+    const string version = "2.16.03";
 
     string yask_get_version_string() {
         return version;
diff --git a/src/compiler/lib/Cpp.cpp b/src/compiler/lib/Cpp.cpp
index 4a967a98..a16ccf93 100644
--- a/src/compiler/lib/Cpp.cpp
+++ b/src/compiler/lib/Cpp.cpp
@@ -34,6 +34,10 @@ namespace yask {
     // Format a real, preserving precision.
     string CppPrintHelper::formatReal(double v) {
 
+        // Int representation equivalent?
+        if (double(int(v)) == v)
+            return to_string(int(v));
+        
         // IEEE double gives 15-17 significant decimal digits of precision per
         // https://en.wikipedia.org/wiki/Double-precision_floating-point_format.
         // Some precision might be lost if/when cast to a float, but that's ok.
diff --git a/src/compiler/lib/Eqs.cpp b/src/compiler/lib/Eqs.cpp
index 8a067d9b..c4c84425 100644
--- a/src/compiler/lib/Eqs.cpp
+++ b/src/compiler/lib/Eqs.cpp
@@ -39,10 +39,6 @@ namespace yask {
     // and its input grids and points.
     class PointVisitor : public ExprVisitor {
 
-        // A set of all points to ensure pointers to each
-        // unique point have same value.
-        set<GridPoint> _all_pts;
-
         // A type to hold a mapping of equations to a set of grids in each.
         typedef unordered_set<Grid*> GridSet;
         typedef unordered_map<EqualsExpr*, Grid*> GridMap;
@@ -58,79 +54,105 @@ namespace yask {
 
         PointMap _lhs_pts; // outputs of eqs.
         PointSetMap _rhs_pts; // inputs of eqs.
+        PointSetMap _cond_pts;  // sub-domain expr inputs.
+        PointSetMap _step_cond_pts;  // step-cond expr inputs.
+        PointSetMap _all_pts;  // all points in each eq (union of above).
 
-        EqualsExpr* _eq=0;   // Current equation.
+        // Vars for indexing data.
+        EqualsExpr* _eq = 0;   // Current equation.
+        enum State { _in_lhs, _in_rhs, _in_cond, _in_step_cond } _state;
 
     public:
 
         // Ctor.
-        // 'pts' contains offsets from each point to create.
         PointVisitor() {}
         virtual ~PointVisitor() {}
 
+        // Get access to grids per eq.
         GridMap& getOutputGrids() { return _lhs_grids; }
         GridSetMap& getInputGrids() { return _rhs_grids; }
+
+        // Get access to pts per eq.
+        // Contains unique ptrs to pts, but pts may not
+        // be unique.
         PointMap& getOutputPts() { return _lhs_pts; }
         PointSetMap& getInputPts() { return _rhs_pts; }
-        int getNumEqs() const { return (int)_lhs_pts.size(); }
+        PointSetMap& getCondPts() { return _cond_pts; }
+        PointSetMap& getStepCondPts() { return _step_cond_pts; }
+        PointSetMap& getAllPts() { return _all_pts; }
 
-        // Determine whether 2 sets have any common points.
-        virtual bool do_sets_intersect(const GridSet& a,
-                                       const GridSet& b) {
-            for (auto ai : a) {
-                if (b.count(ai) > 0)
-                    return true;
-            }
-            return false;
-        }
-        virtual bool do_sets_intersect(const PointSet& a,
-                                       const PointSet& b) {
-            for (auto ai : a) {
-                if (b.count(ai) > 0)
-                    return true;
-            }
-            return false;
-        }
+        int getNumEqs() const { return (int)_lhs_pts.size(); }
 
         // Callback at an equality.
-        // Handles LHS grid pt explicitly, then visits RHS.
+        // Visits all parts that might have grid points.
         virtual string visit(EqualsExpr* ee) {
 
             // Set this equation as current one.
             _eq = ee;
 
-            // Make sure map entries exist for this eq.
+            // Make sure all map entries exist for this eq.
             _lhs_grids[_eq];
             _rhs_grids[_eq];
             _lhs_pts[_eq];
             _rhs_pts[_eq];
-
-            // Store LHS point.
-            auto* lhs = ee->getLhs().get();
-            _lhs_pts[_eq] = lhs;
-
-            // Add grid.
-            auto* g = lhs->getGrid();
-            _lhs_grids[_eq] = g;
-
+            _cond_pts[_eq];
+            _step_cond_pts[_eq];
+            _all_pts[_eq];
+
+            // visit LHS.
+            auto& lhs = ee->getLhs();
+            _state = _in_lhs;
+            lhs->accept(this);
+            
             // visit RHS.
             NumExprPtr rhs = ee->getRhs();
+            _state = _in_rhs;
             rhs->accept(this);
 
-            // Don't visit LHS because we've already saved it.
+            // visit conds.
+            auto& cp = ee->getCond();
+            if (cp) {
+                _state = _in_cond;
+                cp->accept(this);
+            }
+            auto& scp = ee->getStepCond();
+            if (scp) {
+                _state = _in_step_cond;
+                scp->accept(this);
+            }
             return "";
         }
 
-        // Callback at a grid point on the RHS.
+        // Callback at a grid point.
         virtual string visit(GridPoint* gp) {
             assert(_eq);
+            auto* g = gp->getGrid();
+            _all_pts[_eq].insert(gp);
 
-            // Store RHS point.
-            _rhs_pts[_eq].insert(gp);
+            // Save pt and/or grid based on state.
+            switch (_state) {
 
-            // Add grid.
-            auto* g = gp->getGrid();
-            _rhs_grids[_eq].insert(g);
+            case _in_lhs:
+                _lhs_pts[_eq] = gp;
+                _lhs_grids[_eq] = g;
+                break;
+
+            case _in_rhs:
+                _rhs_pts[_eq].insert(gp);
+                _rhs_grids[_eq].insert(g);
+                break;
+
+            case _in_cond:
+                _cond_pts[_eq].insert(gp);
+                break;
+
+            case _in_step_cond:
+                _step_cond_pts[_eq].insert(gp);
+                break;
+
+            default:
+                assert(0 && "illegal state");
+            }
             return "";
         }
     };
@@ -155,6 +177,8 @@ namespace yask {
         auto& inGrids = pt_vis.getInputGrids();
         auto& outPts = pt_vis.getOutputPts();
         auto& inPts = pt_vis.getInputPts();
+        //auto& condPts = pt_vis.getCondPts();
+        //auto& stepCondPts = pt_vis.getStepCondPts();
 
         // 1. Check each eq internally.
         os << "\nProcessing " << getNum() << " stencil equation(s)...\n";
@@ -317,10 +341,10 @@ namespace yask {
                         argn->getIntVal(); // throws exception if not an integer.
                     }
                 }
-            }
+            } // input pts.
 
             // TODO: check to make sure cond1 depends only on domain indices.
-            // TODO: check to make sure stcond1 depends only on step index.
+            // TODO: check to make sure stcond1 does not depend on domain indices.
         } // for all eqs.
 
         // 2. Check each pair of eqs.
@@ -520,6 +544,8 @@ namespace yask {
         SetVecVisitor(const Dimensions& dims) :
             _dims(dims) { 
             _visitEqualsLhs = true;
+            _visitGridPointArgs = true;
+            _visitConds = true;
         }
 
         // Check each grid point in expr.
@@ -529,7 +555,7 @@ namespace yask {
             // Never vectorize scalars.
             if (grid->get_num_dims() == 0) {
                 gp->setVecType(GridPoint::VEC_NONE);
-                return "";
+                return "";      // Also, no args to visit.
             }
 
             // Amount of vectorization allowed primarily depends on number
@@ -550,6 +576,8 @@ namespace yask {
             assert(fdoffsets <= grid_nfd);
 
             // All folded dims are vectorizable?
+            // NB: this will always be the case when there is
+            // no folding in the soln.
             if (fdoffsets == soln_nfd)
                 gp->setVecType(GridPoint::VEC_FULL); // all good.
 
@@ -560,7 +588,9 @@ namespace yask {
             // Uses no folded dims, so scalar only.
             else
                 gp->setVecType(GridPoint::VEC_NONE);
-            return "";
+
+            // Also check args of this grid point.
+            return ExprVisitor::visit(gp);
         }
     };
 
@@ -662,13 +692,8 @@ namespace yask {
         // Analyze each eq.
         for (auto& eq : getAll()) {
 
-            // Get sets of points for this eq.
-            auto* outPt1 = pv.getOutputPts().at(eq.get());
-            auto& inPts1 = pv.getInputPts().at(eq.get());
-
-            // Union of all input and output points for 'eq'.
-            auto allPts1 = inPts1;
-            allPts1.insert(outPt1);
+            // Get all grid points touched by this eq.
+            auto& allPts1 = pv.getAllPts().at(eq.get());
 
             // Update stats of each grid accessed in 'eq'.
             for (auto ap : allPts1) {
diff --git a/src/compiler/lib/Expr.cpp b/src/compiler/lib/Expr.cpp
index 57fd895f..aa685a61 100644
--- a/src/compiler/lib/Expr.cpp
+++ b/src/compiler/lib/Expr.cpp
@@ -518,16 +518,6 @@ namespace yask {
         return expr;
     }
 
-    // Unused operators.
-    EqualsExprPtr operator BAD_OPER1(GridPointPtr gpp, const NumExprPtr rhs) {
-        THROW_YASK_EXCEPTION("Error: operator '==' is not allowed on a grid point;"
-                             " use 'EQUALS' to assert equality between a grid point (LHS)"
-                             " and an equation (RHS)");
-    }
-    EqualsExprPtr operator BAD_OPER1(GridPointPtr gpp, double rhs) {
-        return gpp EQUALS_OPER constNum(rhs);
-    }
-    
     // Define the value of a grid point.
     // Add this equation to the list of eqs for this stencil.
     EqualsExprPtr operator EQUALS_OPER(GridPointPtr gpp, const NumExprPtr rhs) {
@@ -759,7 +749,7 @@ namespace yask {
     }
     string GridPoint::getGridPtr() const {
         string gname = _grid->getName();
-        string expr = "static_cast<_context_type::" + gname + "_type*>(_context->" + gname;
+        string expr = "static_cast<_context_type::" + gname + "_type*>(_context_data->" + gname;
         if (_grid->isScratch())
             expr += "_list[region_thread_idx].get()";
         expr += ")";
diff --git a/src/compiler/lib/Expr.hpp b/src/compiler/lib/Expr.hpp
index 61e6b292..e31bcab7 100644
--- a/src/compiler/lib/Expr.hpp
+++ b/src/compiler/lib/Expr.hpp
@@ -68,6 +68,8 @@ namespace yask {
     class NumExpr;
     typedef shared_ptr<NumExpr> NumExprPtr;
     typedef vector<NumExprPtr> NumExprPtrVec;
+    class GridPoint;
+    typedef shared_ptr<GridPoint> GridPointPtr;
     class IndexExpr;
     typedef shared_ptr<IndexExpr> IndexExprPtr;
     typedef vector<IndexExprPtr> IndexExprPtrVec;
@@ -79,8 +81,6 @@ namespace yask {
     // More forward-decls.
     class ExprVisitor;
     class Grid;
-    class GridPoint;
-    typedef shared_ptr<GridPoint> GridPointPtr;
     class StencilSolution;
     struct Dimensions;
 
@@ -1129,26 +1129,35 @@ namespace yask {
 #define IS_EQUIV_TO EQUALS_OPER
 #define IS_EQUIVALENT_TO EQUALS_OPER
 
-    // Catch use of the old '==' operator.
-#define BAD_OPER1 ==
-    EqualsExprPtr operator BAD_OPER1(GridPointPtr gpp, const NumExprPtr rhs);
-    EqualsExprPtr operator BAD_OPER1(GridPointPtr gpp, double rhs);
-
-    // Binary numerical-to-boolean operators.
-    // Must provide explicit IndexExprPtr operands to keep compiler from
-    // using built-in pointer comparison.
+    // Binary numerical-to-boolean operators.  Must provide more explicit
+    // ptr-type operands than used with math operators to keep compiler from
+    // using built-in pointer comparison. This means we need all
+    // permutations of {NumExpr,IndexExpr,GridPoint}Ptr.  Const values must
+    // be on RHS of operator, e.g., 'x > 5' is ok, but '5 < x' is not.
 #define BOOL_OPER(oper, type, implType)                                          \
     inline BoolExprPtr operator oper(const NumExprPtr lhs, const NumExprPtr rhs) { \
         return make_shared<type>(lhs, rhs); } \
-    inline BoolExprPtr operator oper(const IndexExprPtr lhs, const NumExprPtr rhs) { \
-        return make_shared<type>(lhs, rhs); } \
     inline BoolExprPtr operator oper(const NumExprPtr lhs, const IndexExprPtr rhs) { \
         return make_shared<type>(lhs, rhs); } \
+    inline BoolExprPtr operator oper(const NumExprPtr lhs, const GridPointPtr rhs) { \
+        return make_shared<type>(lhs, rhs); } \
+    inline BoolExprPtr operator oper(const IndexExprPtr lhs, const NumExprPtr rhs) { \
+        return make_shared<type>(lhs, rhs); } \
     inline BoolExprPtr operator oper(const IndexExprPtr lhs, const IndexExprPtr rhs) { \
         return make_shared<type>(lhs, rhs); } \
+    inline BoolExprPtr operator oper(const IndexExprPtr lhs, const GridPointPtr rhs) { \
+        return make_shared<type>(lhs, rhs); } \
+    inline BoolExprPtr operator oper(const GridPointPtr lhs, const NumExprPtr rhs) { \
+        return make_shared<type>(lhs, rhs); } \
+    inline BoolExprPtr operator oper(const GridPointPtr lhs, const IndexExprPtr rhs) { \
+        return make_shared<type>(lhs, rhs); } \
+    inline BoolExprPtr operator oper(const GridPointPtr lhs, const GridPointPtr rhs) { \
+        return make_shared<type>(lhs, rhs); } \
     inline BoolExprPtr operator oper(const NumExprPtr lhs, double rhs) { \
         return make_shared<type>(lhs, constNum(rhs)); }                 \
     inline BoolExprPtr operator oper(const IndexExprPtr lhs, double rhs) { \
+        return make_shared<type>(lhs, constNum(rhs)); }                 \
+    inline BoolExprPtr operator oper(const GridPointPtr lhs, double rhs) { \
         return make_shared<type>(lhs, constNum(rhs)); }
 
     BOOL_OPER(==, IsEqualExpr, yc_equals_node)
@@ -1196,15 +1205,9 @@ namespace yask {
     // - A variable within the global or current namespace where it will be used.
     // - A local variable in the 'value' method; in this case, the value
     //   of the local var must be evaluated and inserted in the expr.
-    // Example code:
-    //   GridValue v;
-    //   SET_VALUE_FROM_EXPR(v =, "_context->temp * " << 0.2);
-    //   SET_VALUE_FROM_EXPR(v +=, "_context->coeff[" << r << "]");
-    // This example would generate the following partial expression (when r=9):
-    //   (_context->temp * 2.00000000000000000e-01) + (_context->coeff[9])
 #define SET_VALUE_FROM_EXPR(lhs, rhs) do {      \
         ostringstream oss;                      \
-        oss << setprecision(17) << scientific;  \
+        oss << setprecision(15) << scientific;  \
         oss << "(" << rhs << ")";               \
         lhs  make_shared<CodeExpr>(oss.str());  \
     } while(0)
diff --git a/src/compiler/lib/Print.cpp b/src/compiler/lib/Print.cpp
index 79549d05..ce7f63cd 100644
--- a/src/compiler/lib/Print.cpp
+++ b/src/compiler/lib/Print.cpp
@@ -441,11 +441,11 @@ namespace yask {
         // Comment about condition.
         // Null ptr => no condition.
         if (ee->getCond()) {
-            string cond = ee->getCond()->accept(this);
+            string cond = ee->getCond()->makeStr();
             _os << " IF (" << cond << ")";
         }
         if (ee->getStepCond()) {
-            string cond = ee->getStepCond()->accept(this);
+            string cond = ee->getStepCond()->makeStr();
             _os << " IF_STEP (" << cond << ")";
         }
         _os << ".\n";
diff --git a/src/compiler/lib/Soln.hpp b/src/compiler/lib/Soln.hpp
index 736c0c0a..e07d3979 100644
--- a/src/compiler/lib/Soln.hpp
+++ b/src/compiler/lib/Soln.hpp
@@ -91,7 +91,8 @@ namespace yask {
         StencilSolution(const string& name) :
             _name(name) {
             yask_output_factory ofac;
-            set_debug_output(ofac.new_stdout_output());
+            auto so = ofac.new_stdout_output();
+            set_debug_output(so);
         }
         virtual ~StencilSolution() {}
 
diff --git a/src/compiler/lib/Visitor.hpp b/src/compiler/lib/Visitor.hpp
index 3b65b215..fb3cd706 100644
--- a/src/compiler/lib/Visitor.hpp
+++ b/src/compiler/lib/Visitor.hpp
@@ -38,6 +38,8 @@ namespace yask {
     class ExprVisitor {
     protected:
         bool _visitEqualsLhs = false; // whether to visit LHS of EQUALS.
+        bool _visitGridPointArgs = false;   // whether to visit exprs in grid point args.
+        bool _visitConds = false;           // whether to visit conditional exprs.
 
     public:
         virtual ~ExprVisitor() { }
@@ -46,7 +48,16 @@ namespace yask {
         virtual string visit(ConstExpr* ce) { return ""; }
         virtual string visit(CodeExpr* ce) { return ""; }
         virtual string visit(IndexExpr* ie) { return ""; }
-        virtual string visit(GridPoint* gp) { return ""; } // does NOT visit arg exprs by default.
+
+        // Visit grid-point args only if flag is set.
+        virtual string visit(GridPoint* gp) {
+            string res;
+            if (_visitGridPointArgs) {
+                for (auto& arg : gp->getArgs())
+                    res = arg->accept(this);
+            }
+            return res; 
+        }
 
         // By default, a unary visitor visits its operand.
         virtual string visit(UnaryNumExpr* ue) {
@@ -88,14 +99,22 @@ namespace yask {
                 res = ep->accept(this);
             return res;
         }
-        virtual string visit(EqualsExpr* ee) {
 
-            // Visit LHS if flag is set.
+        // Visit RHS of equals and LHS and conditions per flags.
+        virtual string visit(EqualsExpr* ee) {
             if (_visitEqualsLhs)
                 ee->getLhs()->accept(this);
+            if (_visitConds) {
+                auto& cp = ee->getCond();
+                if (cp)
+                    cp->accept(this);
+                auto& scp = ee->getStepCond();
+                if (scp)
+                    scp->accept(this);
+            }
+
+            // Always visit RHS.
             return ee->getRhs()->accept(this);
-
-            // Conditions are not visited.
         }
     };
 
diff --git a/src/compiler/lib/YaskKernel.cpp b/src/compiler/lib/YaskKernel.cpp
index ea5c1f8f..9d2d0e81 100644
--- a/src/compiler/lib/YaskKernel.cpp
+++ b/src/compiler/lib/YaskKernel.cpp
@@ -336,7 +336,7 @@ namespace yask {
             ctorCode += " " + grid + "_dim_names = {" +
                 gdims.makeDimStr(", ", "\"", "\"") + "};\n";
             string initCode = " " + grid + "_ptr = std::make_shared<" + typeDef +
-                ">(_dims, \"" + grid + "\", " + grid + "_dim_names, &_opts, &_ostr);\n"
+                ">(*this, \"" + grid + "\", " + grid + "_dim_names);\n"
                 " assert(" + grid + "_ptr);\n";
 
             // Grid vars.
@@ -448,7 +448,7 @@ namespace yask {
                 if (!firstGrid)
                     newGridCode += " else";
                 newGridCode += " if (dims == " + grid + "_dim_names) gp = std::make_shared<" +
-                    typeDef + ">(_dims, name, dims, &_opts, &_ostr);\n";
+                    typeDef + ">(*this, name, dims);\n";
             }
 
         } // grids.
@@ -503,7 +503,7 @@ namespace yask {
                 "\n class " << egsName << " : public StencilBundleBase {\n"
                 " protected:\n"
                 " typedef " << _context_base << " _context_type;\n"
-                " _context_type* _context = 0;\n"
+                " _context_type* _context_data = 0;\n"
                 " public:\n";
 
             // Stats for this eqBundle.
@@ -518,7 +518,7 @@ namespace yask {
             {
                 os << " " << egsName << "(" << _context_base << "* context) :\n"
                     " StencilBundleBase(context),\n"
-                    " _context(context) {\n"
+                    " _context_data(context) {\n"
                     " _name = \"" << egName << "\";\n"
                     " _scalar_fp_ops = " << stats.getNumOps() << ";\n"
                     " _scalar_points_read = " << stats.getNumReads() << ";\n"
@@ -529,9 +529,9 @@ namespace yask {
                 os << "\n // The following grid(s) are read by " << egsName << endl;
                 for (auto gp : eq->getInputGrids()) {
                     if (gp->isScratch())
-                        os << "  inputScratchVecs.push_back(&_context->" << gp->getName() << "_list);\n";
+                        os << "  inputScratchVecs.push_back(&_context_data->" << gp->getName() << "_list);\n";
                     else
-                        os << "  inputGridPtrs.push_back(_context->" << gp->getName() << "_ptr);\n";
+                        os << "  inputGridPtrs.push_back(_context_data->" << gp->getName() << "_ptr);\n";
                 }
                 os << "\n // The following grid(s) are written by " << egsName;
                 if (eq->step_expr)
@@ -539,9 +539,9 @@ namespace yask {
                 os << ".\n";
                 for (auto gp : eq->getOutputGrids()) {
                     if (gp->isScratch())
-                        os << "  outputScratchVecs.push_back(&_context->" << gp->getName() << "_list);\n";
+                        os << "  outputScratchVecs.push_back(&_context_data->" << gp->getName() << "_list);\n";
                     else
-                        os << "  outputGridPtrs.push_back(_context->" << gp->getName() << "_ptr);\n";
+                        os << "  outputGridPtrs.push_back(_context_data->" << gp->getName() << "_ptr);\n";
                 }
                 os << " } // Ctor." << endl;
             }
@@ -557,7 +557,9 @@ namespace yask {
                     os << " return " << eq->cond->makeStr() << ";\n";
                 else
                     os << " return true; // full domain.\n";
-                os << " }\n"
+                os << " }\n";
+
+                os << "\n // Return whether there is a sub-domain expression.\n"
                     " virtual bool is_sub_domain_expr() const {\n"
                     "  return " << (eq->cond ? "true" : "false") <<
                     ";\n }\n";
@@ -573,16 +575,40 @@ namespace yask {
 
             // Step condition.
             {
-                os << endl << " // Determine whether " << egsName << " is valid at the step " <<
-                    _dims._stencilDims.makeDimStr() << ".\n"
-                    " // Return true if indices are within the valid sub-domain or false otherwise.\n"
+                os << endl << " // Determine whether " << egsName <<
+                    " is valid at the step input_step_index.\n" <<
+                    " // Return true if valid or false otherwise.\n"
                     " virtual bool is_in_valid_step(idx_t input_step_index) const final {\n";
-                if (eq->step_cond)
+                if (eq->step_cond) {
                     os << " idx_t " << _dims._stepDim << " = input_step_index;\n"
-                        " return " << eq->step_cond->makeStr() << ";\n";
+                        "\n // " << eq->step_cond->makeStr() << "\n";
+                    
+                    // C++ scalar print assistant.
+                    CounterVisitor cv;
+                    eq->step_cond->accept(&cv);
+                    CppPrintHelper* sp = new CppPrintHelper(_settings, _dims, &cv, "temp", "real_t", " ", ";\n");
+
+                    // Generate the code.
+                    PrintVisitorTopDown pcv(os, *sp);
+                    string expr = eq->step_cond->accept(&pcv);
+                    os << " return " << expr << ";\n";
+                }
                 else
                     os << " return true; // any step.\n";
                 os << " }\n";
+
+                os << "\n // Return whether there is a step-condition expression.\n"
+                    " virtual bool is_step_cond_expr() const {\n"
+                    "  return " << (eq->step_cond ? "true" : "false") <<
+                    ";\n }\n";
+
+                os << "\n // Return human-readable description of step condition.\n"
+                    " virtual std::string get_step_cond_description() const {\n";
+                if (eq->step_cond)
+                    os << " return \"" << eq->step_cond->makeStr() << "\";\n";
+                else
+                    os << " return \"true\"; // any step.\n";
+                os << " }\n";
             }
 
             // LHS step index.
@@ -620,8 +646,8 @@ namespace yask {
 
                 // C++ scalar print assistant.
                 CounterVisitor cv;
-                    eq->visitEqs(&cv);
-                    CppPrintHelper* sp = new CppPrintHelper(_settings, _dims, &cv, "temp", "real_t", " ", ";\n");
+                eq->visitEqs(&cv);
+                CppPrintHelper* sp = new CppPrintHelper(_settings, _dims, &cv, "temp", "real_t", " ", ";\n");
 
                 // Generate the code.
                 PrintVisitorBottomUp pcv(os, *sp);
@@ -823,8 +849,8 @@ namespace yask {
             if (bp->isScratch())
                 continue;
             string bpName = bp->getName();
-            os << "  auto " << bpName << " = std::make_shared<BundlePack>(\"" <<
-                bpName << "\", this);\n";
+            os << "  auto " << bpName << " = std::make_shared<BundlePack>(this, \"" <<
+                bpName << "\");\n";
             for (auto& eg : bp->getBundles()) {
                 if (eg->isScratch())
                     continue;
diff --git a/src/kernel/Makefile b/src/kernel/Makefile
index 0ea0f25a..d3e07e43 100644
--- a/src/kernel/Makefile
+++ b/src/kernel/Makefile
@@ -417,10 +417,16 @@ ifneq ($(filter -O0,$(YK_CXXOPT)),)
  check		?=	1
 endif
 
-# Turn on checking macro.
+# Turn on debug macros.
 ifeq ($(check),1)
  MACROS		+=	CHECK
 endif
+ifeq ($(trace),1)
+ MACROS		+=	TRACE
+endif
+ifeq ($(trace_mem),1)
+ MACROS		+=	TRACE_MEM
+endif
 
 # Set MACROS based on individual makefile vars.
 MACROS		+=	PFD_L1=$(pfd_l1) PFD_L2=$(pfd_l2)
@@ -1008,8 +1014,8 @@ help:
 	echo " "; \
 	echo "Example debug builds of kernel cmd-line tool:"; \
 	echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' check=1 # No OpenMP or MPI, internal checking"; \
-	echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=test_2d mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' check=1 EXTRA_MACROS='TRACE'	 # TRACE is a useful debug setting!"; \
-	echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis radius=0 fold='x=1,y=1,z=1' mpi=0 YK_CXX=g++ OMPFLAGS='' YK_CXXOPT='-O0' check=1 EXTRA_MACROS='TRACE TRACE_MEM TRACE_INTRINSICS'"; \
+	echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=test_2d mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' check=1 trace=1 # trace is a useful debug setting!"; \
+	echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis radius=0 fold='x=1,y=1,z=1' mpi=0 YK_CXX=g++ OMPFLAGS='' YK_CXXOPT='-O0' check=1 trace=1 trace_mem=1"; \
 	echo " "; \
 	echo "Example builds with test runs:"; \
 	echo " $(MAKE) -j all # Normal full API and stencil tests"; \
diff --git a/src/kernel/lib/alloc.cpp b/src/kernel/lib/alloc.cpp
index 9b5d87ae..833bad3e 100644
--- a/src/kernel/lib/alloc.cpp
+++ b/src/kernel/lib/alloc.cpp
@@ -55,7 +55,7 @@ namespace yask {
                                      const map <int, size_t>& ngrids,
                                      map <int, shared_ptr<char>>& data_buf,
                                      const std::string& type) {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         // Loop through each mem type.
         for (const auto& i : nbytes) {
@@ -119,7 +119,7 @@ namespace yask {
 
     // Allocate memory for grids that do not already have storage.
     void StencilContext::allocGridData() {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         // Allocate I/O grids before read-only grids.
         GridPtrs sortedGridPtrs;
@@ -231,7 +231,7 @@ namespace yask {
     // Determine the size and shape of all MPI buffers.
     // Create buffers and allocate them.
     void StencilContext::allocMpiData() {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         // Remove any old MPI data.
         env->global_barrier();
@@ -245,11 +245,11 @@ namespace yask {
 
         map<int, int> num_exchanges; // send/recv => count.
         map<int, idx_t> num_elems; // send/recv => count.
-        auto me = _env->my_rank;
+        auto me = env->my_rank;
 
         // Need to determine the size and shape of all MPI buffers.
         // Loop thru all neighbors of this rank.
-        _mpiInfo->visitNeighbors
+        mpiInfo->visitNeighbors
             ([&](const IdxTuple& neigh_offsets, int neigh_rank, int neigh_idx) {
                 if (neigh_rank == MPI_PROC_NULL)
                     return; // from lambda fn.
@@ -269,7 +269,7 @@ namespace yask {
                     maxdist = NUM_STENCIL_DIMS - 1;
 
                 // Manhattan dist. of current neighbor.
-                int mandist = _mpiInfo->man_dists.at(neigh_idx);
+                int mandist = mpiInfo->man_dists.at(neigh_idx);
 
                 // Check distance.
                 if (mandist > maxdist) {
@@ -282,8 +282,8 @@ namespace yask {
                 // Both my rank and neighbor rank must have *all* domain sizes
                 // of vector multiples.
                 bool vec_ok = allow_vec_exchange &&
-                    _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] &&
-                    _mpiInfo->has_all_vlen_mults[neigh_idx];
+                    mpiInfo->has_all_vlen_mults[mpiInfo->my_neighbor_index] &&
+                    mpiInfo->has_all_vlen_mults[neigh_idx];
 
                 // Determine size of MPI buffers between neigh_rank and my
                 // rank for each grid and create those that are needed.  It
@@ -306,7 +306,7 @@ namespace yask {
                     IdxTuple my_halo_sizes, neigh_halo_sizes;
                     IdxTuple first_inner_idx, last_inner_idx;
                     IdxTuple first_outer_idx, last_outer_idx;
-                    for (auto& dim : _dims->_domain_dims.getDims()) {
+                    for (auto& dim : domain_dims.getDims()) {
                         auto& dname = dim.getName();
 
                         // Only consider domain dims that are used in this grid.
@@ -324,9 +324,9 @@ namespace yask {
                             idx_t lidx = gp->get_last_rank_domain_index(dname);
                             first_inner_idx.addDimBack(dname, fidx);
                             last_inner_idx.addDimBack(dname, lidx);
-                            if (_opts->is_first_rank(dname))
+                            if (opts->is_first_rank(dname))
                                 fidx -= lhalo; // extend into left halo.
-                            if (_opts->is_last_rank(dname))
+                            if (opts->is_last_rank(dname))
                                 lidx += rhalo; // extend into right halo.
                             first_outer_idx.addDimBack(dname, fidx);
                             last_outer_idx.addDimBack(dname, lidx);
@@ -414,7 +414,7 @@ namespace yask {
                     // to be so.
                     // TODO: add a heuristic to avoid increasing by a large factor.
                     if (grid_vec_ok) {
-                        for (auto& dim : _dims->_domain_dims.getDims()) {
+                        for (auto& dim : domain_dims.getDims()) {
                             auto& dname = dim.getName();
                             if (gp->is_dim_used(dname)) {
                                 auto vlen = gp->_get_vec_len(dname);
@@ -448,7 +448,7 @@ namespace yask {
                         IdxTuple copy_end = gp->get_allocs(); // one past last!
 
                         // Adjust along domain dims in this grid.
-                        for (auto& dim : _dims->_domain_dims.getDims()) {
+                        for (auto& dim : domain_dims.getDims()) {
                             auto& dname = dim.getName();
                             if (gp->is_dim_used(dname)) {
 
@@ -480,7 +480,7 @@ namespace yask {
                                         // Adjust LHS of interior.
                                         idx_t ext_end = ROUND_UP(first_inner_idx[dname] +
                                                                  max(min_ext, neigh_halo_sizes[dname]),
-                                                                 _dims->_fold_pts[dname]);
+                                                                 dims->_fold_pts[dname]);
                                         mpi_interior.bb_begin[dname] =
                                             max(mpi_interior.bb_begin[dname], ext_end);
                                     }
@@ -495,7 +495,7 @@ namespace yask {
                                         // Adjust RHS of interior.
                                         idx_t ext_begin = ROUND_DOWN(last_inner_idx[dname] + 1 -
                                                                      max(min_ext, neigh_halo_sizes[dname]),
-                                                                     _dims->_fold_pts[dname]);
+                                                                     dims->_fold_pts[dname]);
                                         mpi_interior.bb_end[dname] =
                                             min(mpi_interior.bb_end[dname], ext_begin);
                                     }
@@ -538,7 +538,7 @@ namespace yask {
                             idx_t dsize = 1;
 
                             // domain dim?
-                            if (_dims->_domain_dims.lookup(dname)) {
+                            if (domain_dims.lookup(dname)) {
                                 dsize = copy_end[dname] - copy_begin[dname];
 
                                 // Check whether alignment and size are multiple of vlen.
@@ -600,7 +600,7 @@ namespace yask {
                         IdxTuple copy_last = copy_end.subElements(1);
 
                         // Make MPI data entry for this grid.
-                        auto gbp = mpiData.emplace(gname, _mpiInfo);
+                        auto gbp = mpiData.emplace(gname, state->_mpiInfo);
                         auto& gbi = gbp.first; // iterator from pair returned by emplace().
                         auto& gbv = gbi->second; // value from iterator.
                         auto& buf = gbv.getBuf(MPIBufs::BufDir(bd), neigh_offsets);
@@ -793,7 +793,7 @@ namespace yask {
     // Allocate memory for scratch grids based on number of threads and
     // block sizes.
     void StencilContext::allocScratchData() {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         // Remove any old scratch data.
         freeScratchData();
@@ -815,7 +815,7 @@ namespace yask {
         // Find the max mini-block size across all packs.
         // They can be different across packs when pack-specific
         // auto-tuning has been used.
-        IdxTuple mblksize(_dims->_domain_dims);
+        IdxTuple mblksize(domain_dims);
         for (auto& sp : stPacks) {
             auto& psettings = sp->getActiveSettings();
             DOMAIN_VAR_LOOP(i, j) {
@@ -851,7 +851,7 @@ namespace yask {
                     int numa_pref = gp->get_numa_preferred();
 
                     // Loop through each domain dim.
-                    for (auto& dim : _dims->_domain_dims.getDims()) {
+                    for (auto& dim : domain_dims.getDims()) {
                         auto& dname = dim.getName();
 
                         if (gp->is_dim_used(dname)) {
@@ -866,8 +866,8 @@ namespace yask {
 
                             // Pads.
                             // Set via both 'extra' and 'min'; larger result will be used.
-                            gp->set_extra_pad_size(dname, _opts->_extra_pad_sizes[dname]);
-                            gp->set_min_pad_size(dname, _opts->_min_pad_sizes[dname]);
+                            gp->set_extra_pad_size(dname, opts->_extra_pad_sizes[dname]);
+                            gp->set_min_pad_size(dname, opts->_min_pad_sizes[dname]);
                         }
                     } // dims.
 
diff --git a/src/kernel/lib/auto_tuner.cpp b/src/kernel/lib/auto_tuner.cpp
index 1dfbd4ed..452873d8 100644
--- a/src/kernel/lib/auto_tuner.cpp
+++ b/src/kernel/lib/auto_tuner.cpp
@@ -31,11 +31,24 @@ using namespace std;
 
 namespace yask {
 
+    // Ctor.
+    AutoTuner::AutoTuner(StencilContext* context,
+                         KernelSettings* settings,
+                         const std::string& name) :
+        ContextLinker(context),
+        _settings(settings),
+        _name("auto-tuner") {
+        assert(settings);
+        if (name.length())
+            _name += "(" + name + ")";
+    }
+    
     // Eval auto-tuner for given number of steps.
     void StencilContext::eval_auto_tuner(idx_t num_steps) {
+        STATE_VARS(this);
         _at.steps_done += num_steps;
 
-        if (_use_pack_tuners) {
+        if (state->_use_pack_tuners) {
             for (auto& sp : stPacks)
                 sp->getAT().eval();
         }
@@ -52,8 +65,9 @@ namespace yask {
 
     // Determine if any auto tuners are running.
     bool StencilContext::is_auto_tuner_enabled() const {
+        STATE_VARS(this);
         bool done = true;
-        if (_use_pack_tuners) {
+        if (state->_use_pack_tuners) {
             for (auto& sp : stPacks)
                 if (!sp->getAT().is_done())
                     done = false;
@@ -65,7 +79,7 @@ namespace yask {
     // Apply auto-tuning immediately, i.e., not as part of normal processing.
     // Will alter data in grids.
     void StencilContext::run_auto_tuner_now(bool verbose) {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
         if (!rank_bb.bb_valid)
             THROW_YASK_EXCEPTION("Error: run_auto_tuner_now() called without calling prepare_solution() first");
 
@@ -90,7 +104,7 @@ namespace yask {
 
         // Determine number of steps to run.
         // If wave-fronts are enabled, run a max number of these steps.
-        idx_t step_dir = _dims->_step_dir; // +/- 1.
+        idx_t step_dir = dims->_step_dir; // +/- 1.
         idx_t step_t = min(max(wf_steps, idx_t(1)), +AutoTuner::max_step_t) * step_dir;
 
         // Run time-steps until AT converges.
@@ -106,7 +120,7 @@ namespace yask {
 
         // Wait for all ranks to finish.
         os << "Waiting for auto-tuner to converge on all ranks...\n";
-        _env->global_barrier();
+        env->global_barrier();
 
         // reenable normal operation.
 #ifndef NO_HALO_EXCHANGE
@@ -118,7 +132,7 @@ namespace yask {
         at_timer.stop();
         os << "Auto-tuner done after " << steps_done << " step(s) in " <<
             at_timer.get_elapsed_secs() << " secs.\n";
-        if (_use_pack_tuners) {
+        if (state->_use_pack_tuners) {
             for (auto& sp : stPacks)
                 sp->getAT().print_settings(os);
         } else
@@ -142,9 +156,8 @@ namespace yask {
     
     // Reset the auto-tuner.
     void AutoTuner::clear(bool mark_done, bool verbose) {
+        STATE_VARS(this);
 
-        // Output.
-        ostream& os = _context->get_ostr();
 #ifdef TRACE
         this->verbose = true;
 #else
@@ -176,15 +189,14 @@ namespace yask {
         steps_done = 0;
 
         // Set min blocks to number of region threads.
-        min_blks = _context->set_region_threads();
+        min_blks = set_region_threads();
 
         // Adjust starting block if needed.
-        auto& opts = _context->get_settings();
         for (auto dim : center_block.getDims()) {
             auto& dname = dim.getName();
             auto& dval = dim.getVal();
 
-            if (dname == opts->_dims->_step_dim) {
+            if (dname == step_dim) {
                 block_steps = opts->_block_sizes[dname];
                 center_block[dname] = block_steps;
             } else {
@@ -194,15 +206,15 @@ namespace yask {
             }
         }
         if (!done) {
-            TRACE_MSG2(_name << ": starting block-size: "  <<
+            TRACE_MSG(_name << ": starting block-size: "  <<
                        center_block.makeDimValStr(" * "));
-            TRACE_MSG2(_name << ": starting search radius: " << radius);
+            TRACE_MSG(_name << ": starting search radius: " << radius);
         }
     } // clear.
 
     // Evaluate the previous run and take next auto-tuner step.
     void AutoTuner::eval() {
-        CONTEXT_VARS(_context);
+        STATE_VARS(this);
 
         // Get elapsed time and reset.
         double etime = timer.get_elapsed_secs();
@@ -248,10 +260,7 @@ namespace yask {
             csteps << " steps(s) in " << ctime <<
             " secs (" << rate <<
             " steps/sec) with block-size " <<
-            _settings->_block_sizes.makeDimValStr(" * ");
-        if (_context->tb_steps > 0)
-            os << ", " << _context->tb_steps << " TB step(s)";
-        os << endl;
+            _settings->_block_sizes.makeDimValStr(" * ") << endl;
         csteps = 0;
         ctime = 0.;
 
@@ -335,7 +344,7 @@ namespace yask {
                     bsize[dname] = sz;
 
                 } // domain dims.
-                TRACE_MSG2(_name << ": checking block-size "  <<
+                TRACE_MSG(_name << ": checking block-size "  <<
                           bsize.makeDimValStr(" * "));
 
                 // Too small?
@@ -346,7 +355,6 @@ namespace yask {
 
                 // Too few?
                 else if (ok) {
-                    auto& opts = _context->get_settings();
                     idx_t nblks = get_num_domain_points(opts->_region_sizes) /
                         get_num_domain_points(bsize);
                     if (nblks < min_blks) {
@@ -393,10 +401,10 @@ namespace yask {
                         os << _name << ": done" << endl;
                         return;
                     }
-                    TRACE_MSG2(_name << ": new search radius=" << radius);
+                    TRACE_MSG(_name << ": new search radius=" << radius);
                 }
                 else {
-                    TRACE_MSG2(_name << ": continuing search from block " <<
+                    TRACE_MSG(_name << ": continuing search from block " <<
                                center_block.makeDimValStr(" * "));
                 }
             } // beyond next neighbor of center.
@@ -406,13 +414,13 @@ namespace yask {
         // Assumption is that block size in one pack doesn't affect
         // perf in another pack.
         apply();
-        TRACE_MSG2(_name << ": next block-size "  <<
+        TRACE_MSG(_name << ": next block-size "  <<
                   _settings->_block_sizes.makeDimValStr(" * "));
     } // eval.
 
     // Apply auto-tuner settings to prepare for a run.
     void AutoTuner::apply() {
-        CONTEXT_VARS(_context);
+        STATE_VARS(this);
 
         // Restore step-dim value for block.
         _settings->_block_sizes[step_posn] = block_steps;
@@ -426,19 +434,23 @@ namespace yask {
         _settings->_mini_block_group_sizes.setValsSame(0);
         _settings->_block_group_sizes.setValsSame(0);
 
+        // Save debug output and set to null.
+        auto saved_op = get_debug_output();
+        set_debug_output(nullop);
+
         // Make sure everything is resized based on block size.
-        auto saved_op = cp->get_debug_output();
-        cp->set_debug_output(nullop);
-        _settings->adjustSettings(cp->get_env());
+        _settings->adjustSettings();
 
         // Update temporal blocking info.
-        cp->update_tb_info();
+        _context->update_tb_info();
 
-        // Reallocate scratch data based on new block size.
+        // Reallocate scratch data based on new mini-block size.
         // TODO: only do this when blocks have increased or
         // decreased by a certain percentage.
         _context->allocScratchData();
-        cp->set_debug_output(saved_op);
+
+        // Restore debug output.
+        set_debug_output(saved_op);
     }
 
 } // namespace yask.
diff --git a/src/kernel/lib/auto_tuner.hpp b/src/kernel/lib/auto_tuner.hpp
index b49218b9..9712b29f 100644
--- a/src/kernel/lib/auto_tuner.hpp
+++ b/src/kernel/lib/auto_tuner.hpp
@@ -28,10 +28,16 @@ IN THE SOFTWARE.
 namespace yask {
 
     // Auto-tuner for setting block size.
-    class AutoTuner {
+    class AutoTuner :
+        public ContextLinker {
+
     protected:
-        StencilContext* _context = 0;
-        KernelSettings* _settings = 0; // settings to change.
+
+        // Settings to change. May be pointer to solution settings
+        // or local settings for a pack.
+        KernelSettings* _settings = 0;
+
+        // Name of tuner.
         std::string _name;
 
         // Null stream to throw away debug info.
@@ -75,15 +81,10 @@ namespace yask {
     public:
         static constexpr idx_t max_step_t = 4;
 
-        AutoTuner(StencilContext* ctx,
+        AutoTuner(StencilContext* context,
                   KernelSettings* settings,
-                  const std::string& name = "") :
-            _context(ctx),
-            _settings(settings) {
-            _name = "auto-tuner";
-            if (name.length())
-                _name += "(" + name + ")";
-        }
+                  const std::string& name = "");
+        virtual ~AutoTuner() {}
 
         // Start & stop this timer to track elapsed time.
         YaskTimer timer;
diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp
index 90eb756d..078f296d 100644
--- a/src/kernel/lib/context.cpp
+++ b/src/kernel/lib/context.cpp
@@ -31,24 +31,12 @@ using namespace std;
 
 namespace yask {
 
-    // Set debug output to cout if my_rank == msg_rank
-    // or a null stream otherwise.
-    ostream& StencilContext::set_ostr() {
-        yask_output_factory yof;
-        if (_env->my_rank == _opts->msg_rank)
-            set_debug_output(yof.new_stdout_output());
-        else
-            set_debug_output(yof.new_null_output());
-        assert(_ostr);
-        return *_ostr;
-    }
-
     ///// Top-level methods for evaluating reference and optimized stencils.
 
     // Eval stencil bundle(s) over grid(s) using reference scalar code.
     void StencilContext::run_ref(idx_t first_step_index,
                                  idx_t last_step_index) {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
         run_time.start();
 
         // Determine step dir from order of first/last.
@@ -69,10 +57,10 @@ namespace yask {
         // Begin & end tuples.
         // Based on rank bounding box, not extended
         // BB because we don't use wave-fronts in the ref code.
-        IdxTuple begin(_dims->_stencil_dims);
+        IdxTuple begin(stencil_dims);
         begin.setVals(rank_bb.bb_begin, false);
         begin[step_dim] = begin_t;
-        IdxTuple end(_dims->_stencil_dims);
+        IdxTuple end(stencil_dims);
         end.setVals(rank_bb.bb_end, false);
         end[step_dim] = end_t;
 
@@ -81,16 +69,16 @@ namespace yask {
 
         // Force sub-sizes to whole rank size so that scratch
         // grids will be large enough. Turn off any temporal blocking.
-        _opts->_region_sizes.setValsSame(0);
-        _opts->_block_sizes.setValsSame(0);
-        _opts->_mini_block_sizes.setValsSame(0);
-        _opts->_sub_block_sizes.setValsSame(0);
-        _opts->adjustSettings(get_env());
+        opts->_region_sizes.setValsSame(0);
+        opts->_block_sizes.setValsSame(0);
+        opts->_mini_block_sizes.setValsSame(0);
+        opts->_sub_block_sizes.setValsSame(0);
+        opts->adjustSettings();
         update_grid_info();
 
         // Copy these settings to packs and realloc scratch grids.
         for (auto& sp : stPacks)
-            sp->getLocalSettings() = *_opts;
+            sp->getLocalSettings() = *opts;
         allocScratchData();
 
         // Use only one set of scratch grids.
@@ -98,7 +86,7 @@ namespace yask {
 
         // Indices to loop through.
         // Init from begin & end tuples.
-        ScanIndices rank_idxs(*_dims, false, &rank_domain_offsets);
+        ScanIndices rank_idxs(*dims, false, &rank_domain_offsets);
         rank_idxs.begin = begin;
         rank_idxs.end = end;
 
@@ -201,7 +189,7 @@ namespace yask {
     void StencilContext::run_solution(idx_t first_step_index,
                                       idx_t last_step_index)
     {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
         run_time.start();
 
         // Determine step dir from order of first/last.
@@ -219,14 +207,14 @@ namespace yask {
         // Begin, end, step tuples.
         // Based on overall bounding box, which includes
         // any needed extensions for wave-fronts.
-        IdxTuple begin(_dims->_stencil_dims);
+        IdxTuple begin(stencil_dims);
         begin.setVals(ext_bb.bb_begin, false);
         begin[step_posn] = begin_t;
-        IdxTuple end(_dims->_stencil_dims);
+        IdxTuple end(stencil_dims);
         end.setVals(ext_bb.bb_end, false);
         end[step_posn] = end_t;
-        IdxTuple step(_dims->_stencil_dims);
-        step.setVals(_opts->_region_sizes, false); // step by region sizes.
+        IdxTuple step(stencil_dims);
+        step.setVals(opts->_region_sizes, false); // step by region sizes.
         step[step_posn] = step_t;
 
         TRACE_MSG("run_solution: [" <<
@@ -241,8 +229,7 @@ namespace yask {
         }
 
 #ifdef MODEL_CACHE
-        ostream& os = get_ostr();
-        if (context.my_rank != context.msg_rank)
+        if (env.my_rank != env.msg_rank)
             cache_model.disable();
         if (cache_model.isEnabled())
             os << "Modeling cache...\n";
@@ -303,7 +290,7 @@ namespace yask {
         // calculations are made.
 
         // Indices needed for the 'rank' loops.
-        ScanIndices rank_idxs(*_dims, true, &rank_domain_offsets);
+        ScanIndices rank_idxs(*dims, true, &rank_domain_offsets);
         rank_idxs.begin = begin;
         rank_idxs.end = end;
         rank_idxs.step = step;
@@ -573,7 +560,7 @@ namespace yask {
     // eval only the one pointed to.
     void StencilContext::calc_region(BundlePackPtr& sel_bp,
                                      const ScanIndices& rank_idxs) {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
         TRACE_MSG("calc_region: region [" <<
                   rank_idxs.start.makeValStr(nsdims) << " ... " <<
                   rank_idxs.stop.makeValStr(nsdims) << ") within rank [" <<
@@ -587,7 +574,7 @@ namespace yask {
             int_time.start();
 
         // Init region begin & end from rank start & stop indices.
-        ScanIndices region_idxs(*_dims, true, &rank_domain_offsets);
+        ScanIndices region_idxs(*dims, true, &rank_domain_offsets);
         region_idxs.initFromOuter(rank_idxs);
 
         // Time range.
@@ -698,7 +685,7 @@ namespace yask {
                 BundlePackPtr bp;
 
                 // Steps within a region are based on rank block sizes.
-                auto& settings = *_opts;
+                auto& settings = *opts;
                 region_idxs.step = settings._block_sizes;
                 region_idxs.step[step_posn] = step_t;
 
@@ -780,7 +767,7 @@ namespace yask {
                                     const ScanIndices& rank_idxs,
                                     const ScanIndices& region_idxs) {
 
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
         auto* bp = sel_bp.get();
         int region_thread_idx = omp_get_thread_num();
         TRACE_MSG("calc_block: phase " << phase << ", block [" <<
@@ -830,7 +817,7 @@ namespace yask {
 #endif
         
         // Init block begin & end from region start & stop indices.
-        ScanIndices block_idxs(*_dims, true, 0);
+        ScanIndices block_idxs(*dims, true, 0);
         block_idxs.initFromOuter(region_idxs);
 
         // Time range.
@@ -901,7 +888,7 @@ namespace yask {
             block_idxs.stop[step_posn] = end_t;
 
             // Steps within a block are based on rank mini-block sizes.
-            auto& settings = *_opts;
+            auto& settings = *opts;
             block_idxs.step = settings._mini_block_sizes;
             block_idxs.step[step_posn] = step_dir;
 
@@ -982,7 +969,7 @@ namespace yask {
                                          const ScanIndices& base_block_idxs,
                                          const ScanIndices& adj_block_idxs) {
 
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
         TRACE_MSG("calc_mini_block: phase " << phase <<
                   ", shape " << shape <<
                   ", mini-block [" <<
@@ -1003,7 +990,7 @@ namespace yask {
         }
 
         // Init mini-block begin & end from blk start & stop indices.
-        ScanIndices mini_block_idxs(*_dims, true, 0);
+        ScanIndices mini_block_idxs(*dims, true, 0);
         mini_block_idxs.initFromOuter(adj_block_idxs);
 
         // Time range.
@@ -1100,7 +1087,7 @@ namespace yask {
                     // Update offsets of scratch grids based on the current
                     // mini-block location.
                     if (scratchVecs.size())
-                        cp->update_scratch_grid_info(region_thread_idx, mini_block_idxs.begin);
+                        update_scratch_grid_info(region_thread_idx, mini_block_idxs.begin);
 
                     // Call calc_mini_block() for each non-scratch bundle.
                     for (auto* sb : *bp)
@@ -1132,7 +1119,7 @@ namespace yask {
                                       idx_t shift_num,
                                       BundlePackPtr& bp,
                                       ScanIndices& idxs) {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         // For wavefront adjustments, see conceptual diagram in
         // run_solution().  At each pack and time-step, the parallelogram
@@ -1322,7 +1309,7 @@ namespace yask {
                                           idx_t nshapes, idx_t shape,
                                           const BridgeMask& bridge_mask,
                                           ScanIndices& idxs) {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
         auto npacks = stPacks.size();
         bool ok = true;
         
@@ -1479,7 +1466,7 @@ namespace yask {
     // the grids' local offsets.
     void StencilContext::update_scratch_grid_info(int thread_idx,
                                                   const Indices& idxs) {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         // Loop thru vecs of scratch grids.
         for (auto* sv : scratchVecs) {
@@ -1525,7 +1512,7 @@ namespace yask {
     // Compare grids in contexts.
     // Return number of mis-compares.
     idx_t StencilContext::compareData(const StencilContext& ref) const {
-        CONTEXT_VARS_CONST(this);
+        STATE_VARS_CONST(this);
 
         os << "Comparing grid(s) in '" << name << "' to '" << ref.name << "'..." << endl;
         if (gridPtrs.size() != ref.gridPtrs.size()) {
@@ -1544,10 +1531,10 @@ namespace yask {
     // Call MPI_Test() on all unfinished requests to promote MPI progress.
     // TODO: replace with more direct and less intrusive techniques.
     void StencilContext::poke_halo_exchange() {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
 #ifdef USE_MPI
-        if (!enable_halo_exchange || _env->num_ranks < 2)
+        if (!enable_halo_exchange || env->num_ranks < 2)
             return;
 
         test_time.start();
@@ -1601,11 +1588,11 @@ namespace yask {
 
     // Exchange dirty halo data for all grids and all steps.
     void StencilContext::exchange_halos() {
-
+        
 #ifdef USE_MPI
-        if (!enable_halo_exchange || _env->num_ranks < 2)
+        STATE_VARS(this);
+        if (!enable_halo_exchange || env->num_ranks < 2)
             return;
-        CONTEXT_VARS(this);
 
         halo_time.start();
         double wait_delta = 0.;
@@ -1737,7 +1724,7 @@ namespace yask {
                                     auto& r = grid_recv_reqs[ni];
                                     MPI_Irecv(buf, nbytes, MPI_BYTE,
                                               neighbor_rank, int(gi),
-                                              _env->comm, &r);
+                                              env->comm, &r);
                                     num_recv_reqs++;
                                 }
                             }
@@ -1788,7 +1775,7 @@ namespace yask {
                                                                    last, lastStepsToSwap[gp]);
                                 else
                                     nelems = gp->get_elements_in_slice(buf, first, last);
-                                idx_t nbytes = nelems * cp->get_element_bytes();
+                                idx_t nbytes = nelems * get_element_bytes();
 
                                 if (using_shm) {
                                     TRACE_MSG("exchange_halos:    no send req due to shm");
@@ -1801,7 +1788,7 @@ namespace yask {
                                     TRACE_MSG("exchange_halos:    sending " << makeByteStr(nbytes));
                                     auto& r = grid_send_reqs[ni];
                                     MPI_Isend(buf, nbytes, MPI_BYTE,
-                                              neighbor_rank, int(gi), _env->comm, &r);
+                                              neighbor_rank, int(gi), env->comm, &r);
                                     num_send_reqs++;
                                 }
                             }
@@ -1926,6 +1913,7 @@ namespace yask {
     // TODO: track sub-domain of grid that is dirty.
     void StencilContext::mark_grids_dirty(const BundlePackPtr& sel_bp,
                                           idx_t start, idx_t stop) {
+        STATE_VARS(this);
         idx_t step = (start > stop) ? -1 : 1;
         map<YkGridPtr, set<idx_t>> grids_done;
 
diff --git a/src/kernel/lib/context.hpp b/src/kernel/lib/context.hpp
index c2b7bc76..95256822 100644
--- a/src/kernel/lib/context.hpp
+++ b/src/kernel/lib/context.hpp
@@ -149,43 +149,18 @@ namespace yask {
     // This is a pure-virtual class that must be implemented
     // for a specific problem.
     class StencilContext :
+        public KernelStateBase,
         public virtual yk_solution {
 
     protected:
 
-        // Output stream for messages.
-        std::ostream* _ostr = 0;
-        yask_output_ptr _debug;
-
-        // Env.
-        KernelEnvPtr _env;
-
-        // Command-line and env parameters.
-        KernelSettingsPtr _opts;
-
-        // Problem dims.
-        DimsPtr _dims;
-
-        // MPI info.
-        MPIInfoPtr _mpiInfo;
-
         // Auto-tuner for global settings.
         AutoTuner _at;
-        bool _use_pack_tuners = false;
 
         // Bytes between each buffer to help avoid aliasing
         // in the HW.
         static constexpr size_t _data_buf_pad = YASK_PAD_BYTES;
 
-        // Check whether dim is appropriate type.
-        virtual void checkDimType(const std::string& dim,
-                                  const std::string& fn_name,
-                                  bool step_ok,
-                                  bool domain_ok,
-                                  bool misc_ok) const {
-            _dims->checkDimType(dim, fn_name, step_ok, domain_ok, misc_ok);
-        }
-
         // Alloc given bytes on each NUMA node.
         virtual void _alloc_data(const std::map <int, size_t>& nbytes,
                                  const std::map <int, size_t>& ngrids,
@@ -217,14 +192,6 @@ namespace yask {
         bool do_mpi_right = true;        // right exterior in given dim.
         idx_t mpi_exterior_dim = -1;      // which domain dim in left/right.
 
-        // Position of inner domain dim in stencil-dims tuple.
-        // Misc dims will follow this if/when using interleaving.
-        int _inner_posn = -1;   // -1 => not set.
-
-        // Position of outer domain dim in stencil-dims tuple.
-        // For 1D stencils, _outer_posn == _inner_posn.
-        int _outer_posn = -1;   // -1 => not set.
-
         // Is overlap currently enabled?
         inline bool is_overlap_active() const {
             bool active = !do_mpi_interior || !do_mpi_left || !do_mpi_right;
@@ -316,8 +283,8 @@ namespace yask {
         std::map<std::string, MPIData> mpiData;
 
         // Constructor.
-        StencilContext(KernelEnvPtr env,
-                       KernelSettingsPtr settings);
+        StencilContext(KernelEnvPtr& env,
+                       KernelSettingsPtr& settings);
 
         // Destructor.
         virtual ~StencilContext() {
@@ -327,42 +294,17 @@ namespace yask {
                 get_stats();
         }
 
-        // Set debug output to cout if my_rank == msg_rank
-        // or a null stream otherwise.
-        std::ostream& set_ostr();
-
-        // Get the messsage output stream.
-        std::ostream& get_ostr() const {
-            assert(_ostr);
-            return *_ostr;
+        // Modify settings in shared state and auto-tuner.
+        void set_settings(KernelSettingsPtr opts) {
+            _state->_opts = opts;
+            _at.set_settings(opts.get());
         }
 
         // Reset elapsed times to zero.
         void clear_timers();
 
-        // Access to settings.
-        KernelSettingsPtr& get_settings() {
-            assert(_opts);
-            return _opts;
-        }
-        const KernelSettingsPtr& get_settings() const {
-            assert(_opts);
-            return _opts;
-        }
-        void set_settings(KernelSettingsPtr opts) {
-            _opts = opts;
-            _at.set_settings(_opts.get());
-        }
-
         // Misc accessors.
-        KernelEnvPtr& get_env() { return _env; }
-        const KernelEnvPtr& get_env() const { return _env; }
-        DimsPtr& get_dims() { return _dims; }
-        const DimsPtr& get_dims() const { return _dims; }
-        MPIInfoPtr& get_mpi_info() { return _mpiInfo; }
-        const MPIInfoPtr& get_mpi_info() const { return _mpiInfo; }
         AutoTuner& getAT() { return _at; }
-        bool use_pack_tuners() const { return _use_pack_tuners; }
 
         // Add a new grid to the containers.
         virtual void addGrid(YkGridPtr gp, bool is_output);
@@ -468,92 +410,6 @@ namespace yask {
         // Return number of mis-compares.
         virtual idx_t compareData(const StencilContext& ref) const;
 
-        // Set number of threads w/o using thread-divisor.
-        // Return number of threads.
-        // Do nothing and return 0 if not properly initialized.
-        int set_max_threads() {
-
-            // Get max number of threads.
-            int mt = std::max(_opts->max_threads, 1);
-
-            // Reset number of OMP threads to max allowed.
-            omp_set_num_threads(mt);
-            return mt;
-        }
-
-        // Get total number of computation threads to use.
-        int get_num_comp_threads(int& region_threads, int& blk_threads) const {
-
-            // Max threads / divisor.
-            int mt = std::max(_opts->max_threads, 1);
-            int td = std::max(_opts->thread_divisor, 1);
-            int at = mt / td;
-            at = std::max(at, 1);
-
-            // Blk threads per region thread.
-            int bt = std::max(_opts->num_block_threads, 1);
-            bt = std::min(bt, at); // Cannot be > 'at'.
-            blk_threads = bt;
-
-            // Region threads.
-            int rt = at / bt;
-            rt = std::max(rt, 1);
-            region_threads = rt;
-
-            // Total number of block threads.
-            return bt * rt;
-        }
-        
-        // Set number of threads to use for something other than a region.
-        // Return number of threads.
-        // Do nothing and return 0 if not properly initialized.
-        int set_all_threads() {
-            int rt, bt;
-            int at = get_num_comp_threads(rt, bt);
-            omp_set_num_threads(at);
-            return at;
-        }
-
-        // Set number of threads to use for a region.
-        // Enable nested OMP if there are >1 block threads,
-        // disable otherwise.
-        // Return number of threads.
-        // Do nothing and return 0 if not properly initialized.
-        int set_region_threads() {
-            int rt, bt;
-            int at = get_num_comp_threads(rt, bt);
-
-            // Limit outer nesting to allow num_block_threads per nested
-            // block loop.
-            yask_num_threads[0] = rt;
-
-            if (bt > 1) {
-                omp_set_nested(1);
-                omp_set_max_active_levels(2);
-                yask_num_threads[1] = bt;
-            }
-            else {
-                omp_set_nested(0);
-                omp_set_max_active_levels(1);
-                yask_num_threads[1] = 0;
-            }
-
-            omp_set_num_threads(rt);
-            return rt;
-        }
-
-        // Set number of threads for a block.
-        // Return number of threads.
-        // Do nothing and return 0 if not properly initialized.
-        int set_block_threads() {
-            int rt, bt;
-            int at = get_num_comp_threads(rt, bt);
-
-            if (omp_get_max_active_levels() > 1)
-                omp_set_num_threads(bt);
-            return bt;
-        }
-
         // Reference stencil calculations.
         void run_ref(idx_t first_step_index,
                      idx_t last_step_index);
@@ -630,20 +486,14 @@ namespace yask {
                                   const GridDimNames& dims,
                                   const GridDimSizes* sizes);
 
-        // Get output object.
-        virtual yask_output_ptr get_debug_output() const {
-            return _debug;
-        }
-
         // APIs.
         // See yask_kernel_api.hpp.
-        virtual void set_debug_output(yask_output_ptr debug) {
-            _debug = debug;     // to share ownership of referent.
-            _ostr = &debug->get_ostream();
-        }
         virtual const std::string& get_name() const {
             return name;
         }
+        virtual void set_debug_output(yask_output_ptr debug) {
+            KernelStateBase::set_debug_output(debug);
+        }
         virtual int get_element_bytes() const {
             return REAL_BYTES;
         }
@@ -691,22 +541,26 @@ namespace yask {
         }
 
         virtual std::string get_step_dim_name() const {
-            return _dims->_step_dim;
+            STATE_VARS_CONST(this);
+            return dims->_step_dim;
         }
         virtual int get_num_domain_dims() const {
-            return _dims->_domain_dims.getNumDims();
+            STATE_VARS_CONST(this);
+            return dims->_domain_dims.getNumDims();
         }
         virtual std::vector<std::string> get_domain_dim_names() const {
-            std::vector<std::string> dims;
-            for (auto& dim : _dims->_domain_dims.getDims())
-                dims.push_back(dim.getName());
-            return dims;
+            STATE_VARS_CONST(this);
+            std::vector<std::string> ddims;
+            for (auto& dim : dims->_domain_dims.getDims())
+                ddims.push_back(dim.getName());
+            return ddims;
         }
         virtual std::vector<std::string> get_misc_dim_names() const {
-            std::vector<std::string> dims;
-            for (auto& dim : _dims->_misc_dims.getDims())
-                dims.push_back(dim.getName());
-            return dims;
+            STATE_VARS_CONST(this);
+            std::vector<std::string> mdims;
+            for (auto& dim : dims->_misc_dims.getDims())
+                mdims.push_back(dim.getName());
+            return mdims;
         }
 
         virtual idx_t get_first_rank_domain_index(const std::string& dim) const;
@@ -735,16 +589,18 @@ namespace yask {
         virtual idx_t get_rank_index(const std::string& dim) const;
         virtual std::string apply_command_line_options(const std::string& args);
         virtual bool set_default_numa_preferred(int numa_node) {
+            STATE_VARS(this);
 #ifdef USE_NUMA
-            _opts->_numa_pref = numa_node;
+            opts->_numa_pref = numa_node;
             return true;
 #else
-            _opts->_numa_pref = yask_numa_none;
+            opts->_numa_pref = yask_numa_none;
             return numa_node == yask_numa_none;
 #endif
         }
         virtual int get_default_numa_preferred() const {
-            return _opts->_numa_pref;
+            STATE_VARS_CONST(this);
+            return opts->_numa_pref;
         }
 
         // Auto-tuner methods.
@@ -757,27 +613,4 @@ namespace yask {
 
     }; // StencilContext.
 
-    // Macro to get commonly-needed vars for stencil calcs efficiently.
-    // *_posn vars are positions in stencil_dims.
-#define CONTEXT_VARS0(ctx_p, pfx)                                       \
-    pfx auto* cp = ctx_p;                                               \
-    auto& os = cp->get_ostr();                                          \
-    pfx auto* opts = cp->get_settings().get();                          \
-    pfx auto* mpiInfo = cp->get_mpi_info().get();                       \
-    pfx auto* dims = cp->get_dims().get();                              \
-    pfx auto* env = cp->get_env().get();                                \
-    const auto& step_dim = dims->_step_dim;                             \
-    const auto& domain_dims = dims->_domain_dims;                       \
-    constexpr int nddims = NUM_DOMAIN_DIMS;                             \
-    assert(nddims == domain_dims.size());                               \
-    const auto& stencil_dims = dims->_stencil_dims;                     \
-    constexpr int nsdims = NUM_STENCIL_DIMS;                            \
-    assert(nsdims == stencil_dims.size());                              \
-    constexpr int step_posn = 0;                                        \
-    assert(step_posn == +Indices::step_posn);                           \
-    constexpr int outer_posn = 1;                                       \
-    const int inner_posn = cp->_inner_posn
-#define CONTEXT_VARS(ctx_p) CONTEXT_VARS0(ctx_p,)
-#define CONTEXT_VARS_CONST(ctx_p) CONTEXT_VARS0(ctx_p, const)
-
 } // yask namespace.
diff --git a/src/kernel/lib/generic_grids.cpp b/src/kernel/lib/generic_grids.cpp
index 03c5a757..9a263ec2 100644
--- a/src/kernel/lib/generic_grids.cpp
+++ b/src/kernel/lib/generic_grids.cpp
@@ -29,16 +29,16 @@ using namespace std;
 namespace yask {
 
     // Ctor. No allocation is done. See notes on default_alloc().
-    GenericGridBase::GenericGridBase(string name,
+    GenericGridBase::GenericGridBase(KernelStateBase& state,
+                                     const string& name,
                                      Layout& layout_base,
-                                     const GridDimNames& dimNames,
-                                     KernelSettingsPtr* settings,
-                                     ostream** ostr) :
-        _name(name), _layout_base(&layout_base), _opts(settings), _ostr(ostr) {
+                                     const GridDimNames& dimNames) :
+        KernelStateBase(state),
+        _name(name),
+        _layout_base(&layout_base) {
         for (auto& dn : dimNames)
-            _dims.addDimBack(dn, 1);
+            _grid_dims.addDimBack(dn, 1);
         _sync_layout_with_dims();
-
     }
 
     // Perform default allocation.
@@ -46,7 +46,7 @@ namespace yask {
     // programmer should call get_num_elems() or get_num_bytes() and
     // then provide allocated memory via set_storage().
     void GenericGridBase::default_alloc() {
-        auto& os = get_ostr();
+        STATE_VARS(this);
 
         // Release any old data if last owner.
         release_storage();
@@ -74,11 +74,11 @@ namespace yask {
     // Make some descriptive info.
     string GenericGridBase::make_info_string(const string& elem_name) const {
         stringstream oss;
-        if (_dims.getNumDims() == 0)
+        if (_grid_dims.getNumDims() == 0)
             oss << "scalar";
         else
-            oss << _dims.getNumDims() << "-D grid (" <<
-                _dims.makeDimValStr(" * ") << ")";
+            oss << _grid_dims.getNumDims() << "-D grid (" <<
+                _grid_dims.makeDimValStr(" * ") << ")";
         oss << " '" << _name << "'";
         if (_elems)
             oss << " with data at " << _elems << " containing ";
@@ -147,7 +147,7 @@ namespace yask {
             return get_num_elems();
 
         // Dims & sizes same?
-        if (_dims != p->_dims)
+        if (_grid_dims != p->_grid_dims)
             return get_num_elems();
 
         // Count abs diffs > epsilon.
diff --git a/src/kernel/lib/generic_grids.hpp b/src/kernel/lib/generic_grids.hpp
index 30e3905b..010d7e03 100644
--- a/src/kernel/lib/generic_grids.hpp
+++ b/src/kernel/lib/generic_grids.hpp
@@ -33,7 +33,8 @@ namespace yask {
 
     // A base class for a generic n-D grid.
     // This class does not define a type or memory layout.
-    class GenericGridBase {
+    class GenericGridBase :
+        public KernelStateBase {
 
     protected:
         std::string _name;      // name for grid.
@@ -52,36 +53,42 @@ namespace yask {
 
         // Note that both _dims and *_layout_base hold dimensions unless this
         // is a scalar. For a scalar, _dims is empty and _layout_base = 0.
-        IdxTuple _dims;         // names and lengths of grid dimensions.
+        IdxTuple _grid_dims;         // names and lengths of grid dimensions.
         Layout* _layout_base = 0; // memory layout.
 
-        // Command-line and env parameters.
-        KernelSettingsPtr* _opts;
-
-        // Output stream for messages.
-        // Pointer-to-pointer to let it follow a parent's pointer.
-        std::ostream** _ostr = 0;
-
         void _sync_dims_with_layout() {
             Indices idxs(_layout_base->get_sizes());
-            idxs.setTupleVals(_dims);
+            idxs.setTupleVals(_grid_dims);
         }
         void _sync_layout_with_dims() {
-            Indices idxs(_dims);
+            Indices idxs(_grid_dims);
             _layout_base->set_sizes(idxs);
         }
 
     public:
 
         // Ctor. No allocation is done. See notes on default_alloc().
-        GenericGridBase(std::string name,
+        GenericGridBase(KernelStateBase& state,
+                        const std::string& name,
                         Layout& layout_base,
-                        const GridDimNames& dimNames,
-                        KernelSettingsPtr* settings,
-                        std::ostream** ostr);
+                        const GridDimNames& dimNames);
 
         virtual ~GenericGridBase() { }
 
+        // Get state info.
+        KernelStatePtr& get_state() {
+            assert(_state);
+            return _state;
+        }
+        const KernelStatePtr& get_state() const {
+            assert(_state);
+            return _state;
+        }
+        std::ostream& get_ostr() const {
+            STATE_VARS(this);
+            return os;
+        }
+
         // Perform default allocation.
         // For other options,
         // programmer should call get_num_elems() or get_num_bytes() and
@@ -94,8 +101,9 @@ namespace yask {
 
         // NUMA accessors.
         virtual int get_numa_pref() const {
+            STATE_VARS_CONST(this);
             return (_numa_pref != _numa_unset) ?
-                _numa_pref : (*_opts)->_numa_pref;
+                _numa_pref : opts->_numa_pref;
         }
         virtual bool set_numa_pref(int numa_node) {
 #ifdef USE_NUMA
@@ -107,19 +115,12 @@ namespace yask {
 #endif
         }
 
-        // Access dims.
-        const IdxTuple& get_dims() const { return _dims; }
-
-        // Get the messsage output stream.
-        virtual std::ostream& get_ostr() const {
-            assert(_ostr);
-            assert(*_ostr);
-            return **_ostr;
-        }
+        // Access dims of this grid.
+        const IdxTuple& get_dims() const { return _grid_dims; }
 
         // Get number of elements.
         virtual idx_t get_num_elems() const {
-            return _dims.product();
+            return _grid_dims.product();
         }
 
         // Get size of one element.
@@ -130,25 +131,25 @@ namespace yask {
 
         // Get number of dimensions.
         virtual int get_num_dims() const {
-            return _dims.getNumDims();
+            return _grid_dims.getNumDims();
         }
 
         // Get the nth dim name.
         virtual const std::string& get_dim_name(int n) const {
-            return _dims.getDimName(n);
+            return _grid_dims.getDimName(n);
         }
 
         // Is dim used?
         virtual bool is_dim_used(const std::string& dim) const {
-            return _dims.lookup(dim) != 0;
+            return _grid_dims.lookup(dim) != 0;
         }
 
         // Access nth dim size.
         idx_t get_dim_size(int n) const {
-            return _dims.getVal(n);
+            return _grid_dims.getVal(n);
         }
         void set_dim_size(int n, idx_t size) {
-            _dims.setVal(n, size);
+            _grid_dims.setVal(n, size);
             _sync_layout_with_dims();
         }
 
@@ -157,15 +158,15 @@ namespace yask {
             return _layout_base->get_sizes();
         }
         void set_dim_sizes(const Indices& sizes) {
-            for (int i = 0; i < _dims.size(); i++)
-                _dims.setVal(i, sizes[i]);
+            for (int i = 0; i < _grid_dims.size(); i++)
+                _grid_dims.setVal(i, sizes[i]);
             _sync_layout_with_dims();
         }
 
         // Return 'true' if dimensions are same names
         // and sizes, 'false' otherwise.
         inline bool are_dims_and_sizes_same(const GenericGridBase& src) {
-            return _dims == src._dims;
+            return _grid_dims == src._grid_dims;
         }
 
         // Print some descriptive info.
@@ -174,7 +175,7 @@ namespace yask {
         // Get linear index.
         virtual idx_t get_index(const Indices& idxs, bool check=true) const =0;
         virtual idx_t get_index(const IdxTuple& pt, bool check=true) const {
-            assert(_dims.areDimsSame(pt));
+            assert(_grid_dims.areDimsSame(pt));
             Indices idxs(pt);
             return get_index(idxs, check);
         }
@@ -213,12 +214,11 @@ namespace yask {
     public:
 
         // Ctor. No allocation is done. See notes on default_alloc().
-        GenericGridTemplate(std::string name,
+        GenericGridTemplate(KernelStateBase& state,
+                            const std::string& name,
                             Layout& layout_base,
-                            const GridDimNames& dimNames,
-                            KernelSettingsPtr* settings,
-                            std::ostream** ostr) :
-            GenericGridBase(name, layout_base, dimNames, settings, ostr) { }
+                            const GridDimNames& dimNames) :
+            GenericGridBase(state, name, layout_base, dimNames) { }
 
         // Dealloc _base when last pointer to it is destructed.
         virtual ~GenericGridTemplate() {
@@ -282,11 +282,10 @@ namespace yask {
     public:
 
         // Construct an unallocated grid.
-        GenericGrid(std::string name,
-                    const GridDimNames& dimNames,
-                    KernelSettingsPtr* settings,
-                    std::ostream** ostr) :
-            GenericGridTemplate<T>(name, _layout, dimNames, settings, ostr) {
+        GenericGrid(KernelStateBase& state,
+                    std::string name,
+                    const GridDimNames& dimNames) :
+            GenericGridTemplate<T>(state, name, _layout, dimNames) {
             assert(int(dimNames.size()) == _layout.get_num_sizes());
         }
 
@@ -306,10 +305,10 @@ namespace yask {
         virtual idx_t get_index(const Indices& idxs, bool check=true) const final {
 #ifdef CHECK
             if (check) {
-                for (int i = 0; i < this->_dims.size(); i++) {
+                for (int i = 0; i < this->_grid_dims.size(); i++) {
                     idx_t j = idxs[i];
                     assert(j >= 0);
-                    assert(j < this->_dims.getVal(i));
+                    assert(j < this->_grid_dims.getVal(i));
                 }
             }
 #endif
@@ -347,8 +346,9 @@ namespace yask {
     public:
 
         // Construct an unallocated scalar.
-        GenericScalar(std::string name) :
-            GenericGrid<T, Layout_0d>(name, _dimNames) { }
+        GenericScalar(KernelStateBase& state,
+                      std::string name) :
+            GenericGrid<T, Layout_0d>(state, name, _dimNames) { }
     };
 
 } // namespace yask.
diff --git a/src/kernel/lib/grid_apis.cpp b/src/kernel/lib/grid_apis.cpp
index 1b677d0b..7bf9c1eb 100644
--- a/src/kernel/lib/grid_apis.cpp
+++ b/src/kernel/lib/grid_apis.cpp
@@ -37,7 +37,8 @@ namespace yask {
     // APIs to get info from vars.
 #define GET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \
     idx_t YkGridBase::api_name(const string& dim) const {               \
-        checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok);      \
+        STATE_VARS(this);                                               \
+        dims->checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \
         int posn = get_dim_posn(dim, true, #api_name);                  \
         idx_t mbit = 1LL << posn;                                       \
         if (prep_req && _rank_offsets[posn] < 0)                        \
@@ -47,6 +48,7 @@ namespace yask {
         return rtn;                                                     \
     }                                                                   \
     idx_t YkGridBase::api_name(int posn) const {                        \
+        STATE_VARS(this);                                               \
         idx_t mbit = 1LL << posn;                                       \
         auto rtn = expr;                                                \
         return rtn;                                                     \
@@ -63,8 +65,8 @@ namespace yask {
     GET_GRID_API(get_alloc_size, _allocs[posn], true, true, true, false)
     GET_GRID_API(get_first_rank_domain_index, _rank_offsets[posn], false, true, false, true)
     GET_GRID_API(get_last_rank_domain_index, _rank_offsets[posn] + _domains[posn] - 1, false, true, false, true)
-    GET_GRID_API(get_first_rank_halo_index, _rank_offsets[posn] - _left_halos[posn], false, false, true, true)
-    GET_GRID_API(get_last_rank_halo_index, _rank_offsets[posn] + _domains[posn] + _right_halos[posn] - 1, false, false, true, true)
+    GET_GRID_API(get_first_rank_halo_index, _rank_offsets[posn] - _left_halos[posn], false, true, false, true)
+    GET_GRID_API(get_last_rank_halo_index, _rank_offsets[posn] + _domains[posn] + _right_halos[posn] - 1, false, true, false, true)
     GET_GRID_API(get_first_rank_alloc_index, _rank_offsets[posn] + _local_offsets[posn] - _actl_left_pads[posn], false, true, false, true)
     GET_GRID_API(get_last_rank_alloc_index, _rank_offsets[posn] + _local_offsets[posn] + _domains[posn] + _actl_right_pads[posn] - 1, false, true, false, true)
     GET_GRID_API(_get_left_wf_ext, _left_wf_exts[posn], true, true, true, false)
@@ -84,14 +86,16 @@ namespace yask {
 #define COMMA ,
 #define SET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok)       \
     void YkGridBase::api_name(const string& dim, idx_t n) {             \
-        TRACE_MSG0(get_ostr(), "grid '" << get_name() << "'."           \
+        STATE_VARS(this);                                               \
+        TRACE_MSG("grid '" << get_name() << "'."                        \
                    #api_name "('" << dim << "', " << n << ")");         \
-        checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok);      \
+        dims->checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \
         int posn = get_dim_posn(dim, true, #api_name);                  \
         idx_t mbit = 1LL << posn;                                       \
         expr;                                                           \
     }                                                                   \
     void YkGridBase::api_name(int posn, idx_t n) {                      \
+        STATE_VARS(this);                                               \
         idx_t mbit = 1LL << posn;                                       \
         int dim = posn;                                                 \
         expr;                                                           \
@@ -160,6 +164,7 @@ namespace yask {
     }
 
     void YkGridBase::share_storage(yk_grid_ptr source) {
+        STATE_VARS(this);
         auto sp = dynamic_pointer_cast<YkGridBase>(source);
         assert(sp);
 
@@ -193,7 +198,7 @@ namespace yask {
             }
 
             // Not a domain dim?
-            bool is_domain = _dims->_domain_dims.lookup(dname) != 0;
+            bool is_domain = domain_dims.lookup(dname) != 0;
             if (!is_domain) {
                 auto tas = get_alloc_size(dname);
                 auto sas = sp->get_alloc_size(dname);
@@ -239,7 +244,7 @@ namespace yask {
         // Copy pad sizes.
         for (int i = 0; i < get_num_dims(); i++) {
             auto dname = get_dim_name(i);
-            bool is_domain = _dims->_domain_dims.lookup(dname) != 0;
+            bool is_domain = domain_dims.lookup(dname) != 0;
             if (is_domain) {
                 _actl_left_pads[i] = sp->_actl_left_pads[i];
                 _actl_right_pads[i] = sp->_actl_right_pads[i];
diff --git a/src/kernel/lib/indices.hpp b/src/kernel/lib/indices.hpp
new file mode 100644
index 00000000..dabb51bc
--- /dev/null
+++ b/src/kernel/lib/indices.hpp
@@ -0,0 +1,404 @@
+/*****************************************************************************
+
+YASK: Yet Another Stencil Kernel
+Copyright (c) 2014-2019, Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+* The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
+
+*****************************************************************************/
+
+#pragma once
+
+namespace yask {
+
+    typedef Tuple<idx_t> IdxTuple;
+    typedef std::vector<idx_t> GridIndices;
+    typedef std::vector<idx_t> GridDimSizes;
+    typedef std::vector<std::string> GridDimNames;
+
+    // A class to hold up to a given number of sizes or indices efficiently.
+    // Similar to a Tuple, but less overhead and doesn't keep names.
+    // Make sure this stays non-virtual.
+    // TODO: make this a template with _ndims as a parameter.
+    // TODO: ultimately, combine with Tuple w/o loss of efficiency.
+    class Indices {
+
+    public:
+
+        // Max number of indices that can be held.
+        // Note use of "+max_idxs" in code below to avoid compiler
+        // trying to take a reference to it, resulting in a undefined
+        // symbol (sometimes).
+        static constexpr int max_idxs = MAX_DIMS;
+
+        // Step dim is always in [0] of an Indices type (if it is used).
+        static constexpr int step_posn = 0;
+
+    protected:
+        idx_t _idxs[max_idxs];
+        int _ndims;
+
+    public:
+        // Ctors.
+        Indices() : _ndims(0) { }
+        Indices(int ndims) : _ndims(ndims) { } // NB: _idxs remain uninit!
+        Indices(const IdxTuple& src) {
+            setFromTuple(src);
+        }
+        Indices(const GridIndices& src) {
+            setFromVec(src);
+        }
+        Indices(const std::initializer_list<idx_t>& src) {
+            setFromInitList(src);
+        }
+        Indices(const idx_t src[], int ndims) {
+            setFromArray(src, ndims);
+        }
+        Indices(idx_t src, int ndims) {
+            setFromConst(src, ndims);
+        }
+
+        // Default copy ctor, copy operator should be okay.
+
+        // Access size.
+        inline int getNumDims() const {
+            return _ndims;
+        }
+        inline void setNumDims(int n) {
+            _ndims = n;
+        }
+
+        // Access indices.
+        inline idx_t& operator[](int i) {
+            assert(i >= 0);
+            assert(i < _ndims);
+            return _idxs[i];
+        }
+        inline const idx_t& operator[](int i) const {
+            assert(i >= 0);
+            assert(i < _ndims);
+            return _idxs[i];
+        }
+
+        // Write to an IdxTuple.
+        // The 'tgt' must have the same number of dims.
+        void setTupleVals(IdxTuple& tgt) const {
+            assert(tgt.size() == _ndims);
+            for (int i = 0; i < _ndims; i++)
+                if (i < tgt.size())
+                    tgt.setVal(i, _idxs[i]);
+        }
+
+        // Read from an IdxTuple.
+        void setFromTuple(const IdxTuple& src) {
+            assert(src.size() <= +max_idxs);
+            int n = int(src.size());
+            for (int i = 0; i < n; i++)
+                _idxs[i] = src.getVal(i);
+            _ndims = n;
+        }
+
+        // Other inits.
+        void setFromVec(const GridIndices& src) {
+            assert(src.size() <= +max_idxs);
+            int n = int(src.size());
+            for (int i = 0; i < n; i++)
+                _idxs[i] = src[i];
+            _ndims = n;
+        }
+
+        // default n => don't change _ndims.
+        void setFromArray(const idx_t src[], int n = -1) {
+            if (n < 0)
+                n = _ndims;
+            assert(n <= +max_idxs);
+            for (int i = 0; i < n; i++)
+                _idxs[i] = src[i];
+            _ndims = n;
+        }
+        void setFromInitList(const std::initializer_list<idx_t>& src) {
+            assert(src.size() <= +max_idxs);
+            int i = 0;
+            for (auto idx : src)
+                _idxs[i++] = idx;
+            _ndims = i;
+        }
+
+        // default n => don't change _ndims.
+        void setFromConst(idx_t val, int n = -1) {
+            if (n < 0)
+                n = _ndims;
+            assert(n <= +max_idxs);
+            for (int i = 0; i < n; i++)
+                _idxs[i] = val;
+            _ndims = n;
+        }
+
+        // Some comparisons.
+        // These assume all the indices are valid or
+        // initialized to the same value.
+        bool operator==(const Indices& rhs) const {
+            if (_ndims != rhs._ndims)
+                return false;
+            for (int i = 0; i < _ndims; i++)
+                if (_idxs[i] != rhs._idxs[i])
+                    return false;
+            return true;
+        }
+        bool operator!=(const Indices& rhs) const {
+            return !operator==(rhs);
+        }
+        bool operator<(const Indices& rhs) const {
+            if (_ndims < rhs._ndims)
+                return true;
+            else if (_ndims > rhs._ndims)
+                return false;
+            for (int i = 0; i < _ndims; i++)
+                if (_idxs[i] < rhs._idxs[i])
+                    return true;
+                else if (_idxs[i] > rhs._idxs[i])
+                    return false;
+            return false;       // equal, so not less than.
+        }
+        bool operator>(const Indices& rhs) const {
+            if (_ndims > rhs._ndims)
+                return true;
+            else if (_ndims < rhs._ndims)
+                return false;
+            for (int i = 0; i < _ndims; i++)
+                if (_idxs[i] > rhs._idxs[i])
+                    return true;
+                else if (_idxs[i] < rhs._idxs[i])
+                    return false;
+            return false;       // equal, so not greater than.
+        }
+
+        // Generic element-wise operator.
+        // Returns a new object.
+        inline Indices combineElements(std::function<void (idx_t& lhs, idx_t rhs)> func,
+                                       const Indices& other) const {
+            Indices res(*this);
+
+#if EXACT_INDICES
+            // Use just the used elements.
+            for (int i = 0; i < _ndims; i++)
+#else
+            // Use all to allow unroll and avoid jumps.
+#pragma unroll
+            for (int i = 0; i < max_idxs; i++)
+#endif
+                func(res._idxs[i], other._idxs[i]);
+            return res;
+        }
+
+        // Some element-wise operators.
+        // These all return a new set of Indices rather
+        // than modifying this object.
+        inline Indices addElements(const Indices& other) const {
+            return combineElements([&](idx_t& lhs, idx_t rhs) { lhs += rhs; },
+                                   other);
+        }
+        inline Indices subElements(const Indices& other) const {
+            return combineElements([&](idx_t& lhs, idx_t rhs) { lhs -= rhs; },
+                                   other);
+        }
+        inline Indices mulElements(const Indices& other) const {
+            return combineElements([&](idx_t& lhs, idx_t rhs) { lhs *= rhs; },
+                                   other);
+        }
+        inline Indices divElements(const Indices& other) const {
+            return combineElements([&](idx_t& lhs, idx_t rhs) { lhs /= rhs; },
+                                   other);
+        }
+        inline Indices minElements(const Indices& other) const {
+            return combineElements([&](idx_t& lhs, idx_t rhs) { lhs = std::min(lhs, rhs); },
+                                   other);
+        }
+        inline Indices maxElements(const Indices& other) const {
+            return combineElements([&](idx_t& lhs, idx_t rhs) { lhs = std::max(lhs, rhs); },
+                                   other);
+        }
+
+        // Generic element-wise operator with RHS const.
+        // Returns a new object.
+        inline Indices mapElements(std::function<void (idx_t& lhs, idx_t rhs)> func,
+                                   idx_t crhs) const {
+            Indices res(*this);
+
+#if EXACT_INDICES
+            // Use just the used elements.
+            for (int i = 0; i < _ndims; i++)
+#else
+            // Use all to allow unroll and avoid jumps.
+#pragma unroll
+            for (int i = 0; i < max_idxs; i++)
+#endif
+                func(res._idxs[i], crhs);
+            return res;
+        }
+
+        // Operate on all elements.
+        Indices addConst(idx_t crhs) const {
+            return mapElements([&](idx_t& lhs, idx_t rhs) { lhs += rhs; },
+                               crhs);
+        }
+        Indices subConst(idx_t crhs) const {
+            return mapElements([&](idx_t& lhs, idx_t rhs) { lhs -= rhs; },
+                               crhs);
+        }
+        Indices mulConst(idx_t crhs) const {
+            return mapElements([&](idx_t& lhs, idx_t rhs) { lhs *= rhs; },
+                               crhs);
+        }
+        Indices divConst(idx_t crhs) const {
+            return mapElements([&](idx_t& lhs, idx_t rhs) { lhs /= rhs; },
+                               crhs);
+        }
+        Indices minConst(idx_t crhs) const {
+            return mapElements([&](idx_t& lhs, idx_t rhs) { lhs = std::min(lhs, rhs); },
+                               crhs);
+        }
+        Indices maxConst(idx_t crhs) const {
+            return mapElements([&](idx_t& lhs, idx_t rhs) { lhs = std::max(lhs, rhs); },
+                               crhs);
+        }
+
+        // Reduce over all elements.
+        idx_t sum() const {
+            idx_t res = 0;
+            for (int i = 0; i < _ndims; i++)
+                res += _idxs[i];
+            return res;
+        }
+        idx_t product() const {
+            idx_t res = 1;
+            for (int i = 0; i < _ndims; i++)
+                res *= _idxs[i];
+            return res;
+        }
+
+        // Make string like "x=4, y=8".
+        std::string makeDimValStr(const GridDimNames& names,
+                                  std::string separator=", ",
+                                  std::string infix="=",
+                                  std::string prefix="",
+                                  std::string suffix="") const {
+            assert((int)names.size() == _ndims);
+
+            // Make a Tuple from names.
+            IdxTuple tmp;
+            for (int i = 0; i < int(names.size()); i++)
+                tmp.addDimBack(names[i], _idxs[i]);
+            return tmp.makeDimValStr(separator, infix, prefix, suffix);
+        }
+
+        // Make string like "4, 3, 2".
+        std::string makeValStr(int nvals,
+                               std::string separator=", ",
+                               std::string prefix="",
+                               std::string suffix="") const {
+            assert(nvals == _ndims);
+
+            // Make a Tuple w/o useful names.
+            IdxTuple tmp;
+            for (int i = 0; i < nvals; i++)
+                tmp.addDimBack(std::string(1, '0' + char(i)), _idxs[i]);
+            return tmp.makeValStr(separator, prefix, suffix);
+        }
+    };
+
+    // Define OMP reductions on Indices.
+#pragma omp declare reduction(min_idxs : Indices : \
+                              omp_out = omp_out.minElements(omp_in) )   \
+    initializer (omp_priv = omp_orig)
+#pragma omp declare reduction(max_idxs : Indices : \
+                              omp_out = omp_out.maxElements(omp_in) )   \
+    initializer (omp_priv = omp_orig)
+
+    // Layout algorithms using Indices.
+#include "yask_layouts.hpp"
+
+    // Forward defns.
+    struct Dims;
+
+    // A group of Indices needed for generated loops.
+    // See the help message from gen_loops.pl for the
+    // documentation of the indices.
+    // Make sure this stays non-virtual.
+    struct ScanIndices {
+        int ndims = 0;
+
+        // Input values; not modified.
+        Indices begin, end;     // first and end (beyond last) range of each index.
+        Indices step;           // step value within range.
+        Indices align;          // alignment of steps after first one.
+        Indices align_ofs;      // adjustment for alignment (see below).
+        Indices group_size;     // proximity grouping within range.
+
+        // Alignment: when possible, start positions after the first
+        // in each dim will be aligned such that ((start - align_ofs) % align) == 0.
+
+        // Output values; set once for entire range.
+        Indices num_indices;    // number of indices in each dim.
+        idx_t   linear_indices = 0; // total indices over all dims (product of num_indices).
+
+        // Output values; set for each index by loop code.
+        Indices start, stop;    // first and last+1 for this sub-range.
+        Indices index;          // 0-based unique index for each sub-range in each dim.
+        idx_t   linear_index = 0;   // 0-based index over all dims.
+
+        // Example w/3 sub-ranges in overall range:
+        // begin                                         end
+        //   |--------------------------------------------|
+        //   |------------------|------------------|------|
+        // start               stop                            (index = 0)
+        //                    start               stop         (index = 1)
+        //                                       start   stop  (index = 2)
+
+        // Ctor.
+        ScanIndices(const Dims& dims, bool use_vec_align, IdxTuple* ofs);
+
+        // Init from outer-loop indices.
+        // Start..stop from point in outer loop become begin..end
+        // for this loop.
+        //
+        // Example:
+        // begin              (outer)                    end
+        //   |--------------------------------------------|
+        //   |------------------|------------------|------|
+        // start      |        stop
+        //            V
+        // begin    (this)     end
+        //   |------------------|
+        // start               stop  (may be sub-dividied later)
+        void initFromOuter(const ScanIndices& outer) {
+
+            // Begin & end set from start & stop of outer loop.
+            begin = start = outer.start;
+            end = stop = outer.stop;
+
+            // Pass some values through.
+            align = outer.align;
+            align_ofs = outer.align_ofs;
+
+            // Leave others alone.
+        }
+    };
+
+} // yask namespace.
diff --git a/src/kernel/lib/new_grid.cpp b/src/kernel/lib/new_grid.cpp
index 559554bf..3adadc54 100644
--- a/src/kernel/lib/new_grid.cpp
+++ b/src/kernel/lib/new_grid.cpp
@@ -30,75 +30,76 @@ namespace yask {
 
     // Make a new grid.
     YkGridPtr StencilContext::newGrid(const std::string& name,
-                                      const GridDimNames& dims,
+                                      const GridDimNames& gdims,
                                       const GridDimSizes* sizes) {
+        STATE_VARS(this);
 
         // Check parameters.
         bool got_sizes = sizes != NULL;
         if (got_sizes) {
-            if (dims.size() != sizes->size()) {
+            if (gdims.size() != sizes->size()) {
                 FORMAT_AND_THROW_YASK_EXCEPTION("Error: attempt to create grid '" << name << "' with " <<
-                                                dims.size() << " dimension names but " << sizes->size() <<
+                                                gdims.size() << " dimension names but " << sizes->size() <<
                                                 " dimension sizes");
             }
         }
 
         // First, try to make a grid that matches the layout in
         // the stencil.
-        YkGridPtr gp = newStencilGrid(name, dims);
+        YkGridPtr gp = newStencilGrid(name, gdims);
 
         // No match.
         if (!gp) {
 
             // Tuple of dims.
             IdxTuple dtup;
-            for (auto& d : dims)
+            for (auto& d : gdims)
                 dtup.addDimBack(d, 0);
 
 #if ALLOW_NEW_GRIDS
             // Allow new grid types.
             
             // Check dims.
-            int ndims = dims.size();
-            int step_posn = -1;      // -1 => not used.
+            int ndims = gdims.size();
+            bool step_used = false;
             set<string> seenDims;
             for (int i = 0; i < ndims; i++) {
+                auto& gdim = gdims[i];
 
                 // Already used?
-                if (seenDims.count(dims[i])) {
+                if (seenDims.count(gdim)) {
                     THROW_YASK_EXCEPTION("Error: cannot create grid '" + name +
-                                         "' because dimension '" + dims[i] +
+                                         "' because dimension '" + gdim +
                                          "' is used more than once");
                 }
 
                 // Step dim?
-                if (dims[i] == _dims->_step_dim) {
-                    step_posn = i;
-                    if (i > 0) {
+                if (gdim == step_dim) {
+                    step_used = true;
+                    if (i != step_posn) {
                         THROW_YASK_EXCEPTION("Error: cannot create grid '" + name +
-                                             "' because step dimension '" + dims[i] +
-                                             "' must be first dimension");
+                                             "' because step dimension '" + gdim +
+                                             "' is not first dimension");
                     }
                 }
 
                 // Domain dim?
-                else if (_dims->_domain_dims.lookup(dims[i])) {
+                else if (domain_dims.lookup(gdim)) {
                 }
 
                 // Known misc dim?
-                else if (_dims->_misc_dims.lookup(dims[i])) {
+                else if (misc_dims.lookup(gdim)) {
                 }
 
                 // New misc dim?
                 else {
-                    _dims->_misc_dims.addDimBack(dims[i], 0);
+                    misc_dims.addDimBack(gdim, 0);
                 }
             }
-            bool do_wrap = step_posn >= 0;
 
             // Scalar?
             if (ndims == 0)
-                gp = make_shared<YkElemGrid<Layout_0d, false>>(_dims, name, dims, &_opts, &_ostr);
+                gp = make_shared<YkElemGrid<Layout_0d, false>>(*this, name, gdims);
 
             // Include auto-gen code for all other cases.
 #include "yask_grid_code.hpp"
@@ -123,20 +124,20 @@ namespace yask {
         // Add to context.
         addGrid(gp, false);     // mark as non-output grid.
 
-        // Set sizes as provided or via solution settings.
+        // Set sizes as provided.
         if (got_sizes) {
-            int ndims = dims.size();
+            int ndims = gdims.size();
             for (int i = 0; i < ndims; i++) {
-                auto& dname = dims[i];
+                auto& gdim = gdims[i];
 
                 // Domain size.
                 gp->_set_domain_size(i, sizes->at(i));
 
                 // Pads.
                 // Set via both 'extra' and 'min'; larger result will be used.
-                if (_dims->_domain_dims.lookup(dname)) {
-                    gp->set_extra_pad_size(i, _opts->_extra_pad_sizes[dname]);
-                    gp->set_min_pad_size(i, _opts->_min_pad_sizes[dname]);
+                if (domain_dims.lookup(gdim)) {
+                    gp->set_extra_pad_size(i, opts->_extra_pad_sizes[gdim]);
+                    gp->set_min_pad_size(i, opts->_min_pad_sizes[gdim]);
                 }
 
                 // Offsets.
@@ -145,9 +146,11 @@ namespace yask {
             }
         }
 
+        // Set sizes based on solution settings.
         else
             update_grid_info();
 
+        // Mark as created via API.
         gp->set_new_grid(true);
         return gp;
     }
diff --git a/src/kernel/lib/realv_grids.cpp b/src/kernel/lib/realv_grids.cpp
index b3c7437c..7b3f350e 100644
--- a/src/kernel/lib/realv_grids.cpp
+++ b/src/kernel/lib/realv_grids.cpp
@@ -31,13 +31,14 @@ using namespace std;
 namespace yask {
 
     // Ctor.
-    YkGridBase::YkGridBase(GenericGridBase* ggb,
-                           const GridDimNames& dimNames,
-                           DimsPtr dims) :
-    _ggb(ggb), _dims(dims) {
-
+    // Important: '*ggb' is NOT yet constructed.
+    YkGridBase::YkGridBase(KernelStateBase& stateb,
+                           GenericGridBase* ggb,
+                           const GridDimNames& dimNames) :
+        KernelStateBase(stateb),
+        _ggb(ggb) {
+        STATE_VARS(this);
         assert(ggb);
-        assert(dims.get());
 
         // Init indices.
         int n = int(dimNames.size());
@@ -62,9 +63,9 @@ namespace yask {
         for (int i = 0; i < dimNames.size(); i++) {
             idx_t mbit = 1LL << i;
             auto& dname = dimNames[i];
-            if (dname == _dims->_step_dim)
+            if (dname == step_dim)
                 _step_dim_mask |= mbit;
-            else if (_dims->_domain_dims.lookup(dname))
+            else if (domain_dims.lookup(dname))
                 _domain_dim_mask |= mbit;
             else
                 _misc_dim_mask |= mbit;
@@ -127,6 +128,7 @@ namespace yask {
     // Does not include user-specified min padding or
     // final rounding for left pad.
     Indices YkGridBase::getReqdPad(const Indices& halos, const Indices& wf_exts) const {
+        STATE_VARS(this);
 
         // Start with halos plus WF exts.
         Indices mp = halos.addElements(wf_exts);
@@ -144,7 +146,7 @@ namespace yask {
         for (int i = 0; i < get_num_dims(); i++) {
             if (mp[i] >= 1) {
                 auto& dname = get_dim_name(i);
-                auto* p = _dims->_fold_pts.lookup(dname);
+                auto* p = dims->_fold_pts.lookup(dname);
                 if (p) {
                     assert (*p >= 1);
                     mp[i] += *p - 1;
@@ -158,6 +160,7 @@ namespace yask {
     // Modifies _pads and _allocs.
     // Fails if mem different and already alloc'd.
     void YkGridBase::resize() {
+        STATE_VARS(this);
 
         // Original size.
         auto p = get_raw_storage_buffer();
@@ -216,7 +219,7 @@ namespace yask {
 
                 // Make inner dim an odd number of vecs.
                 // This reportedly helps avoid some uarch aliasing.
-                if (!p && get_dim_name(i) == _dims->_inner_dim &&
+                if (!p && get_dim_name(i) == inner_dim &&
                     (new_allocs[i] / _vec_lens[i]) % 2 == 0) {
                     new_right_pads[i] += _vec_lens[i];
                     new_allocs[i] += _vec_lens[i];
@@ -269,7 +272,7 @@ namespace yask {
         if (old_allocs != new_allocs || old_dirty != new_dirty) {
             Indices first_allocs = _rank_offsets.subElements(_actl_left_pads);
             Indices end_allocs = first_allocs.addElements(_allocs);
-            TRACE_MSG0(get_ostr(), "grid '" << get_name() << "' resized from " <<
+            TRACE_MSG("grid '" << get_name() << "' resized from " <<
                        makeIndexString(old_allocs, " * ") << " to " <<
                        makeIndexString(new_allocs, " * ") << " at [" <<
                        makeIndexString(first_allocs) << " ... " << 
@@ -288,18 +291,19 @@ namespace yask {
                                   bool step_ok,
                                   bool domain_ok,
                                   bool misc_ok) const {
+        STATE_VARS(this);
         if (!is_dim_used(dim))
             THROW_YASK_EXCEPTION("Error in " + fn_name + "(): dimension '" +
                                  dim + "' not found in " + make_info_string());
-        _dims->checkDimType(dim, fn_name, step_ok, domain_ok, misc_ok);
+        dims->checkDimType(dim, fn_name, step_ok, domain_ok, misc_ok);
     }
 
     // Check for equality.
     // Return number of mismatches greater than epsilon.
     idx_t YkGridBase::compare(const YkGridBase* ref,
                               real_t epsilon,
-                              int maxPrint,
-                              std::ostream& os) const {
+                              int maxPrint) const {
+        STATE_VARS(this);
         if (!ref) {
             os << "** mismatch: no reference grid.\n";
             return get_num_storage_elements();
@@ -316,7 +320,7 @@ namespace yask {
         // same values in extra-padding area.
         // TODO: check layout.
         idx_t errs = _ggb->count_diffs(ref->_ggb, epsilon);
-        TRACE_MSG0(get_ostr(), "count_diffs() returned " << errs);
+        TRACE_MSG("count_diffs() returned " << errs);
         if (!errs)
             return 0;
 
@@ -345,7 +349,7 @@ namespace yask {
                     // Don't compare points outside the domain.
                     // TODO: check points in halo.
                     auto& dname = pt.getDimName(i);
-                    if (_dims->_domain_dims.lookup(dname)) {
+                    if (domain_dims.lookup(dname)) {
                         auto first_ok = get_first_rank_domain_index(dname);
                         auto last_ok = get_last_rank_domain_index(dname);
                         if (opt[i] < first_ok || opt[i] > last_ok)
@@ -374,7 +378,7 @@ namespace yask {
                 }
                 return true;    // keep visiting.
             });
-        TRACE_MSG0(get_ostr(), "detailed compare returned " << errs);
+        TRACE_MSG("detailed compare returned " << errs);
         return errs;
     }
 
@@ -492,6 +496,7 @@ namespace yask {
                                           const real_vec_t& val,
                                           int line,
                                           bool newline) const {
+        STATE_VARS(this);
 
         // Convert to elem indices.
         Indices eidxs = idxs.mulElements(_vec_lens);
@@ -503,7 +508,7 @@ namespace yask {
         eidxs.setTupleVals(idxs2);      // set vals from eidxs.
 
         // Visit every point in fold.
-        IdxTuple folds = _dims->_fold_pts;
+        IdxTuple folds = dims->_fold_pts;
         folds.visitAllPoints([&](const IdxTuple& fofs,
                                  size_t idx) {
 
diff --git a/src/kernel/lib/realv_grids.hpp b/src/kernel/lib/realv_grids.hpp
index 2a3a99a7..11fcd957 100644
--- a/src/kernel/lib/realv_grids.hpp
+++ b/src/kernel/lib/realv_grids.hpp
@@ -34,6 +34,7 @@ namespace yask {
     // Base class implementing all yk_grids. Can be used for grids
     // that contain either individual elements or vectors.
     class YkGridBase :
+        public KernelStateBase,
         public virtual yk_grid {
 
         // Rank and local offsets in domain dim:
@@ -61,9 +62,6 @@ namespace yask {
         // actual data, message stream.
         GenericGridBase* _ggb = 0;
 
-        // Problem dimensions. (NOT grid dims.)
-        DimsPtr _dims;
-
         // The following masks have one bit for each dim in the grid.
         idx_t _step_dim_mask = 0;
         idx_t _domain_dim_mask = 0;
@@ -83,7 +81,7 @@ namespace yask {
         Indices _allocs;    // actual grid alloc in reals | same.
 
         // Sizes in vectors for sizes that are always vec lens (to avoid division).
-        // Each entry _vec_lens may be same as _dims->_fold_pts or one, depending
+        // Each entry _vec_lens may be same as dims->_fold_pts or one, depending
         // on whether grid is fully vectorized.
         Indices _vec_lens;  // num reals in each elem | one.
         Indices _vec_left_pads; // same as _actl_left_pads.
@@ -178,9 +176,9 @@ namespace yask {
                                  const Indices& last_indices) const;
 
     public:
-        YkGridBase(GenericGridBase* ggb,
-                   const GridDimNames& dimNames,
-                   DimsPtr dims);
+        YkGridBase(KernelStateBase& state,
+                   GenericGridBase* ggb,
+                   const GridDimNames& dimNames);
         virtual ~YkGridBase() { }
 
         // Halo-exchange flag accessors.
@@ -295,8 +293,7 @@ namespace yask {
         // Return number of mismatches greater than epsilon.
         virtual idx_t compare(const YkGridBase* ref,
                               real_t epsilon = EPSILON,
-                              int maxPrint = 20,
-                              std::ostream& os = std::cerr) const;
+                              int maxPrint = 20) const;
 
         // Make sure indices are in range.
         // Optionally fix them to be in range and return in 'fixed_indices'.
@@ -610,13 +607,11 @@ namespace yask {
         }
 
     public:
-        YkElemGrid(DimsPtr dims,
+        YkElemGrid(KernelStateBase& state,
                    std::string name,
-                   const GridDimNames& dimNames,
-                   KernelSettingsPtr* settings,
-                   std::ostream** ostr) :
-            YkGridBase(&_data, dimNames, dims),
-            _data(name, dimNames, settings, ostr) {
+                   const GridDimNames& dimNames) :
+            YkGridBase(state, &_data, dimNames),
+            _data(state, name, dimNames) {
             _has_step_dim = _use_step_idx;
             resize();
         }
@@ -730,14 +725,13 @@ namespace yask {
         }
 
     public:
-        YkVecGrid(DimsPtr dims,
+        YkVecGrid(KernelStateBase& stateb,
                   const std::string& name,
-                  const GridDimNames& dimNames,
-                  KernelSettingsPtr* settings,
-                  std::ostream** ostr) :
-            YkGridBase(&_data, dimNames, dims),
-            _data(name, dimNames, settings, ostr),
+                  const GridDimNames& dimNames) :
+            YkGridBase(stateb, &_data, dimNames),
+            _data(stateb, name, dimNames),
             _vec_fold_posns(idx_t(0), int(dimNames.size())) {
+            STATE_VARS(this);
             _has_step_dim = _use_step_idx;
 
             // Template vec lengths.
@@ -805,9 +799,10 @@ namespace yask {
         virtual const real_t* getElemPtr(const Indices& idxs,
                                          idx_t alloc_step_idx,
                                          bool checkBounds=true) const final {
+            STATE_VARS_CONST(this);
 
 #ifdef TRACE_MEM
-            _data.get_ostr() << get_name() << "." << "YkVecGrid::getElemPtr(" <<
+            ostr << get_name() << "." << "YkVecGrid::getElemPtr(" <<
                 idxs.makeValStr(get_num_dims()) << ")";
 #endif
             // Use template vec lengths instead of run-time values for
@@ -865,13 +860,13 @@ namespace yask {
             }
 
             // Get 1D element index into vector.
-            auto i = _dims.get()->getElemIndexInVec(fold_ofs);
+            auto i = dims->getElemIndexInVec(fold_ofs);
 
 #ifdef DEBUG_LAYOUT
             // Compare to more explicit offset extraction.
             IdxTuple eofs = get_allocs(); // get dims for this grid.
             elem_ofs.setTupleVals(eofs);  // set vals from elem_ofs.
-            auto i2 = _dims->getElemIndexInVec(eofs);
+            auto i2 = dims->getElemIndexInVec(eofs);
             assert(i == i2);
 #endif
 
@@ -1029,6 +1024,7 @@ namespace yask {
                                         idx_t first_alloc_step_idx,
                                         const Indices& last_indices,
                                         idx_t last_alloc_step_idx) {
+            STATE_VARS(this);
             if (!is_storage_allocated())
                 return 0;
             Indices firstv, lastv;
@@ -1037,7 +1033,7 @@ namespace yask {
 
             // Find range.
             IdxTuple numVecsTuple = get_slice_range(firstv, lastv);
-            TRACE_MSG0(get_ostr(), "set_vecs_in_slice: setting " <<
+            TRACE_MSG("set_vecs_in_slice: setting " <<
                        numVecsTuple.makeDimValStr(" * ") << " vecs at [" <<
                        makeIndexString(firstv) << " ... " <<
                        makeIndexString(lastv) << "]");
@@ -1083,22 +1079,17 @@ namespace yask {
                                         idx_t first_alloc_step_idx,
                                         const Indices& last_indices,
                                         idx_t last_alloc_step_idx) const {
-            if (!is_storage_allocated()) {
-                yask_exception e;
-                std::stringstream err;
-                err << "Error: call to 'get_vecs_in_slice' with no data allocated for grid '" <<
-                    get_name() << "'.\n";
-                e.add_message(err.str());
-                throw e;
-                //exit_yask(1);
-            }
+            STATE_VARS(this);
+            if (!is_storage_allocated())
+                FORMAT_AND_THROW_YASK_EXCEPTION("Error: call to 'get_vecs_in_slice' with no data allocated for grid '" <<
+                                                get_name());
             Indices firstv, lastv;
             checkIndices(first_indices, "get_vecs_in_slice", true, true, &firstv);
             checkIndices(last_indices, "get_vecs_in_slice", true, true, &lastv);
 
             // Find range.
             IdxTuple numVecsTuple = get_slice_range(firstv, lastv);
-            TRACE_MSG0(get_ostr(), "get_vecs_in_slice: getting " <<
+            TRACE_MSG("get_vecs_in_slice: getting " <<
                        numVecsTuple.makeDimValStr(" * ") << " vecs at " <<
                        makeIndexString(firstv) << " ... " <<
                        makeIndexString(lastv));
diff --git a/src/kernel/lib/settings.cpp b/src/kernel/lib/settings.cpp
index fd56c46f..379e957b 100644
--- a/src/kernel/lib/settings.cpp
+++ b/src/kernel/lib/settings.cpp
@@ -28,6 +28,20 @@ using namespace std;
 
 namespace yask {
 
+    // Set debug output to cout if my_rank == msg_rank
+    // or a null stream otherwise.
+    ostream& KernelStateBase::set_ostr() {
+        assert(_state);
+        assert(_state->_env);
+        assert(_state->_opts);
+        yask_output_factory yof;
+        if (_state->_env->my_rank == _state->_opts->msg_rank)
+            set_debug_output(yof.new_stdout_output());
+        else
+            set_debug_output(yof.new_null_output());
+        return get_debug_output()->get_ostream();
+    }
+
     // Check whether dim is of allowed type.
     void Dims::checkDimType(const std::string& dim,
                             const std::string& fn_name,
@@ -66,8 +80,7 @@ namespace yask {
         return new_env(MPI_COMM_NULL);
     }
 
-    ///// KernelEnv functions:
-
+    // KernelEnv global lock objects.
     omp_lock_t KernelEnv::_debug_lock;
     bool KernelEnv::_debug_lock_init_done = false;
     
@@ -221,6 +234,52 @@ namespace yask {
         return bufs[i].bufs[bd];
     }
 
+    // Settings ctor.
+    KernelSettings::KernelSettings(DimsPtr dims, KernelEnvPtr env) :
+        _dims(dims), max_threads(env->max_threads) {
+        auto& step_dim = dims->_step_dim;
+
+        // Use both step and domain dims for all size tuples.
+        _rank_sizes = dims->_stencil_dims;
+        _rank_sizes.setValsSame(def_rank);             // size of rank.
+        _rank_sizes.setVal(step_dim, 0);        // not used.
+
+        _region_sizes = dims->_stencil_dims;
+        _region_sizes.setValsSame(0);          // 0 => default settings.
+
+        _block_group_sizes = dims->_stencil_dims;
+        _block_group_sizes.setValsSame(0); // 0 => min size.
+
+        _block_sizes = dims->_stencil_dims;
+        _block_sizes.setValsSame(def_block); // size of block.
+        _block_sizes.setVal(step_dim, 0); // 0 => default.
+
+        _mini_block_group_sizes = dims->_stencil_dims;
+        _mini_block_group_sizes.setValsSame(0); // 0 => min size.
+
+        _mini_block_sizes = dims->_stencil_dims;
+        _mini_block_sizes.setValsSame(0);            // 0 => default settings.
+
+        _sub_block_group_sizes = dims->_stencil_dims;
+        _sub_block_group_sizes.setValsSame(0); // 0 => min size.
+
+        _sub_block_sizes = dims->_stencil_dims;
+        _sub_block_sizes.setValsSame(0);            // 0 => default settings.
+
+        _min_pad_sizes = dims->_stencil_dims;
+        _min_pad_sizes.setValsSame(0);
+
+        _extra_pad_sizes = dims->_stencil_dims;
+        _extra_pad_sizes.setValsSame(0);
+
+        // Use only domain dims for MPI tuples.
+        _num_ranks = dims->_domain_dims;
+        _num_ranks.setValsSame(1);
+
+        _rank_indices = dims->_domain_dims;
+        _rank_indices.setValsSame(0);
+    }
+
     // Add options to set one domain var to a cmd-line parser.
     void KernelSettings::_add_domain_option(CommandLineParser& parser,
                                             const std::string& prefix,
@@ -515,7 +574,7 @@ namespace yask {
     // Make sure all user-provided settings are valid and finish setting up some
     // other vars before allocating memory.
     // Called from prepare_solution(), during auto-tuning, etc.
-    void KernelSettings::adjustSettings(std::ostream& os, KernelEnvPtr env) {
+    void KernelSettings::adjustSettings(std::ostream& os) {
 
         auto& step_dim = _dims->_step_dim;
         auto& inner_dim = _dims->_inner_dim;
@@ -711,4 +770,44 @@ namespace yask {
 #endif
     }
 
+    // Ctor.
+    KernelStateBase::KernelStateBase(KernelEnvPtr& kenv,
+                                     KernelSettingsPtr& ksettings)
+    {
+        // Create state. All other objects that need to share
+        // this state should use a shared ptr to it.
+        _state = make_shared<KernelState>();
+
+        // Share passed ptrs.
+        _state->_env = kenv;
+        _state->_opts = ksettings;
+        _state->_dims = ksettings->_dims;
+
+        // Set _state->_debug per settings.
+        set_ostr();
+
+        // Create MPI Info object.
+        _state->_mpiInfo = make_shared<MPIInfo>(ksettings->_dims);
+
+        // Set vars after above inits.
+        STATE_VARS(this);
+
+        // Find index posns in stencil dims.
+        DOMAIN_VAR_LOOP(i, j) {
+            auto& dname = stencil_dims.getDimName(i);
+            if (state->_outer_posn < 0)
+                state->_outer_posn = i;
+            if (dname == dims->_inner_dim)
+                state->_inner_posn = i;
+        }
+        assert(outer_posn == state->_outer_posn);
+    }
+
+    // ContextLinker ctor.
+    ContextLinker::ContextLinker(StencilContext* context) :
+        KernelStateBase(context->get_state()),
+        _context(context) {
+        assert(context);
+    }
+
 } // namespace yask.
diff --git a/src/kernel/lib/settings.hpp b/src/kernel/lib/settings.hpp
index 6113a980..c979c0e3 100644
--- a/src/kernel/lib/settings.hpp
+++ b/src/kernel/lib/settings.hpp
@@ -27,315 +27,8 @@ IN THE SOFTWARE.
 
 namespace yask {
 
-    typedef Tuple<idx_t> IdxTuple;
-    typedef std::vector<idx_t> GridIndices;
-    typedef std::vector<idx_t> GridDimSizes;
-    typedef std::vector<std::string> GridDimNames;
-
-    // A class to hold up to a given number of sizes or indices efficiently.
-    // Similar to a Tuple, but less overhead and doesn't keep names.
-    // Make sure this stays non-virtual.
-    // TODO: make this a template with _ndims as a parameter.
-    // TODO: ultimately, combine with Tuple w/o loss of efficiency.
-    class Indices {
-
-    public:
-
-        // Max number of indices that can be held.
-        // Note use of "+max_idxs" in code below to avoid compiler
-        // trying to take a reference to it, resulting in a undefined
-        // symbol (sometimes).
-        static constexpr int max_idxs = MAX_DIMS;
-
-        // Step dim is always in [0] of an Indices type (if it is used).
-        static constexpr int step_posn = 0;
-
-    protected:
-        idx_t _idxs[max_idxs];
-        int _ndims;
-
-    public:
-        // Ctors.
-        Indices() : _ndims(0) { }
-        Indices(int ndims) : _ndims(ndims) { } // NB: _idxs remain uninit!
-        Indices(const IdxTuple& src) {
-            setFromTuple(src);
-        }
-        Indices(const GridIndices& src) {
-            setFromVec(src);
-        }
-        Indices(const std::initializer_list<idx_t>& src) {
-            setFromInitList(src);
-        }
-        Indices(const idx_t src[], int ndims) {
-            setFromArray(src, ndims);
-        }
-        Indices(idx_t src, int ndims) {
-            setFromConst(src, ndims);
-        }
-
-        // Default copy ctor, copy operator should be okay.
-
-        // Access size.
-        inline int getNumDims() const {
-            return _ndims;
-        }
-        inline void setNumDims(int n) {
-            _ndims = n;
-        }
-
-        // Access indices.
-        inline idx_t& operator[](int i) {
-            assert(i >= 0);
-            assert(i < _ndims);
-            return _idxs[i];
-        }
-        inline const idx_t& operator[](int i) const {
-            assert(i >= 0);
-            assert(i < _ndims);
-            return _idxs[i];
-        }
-
-        // Write to an IdxTuple.
-        // The 'tgt' must have the same number of dims.
-        void setTupleVals(IdxTuple& tgt) const {
-            assert(tgt.size() == _ndims);
-            for (int i = 0; i < _ndims; i++)
-                if (i < tgt.size())
-                    tgt.setVal(i, _idxs[i]);
-        }
-
-        // Read from an IdxTuple.
-        void setFromTuple(const IdxTuple& src) {
-            assert(src.size() <= +max_idxs);
-            int n = int(src.size());
-            for (int i = 0; i < n; i++)
-                _idxs[i] = src.getVal(i);
-            _ndims = n;
-        }
-
-        // Other inits.
-        void setFromVec(const GridIndices& src) {
-            assert(src.size() <= +max_idxs);
-            int n = int(src.size());
-            for (int i = 0; i < n; i++)
-                _idxs[i] = src[i];
-            _ndims = n;
-        }
-
-        // default n => don't change _ndims.
-        void setFromArray(const idx_t src[], int n = -1) {
-            if (n < 0)
-                n = _ndims;
-            assert(n <= +max_idxs);
-            for (int i = 0; i < n; i++)
-                _idxs[i] = src[i];
-            _ndims = n;
-        }
-        void setFromInitList(const std::initializer_list<idx_t>& src) {
-            assert(src.size() <= +max_idxs);
-            int i = 0;
-            for (auto idx : src)
-                _idxs[i++] = idx;
-            _ndims = i;
-        }
-
-        // default n => don't change _ndims.
-        void setFromConst(idx_t val, int n = -1) {
-            if (n < 0)
-                n = _ndims;
-            assert(n <= +max_idxs);
-            for (int i = 0; i < n; i++)
-                _idxs[i] = val;
-            _ndims = n;
-        }
-
-        // Some comparisons.
-        // These assume all the indices are valid or
-        // initialized to the same value.
-        bool operator==(const Indices& rhs) const {
-            if (_ndims != rhs._ndims)
-                return false;
-            for (int i = 0; i < _ndims; i++)
-                if (_idxs[i] != rhs._idxs[i])
-                    return false;
-            return true;
-        }
-        bool operator!=(const Indices& rhs) const {
-            return !operator==(rhs);
-        }
-        bool operator<(const Indices& rhs) const {
-            if (_ndims < rhs._ndims)
-                return true;
-            else if (_ndims > rhs._ndims)
-                return false;
-            for (int i = 0; i < _ndims; i++)
-                if (_idxs[i] < rhs._idxs[i])
-                    return true;
-                else if (_idxs[i] > rhs._idxs[i])
-                    return false;
-            return false;       // equal, so not less than.
-        }
-        bool operator>(const Indices& rhs) const {
-            if (_ndims > rhs._ndims)
-                return true;
-            else if (_ndims < rhs._ndims)
-                return false;
-            for (int i = 0; i < _ndims; i++)
-                if (_idxs[i] > rhs._idxs[i])
-                    return true;
-                else if (_idxs[i] < rhs._idxs[i])
-                    return false;
-            return false;       // equal, so not greater than.
-        }
-
-        // Generic element-wise operator.
-        // Returns a new object.
-        inline Indices combineElements(std::function<void (idx_t& lhs, idx_t rhs)> func,
-                                       const Indices& other) const {
-            Indices res(*this);
-
-#if EXACT_INDICES
-            // Use just the used elements.
-            for (int i = 0; i < _ndims; i++)
-#else
-            // Use all to allow unroll and avoid jumps.
-#pragma unroll
-            for (int i = 0; i < max_idxs; i++)
-#endif
-                func(res._idxs[i], other._idxs[i]);
-            return res;
-        }
-
-        // Some element-wise operators.
-        // These all return a new set of Indices rather
-        // than modifying this object.
-        inline Indices addElements(const Indices& other) const {
-            return combineElements([&](idx_t& lhs, idx_t rhs) { lhs += rhs; },
-                                   other);
-        }
-        inline Indices subElements(const Indices& other) const {
-            return combineElements([&](idx_t& lhs, idx_t rhs) { lhs -= rhs; },
-                                   other);
-        }
-        inline Indices mulElements(const Indices& other) const {
-            return combineElements([&](idx_t& lhs, idx_t rhs) { lhs *= rhs; },
-                                   other);
-        }
-        inline Indices divElements(const Indices& other) const {
-            return combineElements([&](idx_t& lhs, idx_t rhs) { lhs /= rhs; },
-                                   other);
-        }
-        inline Indices minElements(const Indices& other) const {
-            return combineElements([&](idx_t& lhs, idx_t rhs) { lhs = std::min(lhs, rhs); },
-                                   other);
-        }
-        inline Indices maxElements(const Indices& other) const {
-            return combineElements([&](idx_t& lhs, idx_t rhs) { lhs = std::max(lhs, rhs); },
-                                   other);
-        }
-
-        // Generic element-wise operator with RHS const.
-        // Returns a new object.
-        inline Indices mapElements(std::function<void (idx_t& lhs, idx_t rhs)> func,
-                                   idx_t crhs) const {
-            Indices res(*this);
-
-#if EXACT_INDICES
-            // Use just the used elements.
-            for (int i = 0; i < _ndims; i++)
-#else
-            // Use all to allow unroll and avoid jumps.
-#pragma unroll
-            for (int i = 0; i < max_idxs; i++)
-#endif
-                func(res._idxs[i], crhs);
-            return res;
-        }
-
-        // Operate on all elements.
-        Indices addConst(idx_t crhs) const {
-            return mapElements([&](idx_t& lhs, idx_t rhs) { lhs += rhs; },
-                               crhs);
-        }
-        Indices subConst(idx_t crhs) const {
-            return mapElements([&](idx_t& lhs, idx_t rhs) { lhs -= rhs; },
-                               crhs);
-        }
-        Indices mulConst(idx_t crhs) const {
-            return mapElements([&](idx_t& lhs, idx_t rhs) { lhs *= rhs; },
-                               crhs);
-        }
-        Indices divConst(idx_t crhs) const {
-            return mapElements([&](idx_t& lhs, idx_t rhs) { lhs /= rhs; },
-                               crhs);
-        }
-        Indices minConst(idx_t crhs) const {
-            return mapElements([&](idx_t& lhs, idx_t rhs) { lhs = std::min(lhs, rhs); },
-                               crhs);
-        }
-        Indices maxConst(idx_t crhs) const {
-            return mapElements([&](idx_t& lhs, idx_t rhs) { lhs = std::max(lhs, rhs); },
-                               crhs);
-        }
-
-        // Reduce over all elements.
-        idx_t sum() const {
-            idx_t res = 0;
-            for (int i = 0; i < _ndims; i++)
-                res += _idxs[i];
-            return res;
-        }
-        idx_t product() const {
-            idx_t res = 1;
-            for (int i = 0; i < _ndims; i++)
-                res *= _idxs[i];
-            return res;
-        }
-
-        // Make string like "x=4, y=8".
-        std::string makeDimValStr(const GridDimNames& names,
-                                  std::string separator=", ",
-                                  std::string infix="=",
-                                  std::string prefix="",
-                                  std::string suffix="") const {
-            assert((int)names.size() == _ndims);
-
-            // Make a Tuple from names.
-            IdxTuple tmp;
-            for (int i = 0; i < int(names.size()); i++)
-                tmp.addDimBack(names[i], _idxs[i]);
-            return tmp.makeDimValStr(separator, infix, prefix, suffix);
-        }
-
-        // Make string like "4, 3, 2".
-        std::string makeValStr(int nvals,
-                               std::string separator=", ",
-                               std::string prefix="",
-                               std::string suffix="") const {
-            assert(nvals == _ndims);
-
-            // Make a Tuple w/o useful names.
-            IdxTuple tmp;
-            for (int i = 0; i < nvals; i++)
-                tmp.addDimBack(std::string(1, '0' + char(i)), _idxs[i]);
-            return tmp.makeValStr(separator, prefix, suffix);
-        }
-    };
-
-    // Define OMP reductions on Indices.
-#pragma omp declare reduction(min_idxs : Indices : \
-                              omp_out = omp_out.minElements(omp_in) )   \
-    initializer (omp_priv = omp_orig)
-#pragma omp declare reduction(max_idxs : Indices : \
-                              omp_out = omp_out.maxElements(omp_in) )   \
-    initializer (omp_priv = omp_orig)
-
-    // Layout algorithms using Indices.
-#include "yask_layouts.hpp"
-
     // Forward defns.
-    struct StencilContext;
+    class StencilContext;
     class YkGridBase;
 
     // Some derivations from grid types.
@@ -480,96 +173,6 @@ namespace yask {
     };
     typedef std::shared_ptr<Dims> DimsPtr;
 
-    // A group of Indices needed for generated loops.
-    // See the help message from gen_loops.pl for the
-    // documentation of the indices.
-    // Make sure this stays non-virtual.
-    struct ScanIndices {
-        int ndims = 0;
-
-        // Input values; not modified.
-        Indices begin, end;     // first and end (beyond last) range of each index.
-        Indices step;           // step value within range.
-        Indices align;          // alignment of steps after first one.
-        Indices align_ofs;      // adjustment for alignment (see below).
-        Indices group_size;     // proximity grouping within range.
-
-        // Alignment: when possible, start positions after the first
-        // in each dim will be aligned such that ((start - align_ofs) % align) == 0.
-
-        // Output values; set once for entire range.
-        Indices num_indices;    // number of indices in each dim.
-        idx_t   linear_indices = 0; // total indices over all dims (product of num_indices).
-
-        // Output values; set for each index by loop code.
-        Indices start, stop;    // first and last+1 for this sub-range.
-        Indices index;          // 0-based unique index for each sub-range in each dim.
-        idx_t   linear_index = 0;   // 0-based index over all dims.
-
-        // Example w/3 sub-ranges in overall range:
-        // begin                                         end
-        //   |--------------------------------------------|
-        //   |------------------|------------------|------|
-        // start               stop                            (index = 0)
-        //                    start               stop         (index = 1)
-        //                                       start   stop  (index = 2)
-
-        // Default init.
-        ScanIndices(const Dims& dims, bool use_vec_align, IdxTuple* ofs) :
-            ndims(NUM_STENCIL_DIMS),
-            begin(idx_t(0), ndims),
-            end(idx_t(0), ndims),
-            step(idx_t(1), ndims),
-            align(idx_t(1), ndims),
-            align_ofs(idx_t(0), ndims),
-            group_size(idx_t(1), ndims),
-            num_indices(idx_t(1), ndims),
-            start(idx_t(0), ndims),
-            stop(idx_t(0), ndims),
-            index(idx_t(0), ndims) {
-
-            // i: index for stencil dims, j: index for domain dims.
-            DOMAIN_VAR_LOOP(i, j) {
-
-                // Set alignment to vector lengths.
-                if (use_vec_align)
-                    align[i] = fold_pts[j];
-
-                // Set alignment offset.
-                if (ofs) {
-                    assert(ofs->getNumDims() == ndims - 1);
-                    align_ofs[i] = ofs->getVal(j);
-                }
-            }
-        }
-
-        // Init from outer-loop indices.
-        // Start..stop from point in outer loop become begin..end
-        // for this loop.
-        //
-        // Example:
-        // begin              (outer)                    end
-        //   |--------------------------------------------|
-        //   |------------------|------------------|------|
-        // start      |        stop
-        //            V
-        // begin    (this)     end
-        //   |------------------|
-        // start               stop  (may be sub-dividied later)
-        void initFromOuter(const ScanIndices& outer) {
-
-            // Begin & end set from start & stop of outer loop.
-            begin = start = outer.start;
-            end = stop = outer.stop;
-
-            // Pass some values through.
-            align = outer.align;
-            align_ofs = outer.align_ofs;
-
-            // Leave others alone.
-        }
-    };
-
     // MPI neighbor info.
     class MPIInfo {
 
@@ -900,52 +503,8 @@ namespace yask {
         int _numa_pref = NUMA_PREF;
         int _numa_pref_max = 128; // GiB to alloc before using PMEM.
 
-        // Ctor.
-        // TODO: move code to settings.cpp.
-        KernelSettings(DimsPtr dims, KernelEnvPtr env) :
-            _dims(dims), max_threads(env->max_threads) {
-            auto& step_dim = dims->_step_dim;
-
-            // Use both step and domain dims for all size tuples.
-            _rank_sizes = dims->_stencil_dims;
-            _rank_sizes.setValsSame(def_rank);             // size of rank.
-            _rank_sizes.setVal(step_dim, 0);        // not used.
-
-            _region_sizes = dims->_stencil_dims;
-            _region_sizes.setValsSame(0);          // 0 => default settings.
-
-            _block_group_sizes = dims->_stencil_dims;
-            _block_group_sizes.setValsSame(0); // 0 => min size.
-
-            _block_sizes = dims->_stencil_dims;
-            _block_sizes.setValsSame(def_block); // size of block.
-            _block_sizes.setVal(step_dim, 0); // 0 => default.
-
-            _mini_block_group_sizes = dims->_stencil_dims;
-            _mini_block_group_sizes.setValsSame(0); // 0 => min size.
-
-            _mini_block_sizes = dims->_stencil_dims;
-            _mini_block_sizes.setValsSame(0);            // 0 => default settings.
-
-            _sub_block_group_sizes = dims->_stencil_dims;
-            _sub_block_group_sizes.setValsSame(0); // 0 => min size.
-
-            _sub_block_sizes = dims->_stencil_dims;
-            _sub_block_sizes.setValsSame(0);            // 0 => default settings.
-
-            _min_pad_sizes = dims->_stencil_dims;
-            _min_pad_sizes.setValsSame(0);
-
-            _extra_pad_sizes = dims->_stencil_dims;
-            _extra_pad_sizes.setValsSame(0);
-
-            // Use only domain dims for MPI tuples.
-            _num_ranks = dims->_domain_dims;
-            _num_ranks.setValsSame(1);
-
-            _rank_indices = dims->_domain_dims;
-            _rank_indices.setValsSame(0);
-        }
+        // Ctor/dtor.
+        KernelSettings(DimsPtr dims, KernelEnvPtr env);
         virtual ~KernelSettings() { }
 
     protected:
@@ -976,9 +535,9 @@ namespace yask {
         // values as needed.
         // Called from prepare_solution(), so it doesn't normally need to be called from user code.
         // Prints informational info to 'os'.
-        virtual void adjustSettings(std::ostream& os, KernelEnvPtr env);
-        virtual void adjustSettings(KernelEnvPtr env) {
-            adjustSettings(nullop->get_ostream(), env);
+        virtual void adjustSettings(std::ostream& os);
+        virtual void adjustSettings() {
+            adjustSettings(nullop->get_ostream());
         }
 
         // Determine if this is the first or last rank in given dim.
@@ -991,4 +550,222 @@ namespace yask {
     };
     typedef std::shared_ptr<KernelSettings> KernelSettingsPtr;
 
+    // A collection of solution meta-data whose ownership is shared between
+    // various objects.
+    struct KernelState {
+        virtual ~KernelState() { }
+
+        // Output stream for messages.
+        yask_output_ptr _debug;
+
+        // Env.
+        KernelEnvPtr _env;
+
+        // Command-line and env parameters.
+        KernelSettingsPtr _opts;
+        bool _use_pack_tuners = false;
+
+        // Problem dims.
+        DimsPtr _dims;
+
+        // Position of inner domain dim in stencil-dims tuple.
+        // Misc dims will follow this if/when using interleaving.
+        // TODO: move to Dims.
+        int _inner_posn = -1;   // -1 => not set.
+
+        // Position of outer domain dim in stencil-dims tuple.
+        // For 1D stencils, _outer_posn == _inner_posn.
+        // TODO: move to Dims.
+        int _outer_posn = -1;   // -1 => not set.
+
+        // MPI info.
+        MPIInfoPtr _mpiInfo;
+    };
+    typedef std::shared_ptr<KernelState> KernelStatePtr;
+
+    // Macro to define and set commonly-needed state vars efficiently.
+    // 'parent_p' is pointer to object containing 'KernelStatePtr _state'.
+    // '*_posn' vars are positions in stencil_dims.
+#define STATE_VARS0(parent_p, pfx)                                      \
+    pfx auto* pp = parent_p;                                            \
+    assert(pp);                                                         \
+    pfx auto* state = pp->_state.get();                                 \
+    assert(state);                                                      \
+    assert(state->_debug.get());                                        \
+    auto& os = state->_debug.get()->get_ostream();                      \
+    pfx auto* env = state->_env.get();                                  \
+    assert(env);                                                        \
+    pfx auto* opts = state->_opts.get();                                \
+    assert(opts);                                                       \
+    pfx auto* dims = state->_dims.get();                                \
+    assert(dims);                                                       \
+    pfx auto* mpiInfo = state->_mpiInfo.get();                          \
+    assert(mpiInfo);                                                    \
+    const auto& step_dim = dims->_step_dim;                             \
+    const auto& inner_dim = dims->_inner_dim;                           \
+    const auto& domain_dims = dims->_domain_dims;                       \
+    constexpr int nddims = NUM_DOMAIN_DIMS;                             \
+    assert(nddims == domain_dims.size());                               \
+    const auto& stencil_dims = dims->_stencil_dims;                     \
+    constexpr int nsdims = NUM_STENCIL_DIMS;                            \
+    assert(nsdims == stencil_dims.size());                              \
+    auto& misc_dims = dims->_misc_dims;                                 \
+    constexpr int step_posn = 0;                                        \
+    assert(step_posn == +Indices::step_posn);                           \
+    constexpr int outer_posn = 1;                                       \
+    const int inner_posn = state->_inner_posn
+#define STATE_VARS(parent_p) STATE_VARS0(parent_p,)
+#define STATE_VARS_CONST(parent_p) STATE_VARS0(parent_p, const)
+
+    // A base class containing a shared pointer to a kernel state.
+    // Used to ensure that the shared state object stays allocated when
+    // at least one of its owners exists.
+    class KernelStateBase {
+    protected:
+
+        // Common state. This is a separate object to allow
+        // multiple objects to keep it alive via shared ptrs.
+        KernelStatePtr _state;
+
+    public:
+        KernelStateBase(KernelStatePtr& state) :
+            _state(state) {}
+        KernelStateBase(KernelEnvPtr& env,
+                        KernelSettingsPtr& settings);
+        virtual ~KernelStateBase() {}
+
+        // Access to state.
+        KernelStatePtr& get_state() {
+            assert(_state);
+            return _state;
+        }
+        const KernelStatePtr& get_state() const {
+            assert(_state);
+            return _state;
+        }
+        KernelSettingsPtr& get_settings() { return _state->_opts; }
+        const KernelSettingsPtr& get_settings() const { return _state->_opts; }
+        KernelEnvPtr& get_env() { return _state->_env; }
+        const KernelEnvPtr& get_env() const { return _state->_env; }
+        DimsPtr& get_dims() { return _state->_dims; }
+        const DimsPtr& get_dims() const { return _state->_dims; }
+        MPIInfoPtr& get_mpi_info() { return _state->_mpiInfo; }
+        const MPIInfoPtr& get_mpi_info() const { return _state->_mpiInfo; }
+        bool use_pack_tuners() const { return _state->_use_pack_tuners; }
+        virtual yask_output_ptr get_debug_output() const { return _state->_debug; }
+        virtual void set_debug_output(yask_output_ptr debug) { _state->_debug = debug; }
+
+        // Set debug output to cout if my_rank == msg_rank
+        // or a null stream otherwise.
+        std::ostream& set_ostr();
+        std::ostream& get_ostr() const {
+            STATE_VARS(this);
+            return os;
+        }
+
+        // Set number of threads w/o using thread-divisor.
+        // Return number of threads.
+        // Do nothing and return 0 if not properly initialized.
+        int set_max_threads() {
+            STATE_VARS(this);
+
+            // Get max number of threads.
+            int mt = std::max(opts->max_threads, 1);
+
+            // Reset number of OMP threads to max allowed.
+            omp_set_num_threads(mt);
+            return mt;
+        }
+
+        // Get total number of computation threads to use.
+        int get_num_comp_threads(int& region_threads, int& blk_threads) const {
+            STATE_VARS(this);
+
+            // Max threads / divisor.
+            int mt = std::max(opts->max_threads, 1);
+            int td = std::max(opts->thread_divisor, 1);
+            int at = mt / td;
+            at = std::max(at, 1);
+
+            // Blk threads per region thread.
+            int bt = std::max(opts->num_block_threads, 1);
+            bt = std::min(bt, at); // Cannot be > 'at'.
+            blk_threads = bt;
+
+            // Region threads.
+            int rt = at / bt;
+            rt = std::max(rt, 1);
+            region_threads = rt;
+
+            // Total number of block threads.
+            return bt * rt;
+        }
+        
+        // Set number of threads to use for something other than a region.
+        // Return number of threads.
+        // Do nothing and return 0 if not properly initialized.
+        int set_all_threads() {
+            int rt, bt;
+            int at = get_num_comp_threads(rt, bt);
+            omp_set_num_threads(at);
+            return at;
+        }
+
+        // Set number of threads to use for a region.
+        // Enable nested OMP if there are >1 block threads,
+        // disable otherwise.
+        // Return number of threads.
+        // Do nothing and return 0 if not properly initialized.
+        int set_region_threads() {
+            int rt, bt;
+            int at = get_num_comp_threads(rt, bt);
+
+            // Limit outer nesting to allow num_block_threads per nested
+            // block loop.
+            yask_num_threads[0] = rt;
+
+            if (bt > 1) {
+                omp_set_nested(1);
+                omp_set_max_active_levels(2);
+                yask_num_threads[1] = bt;
+            }
+            else {
+                omp_set_nested(0);
+                omp_set_max_active_levels(1);
+                yask_num_threads[1] = 0;
+            }
+
+            omp_set_num_threads(rt);
+            return rt;
+        }
+
+        // Set number of threads for a block.
+        // Return number of threads.
+        // Do nothing and return 0 if not properly initialized.
+        int set_block_threads() {
+            int rt, bt;
+            int at = get_num_comp_threads(rt, bt);
+
+            if (omp_get_max_active_levels() > 1)
+                omp_set_num_threads(bt);
+            return bt;
+        }
+
+    };
+
+    // An object that is created from a context, shares ownership of the
+    // state, and keeps a pointer back to the context. However, it does not
+    // share ownership of the context itself. That would create an ownership
+    // loop that would not allow the context to be deleted.
+    class ContextLinker :
+        public KernelStateBase {
+
+    protected:
+        StencilContext* _context;
+
+    public:
+        ContextLinker(StencilContext* context);
+        virtual ~ContextLinker() { }
+    };
+
 } // yask namespace.
diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp
index cc6a69ce..ab69aef5 100644
--- a/src/kernel/lib/setup.cpp
+++ b/src/kernel/lib/setup.cpp
@@ -31,37 +31,42 @@ using namespace std;
 
 namespace yask {
 
-    // Constructor.
-    StencilContext::StencilContext(KernelEnvPtr kenv,
-                                   KernelSettingsPtr ksettings) :
-        _ostr(&std::cout),
-        _env(kenv),
-        _opts(ksettings),
-        _dims(ksettings->_dims),
-        _at(this, _opts.get())
-    {
-        // Set debug output object.
-        yask_output_factory yof;
-        set_debug_output(yof.new_stdout_output());
-
-        // Create MPI Info object.
-        _mpiInfo = std::make_shared<MPIInfo>(ksettings->_dims);
-
-        // Set output to msg-rank per settings.
-        set_ostr();
+    // ScanIndices ctor.
+    ScanIndices::ScanIndices(const Dims& dims, bool use_vec_align, IdxTuple* ofs) :
+        ndims(NUM_STENCIL_DIMS),
+        begin(idx_t(0), ndims),
+        end(idx_t(0), ndims),
+        step(idx_t(1), ndims),
+        align(idx_t(1), ndims),
+        align_ofs(idx_t(0), ndims),
+        group_size(idx_t(1), ndims),
+        num_indices(idx_t(1), ndims),
+        start(idx_t(0), ndims),
+        stop(idx_t(0), ndims),
+        index(idx_t(0), ndims) {
+
+        // i: index for stencil dims, j: index for domain dims.
+        DOMAIN_VAR_LOOP(i, j) {
 
-        // Set standard context vars after above init code.
-        CONTEXT_VARS(this);
+            // Set alignment to vector lengths.
+            if (use_vec_align)
+                align[i] = fold_pts[j];
 
-        // Find index posns in stencil dims.
-        DOMAIN_VAR_LOOP(i, j) {
-            auto& dname = stencil_dims.getDimName(i);
-            if (_outer_posn < 0)
-                _outer_posn = i;
-            if (dname == dims->_inner_dim)
-                _inner_posn = i;
+            // Set alignment offset.
+            if (ofs) {
+                assert(ofs->getNumDims() == ndims - 1);
+                align_ofs[i] = ofs->getVal(j);
+            }
         }
-        assert(outer_posn == _outer_posn);
+    }
+
+    // Context ctor.
+    StencilContext::StencilContext(KernelEnvPtr& kenv,
+                                   KernelSettingsPtr& ksettings) :
+        KernelStateBase(kenv, ksettings),
+        _at(this, ksettings.get())
+    {
+        STATE_VARS(this);
 
         // Init various tuples to make sure they have the correct dims.
         rank_domain_offsets = domain_dims;
@@ -83,7 +88,7 @@ namespace yask {
     // if not using MPI to properly init these vars.  Called from
     // prepare_solution(), so it doesn't normally need to be called from user code.
     void StencilContext::setupRank() {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         auto me = env->my_rank;
         int num_neighbors = 0;
@@ -129,9 +134,9 @@ namespace yask {
         // Exchange coord and size info between all ranks.
         for (int rn = 0; rn < env->num_ranks; rn++) {
             MPI_Bcast(&coords[rn][0], nddims, MPI_INTEGER8,
-                      rn, _env->comm);
+                      rn, env->comm);
             MPI_Bcast(&rsizes[rn][0], nddims, MPI_INTEGER8,
-                      rn, _env->comm);
+                      rn, env->comm);
         }
         // Now, the tables are filled in for all ranks.
 
@@ -205,7 +210,7 @@ namespace yask {
                             auto mysz = rsizes[me][dj];
                             auto rnsz = rsizes[rn][dj];
                             if (mysz != rnsz) {
-                                auto& dnamej = _opts->_rank_indices.getDimName(dj);
+                                auto& dnamej = opts->_rank_indices.getDimName(dj);
                                 FORMAT_AND_THROW_YASK_EXCEPTION
                                     ("Error: rank " << rn << " and " << me <<
                                      " are both at rank-index " << coords[me][di] <<
@@ -236,11 +241,11 @@ namespace yask {
                 assert(roffsets.max() <= 2);
 
                 // Convert the offsets into a 1D index.
-                auto rn_ofs = _mpiInfo->getNeighborIndex(roffsets);
-                TRACE_MSG("neighborhood size = " << _mpiInfo->neighborhood_sizes.makeDimValStr() <<
+                auto rn_ofs = mpiInfo->getNeighborIndex(roffsets);
+                TRACE_MSG("neighborhood size = " << mpiInfo->neighborhood_sizes.makeDimValStr() <<
                           " & roffsets of rank " << rn << " = " << roffsets.makeDimValStr() <<
                           " => " << rn_ofs);
-                assert(idx_t(rn_ofs) < _mpiInfo->neighborhood_size);
+                assert(idx_t(rn_ofs) < mpiInfo->neighborhood_size);
 
                 // Save rank of this neighbor into the MPI info object.
                 mpiInfo->my_neighbors.at(rn_ofs) = rn;
@@ -283,7 +288,7 @@ namespace yask {
                     auto rnsz = rsizes[rn][j];
                     auto vlen = fold_pts[j];
                     if (rnsz % vlen != 0) {
-                        auto& dname = _opts->_rank_indices.getDimName(j);
+                        auto& dname = opts->_rank_indices.getDimName(j);
                         TRACE_MSG("cannot use vector halo exchange with rank " << rn <<
                                   " because its size in '" << dname << "' is " << rnsz);
                         vlen_mults = false;
@@ -291,7 +296,7 @@ namespace yask {
                 }
 
                 // Save vec-mult flag.
-                _mpiInfo->has_all_vlen_mults.at(rn_ofs) = vlen_mults;
+                mpiInfo->has_all_vlen_mults.at(rn_ofs) = vlen_mults;
 
             } // self or immediate neighbor in any direction.
 
@@ -312,7 +317,7 @@ namespace yask {
     // Set wave-front settings.
     // This should be called anytime a setting or rank offset is changed.
     void StencilContext::update_grid_info() {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         // If we haven't finished constructing the context, it's too early
         // to do this.
@@ -331,18 +336,18 @@ namespace yask {
                 continue;
 
             // Loop through each domain dim.
-            for (auto& dim : _dims->_domain_dims.getDims()) {
+            for (auto& dim : domain_dims.getDims()) {
                 auto& dname = dim.getName();
 
                 if (gp->is_dim_used(dname)) {
 
                     // Rank domains.
-                    gp->_set_domain_size(dname, _opts->_rank_sizes[dname]);
+                    gp->_set_domain_size(dname, opts->_rank_sizes[dname]);
 
                     // Pads.
                     // Set via both 'extra' and 'min'; larger result will be used.
-                    gp->set_extra_pad_size(dname, _opts->_extra_pad_sizes[dname]);
-                    gp->set_min_pad_size(dname, _opts->_min_pad_sizes[dname]);
+                    gp->set_extra_pad_size(dname, opts->_extra_pad_sizes[dname]);
+                    gp->set_min_pad_size(dname, opts->_min_pad_sizes[dname]);
 
                     // Offsets.
                     gp->_set_rank_offset(dname, rank_domain_offsets[dname]);
@@ -358,9 +363,9 @@ namespace yask {
         // Calculate wave-front shifts.
         // See the wavefront diagram in run_solution() for description
         // of angles and extensions.
-        idx_t tb_steps = _opts->_block_sizes[step_dim]; // use requested size; actual may be less.
+        idx_t tb_steps = opts->_block_sizes[step_dim]; // use requested size; actual may be less.
         assert(tb_steps >= 0);
-        wf_steps = _opts->_region_sizes[step_dim];
+        wf_steps = opts->_region_sizes[step_dim];
         wf_steps = max(wf_steps, tb_steps); // round up WF steps if less than TB steps.
         assert(wf_steps >= 0);
         num_wf_shifts = 0;
@@ -377,18 +382,18 @@ namespace yask {
         assert(num_wf_shifts >= 0);
 
         // Determine whether separate tuners can be used.
-        _use_pack_tuners = (tb_steps == 0) && (stPacks.size() > 1);
+        state->_use_pack_tuners = (tb_steps == 0) && (stPacks.size() > 1);
 
         // Calculate angles and related settings.
-        for (auto& dim : _dims->_domain_dims.getDims()) {
+        for (auto& dim : domain_dims.getDims()) {
             auto& dname = dim.getName();
-            auto rnsize = _opts->_region_sizes[dname];
-            auto rksize = _opts->_rank_sizes[dname];
-            auto nranks = _opts->_num_ranks[dname];
+            auto rnsize = opts->_region_sizes[dname];
+            auto rksize = opts->_rank_sizes[dname];
+            auto nranks = opts->_num_ranks[dname];
 
             // Req'd shift in this dim based on max halos.
             // TODO: use different angle for L & R side of each pack.
-            idx_t angle = ROUND_UP(max_halos[dname], _dims->_fold_pts[dname]);
+            idx_t angle = ROUND_UP(max_halos[dname], dims->_fold_pts[dname]);
             
             // Determine the spatial skewing angles for WF tiling.  We
             // only need non-zero angles if the region size is less than the
@@ -409,7 +414,7 @@ namespace yask {
             // Is domain size at least as large as halo + wf_ext in direction
             // when there are multiple ranks?
             auto min_size = max_halos[dname] + shifts;
-            if (_opts->_num_ranks[dname] > 1 && rksize < min_size) {
+            if (opts->_num_ranks[dname] > 1 && rksize < min_size) {
                 FORMAT_AND_THROW_YASK_EXCEPTION("Error: rank-domain size of " << rksize << " in '" <<
                                                 dname << "' dim is less than minimum size of " << min_size <<
                                                 ", which is based on stencil halos and temporal wave-front sizes");
@@ -417,11 +422,11 @@ namespace yask {
 
             // If there is another rank to the left, set wave-front
             // extension on the left.
-            left_wf_exts[dname] = _opts->is_first_rank(dname) ? 0 : shifts;
+            left_wf_exts[dname] = opts->is_first_rank(dname) ? 0 : shifts;
 
             // If there is another rank to the right, set wave-front
             // extension on the right.
-            right_wf_exts[dname] = _opts->is_last_rank(dname) ? 0 : shifts;
+            right_wf_exts[dname] = opts->is_last_rank(dname) ? 0 : shifts;
         }
 
         // Now that wave-front settings are known, we can push this info
@@ -436,7 +441,7 @@ namespace yask {
                 continue;
 
             // Loop through each domain dim.
-            for (auto& dim : _dims->_domain_dims.getDims()) {
+            for (auto& dim : domain_dims.getDims()) {
                 auto& dname = dim.getName();
                 if (gp->is_dim_used(dname)) {
 
@@ -459,7 +464,7 @@ namespace yask {
     // considering temporal conditions; this assumes worst-case, which is
     // all packs always done.
     void StencilContext::update_tb_info() {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         // Get requested size.
         tb_steps = opts->_block_sizes[step_dim];
@@ -485,21 +490,21 @@ namespace yask {
 
             // Loop through each domain dim.
             DOMAIN_VAR_LOOP(i, j) {
-                auto& dim = _dims->_domain_dims.getDim(j);
+                auto& dim = domain_dims.getDim(j);
                 auto& dname = dim.getName();
-                auto rnsize = _opts->_region_sizes[i];
+                auto rnsize = opts->_region_sizes[i];
 
                 // There must be only one block size when using TB, so get
                 // sizes from context settings instead of packs.
-                assert(_use_pack_tuners == false);
-                auto blksize = _opts->_block_sizes[i];
-                auto mblksize = _opts->_mini_block_sizes[i];
+                assert(state->_use_pack_tuners == false);
+                auto blksize = opts->_block_sizes[i];
+                auto mblksize = opts->_mini_block_sizes[i];
 
                 // Req'd shift in this dim based on max halos.
                 // Can't use separate L & R shift because of possible data reuse in grids.
                 // Can't use separate shifts for each pack for same reason.
                 // TODO: make round-up optional.
-                auto fpts = _dims->_fold_pts[j];
+                auto fpts = dims->_fold_pts[j];
                 idx_t angle = ROUND_UP(max_halos[j], fpts);
             
                 // Determine the spatial skewing angles for MB.
@@ -591,7 +596,7 @@ namespace yask {
         // TODO: use actual number of shifts dynamically instead of this
         // max.
         DOMAIN_VAR_LOOP(i, j) {
-            auto blk_sz = _opts->_block_sizes[i];
+            auto blk_sz = opts->_block_sizes[i];
             auto tb_angle = tb_angles[j];
             tb_widths[j] = blk_sz;
             tb_tops[j] = blk_sz;
@@ -601,7 +606,7 @@ namespace yask {
             if (num_tb_shifts > 0 && tb_angle > 0) {
                 
                 // See equations above for block size.
-                auto fpts = _dims->_fold_pts[j];
+                auto fpts = dims->_fold_pts[j];
                 idx_t min_top_sz = fpts;
                 idx_t sa = num_tb_shifts * tb_angle;
                 idx_t min_blk_width = min_top_sz + 2 * sa;
@@ -619,7 +624,7 @@ namespace yask {
     // Init all grids & params by calling initFn.
     void StencilContext::initValues(function<void (YkGridPtr gp,
                                                    real_t seed)> realInitFn) {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         real_t seed = 0.1;
         os << "Initializing grids...\n" << flush;
@@ -637,7 +642,7 @@ namespace yask {
     // Set the bounding-box for each stencil-bundle and whole domain.
     void StencilContext::find_bounding_boxes()
     {
-        ostream& os = get_ostr();
+        STATE_VARS(this);
         os << "Constructing bounding boxes for " <<
             stBundles.size() << " stencil-bundles(s)...\n" << flush;
         YaskTimer bbtimer;
@@ -645,7 +650,7 @@ namespace yask {
 
         // Rank BB is based only on rank offsets and rank domain sizes.
         rank_bb.bb_begin = rank_domain_offsets;
-        rank_bb.bb_end = rank_domain_offsets.addElements(_opts->_rank_sizes, false);
+        rank_bb.bb_end = rank_domain_offsets.addElements(opts->_rank_sizes, false);
         rank_bb.update_bb("rank", *this, true, &os);
 
         // BB may be extended for wave-fronts.
@@ -659,8 +664,8 @@ namespace yask {
         // Find BB for each pack.
         for (auto sp : stPacks) {
             auto& spbb = sp->getBB();
-            spbb.bb_begin = _dims->_domain_dims;
-            spbb.bb_end = _dims->_domain_dims;
+            spbb.bb_begin = domain_dims;
+            spbb.bb_end = domain_dims;
 
             // Find BB for each bundle in this pack.
             for (auto sb : *sp) {
@@ -699,8 +704,8 @@ namespace yask {
 
     // Copy BB vars from another.
     void StencilBundleBase::copy_bounding_box(const StencilBundleBase* src) {
-        CONTEXT_VARS(_generic_context);
-        TRACE_MSG3("copy_bounding_box for '" << get_name() << "' from '" <<
+        STATE_VARS(this);
+        TRACE_MSG("copy_bounding_box for '" << get_name() << "' from '" <<
                    src->get_name() << "'...");
 
         _bundle_bb = src->_bundle_bb;
@@ -713,11 +718,12 @@ namespace yask {
     // Step-vars are tested dynamically for each step
     // as it is executed.
     void StencilBundleBase::find_bounding_box() {
-        CONTEXT_VARS(_generic_context);
-        TRACE_MSG3("find_bounding_box for '" << get_name() << "'...");
+        STATE_VARS(this);
+        TRACE_MSG("find_bounding_box for '" << get_name() << "'...");
 
         // Init overall bundle BB to that of parent and clear list.
-        _bundle_bb = cp->ext_bb;
+        assert(_context);
+        _bundle_bb = _context->ext_bb;
         assert(_bundle_bb.bb_valid);
         _bb_list.clear();
         
@@ -727,7 +733,7 @@ namespace yask {
 
         // If there is no condition, just add full BB to list.
         if (!is_sub_domain_expr()) {
-            TRACE_MSG3("adding 1 sub-BB: [" << _bundle_bb.bb_begin.makeDimValStr() <<
+            TRACE_MSG("adding 1 sub-BB: [" << _bundle_bb.bb_begin.makeDimValStr() <<
                        " ... " << _bundle_bb.bb_end.makeDimValStr() << ")");
             _bb_list.push_back(_bundle_bb);
             return;
@@ -744,7 +750,7 @@ namespace yask {
         idx_t outer_len = _bundle_bb.bb_len[odim];
         idx_t nthreads = yask_get_num_threads();
         idx_t len_per_thr = CEIL_DIV(outer_len, nthreads);
-        TRACE_MSG3("find_bounding_box: running " << nthreads << " thread(s) over " <<
+        TRACE_MSG("find_bounding_box: running " << nthreads << " thread(s) over " <<
                    outer_len << " point(s) in outer dim");
 
         // List of full BBs for each thread.
@@ -818,7 +824,7 @@ namespace yask {
                             while (do_scan) {
                                 do_scan = false;
 
-                                TRACE_MSG3("scanning " << scan_len.makeDimValStr(" * ") <<
+                                TRACE_MSG("scanning " << scan_len.makeDimValStr(" * ") <<
                                            " starting at " << bdpt.makeDimValStr());
                                 scan_len.visitAllPoints
                                     ([&](const IdxTuple& eofs, size_t eidx) {
@@ -874,14 +880,14 @@ namespace yask {
                                         return true; // keep looking for invalid point.
                                     }); // Looking for invalid point.
                             } // while scan is adjusted.
-                            TRACE_MSG3("found BB " << scan_len.makeDimValStr(" * ") <<
+                            TRACE_MSG("found BB " << scan_len.makeDimValStr(" * ") <<
                                        " starting at " << bdpt.makeDimValStr());
 
                             // 'scan_len' now contains sizes of the new BB.
                             BoundingBox new_bb;
                             new_bb.bb_begin = bdpt;
                             new_bb.bb_end = bdpt.addElements(scan_len);
-                            new_bb.update_bb("sub-bb", *cp, true);
+                            new_bb.update_bb("sub-bb", *_context, true);
                             cur_bb_list.push_back(new_bb);
                             
                         } // new rect found.
@@ -889,7 +895,7 @@ namespace yask {
                         return true;  // from labmda; keep looking.
                     }); // Looking for new rects.
             }); // threads/slices.
-        TRACE_MSG3("sub-bbs found in " <<
+        TRACE_MSG("sub-bbs found in " <<
                    bbtimer.get_secs_since_start() << " secs.");
         // At this point, we have a set of full BBs.
 
@@ -900,13 +906,13 @@ namespace yask {
         // TODO: merge in a parallel binary tree instead of sequentially.
         for (int n = 0; n < nthreads; n++) {
             auto& cur_bb_list = bb_lists[n];
-            TRACE_MSG3("processing " << cur_bb_list.size() <<
+            TRACE_MSG("processing " << cur_bb_list.size() <<
                        " sub-BB(s) in bundle '" << get_name() <<
                        "' from thread " << n);
 
             // BBs in slice 'n'.
             for (auto& bbn : cur_bb_list) {
-                TRACE_MSG3(" sub-BB: [" << bbn.bb_begin.makeDimValStr() <<
+                TRACE_MSG(" sub-BB: [" << bbn.bb_begin.makeDimValStr() <<
                            " ... " << bbn.bb_end.makeDimValStr() << ")");
 
                 // Don't bother with empty BB.
@@ -948,9 +954,9 @@ namespace yask {
 
                         // Merge by just increasing the size of 'bb'.
                         bb.bb_end[odim] = bbn.bb_end[odim];
-                        TRACE_MSG3("  merging to form [" << bb.bb_begin.makeDimValStr() <<
+                        TRACE_MSG("  merging to form [" << bb.bb_begin.makeDimValStr() <<
                                    " ... " << bb.bb_end.makeDimValStr() << ")");
-                        bb.update_bb("sub-bb", *cp, true);
+                        bb.update_bb("sub-bb", *_context, true);
                         break;
                     }
                 }
@@ -958,15 +964,15 @@ namespace yask {
                 // If not merged, add 'bbn' as new.
                 if (!do_merge) {
                     _bb_list.push_back(bbn);
-                    TRACE_MSG3("  adding as final sub-BB #" << _bb_list.size());
+                    TRACE_MSG("  adding as final sub-BB #" << _bb_list.size());
                 }
             }
         }
 
         // Finalize overall BB.
-        _bundle_bb.update_bb(get_name(), *cp, false);
+        _bundle_bb.update_bb(get_name(), *_context, false);
         bbtimer.stop();
-        TRACE_MSG3("find-bounding-box: done in " <<
+        TRACE_MSG("find-bounding-box: done in " <<
                    bbtimer.get_elapsed_secs() << " secs.");
     }
 
diff --git a/src/kernel/lib/soln_apis.cpp b/src/kernel/lib/soln_apis.cpp
index ad966dd1..f2c53d72 100644
--- a/src/kernel/lib/soln_apis.cpp
+++ b/src/kernel/lib/soln_apis.cpp
@@ -36,43 +36,45 @@ namespace yask {
 
 #define GET_SOLN_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \
     idx_t StencilContext::api_name(const string& dim) const {           \
+        STATE_VARS(this);                                               \
         if (prep_req && !rank_bb.bb_valid)                              \
             THROW_YASK_EXCEPTION("Error: '" #api_name \
                                  "()' called before calling 'prepare_solution()'"); \
-        checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok);      \
+        dims->checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \
         return expr;                                                    \
     }
-    GET_SOLN_API(get_num_ranks, _opts->_num_ranks[dim], false, true, false, false)
+    GET_SOLN_API(get_num_ranks, opts->_num_ranks[dim], false, true, false, false)
     GET_SOLN_API(get_overall_domain_size, overall_domain_sizes[dim], false, true, false, true)
-    GET_SOLN_API(get_rank_domain_size, _opts->_rank_sizes[dim], false, true, false, false)
-    GET_SOLN_API(get_region_size, _opts->_region_sizes[dim], true, true, false, false)
-    GET_SOLN_API(get_block_size, _opts->_block_sizes[dim], true, true, false, false)
+    GET_SOLN_API(get_rank_domain_size, opts->_rank_sizes[dim], false, true, false, false)
+    GET_SOLN_API(get_region_size, opts->_region_sizes[dim], true, true, false, false)
+    GET_SOLN_API(get_block_size, opts->_block_sizes[dim], true, true, false, false)
     GET_SOLN_API(get_first_rank_domain_index, rank_bb.bb_begin[dim], false, true, false, true)
     GET_SOLN_API(get_last_rank_domain_index, rank_bb.bb_end[dim] - 1, false, true, false, true)
-    GET_SOLN_API(get_min_pad_size, _opts->_min_pad_sizes[dim], false, true, false, false)
-    GET_SOLN_API(get_rank_index, _opts->_rank_indices[dim], false, true, false, true)
+    GET_SOLN_API(get_min_pad_size, opts->_min_pad_sizes[dim], false, true, false, false)
+    GET_SOLN_API(get_rank_index, opts->_rank_indices[dim], false, true, false, true)
 #undef GET_SOLN_API
 
     // The grid sizes updated any time these settings are changed.
 #define SET_SOLN_API(api_name, expr, step_ok, domain_ok, misc_ok, reset_prep) \
     void StencilContext::api_name(const string& dim, idx_t n) {         \
-        checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok);      \
+        STATE_VARS(this);                                               \
+        dims->checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \
         expr;                                                           \
         update_grid_info();                                             \
         if (reset_prep) rank_bb.bb_valid = ext_bb.bb_valid = false;     \
     }
-    SET_SOLN_API(set_rank_index, _opts->_rank_indices[dim] = n, false, true, false, true)
-    SET_SOLN_API(set_num_ranks, _opts->_num_ranks[dim] = n, false, true, false, true)
-    SET_SOLN_API(set_rank_domain_size, _opts->_rank_sizes[dim] = n, false, true, false, true)
-    SET_SOLN_API(set_region_size, _opts->_region_sizes[dim] = n, true, true, false, true)
-    SET_SOLN_API(set_block_size, _opts->_block_sizes[dim] = n, true, true, false, true)
-    SET_SOLN_API(set_min_pad_size, _opts->_min_pad_sizes[dim] = n, false, true, false, false)
+    SET_SOLN_API(set_rank_index, opts->_rank_indices[dim] = n, false, true, false, true)
+    SET_SOLN_API(set_num_ranks, opts->_num_ranks[dim] = n, false, true, false, true)
+    SET_SOLN_API(set_rank_domain_size, opts->_rank_sizes[dim] = n, false, true, false, true)
+    SET_SOLN_API(set_region_size, opts->_region_sizes[dim] = n, true, true, false, true)
+    SET_SOLN_API(set_block_size, opts->_block_sizes[dim] = n, true, true, false, true)
+    SET_SOLN_API(set_min_pad_size, opts->_min_pad_sizes[dim] = n, false, true, false, false)
 #undef SET_SOLN_API
 
     // Allocate grids and MPI bufs.
     // Initialize some data structures.
     void StencilContext::prepare_solution() {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         // Don't continue until all ranks are this far.
         env->global_barrier();
@@ -99,22 +101,22 @@ namespace yask {
         // Adjust all settings before setting MPI buffers or sizing grids.
         // Prints adjusted settings.
         // TODO: print settings again after auto-tuning.
-        _opts->adjustSettings(os, _env);
+        opts->adjustSettings(os);
 
         // Copy current settings to packs.
         // Needed here because settings may have been changed via APIs
         // since last call to prepare_solution().
         // This will wipe out any previous auto-tuning.
         for (auto& sp : stPacks)
-            sp->getLocalSettings() = *_opts;
+            sp->getLocalSettings() = *opts;
 
         // Init auto-tuner to run silently during normal operation.
         reset_auto_tuner(true, false);
 
         // Report ranks.
         os << endl;
-        os << "Num MPI ranks:            " << _env->get_num_ranks() << endl;
-        os << "This MPI rank index:      " << _env->get_rank_index() << endl;
+        os << "Num MPI ranks:            " << env->get_num_ranks() << endl;
+        os << "This MPI rank index:      " << env->get_rank_index() << endl;
 
         // report threads.
         {
@@ -189,36 +191,36 @@ namespace yask {
     }
     
     void StencilContext::print_info() {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         // Calc and report total allocation and domain sizes.
         rank_nbytes = get_num_bytes();
-        tot_nbytes = sumOverRanks(rank_nbytes, _env->comm);
+        tot_nbytes = sumOverRanks(rank_nbytes, env->comm);
         rank_domain_pts = rank_bb.bb_num_points;
-        tot_domain_pts = sumOverRanks(rank_domain_pts, _env->comm);
+        tot_domain_pts = sumOverRanks(rank_domain_pts, env->comm);
         os <<
             "\nDomain size in this rank (points):          " << makeNumStr(rank_domain_pts) <<
             "\nTotal allocation in this rank:              " << makeByteStr(rank_nbytes) <<
-            "\nOverall problem size in " << _env->num_ranks << " rank(s) (points): " <<
+            "\nOverall problem size in " << env->num_ranks << " rank(s) (points): " <<
             makeNumStr(tot_domain_pts) <<
-            "\nTotal overall allocation in " << _env->num_ranks << " rank(s):      " <<
+            "\nTotal overall allocation in " << env->num_ranks << " rank(s):      " <<
             makeByteStr(tot_nbytes) <<
             endl;
 
         // Report some sizes and settings.
         os << "\nWork-unit sizes in points (from smallest to largest):\n"
-            " vector-size:           " << _dims->_fold_pts.makeDimValStr(" * ") << endl <<
-            " cluster-size:          " << _dims->_cluster_pts.makeDimValStr(" * ") << endl <<
-            " sub-block-size:        " << _opts->_sub_block_sizes.makeDimValStr(" * ") << endl <<
-            " mini-block-size:       " << _opts->_mini_block_sizes.makeDimValStr(" * ") << endl <<
-            " block-size:            " << _opts->_block_sizes.makeDimValStr(" * ") << endl <<
-            " region-size:           " << _opts->_region_sizes.makeDimValStr(" * ") << endl <<
-            " rank-domain-size:      " << _opts->_rank_sizes.makeDimValStr(" * ") << endl <<
+            " vector-size:           " << dims->_fold_pts.makeDimValStr(" * ") << endl <<
+            " cluster-size:          " << dims->_cluster_pts.makeDimValStr(" * ") << endl <<
+            " sub-block-size:        " << opts->_sub_block_sizes.makeDimValStr(" * ") << endl <<
+            " mini-block-size:       " << opts->_mini_block_sizes.makeDimValStr(" * ") << endl <<
+            " block-size:            " << opts->_block_sizes.makeDimValStr(" * ") << endl <<
+            " region-size:           " << opts->_region_sizes.makeDimValStr(" * ") << endl <<
+            " rank-domain-size:      " << opts->_rank_sizes.makeDimValStr(" * ") << endl <<
             " overall-problem-size:  " << overall_domain_sizes.makeDimValStr(" * ") << endl;
 #ifdef SHOW_GROUPS
         os << 
-            " sub-block-group-size:  " << _opts->_sub_block_group_sizes.makeDimValStr(" * ") << endl <<
-            " block-group-size:      " << _opts->_block_group_sizes.makeDimValStr(" * ") << endl <<
+            " sub-block-group-size:  " << opts->_sub_block_group_sizes.makeDimValStr(" * ") << endl <<
+            " block-group-size:      " << opts->_block_group_sizes.makeDimValStr(" * ") << endl <<
 #endif
         os << "\nOther settings:\n"
             " yask-version:          " << yask_get_version_string() << endl <<
@@ -228,8 +230,8 @@ namespace yask {
             " ... " << rank_bb.bb_end.subElements(1).makeDimValStr() << endl;
 #ifdef USE_MPI
         os <<
-            " num-ranks:             " << _opts->_num_ranks.makeDimValStr(" * ") << endl <<
-            " rank-indices:          " << _opts->_rank_indices.makeDimValStr() << endl <<
+            " num-ranks:             " << opts->_num_ranks.makeDimValStr(" * ") << endl <<
+            " rank-indices:          " << opts->_rank_indices.makeDimValStr() << endl <<
             " rank-domain-offsets:   " << rank_domain_offsets.makeDimValOffsetStr() << endl;
         if (opts->overlap_comms)
             os <<
@@ -238,8 +240,8 @@ namespace yask {
 #endif
         os <<
             " vector-len:            " << VLEN << endl <<
-            " extra-padding:         " << _opts->_extra_pad_sizes.makeDimValStr() << endl <<
-            " minimum-padding:       " << _opts->_min_pad_sizes.makeDimValStr() << endl <<
+            " extra-padding:         " << opts->_extra_pad_sizes.makeDimValStr() << endl <<
+            " minimum-padding:       " << opts->_min_pad_sizes.makeDimValStr() << endl <<
             " L1-prefetch-distance:  " << PFD_L1 << endl <<
             " L2-prefetch-distance:  " << PFD_L2 << endl <<
             " max-halos:             " << max_halos.makeDimValStr() << endl;
@@ -259,7 +261,7 @@ namespace yask {
 
     // Dealloc grids, etc.
     void StencilContext::end_solution() {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         // Final halo exchange (usually not needed).
         exchange_halos();
@@ -294,10 +296,11 @@ namespace yask {
     }
 
     string StencilContext::apply_command_line_options(const string& args) {
+        STATE_VARS(this);
 
         // Create a parser and add base options to it.
         CommandLineParser parser;
-        _opts->add_options(parser);
+        opts->add_options(parser);
 
         // Tokenize default args.
         vector<string> argsv;
@@ -318,7 +321,7 @@ namespace yask {
 
     // Add a new grid to the containers.
     void StencilContext::addGrid(YkGridPtr gp, bool is_output) {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
         assert(gp);
         auto& gname = gp->get_name();
         if (gridMap.count(gname))
@@ -345,7 +348,7 @@ namespace yask {
 
     /// Get statistics associated with preceding calls to run_solution().
     yk_stats_ptr StencilContext::get_stats() {
-        CONTEXT_VARS(this);
+        STATE_VARS(this);
 
         // Numbers of threads.
         int rthr, bthr;
diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp
index 4b50cb87..50d0b80c 100644
--- a/src/kernel/lib/stencil_calc.cpp
+++ b/src/kernel/lib/stencil_calc.cpp
@@ -40,8 +40,8 @@ namespace yask {
     void StencilBundleBase::calc_mini_block(int region_thread_idx,
                                             KernelSettings& settings,
                                             const ScanIndices& mini_block_idxs) {
-        CONTEXT_VARS(_generic_context);
-        TRACE_MSG3("calc_mini_block('" << get_name() << "'): [" <<
+        STATE_VARS(this);
+        TRACE_MSG("calc_mini_block('" << get_name() << "'): [" <<
                    mini_block_idxs.begin.makeValStr(nsdims) << " ... " <<
                    mini_block_idxs.end.makeValStr(nsdims) << ") by " <<
                    mini_block_idxs.step.makeValStr(nsdims) <<
@@ -57,7 +57,7 @@ namespace yask {
 
         // Nothing to do if outer BB is empty.
         if (_bundle_bb.bb_num_points == 0) {
-            TRACE_MSG3("calc_mini_block: empty BB");
+            TRACE_MSG("calc_mini_block: empty BB");
             return;
         }
         
@@ -75,7 +75,7 @@ namespace yask {
         // Loop through each solid BB for this bundle.
         // For each BB, calc intersection between it and 'mini_block_idxs'.
         // If this is non-empty, apply the bundle to all its required sub-blocks.
-        TRACE_MSG3("calc_mini_block('" << get_name() << "'): checking " <<
+        TRACE_MSG("calc_mini_block('" << get_name() << "'): checking " <<
                    _bb_list.size() << " BB(s)");
         int bbn = 0;
   	for (auto& bb : _bb_list) {
@@ -106,12 +106,12 @@ namespace yask {
 
             // nothing to do?
             if (!bb_ok) {
-                TRACE_MSG3("calc_mini_block for bundle '" << get_name() <<
+                TRACE_MSG("calc_mini_block for bundle '" << get_name() <<
                            "': no overlap between bundle " << bbn << " and current block");
                 continue; // to next BB.
             }
             
-            TRACE_MSG3("calc_mini_block('" << get_name() <<
+            TRACE_MSG("calc_mini_block('" << get_name() <<
                        "'): after trimming for BB " << bbn << ": [" <<
                        mb_idxs.begin.makeValStr(nsdims) <<
                        " ... " << mb_idxs.end.makeValStr(nsdims) << ")");
@@ -128,7 +128,7 @@ namespace yask {
                 // eventually work on a separate sub-block.  This is nested within
                 // an OMP region thread.  If there is only one block per thread,
                 // nested OMP is disabled, and this OMP pragma does nothing.
-                int nbt = cp->set_block_threads();
+                int nbt = _context->set_block_threads();
                 bool bind_threads = nbt > 1 && settings.bind_block_threads;
                 _Pragma("omp parallel proc_bind(spread)") {
                     int block_thread_idx = (nbt <= 1) ? 0 : omp_get_thread_num();
@@ -158,7 +158,7 @@ namespace yask {
                             adj_mb_idxs.step[i] = adj_mb_idxs.end[i] - adj_mb_idxs.begin[i];
                     }
 
-                    TRACE_MSG3("calc_mini_block('" << get_name() << "'): " <<
+                    TRACE_MSG("calc_mini_block('" << get_name() << "'): " <<
                                " for reqd bundle '" << sg->get_name() << "': [" <<
                                adj_mb_idxs.begin.makeValStr(nsdims) << " ... " <<
                                adj_mb_idxs.end.makeValStr(nsdims) << ") by " <<
@@ -205,8 +205,8 @@ namespace yask {
                                                   int block_thread_idx,
                                                   KernelSettings& settings,
                                                   const ScanIndices& mini_block_idxs) {
-        CONTEXT_VARS(_generic_context);
-        TRACE_MSG3("calc_sub_block_scalar for bundle '" << get_name() << "': [" <<
+        STATE_VARS(this);
+        TRACE_MSG("calc_sub_block_scalar for bundle '" << get_name() << "': [" <<
                    mini_block_idxs.start.makeValStr(nsdims) <<
                    " ... " << mini_block_idxs.stop.makeValStr(nsdims) <<
                    ") by region thread " << region_thread_idx <<
@@ -243,8 +243,8 @@ namespace yask {
                                                int block_thread_idx,
                                                KernelSettings& settings,
                                                const ScanIndices& mini_block_idxs) {
-        CONTEXT_VARS(_generic_context);        
-        TRACE_MSG3("calc_sub_block_vec for bundle '" << get_name() << "': [" <<
+        STATE_VARS(this);
+        TRACE_MSG("calc_sub_block_vec for bundle '" << get_name() << "': [" <<
                    mini_block_idxs.start.makeValStr(nsdims) <<
                    " ... " << mini_block_idxs.stop.makeValStr(nsdims) <<
                    ") by region thread " << region_thread_idx <<
@@ -308,7 +308,7 @@ namespace yask {
         DOMAIN_VAR_LOOP(i, j) {
 
             // Rank offset.
-            auto rofs = cp->rank_domain_offsets[j];
+            auto rofs = _context->rank_domain_offsets[j];
 
             // Begin/end of rank-relative scalar elements in this dim.
             auto ebgn = sub_block_idxs.begin[i] - rofs;
@@ -459,7 +459,7 @@ namespace yask {
 
         // Full rectilinear polytope of aligned clusters: use optimized code.
         if (do_clusters) {
-            TRACE_MSG3("calc_sub_block_vec:  using cluster code for [" <<
+            TRACE_MSG("calc_sub_block_vec:  using cluster code for [" <<
                        sub_block_fcidxs.begin.makeValStr(nsdims) <<
                        " ... " << sub_block_fcidxs.end.makeValStr(nsdims) <<
                        ") by region thread " << region_thread_idx <<
@@ -486,7 +486,7 @@ namespace yask {
         // Full and partial peel/remainder vectors in all dims except
         // the inner one.
         if (do_vectors) {
-            TRACE_MSG3("calc_sub_block_vec:  using vector code for [" <<
+            TRACE_MSG("calc_sub_block_vec:  using vector code for [" <<
                        sub_block_vidxs.begin.makeValStr(nsdims) <<
                        " ... " << sub_block_vidxs.end.makeValStr(nsdims) <<
                        ") *not* within full vector-clusters at [" <<
@@ -570,7 +570,7 @@ namespace yask {
             misc_idxs.step.setFromConst(1);
             misc_idxs.align.setFromConst(1);
 
-            TRACE_MSG3("calc_sub_block_vec:  using scalar code for [" <<
+            TRACE_MSG("calc_sub_block_vec:  using scalar code for [" <<
                        misc_idxs.begin.makeValStr(nsdims) << " ... " <<
                        misc_idxs.end.makeValStr(nsdims) <<
                        ") *not* within vectors at [" <<
@@ -587,13 +587,13 @@ namespace yask {
 #define MISC_FN(pt_idxs)  do {                                          \
                 bool ok = false;                                        \
                 DOMAIN_VAR_LOOP(i, j) {                                 \
-                    auto rofs = cp->rank_domain_offsets[j];             \
+                    auto rofs = _context->rank_domain_offsets[j];       \
                     if (pt_idxs.start[i] < rofs + sub_block_vidxs.begin[i] || \
                         pt_idxs.start[i] >= rofs + sub_block_vidxs.end[i]) { \
                         ok = true; break; }                             \
                 }                                                       \
                 if (ok) {                                               \
-                    TRACE_MSG3("calc_sub_block_vec:   at pt " <<            \
+                    TRACE_MSG("calc_sub_block_vec:   at pt " <<            \
                                pt_idxs.start.makeValStr(nsdims));       \
                     calc_scalar(region_thread_idx, pt_idxs.start);      \
                 }                                                       \
@@ -615,8 +615,8 @@ namespace yask {
     void StencilBundleBase::calc_loop_of_clusters(int region_thread_idx,
                                                   int block_thread_idx,
                                                   const ScanIndices& loop_idxs) {
-        CONTEXT_VARS(_generic_context);
-        TRACE_MSG3("calc_loop_of_clusters: local vector-indices [" <<
+        STATE_VARS(this);
+        TRACE_MSG("calc_loop_of_clusters: local vector-indices [" <<
                    loop_idxs.start.makeValStr(nsdims) <<
                    " ... " << loop_idxs.stop.makeValStr(nsdims) <<
                    ") by region thread " << region_thread_idx <<
@@ -649,8 +649,8 @@ namespace yask {
                                                  int block_thread_idx,
                                                  const ScanIndices& loop_idxs,
                                                  idx_t write_mask) {
-        CONTEXT_VARS(_generic_context);
-        TRACE_MSG3("calc_loop_of_vectors: local vector-indices [" <<
+        STATE_VARS(this);
+        TRACE_MSG("calc_loop_of_vectors: local vector-indices [" <<
                    loop_idxs.start.makeValStr(nsdims) <<
                    " ... " << loop_idxs.stop.makeValStr(nsdims) <<
                    ") w/write-mask = 0x" << hex << write_mask << dec <<
@@ -688,7 +688,7 @@ namespace yask {
     // Returns adjusted indices.
     ScanIndices StencilBundleBase::adjust_span(int region_thread_idx,
                                                const ScanIndices& idxs) const {
-        CONTEXT_VARS(_generic_context);
+        STATE_VARS(this);
         ScanIndices adj_idxs(idxs);
 
         // Loop thru vecs of scratch grids for this bundle.
@@ -723,7 +723,7 @@ namespace yask {
                     adj_idxs.end[i] = idxs.end[i] + rh;
 
                     // Make sure grid covers index bounds.
-                    TRACE_MSG3("adjust_span: mini-blk [" << 
+                    TRACE_MSG("adjust_span: mini-blk [" << 
                                idxs.begin[i] << "..." <<
                                idxs.end[i] << ") adjusted to [" << 
                                adj_idxs.begin[i] << "..." <<
@@ -817,6 +817,11 @@ namespace yask {
                 "  num reqd scratch bundles:   " << (sg_list.size() - 1) << endl;
             // TODO: add info on scratch bundles here.
 
+            if (sg->is_sub_domain_expr())
+                os << "  sub-domain expr:            '" << sg->get_domain_description() << "'\n";
+            if (sg->is_step_cond_expr())
+                os << "  step-condition expr:        '" << sg->get_step_cond_description() << "'\n";
+
             os <<
                 "  bundle size (points):       " << makeNumStr(bb.bb_size) << endl;
             if (bb.bb_size) {
diff --git a/src/kernel/lib/stencil_calc.hpp b/src/kernel/lib/stencil_calc.hpp
index 7695cf0d..29edce67 100644
--- a/src/kernel/lib/stencil_calc.hpp
+++ b/src/kernel/lib/stencil_calc.hpp
@@ -32,9 +32,10 @@ namespace yask {
     // A stencil context contains one or more packs.
 
     // A pure-virtual class base for a stencil bundle.
-    class StencilBundleBase {
+    class StencilBundleBase : 
+        public ContextLinker {
+
     protected:
-        StencilContext* _generic_context = 0;
         std::string _name;
         int _scalar_fp_ops = 0;
         int _scalar_points_read = 0;
@@ -65,7 +66,7 @@ namespace yask {
         // Ranks offsets must already be subtracted.
         // Each dim in 'orig' must be a multiple of corresponding vec len.
         void normalize_indices(const Indices& orig, Indices& norm) const {
-            CONTEXT_VARS(_generic_context);
+            STATE_VARS(this);
             assert(orig.getNumDims() == nsdims);
             assert(norm.getNumDims() == nsdims);
 
@@ -97,20 +98,9 @@ namespace yask {
 
         // ctor, dtor.
         StencilBundleBase(StencilContext* context) :
-            _generic_context(context) {
-            //CONTEXT_VARS(context);
-        }
-
+            ContextLinker(context) { }
         virtual ~StencilBundleBase() { }
 
-        // Access to dims and MPI info.
-        DimsPtr& get_dims() const {
-            return _generic_context->get_dims();
-        }
-        MPIInfoPtr& get_mpi_info() {
-            return _generic_context->get_mpi_info();
-        }
-
         // Get name of this bundle.
         const std::string& get_name() const { return _name; }
 
@@ -172,13 +162,17 @@ namespace yask {
         virtual bool
         is_in_valid_domain(const Indices& idxs) const =0;
 
-        // Return true if there is a non-default sub-domain expression.
+        // Return true if there is a non-default conditions.
         virtual bool
         is_sub_domain_expr() const { return false; }
+        virtual bool
+        is_step_cond_expr() const { return false; }
 
-        // Return human-readable description of sub-domain.
+        // Return human-readable description of conditions.
         virtual std::string
         get_domain_description() const =0;
+        virtual std::string
+        get_step_cond_description() const =0;
         
         // Determine whether step index is enabled.
         virtual bool
@@ -277,20 +271,18 @@ namespace yask {
     // "Independent" implies that they may be evaluated
     // in any order.
     class BundlePack :
+        public ContextLinker,
         public std::vector<StencilBundleBase*> {
 
     protected:
         std::string _name;
 
-        // Parent solution.
-        StencilContext* _context = 0;
-        
         // Union of bounding boxes for all bundles in this pack.
         BoundingBox _pack_bb;
 
         // Local pack settings.
         // Only some of these will be used.
-        KernelSettings _opts;
+        KernelSettings _pack_opts;
 
         // Auto-tuner for pack settings.
         AutoTuner _at;
@@ -312,12 +304,12 @@ namespace yask {
         idx_t tot_writes_per_step = 0;
         idx_t tot_fpops_per_step = 0;
         
-        BundlePack(const std::string& name,
-                   StencilContext* ctx) :
+        BundlePack(StencilContext* context,
+                   const std::string& name) :
+            ContextLinker(context),
             _name(name),
-            _context(ctx),
-            _opts(*ctx->get_settings()), // make a copy of the context settings.
-            _at(ctx, &_opts, name) { }
+            _pack_opts(*context->get_state()->_opts), // init w/a copy of the base settings.
+            _at(context, &_pack_opts, name) { }
         virtual ~BundlePack() { }
 
         const std::string& get_name() {
@@ -342,13 +334,14 @@ namespace yask {
         // Accessors.
         BoundingBox& getBB() { return _pack_bb; }
         AutoTuner& getAT() { return _at; }
-        KernelSettings& getLocalSettings() { return _opts; }
+        KernelSettings& getLocalSettings() { return _pack_opts; }
 
         // If using separate pack tuners, return local settings.
         // Otherwise, return one in context.
         KernelSettings& getActiveSettings() {
-            return _context->use_pack_tuners() ? _opts :
-                *_context->get_settings().get(); }
+            STATE_VARS(this);
+            return use_pack_tuners() ? _pack_opts : *opts;
+        }
 
         // Perf-tracking methods.
         void start_timers();
diff --git a/src/kernel/lib/yask.hpp b/src/kernel/lib/yask.hpp
index 74374b86..75a5a95a 100644
--- a/src/kernel/lib/yask.hpp
+++ b/src/kernel/lib/yask.hpp
@@ -154,15 +154,8 @@ typedef std::uint64_t uidx_t;
 #define TRACE_MSG0(os, msg) ((void)0)
 #endif
 
-// macro for debug message from a StencilContext method.
-#define TRACE_MSG1(msg) TRACE_MSG0(get_ostr(), msg)
-#define TRACE_MSG(msg) TRACE_MSG1(msg)
-
-// macro for debug message when _context ptr is defined.
-#define TRACE_MSG2(msg) TRACE_MSG0(_context->get_ostr(), msg)
-
-// macro for debug message when _generic_context ptr is defined.
-#define TRACE_MSG3(msg) TRACE_MSG0(_generic_context->get_ostr(), msg)
+// macro for debug message when 'os' is defined.
+#define TRACE_MSG(msg) TRACE_MSG0(os, msg)
 
 // breakpoint.
 #define INT3 asm volatile("int $3")
diff --git a/src/kernel/lib/yask_stencil.hpp b/src/kernel/lib/yask_stencil.hpp
index b5356875..57faa0ec 100644
--- a/src/kernel/lib/yask_stencil.hpp
+++ b/src/kernel/lib/yask_stencil.hpp
@@ -72,6 +72,7 @@ IN THE SOFTWARE.
 #include "realv.hpp"
 
 // Base types for stencil context, etc.
+#include "indices.hpp"
 #include "settings.hpp"
 #include "generic_grids.hpp"
 #include "realv_grids.hpp"
diff --git a/src/kernel/tests/grid_test.cpp b/src/kernel/tests/grid_test.cpp
index ae05af13..00770eac 100644
--- a/src/kernel/tests/grid_test.cpp
+++ b/src/kernel/tests/grid_test.cpp
@@ -70,10 +70,10 @@ int main(int argc, char** argv) {
         os << "0-D test...\n";
         GridDimNames gdims;
         string name = "test grid";
-        YkGridPtr g0 = make_shared<YkElemGrid<Layout_0d, false>>(dims, name, gdims, &settings, &osp);
+        YkGridPtr g0 = make_shared<YkElemGrid<Layout_0d, false>>(*context, name, gdims);
         g0->alloc_storage();
         os << g0->make_info_string() << endl;
-        YkGridPtr g1 = make_shared<YkElemGrid<Layout_0d, false>>(dims, name, gdims, &settings, &osp);
+        YkGridPtr g1 = make_shared<YkElemGrid<Layout_0d, false>>(*context, name, gdims);
         g1->alloc_storage();
         os << g1->make_info_string() << endl;
 
@@ -91,8 +91,8 @@ int main(int argc, char** argv) {
         os << "3-D test...\n";
         GridDimNames gdims = {"x", "y", "z"};
         string name = "test grid";
-        YkGridPtr g3 = make_shared<YkElemGrid<Layout_321, false>>(dims, name, gdims, &settings, &osp);
-        YkGridPtr g3f = make_shared<YkVecGrid<Layout_123, false, VLEN_X, VLEN_Y, VLEN_Z>>(dims, name, gdims, &settings, &osp);
+        YkGridPtr g3 = make_shared<YkElemGrid<Layout_321, false>>(*context, name, gdims);
+        YkGridPtr g3f = make_shared<YkVecGrid<Layout_123, false, VLEN_X, VLEN_Y, VLEN_Z>>(*context, name, gdims);
         int i = 0;
         int min_pad = 3;
         for (auto dname : gdims) {
diff --git a/src/kernel/tests/yask_kernel_api_exception_test.py b/src/kernel/tests/yask_kernel_api_exception_test.py
index a48086ba..f2ab9c1b 100755
--- a/src/kernel/tests/yask_kernel_api_exception_test.py
+++ b/src/kernel/tests/yask_kernel_api_exception_test.py
@@ -339,7 +339,7 @@ def init_grid(grid, timestep) :
     print("Running the solution for 10 more steps...")
     soln.run_solution(1, 10)
 
-    # Print final result at timestep 11.
+    # Print final result at timestep 11, assuming update was to t+1.
     for grid in soln.get_grids() :
         read_grid(grid, 11)
 
diff --git a/src/kernel/tests/yask_kernel_api_test.cpp b/src/kernel/tests/yask_kernel_api_test.cpp
index 80e1074b..f1167018 100644
--- a/src/kernel/tests/yask_kernel_api_test.cpp
+++ b/src/kernel/tests/yask_kernel_api_test.cpp
@@ -113,19 +113,35 @@ int main() {
 
         // Print out some info about the grids and init their data.
         for (auto grid : soln->get_grids()) {
-            os << "    " << grid->get_name() << "(";
-            for (auto dname : grid->get_dim_names())
-                os << " '" << dname << "'";
-            os << " )\n";
+            os << "    grid-var '" << grid->get_name() << ":\n";
             for (auto dname : grid->get_dim_names()) {
+                os << "      '" << dname << "' dim:\n";
+                os << "        alloc-size on this rank: " <<
+                    grid->get_alloc_size(dname) << endl;
+                
+                // Is this a domain dim?
                 if (domain_dim_set.count(dname)) {
-                    os << "      '" << dname << "' domain index range on this rank: " <<
+                    os << "        domain index range on this rank: " <<
                         grid->get_first_rank_domain_index(dname) << " ... " <<
                         grid->get_last_rank_domain_index(dname) << endl;
-                    os << "      '" << dname << "' allowed index range on this rank: " <<
+                    os << "        domain+halo index range on this rank: " <<
+                        grid->get_first_rank_halo_index(dname) << " ... " <<
+                        grid->get_last_rank_halo_index(dname) << endl;
+                    os << "        allowed index range on this rank: " <<
                         grid->get_first_rank_alloc_index(dname) << " ... " <<
                         grid->get_last_rank_alloc_index(dname) << endl;
                 }
+
+                // Step dim?
+                else if (dname == soln->get_step_dim_name()) {
+                }
+
+                // Misc dim?
+                else {
+                    os << "        misc index range: " <<
+                        grid->get_first_misc_index(dname) << " ... " <<
+                        grid->get_last_misc_index(dname) << endl;
+                }
             }
 
             // First, just init all the elements to the same value.
@@ -138,35 +154,39 @@ int main() {
             // Create indices describing a subset of the overall domain.
             vector<idx_t> first_indices, last_indices;
             for (auto dname : grid->get_dim_names()) {
+                idx_t first_idx = 0, last_idx = 0;
 
                 // Is this a domain dim?
                 if (domain_dim_set.count(dname)) {
 
                     // Set indices to create a small cube (assuming 3D)
                     // in center of overall problem.
+                    // Using global indices.
                     idx_t psize = soln->get_overall_domain_size(dname);
-                    idx_t first_idx = psize/2 - 30;
-                    idx_t last_idx = psize/2 + 30;
-                    first_indices.push_back(first_idx);
-                    last_indices.push_back(last_idx);
+                    first_idx = psize/2 - 30;
+                    last_idx = psize/2 + 30;
                 }
 
                 // Step dim?
                 else if (dname == soln->get_step_dim_name()) {
 
-                    // Add indices for timestep zero (0) only.
-                    first_indices.push_back(0);
-                    last_indices.push_back(0);
+                    // Set indices for one time-step.
+                    first_idx = 0;
+                    last_idx = 0;
                 }
 
                 // Misc dim?
                 else {
 
-                    // Add indices to set all allowed values.
-                    // (This isn't really meaningful; it's just illustrative.)
-                    first_indices.push_back(grid->get_first_misc_index(dname));
-                    last_indices.push_back(grid->get_last_misc_index(dname));
+                    // Set indices to set all allowed values.
+                    first_idx = grid->get_first_misc_index(dname);
+                    last_idx = grid->get_last_misc_index(dname);
+                    assert(last_idx - first_idx + 1 == grid->get_alloc_size(dname));
                 }
+
+                // Add indices to index vectors.
+                first_indices.push_back(first_idx);
+                last_indices.push_back(last_idx);
             }
 
             // Init the values using the indices created above.
diff --git a/src/kernel/tests/yask_kernel_api_test.py b/src/kernel/tests/yask_kernel_api_test.py
index f2341b08..2c74fdf9 100755
--- a/src/kernel/tests/yask_kernel_api_test.py
+++ b/src/kernel/tests/yask_kernel_api_test.py
@@ -30,58 +30,69 @@
 import argparse
 import yask_kernel as yk
 
-# Read data from grid using NumPy ndarray.
-def read_grid(grid, timestep) :
-
-    # Ignore with fixed-sized grids.
-    if grid.is_fixed_size():
-        return
-
-    print("Reading grid '" + grid.get_name() + "' at time " + repr(timestep) + "...")
+# Prepare an NymPy ndarray to hold a slice of 'grid'.
+def make_ndarray(grid, timestep) :
 
     # Create indices for YASK and shape for NumPy.
     first_indices = []
     last_indices = []
     shape = []
+    point = ()
     nelems = 1
     for dname in grid.get_dim_names() :
 
         if dname == soln.get_step_dim_name() :
 
-            # Read one timestep only.
-            # So, we don't need to add a time axis in 'shape'.
-            first_indices += [timestep]
-            last_indices += [timestep]
+            # Slice in requested time.
+            first_idx = timestep
+            last_idx = timestep
 
         # Domain dim?
         elif dname in soln.get_domain_dim_names() :
 
-            # Full domain in this rank.
-            first_idx = soln.get_first_rank_domain_index(dname)
-            last_idx = soln.get_last_rank_domain_index(dname)
-
-            # Read one point in the halo, too.
-            first_idx -= 1
-            last_idx += 1
-
-            first_indices += [first_idx]
-            last_indices += [last_idx]
-            shape += [last_idx - first_idx + 1]
-            nelems *= last_idx - first_idx + 1
+            # Cover full alloc in this rank.
+            first_idx = grid.get_first_rank_alloc_index(dname)
+            last_idx = grid.get_last_rank_alloc_index(dname)
 
         # Misc dim?
         else :
 
-            # Read first index only.
-            first_indices += [grid.get_first_misc_index(dname)]
-            last_indices += [grid.get_first_misc_index(dname)]
+            # Cover all misc values.
+            first_idx = grid.get_first_misc_index(dname)
+            last_idx = grid.get_last_misc_index(dname)
+
+        # Add indices to API vars.
+        first_indices += [first_idx]
+        last_indices += [last_idx]
+        ne = last_idx - first_idx + 1
+        shape += [ne]
+        nelems *= ne
+
+        # Define first point in ndarray.
+        point += (0, )
 
     # Create a NumPy ndarray to hold the extracted data.
-    ndarray1 = np.empty(shape, dtype, 'C');
+    print("Creating a NumPy ndarray with shape " + repr(shape) + " and " +
+          repr(nelems) + " element(s)...")
+    ndarray = np.zeros(shape, dtype, 'C');
+    return ndarray, first_indices, last_indices, point
 
-    print("Reading " + repr(nelems) + " element(s)...")
-    nread = grid.get_elements_in_slice(ndarray1.data, first_indices, last_indices)
-    print(ndarray1)
+# Read data from grid using NumPy ndarray.
+def read_grid(grid, timestep) :
+
+    # Ignore with fixed-sized grids.
+    if grid.is_fixed_size():
+        return
+    print("Testing reading grid '" + grid.get_name() + "' at time " + repr(timestep) + "...")
+    ndarray, first_indices, last_indices, point = make_ndarray(grid, timestep)
+
+    print("Reading 1 element...")
+    val1 = grid.get_element(first_indices)
+    print("Read value " + repr(val1))
+
+    print("Reading all element(s) in ndarray...")
+    nread = grid.get_elements_in_slice(ndarray.data, first_indices, last_indices)
+    print(ndarray)
 
     # Raw access to this grid.
     if soln.get_element_bytes() == 4 :
@@ -92,66 +103,53 @@ def read_grid(grid, timestep) :
     fp_ptr = ct.cast(int(raw_ptr), ptype)
     num_elems = grid.get_num_storage_elements()
     print("Raw data: " + repr(fp_ptr[0]) + ", ..., " + repr(fp_ptr[num_elems-1]))
-    #ndarray2 = np.fromiter(fp_ptr, dtype, num_elems); print(ndarray2)
 
 # Init grid using NumPy ndarray.
 def init_grid(grid, timestep) :
     print("Initializing grid '" + grid.get_name() + "' at time " + repr(timestep) + "...")
-
-    # Create indices for YASK, shape & point for NumPy.
-    first_indices = []
-    last_indices = []
-    shape = []
-    point = ()
-    nelems = 1
-    for dname in grid.get_dim_names() :
-
-        if dname == soln.get_step_dim_name() :
-
-            # Write one timestep only.
-            # So, we don't need to add a time axis in 'shape'.
-            first_indices += [timestep]
-            last_indices += [timestep]
-
-        # Domain dim?
-        elif dname in soln.get_domain_dim_names() :
-
-            # Full domain in this rank.
-            first_idx = soln.get_first_rank_domain_index(dname)
-            last_idx = soln.get_last_rank_domain_index(dname)
-
-            # Write one point in the halo, too.
-            first_idx -= 1
-            last_idx += 1
-
-            first_indices += [first_idx]
-            last_indices += [last_idx]
-            shape += [last_idx - first_idx + 1]
-            nelems *= last_idx - first_idx + 1
-
-            # Since the array covers one layer of points in the halo
-            # starting at 0,..,0, 1,..,1 is the first point in the
-            # computable domain.
-            point += (1,)
-
-        # Misc dim?
-        else :
-
-            # Write first index only.
-            first_indices += [grid.get_first_misc_index(dname)]
-            last_indices += [grid.get_first_misc_index(dname)]
-
-    # Create a NumPy ndarray to hold the data.
-    ndarray = np.zeros(shape, dtype, 'C');
+    ndarray, first_indices, last_indices, point = make_ndarray(grid, timestep)
 
     # Set one point to a non-zero value.
-    ndarray[point] = 21.0;
+    val1 = 21.0
+    ndarray[point] = val1
     print(ndarray)
 
-    print("Writing " + repr(nelems) + " element(s)...")
+    print("Setting grid from all element(s) in ndarray...")
     nset = grid.set_elements_in_slice(ndarray.data, first_indices, last_indices)
     print("Set " + repr(nset) + " element(s) in rank " + repr(env.get_rank_index()))
 
+    # Check that set worked.
+    print("Reading those element(s)...")
+    val2 = grid.get_element(first_indices)
+    assert val2 == val1
+    val2 = grid.get_element(last_indices)
+    if nset > 1 :
+        assert val2 == 0.0
+    else :
+        assert val2 == val1  # Only 1 val => first == last.
+    ndarray2 = ndarray
+    ndarray2.fill(5.0)
+    nread = grid.get_elements_in_slice(ndarray2.data, first_indices, last_indices)
+    assert nread == ndarray2.size
+    assert ndarray2[point] == val1
+    assert ndarray2.sum() == val1  # One point is val1; others are zero.
+
+    # Test element set.
+    print("Testing setting 1 point at " + repr(last_indices) + "...")
+    val1 += 1.0
+    nset = grid.set_element(val1, last_indices);
+    assert nset == 1
+    val2 = grid.get_element(last_indices)
+    assert val2 == val1
+
+    # Test add.
+    val3 = 2.0
+    print("Testing adding to 1 point at " + repr(last_indices) + "...")
+    nset = grid.add_to_element(val3, last_indices);
+    assert nset == 1
+    val2 = grid.get_element(last_indices)
+    assert val2 == val1 + val3
+    
 # Main script.
 if __name__ == "__main__":
 
@@ -242,8 +240,9 @@ def init_grid(grid, timestep) :
             continue
         
         # Init timestep 0 using NumPy.
-        # This will set one point in each rank.
         init_grid(grid, 0)
+
+        # Print out the values.
         read_grid(grid, 0)
 
         # Simple one-index example.
@@ -302,9 +301,8 @@ def init_grid(grid, timestep) :
         nset = grid.set_elements_in_slice_same(0.5, first_indices, last_indices)
         print("Set " + repr(nset) + " element(s) in rank " + repr(env.get_rank_index()))
 
-        # Print the initial contents of the grid at timesteps 0 and 1.
+        # Print the initial contents of the grid.
         read_grid(grid, 0)
-        read_grid(grid, 1)
 
     # Apply the stencil solution to the data.
     env.global_barrier()
@@ -318,7 +316,7 @@ def init_grid(grid, timestep) :
     print("Running the solution for 10 more steps...")
     soln.run_solution(1, 10)
 
-    # Print final result at timestep 11.
+    # Print final result at timestep 11, assuming update was to t+1.
     for grid in soln.get_grids() :
         read_grid(grid, 11)
 
diff --git a/src/stencils/SimpleTestStencils.cpp b/src/stencils/SimpleTestStencils.cpp
index 826960f8..d278e979 100644
--- a/src/stencils/SimpleTestStencils.cpp
+++ b/src/stencils/SimpleTestStencils.cpp
@@ -518,50 +518,51 @@ class TestSubdomainStencil3 : public StencilRadiusBase {
 
 REGISTER_STENCIL(TestSubdomainStencil3);
 
-class TestStepCondStencil1 : public StencilRadiusBase {
+// Test step condition.
+class TestStepCondStencil1 : public Test1dBase {
 
 protected:
 
-    // Indices & dimensions.
-    MAKE_STEP_INDEX(t);           // step in time dim.
-    MAKE_DOMAIN_INDEX(x);         // spatial dim.
+    // Indices.
+    MAKE_MISC_INDEX(b);
 
     // Vars.
     MAKE_GRID(A, t, x); // time-varying grid.
+    MAKE_ARRAY(B, b);
 
 public:
 
     TestStepCondStencil1(StencilList& stencils, int radius=2) :
-        StencilRadiusBase("test_step_cond_1d", stencils, radius) { }
+        Test1dBase("test_step_cond_1d", stencils, radius) { }
 
     // Define equation to apply to all points in 'A' grid.
     virtual void define() {
 
         // Time condition.
         Condition tc0 = (t % 2 == 0);
+
+        // Var condition.
+        Condition vc0 = (B(0) > B(1));
         
-        // Set A w/different stencils.
+        // Set A w/different stencils depending on the conditions.  It is
+        // the programmer's responsibility to ensure that the conditions are
+        // exclusive when necessary. It is not checked at compile or
+        // run-time.
 
-        GridValue u = A(t, x);
-        for (int r = 1; r <= _radius; r++)
-            u += A(t, x-r);
-        for (int r = 1; r <= _radius + 1; r++)
-            u += A(t, x+r);
-        A(t+1, x) EQUALS u / (_radius * 2 + 2) IF_STEP tc0;
+        // Use this equation when t is even.
+        A(t+1, x) EQUALS def_1d(A, t, x, 0, 0) IF_STEP tc0;
 
-        GridValue v = A(t, x);
-        for (int r = 1; r <= _radius + 3; r++)
-            v += A(t, x-r);
-        for (int r = 1; r <= _radius + 2; r++)
-            v += A(t, x+r);
-        A(t+1, x) EQUALS v / (_radius * 2 + 6) IF_STEP !tc0;
+        // Use this equation when t is odd and B(0) > B(1).
+        A(t+1, x) EQUALS def_1d(A, t, x, 1, 2) IF_STEP !tc0 && vc0;
+
+        // Use this equation when t is even and B(0) <= B(1).
+        A(t+1, x) EQUALS def_1d(A, t, x, 2, 3) IF_STEP !tc0 && !vc0;
     }
 };
 
 REGISTER_STENCIL(TestStepCondStencil1);
 
 // Test the use of conditional updates with scratch-pad grids.
-
 class TestScratchSubdomainStencil1 : public Test1dBase {
 
 protected:
diff --git a/utils/bin/gen_layouts.pl b/utils/bin/gen_layouts.pl
index b7c33370..b9352f43 100755
--- a/utils/bin/gen_layouts.pl
+++ b/utils/bin/gen_layouts.pl
@@ -276,8 +276,8 @@ END
       my $wrap = $w ? "true" : "false";
     
       # Creation.
-      print " else if (ndims == $n && do_wrap == $wrap)\n",
-        "  gp = make_shared<YkElemGrid<$layout, $wrap>>(_dims, name, dims, &_opts, &_ostr);\n";
+      print " else if (ndims == $n && step_used == $wrap)\n",
+        "  gp = make_shared<YkElemGrid<$layout, $wrap>>(*this, name, gdims);\n";
     }
   }