diff --git a/README.md b/README.md index b816ed51..beccfedb 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ YASK contains a domain-specific compiler to convert scalar stencil code to SIMD- for functional testing if you don't have native support for any given instruction set. ### Backward-compatibility notices: +* Version 2.16.03 moved the position of the log-file name to the last column in the CSV output of `utils/bin/yask_log_to_csv.pl`. * Version 2.15.04 required a call to `yc_grid::set_dynamic_step_alloc(true)` to allow changing the allocation in the step (time) dimension for grid variables created at YASK compile-time. * Version 2.15.02 required all "misc" indices to be yask-compiler-time constants. diff --git a/docs/YASK-intro.pdf b/docs/YASK-intro.pdf index a4c56e86..98c14207 100644 Binary files a/docs/YASK-intro.pdf and b/docs/YASK-intro.pdf differ diff --git a/include/yc_node_api.hpp b/include/yc_node_api.hpp index 5e477661..b0c62fc7 100644 --- a/include/yc_node_api.hpp +++ b/include/yc_node_api.hpp @@ -178,20 +178,22 @@ namespace yask { LHS. An optional domain condition may be provided to define the sub-domain - to which this equation applies. See new_first_domain_index() - for more information and an example. - Conditions are always evaluated with respect to the overall + to which this equation applies. + Domain conditions are always evaluated with respect to the overall problem domain, i.e., independent of any specific MPI domain decomposition that might occur at run-time. - If a condition is not provided, the equation applies to the + If a domain condition is not provided, the equation applies to the entire problem domain. A domain condition can be added to an equation after its creation via yc_equation_node.set_cond(). + See yc_equation_node.set_cond() for more information and an example. - A step-index condition is similar to a domain condition, but - applies to the current step (usually time). - A step-index condition can be added to an equation after its creation + A step condition is similar to a domain condition, but + enables or disables the entire equation based on the current step (usually time) + and/or other values. + A step condition can only be added to an equation after its creation via yc_equation_node.set_step_cond(). + See yc_equation_node.set_step_cond() for more information and an example. @returns Pointer to new \ref yc_equation_node object. */ @@ -200,7 +202,7 @@ namespace yask { /**< [in] Grid-point before EQUALS operator. */, yc_number_node_ptr rhs /**< [in] Expression after EQUALS operator. */, - yc_bool_node_ptr cond = nullptr + yc_bool_node_ptr sub_domain_cond = nullptr /**< [in] Optional expression defining sub-domain where `lhs EQUALS rhs` is valid. */ ); @@ -291,35 +293,7 @@ namespace yask { domain in `dim` dimension. The `dim` argument is created via new_domain_index(). - Typical C++ usage: - - \code{.cpp} - auto x = node_fac.new_domain_index("x"); - - // Create boolean expression for the - // boundary sub-domain "x < first_x + 10". - auto first_x = node_fac.new_first_domain_index(x); - auto left_bc_cond = node_fac.new_less_than_node(x, first_x + 10); - - // Create a new equation that is valid in this range. - auto left_bc_eq = - node_fac.new_equation_node(grid_pt_expr, left_bc_expr, left_bc_cond); - \endcode - - Specification of the "interior" part of a 2-D domain could be - represented by an expression similar to - `x >= new_first_domain_index(x) + 20 && - x <= new_last_domain_index(x) - 20 && - y >= new_first_domain_index(y) + 20 && - y <= new_last_domain_index(y) - 20`. - - @note The entire domain in dimension "x" would be represented by - `x >= new_first_domain_index(x) && x <= new_last_domain_index(x)`, but - that is the default condition so does not need to be specified. - - @note Be sure to use an expression like "x < first_x + 10" - instead of merely "x < 10" to avoid the assumption that - the first index is always zero (0). + See yc_equation_node.set_cond() for more information and an example. @returns Pointer to new \ref yc_index_node object. */ @@ -333,6 +307,8 @@ namespace yask { domain in `dim` dimension. The `dim` argument is created via new_domain_index(). + See yc_equation_node.set_cond() for more information and an example. + @returns Pointer to new \ref yc_index_node object. */ virtual yc_number_node_ptr @@ -466,15 +442,80 @@ namespace yask { `nullptr` if not defined. */ virtual yc_bool_node_ptr get_cond() =0; - /// Set the condition describing the sub-domain. - /** See yc_node_factory::new_equation_node(). */ - virtual void set_cond(yc_bool_node_ptr cond - /**< [in] Boolean expression describing the sub-domain + /// Set the condition describing the sub-domain for this equation. + /** + See yc_node_factory::new_equation_node() for an overall description + of conditions. + + Typical C++ usage to create a sub-domain condition: + + \code{.cpp} + auto x = node_fac.new_domain_index("x"); + + // Create boolean expression for a 10-point wide left boundary area. + auto first_x = node_fac.new_first_domain_index(x); + auto left_bc_cond = x < first_x + 10; + + // Indicate that an expression is valid only in this area. + // (Assumes left_bc_expr was already defined.) + left_bc_expr.set_cond(left_bc_cond); + \endcode + + Specification of the "interior" part of a 2-D domain could be + represented by an expression like + `(x >= node_fac.new_first_domain_index(x) + 20) && (x <= node_fac.new_last_domain_index(x) - 20) && (y >= node_fac.new_first_domain_index(y) + 20) && (y <= node_fac.new_last_domain_index(y) - 20)`. + + @warning For performance, sub-domain expressions are only + evaluated once when yk_solution::prepare_solution() is called, + and the results are analyzed and cached internally. Thus, + sub-domain expressions should not include a step index or a + reference to any other varible that might change during or + between time-steps. See set_step_cond() for the mechanism to + enable equations based on variables that can change between + time-steps. + + @note The entire domain in dimension "x" would be represented by + `(x >= node_fac.new_first_domain_index(x)) && (x <= node_fac.new_last_domain_index(x))`, but + that is the default condition so does not need to be specified. + + @note Be sure to use an expression like `x < first_x + 10` + instead of merely `x < 10` to avoid the assumption that + the first index is always zero (0). More importantly, use + an expression like `x > last_x - 10` instead of hard-coding + the last index. + */ + virtual void set_cond(yc_bool_node_ptr sub_domain_cond + /**< [in] Boolean expression describing + where in the sub-domain this expression is valid or `nullptr` to remove the condition. */ ) =0; - /// Set the condition describing the valid step indices. + /// Set the condition describing when the equation is valid. + /** + See yc_node_factory::new_equation_node() for an overall description + of conditions. + + Typical C++ usage to create a step condition: + + \code{.cpp} + auto t = node_fac.new_step_index("t"); + + // Create boolean expression that is true every third step. + auto my_step_cond = (t % 3 == 0); + + // Indicate that an expression is valid only when step_cond is true. + // (Assumes my_expr was already defined.) + my_expr.set_step_cond(my_step_cond); + \endcode + + Step conditions may also refer to elements in grid variables including + scalars (1-D) and arrays (2-D). For non-scalar variables, indices + used in a step condition _cannot_ include domain variables like `x` or `y`, but + constants are allowed. In this way, equations can be enabled or + disabled programmatically by setting elements in the tested variables. + */ virtual void set_step_cond(yc_bool_node_ptr step_cond - /**< [in] Boolean expression describing a valid step + /**< [in] Boolean expression describing + when the expression is valid or `nullptr` to remove the condition. */ ) =0; /// Create a deep copy of AST starting with this node. diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp index cd1378a9..5040b96c 100644 --- a/src/common/common_utils.cpp +++ b/src/common/common_utils.cpp @@ -46,7 +46,7 @@ namespace yask { // for numbers above 9 (at least up to 99). // Format: "major.minor.patch". - const string version = "2.16.02"; + const string version = "2.16.03"; string yask_get_version_string() { return version; diff --git a/src/compiler/lib/Cpp.cpp b/src/compiler/lib/Cpp.cpp index 4a967a98..a16ccf93 100644 --- a/src/compiler/lib/Cpp.cpp +++ b/src/compiler/lib/Cpp.cpp @@ -34,6 +34,10 @@ namespace yask { // Format a real, preserving precision. string CppPrintHelper::formatReal(double v) { + // Int representation equivalent? + if (double(int(v)) == v) + return to_string(int(v)); + // IEEE double gives 15-17 significant decimal digits of precision per // https://en.wikipedia.org/wiki/Double-precision_floating-point_format. // Some precision might be lost if/when cast to a float, but that's ok. diff --git a/src/compiler/lib/Eqs.cpp b/src/compiler/lib/Eqs.cpp index 8a067d9b..c4c84425 100644 --- a/src/compiler/lib/Eqs.cpp +++ b/src/compiler/lib/Eqs.cpp @@ -39,10 +39,6 @@ namespace yask { // and its input grids and points. class PointVisitor : public ExprVisitor { - // A set of all points to ensure pointers to each - // unique point have same value. - set _all_pts; - // A type to hold a mapping of equations to a set of grids in each. typedef unordered_set GridSet; typedef unordered_map GridMap; @@ -58,79 +54,105 @@ namespace yask { PointMap _lhs_pts; // outputs of eqs. PointSetMap _rhs_pts; // inputs of eqs. + PointSetMap _cond_pts; // sub-domain expr inputs. + PointSetMap _step_cond_pts; // step-cond expr inputs. + PointSetMap _all_pts; // all points in each eq (union of above). - EqualsExpr* _eq=0; // Current equation. + // Vars for indexing data. + EqualsExpr* _eq = 0; // Current equation. + enum State { _in_lhs, _in_rhs, _in_cond, _in_step_cond } _state; public: // Ctor. - // 'pts' contains offsets from each point to create. PointVisitor() {} virtual ~PointVisitor() {} + // Get access to grids per eq. GridMap& getOutputGrids() { return _lhs_grids; } GridSetMap& getInputGrids() { return _rhs_grids; } + + // Get access to pts per eq. + // Contains unique ptrs to pts, but pts may not + // be unique. PointMap& getOutputPts() { return _lhs_pts; } PointSetMap& getInputPts() { return _rhs_pts; } - int getNumEqs() const { return (int)_lhs_pts.size(); } + PointSetMap& getCondPts() { return _cond_pts; } + PointSetMap& getStepCondPts() { return _step_cond_pts; } + PointSetMap& getAllPts() { return _all_pts; } - // Determine whether 2 sets have any common points. - virtual bool do_sets_intersect(const GridSet& a, - const GridSet& b) { - for (auto ai : a) { - if (b.count(ai) > 0) - return true; - } - return false; - } - virtual bool do_sets_intersect(const PointSet& a, - const PointSet& b) { - for (auto ai : a) { - if (b.count(ai) > 0) - return true; - } - return false; - } + int getNumEqs() const { return (int)_lhs_pts.size(); } // Callback at an equality. - // Handles LHS grid pt explicitly, then visits RHS. + // Visits all parts that might have grid points. virtual string visit(EqualsExpr* ee) { // Set this equation as current one. _eq = ee; - // Make sure map entries exist for this eq. + // Make sure all map entries exist for this eq. _lhs_grids[_eq]; _rhs_grids[_eq]; _lhs_pts[_eq]; _rhs_pts[_eq]; - - // Store LHS point. - auto* lhs = ee->getLhs().get(); - _lhs_pts[_eq] = lhs; - - // Add grid. - auto* g = lhs->getGrid(); - _lhs_grids[_eq] = g; - + _cond_pts[_eq]; + _step_cond_pts[_eq]; + _all_pts[_eq]; + + // visit LHS. + auto& lhs = ee->getLhs(); + _state = _in_lhs; + lhs->accept(this); + // visit RHS. NumExprPtr rhs = ee->getRhs(); + _state = _in_rhs; rhs->accept(this); - // Don't visit LHS because we've already saved it. + // visit conds. + auto& cp = ee->getCond(); + if (cp) { + _state = _in_cond; + cp->accept(this); + } + auto& scp = ee->getStepCond(); + if (scp) { + _state = _in_step_cond; + scp->accept(this); + } return ""; } - // Callback at a grid point on the RHS. + // Callback at a grid point. virtual string visit(GridPoint* gp) { assert(_eq); + auto* g = gp->getGrid(); + _all_pts[_eq].insert(gp); - // Store RHS point. - _rhs_pts[_eq].insert(gp); + // Save pt and/or grid based on state. + switch (_state) { - // Add grid. - auto* g = gp->getGrid(); - _rhs_grids[_eq].insert(g); + case _in_lhs: + _lhs_pts[_eq] = gp; + _lhs_grids[_eq] = g; + break; + + case _in_rhs: + _rhs_pts[_eq].insert(gp); + _rhs_grids[_eq].insert(g); + break; + + case _in_cond: + _cond_pts[_eq].insert(gp); + break; + + case _in_step_cond: + _step_cond_pts[_eq].insert(gp); + break; + + default: + assert(0 && "illegal state"); + } return ""; } }; @@ -155,6 +177,8 @@ namespace yask { auto& inGrids = pt_vis.getInputGrids(); auto& outPts = pt_vis.getOutputPts(); auto& inPts = pt_vis.getInputPts(); + //auto& condPts = pt_vis.getCondPts(); + //auto& stepCondPts = pt_vis.getStepCondPts(); // 1. Check each eq internally. os << "\nProcessing " << getNum() << " stencil equation(s)...\n"; @@ -317,10 +341,10 @@ namespace yask { argn->getIntVal(); // throws exception if not an integer. } } - } + } // input pts. // TODO: check to make sure cond1 depends only on domain indices. - // TODO: check to make sure stcond1 depends only on step index. + // TODO: check to make sure stcond1 does not depend on domain indices. } // for all eqs. // 2. Check each pair of eqs. @@ -520,6 +544,8 @@ namespace yask { SetVecVisitor(const Dimensions& dims) : _dims(dims) { _visitEqualsLhs = true; + _visitGridPointArgs = true; + _visitConds = true; } // Check each grid point in expr. @@ -529,7 +555,7 @@ namespace yask { // Never vectorize scalars. if (grid->get_num_dims() == 0) { gp->setVecType(GridPoint::VEC_NONE); - return ""; + return ""; // Also, no args to visit. } // Amount of vectorization allowed primarily depends on number @@ -550,6 +576,8 @@ namespace yask { assert(fdoffsets <= grid_nfd); // All folded dims are vectorizable? + // NB: this will always be the case when there is + // no folding in the soln. if (fdoffsets == soln_nfd) gp->setVecType(GridPoint::VEC_FULL); // all good. @@ -560,7 +588,9 @@ namespace yask { // Uses no folded dims, so scalar only. else gp->setVecType(GridPoint::VEC_NONE); - return ""; + + // Also check args of this grid point. + return ExprVisitor::visit(gp); } }; @@ -662,13 +692,8 @@ namespace yask { // Analyze each eq. for (auto& eq : getAll()) { - // Get sets of points for this eq. - auto* outPt1 = pv.getOutputPts().at(eq.get()); - auto& inPts1 = pv.getInputPts().at(eq.get()); - - // Union of all input and output points for 'eq'. - auto allPts1 = inPts1; - allPts1.insert(outPt1); + // Get all grid points touched by this eq. + auto& allPts1 = pv.getAllPts().at(eq.get()); // Update stats of each grid accessed in 'eq'. for (auto ap : allPts1) { diff --git a/src/compiler/lib/Expr.cpp b/src/compiler/lib/Expr.cpp index 57fd895f..aa685a61 100644 --- a/src/compiler/lib/Expr.cpp +++ b/src/compiler/lib/Expr.cpp @@ -518,16 +518,6 @@ namespace yask { return expr; } - // Unused operators. - EqualsExprPtr operator BAD_OPER1(GridPointPtr gpp, const NumExprPtr rhs) { - THROW_YASK_EXCEPTION("Error: operator '==' is not allowed on a grid point;" - " use 'EQUALS' to assert equality between a grid point (LHS)" - " and an equation (RHS)"); - } - EqualsExprPtr operator BAD_OPER1(GridPointPtr gpp, double rhs) { - return gpp EQUALS_OPER constNum(rhs); - } - // Define the value of a grid point. // Add this equation to the list of eqs for this stencil. EqualsExprPtr operator EQUALS_OPER(GridPointPtr gpp, const NumExprPtr rhs) { @@ -759,7 +749,7 @@ namespace yask { } string GridPoint::getGridPtr() const { string gname = _grid->getName(); - string expr = "static_cast<_context_type::" + gname + "_type*>(_context->" + gname; + string expr = "static_cast<_context_type::" + gname + "_type*>(_context_data->" + gname; if (_grid->isScratch()) expr += "_list[region_thread_idx].get()"; expr += ")"; diff --git a/src/compiler/lib/Expr.hpp b/src/compiler/lib/Expr.hpp index 61e6b292..e31bcab7 100644 --- a/src/compiler/lib/Expr.hpp +++ b/src/compiler/lib/Expr.hpp @@ -68,6 +68,8 @@ namespace yask { class NumExpr; typedef shared_ptr NumExprPtr; typedef vector NumExprPtrVec; + class GridPoint; + typedef shared_ptr GridPointPtr; class IndexExpr; typedef shared_ptr IndexExprPtr; typedef vector IndexExprPtrVec; @@ -79,8 +81,6 @@ namespace yask { // More forward-decls. class ExprVisitor; class Grid; - class GridPoint; - typedef shared_ptr GridPointPtr; class StencilSolution; struct Dimensions; @@ -1129,26 +1129,35 @@ namespace yask { #define IS_EQUIV_TO EQUALS_OPER #define IS_EQUIVALENT_TO EQUALS_OPER - // Catch use of the old '==' operator. -#define BAD_OPER1 == - EqualsExprPtr operator BAD_OPER1(GridPointPtr gpp, const NumExprPtr rhs); - EqualsExprPtr operator BAD_OPER1(GridPointPtr gpp, double rhs); - - // Binary numerical-to-boolean operators. - // Must provide explicit IndexExprPtr operands to keep compiler from - // using built-in pointer comparison. + // Binary numerical-to-boolean operators. Must provide more explicit + // ptr-type operands than used with math operators to keep compiler from + // using built-in pointer comparison. This means we need all + // permutations of {NumExpr,IndexExpr,GridPoint}Ptr. Const values must + // be on RHS of operator, e.g., 'x > 5' is ok, but '5 < x' is not. #define BOOL_OPER(oper, type, implType) \ inline BoolExprPtr operator oper(const NumExprPtr lhs, const NumExprPtr rhs) { \ return make_shared(lhs, rhs); } \ - inline BoolExprPtr operator oper(const IndexExprPtr lhs, const NumExprPtr rhs) { \ - return make_shared(lhs, rhs); } \ inline BoolExprPtr operator oper(const NumExprPtr lhs, const IndexExprPtr rhs) { \ return make_shared(lhs, rhs); } \ + inline BoolExprPtr operator oper(const NumExprPtr lhs, const GridPointPtr rhs) { \ + return make_shared(lhs, rhs); } \ + inline BoolExprPtr operator oper(const IndexExprPtr lhs, const NumExprPtr rhs) { \ + return make_shared(lhs, rhs); } \ inline BoolExprPtr operator oper(const IndexExprPtr lhs, const IndexExprPtr rhs) { \ return make_shared(lhs, rhs); } \ + inline BoolExprPtr operator oper(const IndexExprPtr lhs, const GridPointPtr rhs) { \ + return make_shared(lhs, rhs); } \ + inline BoolExprPtr operator oper(const GridPointPtr lhs, const NumExprPtr rhs) { \ + return make_shared(lhs, rhs); } \ + inline BoolExprPtr operator oper(const GridPointPtr lhs, const IndexExprPtr rhs) { \ + return make_shared(lhs, rhs); } \ + inline BoolExprPtr operator oper(const GridPointPtr lhs, const GridPointPtr rhs) { \ + return make_shared(lhs, rhs); } \ inline BoolExprPtr operator oper(const NumExprPtr lhs, double rhs) { \ return make_shared(lhs, constNum(rhs)); } \ inline BoolExprPtr operator oper(const IndexExprPtr lhs, double rhs) { \ + return make_shared(lhs, constNum(rhs)); } \ + inline BoolExprPtr operator oper(const GridPointPtr lhs, double rhs) { \ return make_shared(lhs, constNum(rhs)); } BOOL_OPER(==, IsEqualExpr, yc_equals_node) @@ -1196,15 +1205,9 @@ namespace yask { // - A variable within the global or current namespace where it will be used. // - A local variable in the 'value' method; in this case, the value // of the local var must be evaluated and inserted in the expr. - // Example code: - // GridValue v; - // SET_VALUE_FROM_EXPR(v =, "_context->temp * " << 0.2); - // SET_VALUE_FROM_EXPR(v +=, "_context->coeff[" << r << "]"); - // This example would generate the following partial expression (when r=9): - // (_context->temp * 2.00000000000000000e-01) + (_context->coeff[9]) #define SET_VALUE_FROM_EXPR(lhs, rhs) do { \ ostringstream oss; \ - oss << setprecision(17) << scientific; \ + oss << setprecision(15) << scientific; \ oss << "(" << rhs << ")"; \ lhs make_shared(oss.str()); \ } while(0) diff --git a/src/compiler/lib/Print.cpp b/src/compiler/lib/Print.cpp index 79549d05..ce7f63cd 100644 --- a/src/compiler/lib/Print.cpp +++ b/src/compiler/lib/Print.cpp @@ -441,11 +441,11 @@ namespace yask { // Comment about condition. // Null ptr => no condition. if (ee->getCond()) { - string cond = ee->getCond()->accept(this); + string cond = ee->getCond()->makeStr(); _os << " IF (" << cond << ")"; } if (ee->getStepCond()) { - string cond = ee->getStepCond()->accept(this); + string cond = ee->getStepCond()->makeStr(); _os << " IF_STEP (" << cond << ")"; } _os << ".\n"; diff --git a/src/compiler/lib/Soln.hpp b/src/compiler/lib/Soln.hpp index 736c0c0a..e07d3979 100644 --- a/src/compiler/lib/Soln.hpp +++ b/src/compiler/lib/Soln.hpp @@ -91,7 +91,8 @@ namespace yask { StencilSolution(const string& name) : _name(name) { yask_output_factory ofac; - set_debug_output(ofac.new_stdout_output()); + auto so = ofac.new_stdout_output(); + set_debug_output(so); } virtual ~StencilSolution() {} diff --git a/src/compiler/lib/Visitor.hpp b/src/compiler/lib/Visitor.hpp index 3b65b215..fb3cd706 100644 --- a/src/compiler/lib/Visitor.hpp +++ b/src/compiler/lib/Visitor.hpp @@ -38,6 +38,8 @@ namespace yask { class ExprVisitor { protected: bool _visitEqualsLhs = false; // whether to visit LHS of EQUALS. + bool _visitGridPointArgs = false; // whether to visit exprs in grid point args. + bool _visitConds = false; // whether to visit conditional exprs. public: virtual ~ExprVisitor() { } @@ -46,7 +48,16 @@ namespace yask { virtual string visit(ConstExpr* ce) { return ""; } virtual string visit(CodeExpr* ce) { return ""; } virtual string visit(IndexExpr* ie) { return ""; } - virtual string visit(GridPoint* gp) { return ""; } // does NOT visit arg exprs by default. + + // Visit grid-point args only if flag is set. + virtual string visit(GridPoint* gp) { + string res; + if (_visitGridPointArgs) { + for (auto& arg : gp->getArgs()) + res = arg->accept(this); + } + return res; + } // By default, a unary visitor visits its operand. virtual string visit(UnaryNumExpr* ue) { @@ -88,14 +99,22 @@ namespace yask { res = ep->accept(this); return res; } - virtual string visit(EqualsExpr* ee) { - // Visit LHS if flag is set. + // Visit RHS of equals and LHS and conditions per flags. + virtual string visit(EqualsExpr* ee) { if (_visitEqualsLhs) ee->getLhs()->accept(this); + if (_visitConds) { + auto& cp = ee->getCond(); + if (cp) + cp->accept(this); + auto& scp = ee->getStepCond(); + if (scp) + scp->accept(this); + } + + // Always visit RHS. return ee->getRhs()->accept(this); - - // Conditions are not visited. } }; diff --git a/src/compiler/lib/YaskKernel.cpp b/src/compiler/lib/YaskKernel.cpp index ea5c1f8f..9d2d0e81 100644 --- a/src/compiler/lib/YaskKernel.cpp +++ b/src/compiler/lib/YaskKernel.cpp @@ -336,7 +336,7 @@ namespace yask { ctorCode += " " + grid + "_dim_names = {" + gdims.makeDimStr(", ", "\"", "\"") + "};\n"; string initCode = " " + grid + "_ptr = std::make_shared<" + typeDef + - ">(_dims, \"" + grid + "\", " + grid + "_dim_names, &_opts, &_ostr);\n" + ">(*this, \"" + grid + "\", " + grid + "_dim_names);\n" " assert(" + grid + "_ptr);\n"; // Grid vars. @@ -448,7 +448,7 @@ namespace yask { if (!firstGrid) newGridCode += " else"; newGridCode += " if (dims == " + grid + "_dim_names) gp = std::make_shared<" + - typeDef + ">(_dims, name, dims, &_opts, &_ostr);\n"; + typeDef + ">(*this, name, dims);\n"; } } // grids. @@ -503,7 +503,7 @@ namespace yask { "\n class " << egsName << " : public StencilBundleBase {\n" " protected:\n" " typedef " << _context_base << " _context_type;\n" - " _context_type* _context = 0;\n" + " _context_type* _context_data = 0;\n" " public:\n"; // Stats for this eqBundle. @@ -518,7 +518,7 @@ namespace yask { { os << " " << egsName << "(" << _context_base << "* context) :\n" " StencilBundleBase(context),\n" - " _context(context) {\n" + " _context_data(context) {\n" " _name = \"" << egName << "\";\n" " _scalar_fp_ops = " << stats.getNumOps() << ";\n" " _scalar_points_read = " << stats.getNumReads() << ";\n" @@ -529,9 +529,9 @@ namespace yask { os << "\n // The following grid(s) are read by " << egsName << endl; for (auto gp : eq->getInputGrids()) { if (gp->isScratch()) - os << " inputScratchVecs.push_back(&_context->" << gp->getName() << "_list);\n"; + os << " inputScratchVecs.push_back(&_context_data->" << gp->getName() << "_list);\n"; else - os << " inputGridPtrs.push_back(_context->" << gp->getName() << "_ptr);\n"; + os << " inputGridPtrs.push_back(_context_data->" << gp->getName() << "_ptr);\n"; } os << "\n // The following grid(s) are written by " << egsName; if (eq->step_expr) @@ -539,9 +539,9 @@ namespace yask { os << ".\n"; for (auto gp : eq->getOutputGrids()) { if (gp->isScratch()) - os << " outputScratchVecs.push_back(&_context->" << gp->getName() << "_list);\n"; + os << " outputScratchVecs.push_back(&_context_data->" << gp->getName() << "_list);\n"; else - os << " outputGridPtrs.push_back(_context->" << gp->getName() << "_ptr);\n"; + os << " outputGridPtrs.push_back(_context_data->" << gp->getName() << "_ptr);\n"; } os << " } // Ctor." << endl; } @@ -557,7 +557,9 @@ namespace yask { os << " return " << eq->cond->makeStr() << ";\n"; else os << " return true; // full domain.\n"; - os << " }\n" + os << " }\n"; + + os << "\n // Return whether there is a sub-domain expression.\n" " virtual bool is_sub_domain_expr() const {\n" " return " << (eq->cond ? "true" : "false") << ";\n }\n"; @@ -573,16 +575,40 @@ namespace yask { // Step condition. { - os << endl << " // Determine whether " << egsName << " is valid at the step " << - _dims._stencilDims.makeDimStr() << ".\n" - " // Return true if indices are within the valid sub-domain or false otherwise.\n" + os << endl << " // Determine whether " << egsName << + " is valid at the step input_step_index.\n" << + " // Return true if valid or false otherwise.\n" " virtual bool is_in_valid_step(idx_t input_step_index) const final {\n"; - if (eq->step_cond) + if (eq->step_cond) { os << " idx_t " << _dims._stepDim << " = input_step_index;\n" - " return " << eq->step_cond->makeStr() << ";\n"; + "\n // " << eq->step_cond->makeStr() << "\n"; + + // C++ scalar print assistant. + CounterVisitor cv; + eq->step_cond->accept(&cv); + CppPrintHelper* sp = new CppPrintHelper(_settings, _dims, &cv, "temp", "real_t", " ", ";\n"); + + // Generate the code. + PrintVisitorTopDown pcv(os, *sp); + string expr = eq->step_cond->accept(&pcv); + os << " return " << expr << ";\n"; + } else os << " return true; // any step.\n"; os << " }\n"; + + os << "\n // Return whether there is a step-condition expression.\n" + " virtual bool is_step_cond_expr() const {\n" + " return " << (eq->step_cond ? "true" : "false") << + ";\n }\n"; + + os << "\n // Return human-readable description of step condition.\n" + " virtual std::string get_step_cond_description() const {\n"; + if (eq->step_cond) + os << " return \"" << eq->step_cond->makeStr() << "\";\n"; + else + os << " return \"true\"; // any step.\n"; + os << " }\n"; } // LHS step index. @@ -620,8 +646,8 @@ namespace yask { // C++ scalar print assistant. CounterVisitor cv; - eq->visitEqs(&cv); - CppPrintHelper* sp = new CppPrintHelper(_settings, _dims, &cv, "temp", "real_t", " ", ";\n"); + eq->visitEqs(&cv); + CppPrintHelper* sp = new CppPrintHelper(_settings, _dims, &cv, "temp", "real_t", " ", ";\n"); // Generate the code. PrintVisitorBottomUp pcv(os, *sp); @@ -823,8 +849,8 @@ namespace yask { if (bp->isScratch()) continue; string bpName = bp->getName(); - os << " auto " << bpName << " = std::make_shared(\"" << - bpName << "\", this);\n"; + os << " auto " << bpName << " = std::make_shared(this, \"" << + bpName << "\");\n"; for (auto& eg : bp->getBundles()) { if (eg->isScratch()) continue; diff --git a/src/kernel/Makefile b/src/kernel/Makefile index 0ea0f25a..d3e07e43 100644 --- a/src/kernel/Makefile +++ b/src/kernel/Makefile @@ -417,10 +417,16 @@ ifneq ($(filter -O0,$(YK_CXXOPT)),) check ?= 1 endif -# Turn on checking macro. +# Turn on debug macros. ifeq ($(check),1) MACROS += CHECK endif +ifeq ($(trace),1) + MACROS += TRACE +endif +ifeq ($(trace_mem),1) + MACROS += TRACE_MEM +endif # Set MACROS based on individual makefile vars. MACROS += PFD_L1=$(pfd_l1) PFD_L2=$(pfd_l2) @@ -1008,8 +1014,8 @@ help: echo " "; \ echo "Example debug builds of kernel cmd-line tool:"; \ echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' check=1 # No OpenMP or MPI, internal checking"; \ - echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=test_2d mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' check=1 EXTRA_MACROS='TRACE' # TRACE is a useful debug setting!"; \ - echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis radius=0 fold='x=1,y=1,z=1' mpi=0 YK_CXX=g++ OMPFLAGS='' YK_CXXOPT='-O0' check=1 EXTRA_MACROS='TRACE TRACE_MEM TRACE_INTRINSICS'"; \ + echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=test_2d mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' check=1 trace=1 # trace is a useful debug setting!"; \ + echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis radius=0 fold='x=1,y=1,z=1' mpi=0 YK_CXX=g++ OMPFLAGS='' YK_CXXOPT='-O0' check=1 trace=1 trace_mem=1"; \ echo " "; \ echo "Example builds with test runs:"; \ echo " $(MAKE) -j all # Normal full API and stencil tests"; \ diff --git a/src/kernel/lib/alloc.cpp b/src/kernel/lib/alloc.cpp index 9b5d87ae..833bad3e 100644 --- a/src/kernel/lib/alloc.cpp +++ b/src/kernel/lib/alloc.cpp @@ -55,7 +55,7 @@ namespace yask { const map & ngrids, map >& data_buf, const std::string& type) { - CONTEXT_VARS(this); + STATE_VARS(this); // Loop through each mem type. for (const auto& i : nbytes) { @@ -119,7 +119,7 @@ namespace yask { // Allocate memory for grids that do not already have storage. void StencilContext::allocGridData() { - CONTEXT_VARS(this); + STATE_VARS(this); // Allocate I/O grids before read-only grids. GridPtrs sortedGridPtrs; @@ -231,7 +231,7 @@ namespace yask { // Determine the size and shape of all MPI buffers. // Create buffers and allocate them. void StencilContext::allocMpiData() { - CONTEXT_VARS(this); + STATE_VARS(this); // Remove any old MPI data. env->global_barrier(); @@ -245,11 +245,11 @@ namespace yask { map num_exchanges; // send/recv => count. map num_elems; // send/recv => count. - auto me = _env->my_rank; + auto me = env->my_rank; // Need to determine the size and shape of all MPI buffers. // Loop thru all neighbors of this rank. - _mpiInfo->visitNeighbors + mpiInfo->visitNeighbors ([&](const IdxTuple& neigh_offsets, int neigh_rank, int neigh_idx) { if (neigh_rank == MPI_PROC_NULL) return; // from lambda fn. @@ -269,7 +269,7 @@ namespace yask { maxdist = NUM_STENCIL_DIMS - 1; // Manhattan dist. of current neighbor. - int mandist = _mpiInfo->man_dists.at(neigh_idx); + int mandist = mpiInfo->man_dists.at(neigh_idx); // Check distance. if (mandist > maxdist) { @@ -282,8 +282,8 @@ namespace yask { // Both my rank and neighbor rank must have *all* domain sizes // of vector multiples. bool vec_ok = allow_vec_exchange && - _mpiInfo->has_all_vlen_mults[_mpiInfo->my_neighbor_index] && - _mpiInfo->has_all_vlen_mults[neigh_idx]; + mpiInfo->has_all_vlen_mults[mpiInfo->my_neighbor_index] && + mpiInfo->has_all_vlen_mults[neigh_idx]; // Determine size of MPI buffers between neigh_rank and my // rank for each grid and create those that are needed. It @@ -306,7 +306,7 @@ namespace yask { IdxTuple my_halo_sizes, neigh_halo_sizes; IdxTuple first_inner_idx, last_inner_idx; IdxTuple first_outer_idx, last_outer_idx; - for (auto& dim : _dims->_domain_dims.getDims()) { + for (auto& dim : domain_dims.getDims()) { auto& dname = dim.getName(); // Only consider domain dims that are used in this grid. @@ -324,9 +324,9 @@ namespace yask { idx_t lidx = gp->get_last_rank_domain_index(dname); first_inner_idx.addDimBack(dname, fidx); last_inner_idx.addDimBack(dname, lidx); - if (_opts->is_first_rank(dname)) + if (opts->is_first_rank(dname)) fidx -= lhalo; // extend into left halo. - if (_opts->is_last_rank(dname)) + if (opts->is_last_rank(dname)) lidx += rhalo; // extend into right halo. first_outer_idx.addDimBack(dname, fidx); last_outer_idx.addDimBack(dname, lidx); @@ -414,7 +414,7 @@ namespace yask { // to be so. // TODO: add a heuristic to avoid increasing by a large factor. if (grid_vec_ok) { - for (auto& dim : _dims->_domain_dims.getDims()) { + for (auto& dim : domain_dims.getDims()) { auto& dname = dim.getName(); if (gp->is_dim_used(dname)) { auto vlen = gp->_get_vec_len(dname); @@ -448,7 +448,7 @@ namespace yask { IdxTuple copy_end = gp->get_allocs(); // one past last! // Adjust along domain dims in this grid. - for (auto& dim : _dims->_domain_dims.getDims()) { + for (auto& dim : domain_dims.getDims()) { auto& dname = dim.getName(); if (gp->is_dim_used(dname)) { @@ -480,7 +480,7 @@ namespace yask { // Adjust LHS of interior. idx_t ext_end = ROUND_UP(first_inner_idx[dname] + max(min_ext, neigh_halo_sizes[dname]), - _dims->_fold_pts[dname]); + dims->_fold_pts[dname]); mpi_interior.bb_begin[dname] = max(mpi_interior.bb_begin[dname], ext_end); } @@ -495,7 +495,7 @@ namespace yask { // Adjust RHS of interior. idx_t ext_begin = ROUND_DOWN(last_inner_idx[dname] + 1 - max(min_ext, neigh_halo_sizes[dname]), - _dims->_fold_pts[dname]); + dims->_fold_pts[dname]); mpi_interior.bb_end[dname] = min(mpi_interior.bb_end[dname], ext_begin); } @@ -538,7 +538,7 @@ namespace yask { idx_t dsize = 1; // domain dim? - if (_dims->_domain_dims.lookup(dname)) { + if (domain_dims.lookup(dname)) { dsize = copy_end[dname] - copy_begin[dname]; // Check whether alignment and size are multiple of vlen. @@ -600,7 +600,7 @@ namespace yask { IdxTuple copy_last = copy_end.subElements(1); // Make MPI data entry for this grid. - auto gbp = mpiData.emplace(gname, _mpiInfo); + auto gbp = mpiData.emplace(gname, state->_mpiInfo); auto& gbi = gbp.first; // iterator from pair returned by emplace(). auto& gbv = gbi->second; // value from iterator. auto& buf = gbv.getBuf(MPIBufs::BufDir(bd), neigh_offsets); @@ -793,7 +793,7 @@ namespace yask { // Allocate memory for scratch grids based on number of threads and // block sizes. void StencilContext::allocScratchData() { - CONTEXT_VARS(this); + STATE_VARS(this); // Remove any old scratch data. freeScratchData(); @@ -815,7 +815,7 @@ namespace yask { // Find the max mini-block size across all packs. // They can be different across packs when pack-specific // auto-tuning has been used. - IdxTuple mblksize(_dims->_domain_dims); + IdxTuple mblksize(domain_dims); for (auto& sp : stPacks) { auto& psettings = sp->getActiveSettings(); DOMAIN_VAR_LOOP(i, j) { @@ -851,7 +851,7 @@ namespace yask { int numa_pref = gp->get_numa_preferred(); // Loop through each domain dim. - for (auto& dim : _dims->_domain_dims.getDims()) { + for (auto& dim : domain_dims.getDims()) { auto& dname = dim.getName(); if (gp->is_dim_used(dname)) { @@ -866,8 +866,8 @@ namespace yask { // Pads. // Set via both 'extra' and 'min'; larger result will be used. - gp->set_extra_pad_size(dname, _opts->_extra_pad_sizes[dname]); - gp->set_min_pad_size(dname, _opts->_min_pad_sizes[dname]); + gp->set_extra_pad_size(dname, opts->_extra_pad_sizes[dname]); + gp->set_min_pad_size(dname, opts->_min_pad_sizes[dname]); } } // dims. diff --git a/src/kernel/lib/auto_tuner.cpp b/src/kernel/lib/auto_tuner.cpp index 1dfbd4ed..452873d8 100644 --- a/src/kernel/lib/auto_tuner.cpp +++ b/src/kernel/lib/auto_tuner.cpp @@ -31,11 +31,24 @@ using namespace std; namespace yask { + // Ctor. + AutoTuner::AutoTuner(StencilContext* context, + KernelSettings* settings, + const std::string& name) : + ContextLinker(context), + _settings(settings), + _name("auto-tuner") { + assert(settings); + if (name.length()) + _name += "(" + name + ")"; + } + // Eval auto-tuner for given number of steps. void StencilContext::eval_auto_tuner(idx_t num_steps) { + STATE_VARS(this); _at.steps_done += num_steps; - if (_use_pack_tuners) { + if (state->_use_pack_tuners) { for (auto& sp : stPacks) sp->getAT().eval(); } @@ -52,8 +65,9 @@ namespace yask { // Determine if any auto tuners are running. bool StencilContext::is_auto_tuner_enabled() const { + STATE_VARS(this); bool done = true; - if (_use_pack_tuners) { + if (state->_use_pack_tuners) { for (auto& sp : stPacks) if (!sp->getAT().is_done()) done = false; @@ -65,7 +79,7 @@ namespace yask { // Apply auto-tuning immediately, i.e., not as part of normal processing. // Will alter data in grids. void StencilContext::run_auto_tuner_now(bool verbose) { - CONTEXT_VARS(this); + STATE_VARS(this); if (!rank_bb.bb_valid) THROW_YASK_EXCEPTION("Error: run_auto_tuner_now() called without calling prepare_solution() first"); @@ -90,7 +104,7 @@ namespace yask { // Determine number of steps to run. // If wave-fronts are enabled, run a max number of these steps. - idx_t step_dir = _dims->_step_dir; // +/- 1. + idx_t step_dir = dims->_step_dir; // +/- 1. idx_t step_t = min(max(wf_steps, idx_t(1)), +AutoTuner::max_step_t) * step_dir; // Run time-steps until AT converges. @@ -106,7 +120,7 @@ namespace yask { // Wait for all ranks to finish. os << "Waiting for auto-tuner to converge on all ranks...\n"; - _env->global_barrier(); + env->global_barrier(); // reenable normal operation. #ifndef NO_HALO_EXCHANGE @@ -118,7 +132,7 @@ namespace yask { at_timer.stop(); os << "Auto-tuner done after " << steps_done << " step(s) in " << at_timer.get_elapsed_secs() << " secs.\n"; - if (_use_pack_tuners) { + if (state->_use_pack_tuners) { for (auto& sp : stPacks) sp->getAT().print_settings(os); } else @@ -142,9 +156,8 @@ namespace yask { // Reset the auto-tuner. void AutoTuner::clear(bool mark_done, bool verbose) { + STATE_VARS(this); - // Output. - ostream& os = _context->get_ostr(); #ifdef TRACE this->verbose = true; #else @@ -176,15 +189,14 @@ namespace yask { steps_done = 0; // Set min blocks to number of region threads. - min_blks = _context->set_region_threads(); + min_blks = set_region_threads(); // Adjust starting block if needed. - auto& opts = _context->get_settings(); for (auto dim : center_block.getDims()) { auto& dname = dim.getName(); auto& dval = dim.getVal(); - if (dname == opts->_dims->_step_dim) { + if (dname == step_dim) { block_steps = opts->_block_sizes[dname]; center_block[dname] = block_steps; } else { @@ -194,15 +206,15 @@ namespace yask { } } if (!done) { - TRACE_MSG2(_name << ": starting block-size: " << + TRACE_MSG(_name << ": starting block-size: " << center_block.makeDimValStr(" * ")); - TRACE_MSG2(_name << ": starting search radius: " << radius); + TRACE_MSG(_name << ": starting search radius: " << radius); } } // clear. // Evaluate the previous run and take next auto-tuner step. void AutoTuner::eval() { - CONTEXT_VARS(_context); + STATE_VARS(this); // Get elapsed time and reset. double etime = timer.get_elapsed_secs(); @@ -248,10 +260,7 @@ namespace yask { csteps << " steps(s) in " << ctime << " secs (" << rate << " steps/sec) with block-size " << - _settings->_block_sizes.makeDimValStr(" * "); - if (_context->tb_steps > 0) - os << ", " << _context->tb_steps << " TB step(s)"; - os << endl; + _settings->_block_sizes.makeDimValStr(" * ") << endl; csteps = 0; ctime = 0.; @@ -335,7 +344,7 @@ namespace yask { bsize[dname] = sz; } // domain dims. - TRACE_MSG2(_name << ": checking block-size " << + TRACE_MSG(_name << ": checking block-size " << bsize.makeDimValStr(" * ")); // Too small? @@ -346,7 +355,6 @@ namespace yask { // Too few? else if (ok) { - auto& opts = _context->get_settings(); idx_t nblks = get_num_domain_points(opts->_region_sizes) / get_num_domain_points(bsize); if (nblks < min_blks) { @@ -393,10 +401,10 @@ namespace yask { os << _name << ": done" << endl; return; } - TRACE_MSG2(_name << ": new search radius=" << radius); + TRACE_MSG(_name << ": new search radius=" << radius); } else { - TRACE_MSG2(_name << ": continuing search from block " << + TRACE_MSG(_name << ": continuing search from block " << center_block.makeDimValStr(" * ")); } } // beyond next neighbor of center. @@ -406,13 +414,13 @@ namespace yask { // Assumption is that block size in one pack doesn't affect // perf in another pack. apply(); - TRACE_MSG2(_name << ": next block-size " << + TRACE_MSG(_name << ": next block-size " << _settings->_block_sizes.makeDimValStr(" * ")); } // eval. // Apply auto-tuner settings to prepare for a run. void AutoTuner::apply() { - CONTEXT_VARS(_context); + STATE_VARS(this); // Restore step-dim value for block. _settings->_block_sizes[step_posn] = block_steps; @@ -426,19 +434,23 @@ namespace yask { _settings->_mini_block_group_sizes.setValsSame(0); _settings->_block_group_sizes.setValsSame(0); + // Save debug output and set to null. + auto saved_op = get_debug_output(); + set_debug_output(nullop); + // Make sure everything is resized based on block size. - auto saved_op = cp->get_debug_output(); - cp->set_debug_output(nullop); - _settings->adjustSettings(cp->get_env()); + _settings->adjustSettings(); // Update temporal blocking info. - cp->update_tb_info(); + _context->update_tb_info(); - // Reallocate scratch data based on new block size. + // Reallocate scratch data based on new mini-block size. // TODO: only do this when blocks have increased or // decreased by a certain percentage. _context->allocScratchData(); - cp->set_debug_output(saved_op); + + // Restore debug output. + set_debug_output(saved_op); } } // namespace yask. diff --git a/src/kernel/lib/auto_tuner.hpp b/src/kernel/lib/auto_tuner.hpp index b49218b9..9712b29f 100644 --- a/src/kernel/lib/auto_tuner.hpp +++ b/src/kernel/lib/auto_tuner.hpp @@ -28,10 +28,16 @@ IN THE SOFTWARE. namespace yask { // Auto-tuner for setting block size. - class AutoTuner { + class AutoTuner : + public ContextLinker { + protected: - StencilContext* _context = 0; - KernelSettings* _settings = 0; // settings to change. + + // Settings to change. May be pointer to solution settings + // or local settings for a pack. + KernelSettings* _settings = 0; + + // Name of tuner. std::string _name; // Null stream to throw away debug info. @@ -75,15 +81,10 @@ namespace yask { public: static constexpr idx_t max_step_t = 4; - AutoTuner(StencilContext* ctx, + AutoTuner(StencilContext* context, KernelSettings* settings, - const std::string& name = "") : - _context(ctx), - _settings(settings) { - _name = "auto-tuner"; - if (name.length()) - _name += "(" + name + ")"; - } + const std::string& name = ""); + virtual ~AutoTuner() {} // Start & stop this timer to track elapsed time. YaskTimer timer; diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp index 90eb756d..078f296d 100644 --- a/src/kernel/lib/context.cpp +++ b/src/kernel/lib/context.cpp @@ -31,24 +31,12 @@ using namespace std; namespace yask { - // Set debug output to cout if my_rank == msg_rank - // or a null stream otherwise. - ostream& StencilContext::set_ostr() { - yask_output_factory yof; - if (_env->my_rank == _opts->msg_rank) - set_debug_output(yof.new_stdout_output()); - else - set_debug_output(yof.new_null_output()); - assert(_ostr); - return *_ostr; - } - ///// Top-level methods for evaluating reference and optimized stencils. // Eval stencil bundle(s) over grid(s) using reference scalar code. void StencilContext::run_ref(idx_t first_step_index, idx_t last_step_index) { - CONTEXT_VARS(this); + STATE_VARS(this); run_time.start(); // Determine step dir from order of first/last. @@ -69,10 +57,10 @@ namespace yask { // Begin & end tuples. // Based on rank bounding box, not extended // BB because we don't use wave-fronts in the ref code. - IdxTuple begin(_dims->_stencil_dims); + IdxTuple begin(stencil_dims); begin.setVals(rank_bb.bb_begin, false); begin[step_dim] = begin_t; - IdxTuple end(_dims->_stencil_dims); + IdxTuple end(stencil_dims); end.setVals(rank_bb.bb_end, false); end[step_dim] = end_t; @@ -81,16 +69,16 @@ namespace yask { // Force sub-sizes to whole rank size so that scratch // grids will be large enough. Turn off any temporal blocking. - _opts->_region_sizes.setValsSame(0); - _opts->_block_sizes.setValsSame(0); - _opts->_mini_block_sizes.setValsSame(0); - _opts->_sub_block_sizes.setValsSame(0); - _opts->adjustSettings(get_env()); + opts->_region_sizes.setValsSame(0); + opts->_block_sizes.setValsSame(0); + opts->_mini_block_sizes.setValsSame(0); + opts->_sub_block_sizes.setValsSame(0); + opts->adjustSettings(); update_grid_info(); // Copy these settings to packs and realloc scratch grids. for (auto& sp : stPacks) - sp->getLocalSettings() = *_opts; + sp->getLocalSettings() = *opts; allocScratchData(); // Use only one set of scratch grids. @@ -98,7 +86,7 @@ namespace yask { // Indices to loop through. // Init from begin & end tuples. - ScanIndices rank_idxs(*_dims, false, &rank_domain_offsets); + ScanIndices rank_idxs(*dims, false, &rank_domain_offsets); rank_idxs.begin = begin; rank_idxs.end = end; @@ -201,7 +189,7 @@ namespace yask { void StencilContext::run_solution(idx_t first_step_index, idx_t last_step_index) { - CONTEXT_VARS(this); + STATE_VARS(this); run_time.start(); // Determine step dir from order of first/last. @@ -219,14 +207,14 @@ namespace yask { // Begin, end, step tuples. // Based on overall bounding box, which includes // any needed extensions for wave-fronts. - IdxTuple begin(_dims->_stencil_dims); + IdxTuple begin(stencil_dims); begin.setVals(ext_bb.bb_begin, false); begin[step_posn] = begin_t; - IdxTuple end(_dims->_stencil_dims); + IdxTuple end(stencil_dims); end.setVals(ext_bb.bb_end, false); end[step_posn] = end_t; - IdxTuple step(_dims->_stencil_dims); - step.setVals(_opts->_region_sizes, false); // step by region sizes. + IdxTuple step(stencil_dims); + step.setVals(opts->_region_sizes, false); // step by region sizes. step[step_posn] = step_t; TRACE_MSG("run_solution: [" << @@ -241,8 +229,7 @@ namespace yask { } #ifdef MODEL_CACHE - ostream& os = get_ostr(); - if (context.my_rank != context.msg_rank) + if (env.my_rank != env.msg_rank) cache_model.disable(); if (cache_model.isEnabled()) os << "Modeling cache...\n"; @@ -303,7 +290,7 @@ namespace yask { // calculations are made. // Indices needed for the 'rank' loops. - ScanIndices rank_idxs(*_dims, true, &rank_domain_offsets); + ScanIndices rank_idxs(*dims, true, &rank_domain_offsets); rank_idxs.begin = begin; rank_idxs.end = end; rank_idxs.step = step; @@ -573,7 +560,7 @@ namespace yask { // eval only the one pointed to. void StencilContext::calc_region(BundlePackPtr& sel_bp, const ScanIndices& rank_idxs) { - CONTEXT_VARS(this); + STATE_VARS(this); TRACE_MSG("calc_region: region [" << rank_idxs.start.makeValStr(nsdims) << " ... " << rank_idxs.stop.makeValStr(nsdims) << ") within rank [" << @@ -587,7 +574,7 @@ namespace yask { int_time.start(); // Init region begin & end from rank start & stop indices. - ScanIndices region_idxs(*_dims, true, &rank_domain_offsets); + ScanIndices region_idxs(*dims, true, &rank_domain_offsets); region_idxs.initFromOuter(rank_idxs); // Time range. @@ -698,7 +685,7 @@ namespace yask { BundlePackPtr bp; // Steps within a region are based on rank block sizes. - auto& settings = *_opts; + auto& settings = *opts; region_idxs.step = settings._block_sizes; region_idxs.step[step_posn] = step_t; @@ -780,7 +767,7 @@ namespace yask { const ScanIndices& rank_idxs, const ScanIndices& region_idxs) { - CONTEXT_VARS(this); + STATE_VARS(this); auto* bp = sel_bp.get(); int region_thread_idx = omp_get_thread_num(); TRACE_MSG("calc_block: phase " << phase << ", block [" << @@ -830,7 +817,7 @@ namespace yask { #endif // Init block begin & end from region start & stop indices. - ScanIndices block_idxs(*_dims, true, 0); + ScanIndices block_idxs(*dims, true, 0); block_idxs.initFromOuter(region_idxs); // Time range. @@ -901,7 +888,7 @@ namespace yask { block_idxs.stop[step_posn] = end_t; // Steps within a block are based on rank mini-block sizes. - auto& settings = *_opts; + auto& settings = *opts; block_idxs.step = settings._mini_block_sizes; block_idxs.step[step_posn] = step_dir; @@ -982,7 +969,7 @@ namespace yask { const ScanIndices& base_block_idxs, const ScanIndices& adj_block_idxs) { - CONTEXT_VARS(this); + STATE_VARS(this); TRACE_MSG("calc_mini_block: phase " << phase << ", shape " << shape << ", mini-block [" << @@ -1003,7 +990,7 @@ namespace yask { } // Init mini-block begin & end from blk start & stop indices. - ScanIndices mini_block_idxs(*_dims, true, 0); + ScanIndices mini_block_idxs(*dims, true, 0); mini_block_idxs.initFromOuter(adj_block_idxs); // Time range. @@ -1100,7 +1087,7 @@ namespace yask { // Update offsets of scratch grids based on the current // mini-block location. if (scratchVecs.size()) - cp->update_scratch_grid_info(region_thread_idx, mini_block_idxs.begin); + update_scratch_grid_info(region_thread_idx, mini_block_idxs.begin); // Call calc_mini_block() for each non-scratch bundle. for (auto* sb : *bp) @@ -1132,7 +1119,7 @@ namespace yask { idx_t shift_num, BundlePackPtr& bp, ScanIndices& idxs) { - CONTEXT_VARS(this); + STATE_VARS(this); // For wavefront adjustments, see conceptual diagram in // run_solution(). At each pack and time-step, the parallelogram @@ -1322,7 +1309,7 @@ namespace yask { idx_t nshapes, idx_t shape, const BridgeMask& bridge_mask, ScanIndices& idxs) { - CONTEXT_VARS(this); + STATE_VARS(this); auto npacks = stPacks.size(); bool ok = true; @@ -1479,7 +1466,7 @@ namespace yask { // the grids' local offsets. void StencilContext::update_scratch_grid_info(int thread_idx, const Indices& idxs) { - CONTEXT_VARS(this); + STATE_VARS(this); // Loop thru vecs of scratch grids. for (auto* sv : scratchVecs) { @@ -1525,7 +1512,7 @@ namespace yask { // Compare grids in contexts. // Return number of mis-compares. idx_t StencilContext::compareData(const StencilContext& ref) const { - CONTEXT_VARS_CONST(this); + STATE_VARS_CONST(this); os << "Comparing grid(s) in '" << name << "' to '" << ref.name << "'..." << endl; if (gridPtrs.size() != ref.gridPtrs.size()) { @@ -1544,10 +1531,10 @@ namespace yask { // Call MPI_Test() on all unfinished requests to promote MPI progress. // TODO: replace with more direct and less intrusive techniques. void StencilContext::poke_halo_exchange() { - CONTEXT_VARS(this); + STATE_VARS(this); #ifdef USE_MPI - if (!enable_halo_exchange || _env->num_ranks < 2) + if (!enable_halo_exchange || env->num_ranks < 2) return; test_time.start(); @@ -1601,11 +1588,11 @@ namespace yask { // Exchange dirty halo data for all grids and all steps. void StencilContext::exchange_halos() { - + #ifdef USE_MPI - if (!enable_halo_exchange || _env->num_ranks < 2) + STATE_VARS(this); + if (!enable_halo_exchange || env->num_ranks < 2) return; - CONTEXT_VARS(this); halo_time.start(); double wait_delta = 0.; @@ -1737,7 +1724,7 @@ namespace yask { auto& r = grid_recv_reqs[ni]; MPI_Irecv(buf, nbytes, MPI_BYTE, neighbor_rank, int(gi), - _env->comm, &r); + env->comm, &r); num_recv_reqs++; } } @@ -1788,7 +1775,7 @@ namespace yask { last, lastStepsToSwap[gp]); else nelems = gp->get_elements_in_slice(buf, first, last); - idx_t nbytes = nelems * cp->get_element_bytes(); + idx_t nbytes = nelems * get_element_bytes(); if (using_shm) { TRACE_MSG("exchange_halos: no send req due to shm"); @@ -1801,7 +1788,7 @@ namespace yask { TRACE_MSG("exchange_halos: sending " << makeByteStr(nbytes)); auto& r = grid_send_reqs[ni]; MPI_Isend(buf, nbytes, MPI_BYTE, - neighbor_rank, int(gi), _env->comm, &r); + neighbor_rank, int(gi), env->comm, &r); num_send_reqs++; } } @@ -1926,6 +1913,7 @@ namespace yask { // TODO: track sub-domain of grid that is dirty. void StencilContext::mark_grids_dirty(const BundlePackPtr& sel_bp, idx_t start, idx_t stop) { + STATE_VARS(this); idx_t step = (start > stop) ? -1 : 1; map> grids_done; diff --git a/src/kernel/lib/context.hpp b/src/kernel/lib/context.hpp index c2b7bc76..95256822 100644 --- a/src/kernel/lib/context.hpp +++ b/src/kernel/lib/context.hpp @@ -149,43 +149,18 @@ namespace yask { // This is a pure-virtual class that must be implemented // for a specific problem. class StencilContext : + public KernelStateBase, public virtual yk_solution { protected: - // Output stream for messages. - std::ostream* _ostr = 0; - yask_output_ptr _debug; - - // Env. - KernelEnvPtr _env; - - // Command-line and env parameters. - KernelSettingsPtr _opts; - - // Problem dims. - DimsPtr _dims; - - // MPI info. - MPIInfoPtr _mpiInfo; - // Auto-tuner for global settings. AutoTuner _at; - bool _use_pack_tuners = false; // Bytes between each buffer to help avoid aliasing // in the HW. static constexpr size_t _data_buf_pad = YASK_PAD_BYTES; - // Check whether dim is appropriate type. - virtual void checkDimType(const std::string& dim, - const std::string& fn_name, - bool step_ok, - bool domain_ok, - bool misc_ok) const { - _dims->checkDimType(dim, fn_name, step_ok, domain_ok, misc_ok); - } - // Alloc given bytes on each NUMA node. virtual void _alloc_data(const std::map & nbytes, const std::map & ngrids, @@ -217,14 +192,6 @@ namespace yask { bool do_mpi_right = true; // right exterior in given dim. idx_t mpi_exterior_dim = -1; // which domain dim in left/right. - // Position of inner domain dim in stencil-dims tuple. - // Misc dims will follow this if/when using interleaving. - int _inner_posn = -1; // -1 => not set. - - // Position of outer domain dim in stencil-dims tuple. - // For 1D stencils, _outer_posn == _inner_posn. - int _outer_posn = -1; // -1 => not set. - // Is overlap currently enabled? inline bool is_overlap_active() const { bool active = !do_mpi_interior || !do_mpi_left || !do_mpi_right; @@ -316,8 +283,8 @@ namespace yask { std::map mpiData; // Constructor. - StencilContext(KernelEnvPtr env, - KernelSettingsPtr settings); + StencilContext(KernelEnvPtr& env, + KernelSettingsPtr& settings); // Destructor. virtual ~StencilContext() { @@ -327,42 +294,17 @@ namespace yask { get_stats(); } - // Set debug output to cout if my_rank == msg_rank - // or a null stream otherwise. - std::ostream& set_ostr(); - - // Get the messsage output stream. - std::ostream& get_ostr() const { - assert(_ostr); - return *_ostr; + // Modify settings in shared state and auto-tuner. + void set_settings(KernelSettingsPtr opts) { + _state->_opts = opts; + _at.set_settings(opts.get()); } // Reset elapsed times to zero. void clear_timers(); - // Access to settings. - KernelSettingsPtr& get_settings() { - assert(_opts); - return _opts; - } - const KernelSettingsPtr& get_settings() const { - assert(_opts); - return _opts; - } - void set_settings(KernelSettingsPtr opts) { - _opts = opts; - _at.set_settings(_opts.get()); - } - // Misc accessors. - KernelEnvPtr& get_env() { return _env; } - const KernelEnvPtr& get_env() const { return _env; } - DimsPtr& get_dims() { return _dims; } - const DimsPtr& get_dims() const { return _dims; } - MPIInfoPtr& get_mpi_info() { return _mpiInfo; } - const MPIInfoPtr& get_mpi_info() const { return _mpiInfo; } AutoTuner& getAT() { return _at; } - bool use_pack_tuners() const { return _use_pack_tuners; } // Add a new grid to the containers. virtual void addGrid(YkGridPtr gp, bool is_output); @@ -468,92 +410,6 @@ namespace yask { // Return number of mis-compares. virtual idx_t compareData(const StencilContext& ref) const; - // Set number of threads w/o using thread-divisor. - // Return number of threads. - // Do nothing and return 0 if not properly initialized. - int set_max_threads() { - - // Get max number of threads. - int mt = std::max(_opts->max_threads, 1); - - // Reset number of OMP threads to max allowed. - omp_set_num_threads(mt); - return mt; - } - - // Get total number of computation threads to use. - int get_num_comp_threads(int& region_threads, int& blk_threads) const { - - // Max threads / divisor. - int mt = std::max(_opts->max_threads, 1); - int td = std::max(_opts->thread_divisor, 1); - int at = mt / td; - at = std::max(at, 1); - - // Blk threads per region thread. - int bt = std::max(_opts->num_block_threads, 1); - bt = std::min(bt, at); // Cannot be > 'at'. - blk_threads = bt; - - // Region threads. - int rt = at / bt; - rt = std::max(rt, 1); - region_threads = rt; - - // Total number of block threads. - return bt * rt; - } - - // Set number of threads to use for something other than a region. - // Return number of threads. - // Do nothing and return 0 if not properly initialized. - int set_all_threads() { - int rt, bt; - int at = get_num_comp_threads(rt, bt); - omp_set_num_threads(at); - return at; - } - - // Set number of threads to use for a region. - // Enable nested OMP if there are >1 block threads, - // disable otherwise. - // Return number of threads. - // Do nothing and return 0 if not properly initialized. - int set_region_threads() { - int rt, bt; - int at = get_num_comp_threads(rt, bt); - - // Limit outer nesting to allow num_block_threads per nested - // block loop. - yask_num_threads[0] = rt; - - if (bt > 1) { - omp_set_nested(1); - omp_set_max_active_levels(2); - yask_num_threads[1] = bt; - } - else { - omp_set_nested(0); - omp_set_max_active_levels(1); - yask_num_threads[1] = 0; - } - - omp_set_num_threads(rt); - return rt; - } - - // Set number of threads for a block. - // Return number of threads. - // Do nothing and return 0 if not properly initialized. - int set_block_threads() { - int rt, bt; - int at = get_num_comp_threads(rt, bt); - - if (omp_get_max_active_levels() > 1) - omp_set_num_threads(bt); - return bt; - } - // Reference stencil calculations. void run_ref(idx_t first_step_index, idx_t last_step_index); @@ -630,20 +486,14 @@ namespace yask { const GridDimNames& dims, const GridDimSizes* sizes); - // Get output object. - virtual yask_output_ptr get_debug_output() const { - return _debug; - } - // APIs. // See yask_kernel_api.hpp. - virtual void set_debug_output(yask_output_ptr debug) { - _debug = debug; // to share ownership of referent. - _ostr = &debug->get_ostream(); - } virtual const std::string& get_name() const { return name; } + virtual void set_debug_output(yask_output_ptr debug) { + KernelStateBase::set_debug_output(debug); + } virtual int get_element_bytes() const { return REAL_BYTES; } @@ -691,22 +541,26 @@ namespace yask { } virtual std::string get_step_dim_name() const { - return _dims->_step_dim; + STATE_VARS_CONST(this); + return dims->_step_dim; } virtual int get_num_domain_dims() const { - return _dims->_domain_dims.getNumDims(); + STATE_VARS_CONST(this); + return dims->_domain_dims.getNumDims(); } virtual std::vector get_domain_dim_names() const { - std::vector dims; - for (auto& dim : _dims->_domain_dims.getDims()) - dims.push_back(dim.getName()); - return dims; + STATE_VARS_CONST(this); + std::vector ddims; + for (auto& dim : dims->_domain_dims.getDims()) + ddims.push_back(dim.getName()); + return ddims; } virtual std::vector get_misc_dim_names() const { - std::vector dims; - for (auto& dim : _dims->_misc_dims.getDims()) - dims.push_back(dim.getName()); - return dims; + STATE_VARS_CONST(this); + std::vector mdims; + for (auto& dim : dims->_misc_dims.getDims()) + mdims.push_back(dim.getName()); + return mdims; } virtual idx_t get_first_rank_domain_index(const std::string& dim) const; @@ -735,16 +589,18 @@ namespace yask { virtual idx_t get_rank_index(const std::string& dim) const; virtual std::string apply_command_line_options(const std::string& args); virtual bool set_default_numa_preferred(int numa_node) { + STATE_VARS(this); #ifdef USE_NUMA - _opts->_numa_pref = numa_node; + opts->_numa_pref = numa_node; return true; #else - _opts->_numa_pref = yask_numa_none; + opts->_numa_pref = yask_numa_none; return numa_node == yask_numa_none; #endif } virtual int get_default_numa_preferred() const { - return _opts->_numa_pref; + STATE_VARS_CONST(this); + return opts->_numa_pref; } // Auto-tuner methods. @@ -757,27 +613,4 @@ namespace yask { }; // StencilContext. - // Macro to get commonly-needed vars for stencil calcs efficiently. - // *_posn vars are positions in stencil_dims. -#define CONTEXT_VARS0(ctx_p, pfx) \ - pfx auto* cp = ctx_p; \ - auto& os = cp->get_ostr(); \ - pfx auto* opts = cp->get_settings().get(); \ - pfx auto* mpiInfo = cp->get_mpi_info().get(); \ - pfx auto* dims = cp->get_dims().get(); \ - pfx auto* env = cp->get_env().get(); \ - const auto& step_dim = dims->_step_dim; \ - const auto& domain_dims = dims->_domain_dims; \ - constexpr int nddims = NUM_DOMAIN_DIMS; \ - assert(nddims == domain_dims.size()); \ - const auto& stencil_dims = dims->_stencil_dims; \ - constexpr int nsdims = NUM_STENCIL_DIMS; \ - assert(nsdims == stencil_dims.size()); \ - constexpr int step_posn = 0; \ - assert(step_posn == +Indices::step_posn); \ - constexpr int outer_posn = 1; \ - const int inner_posn = cp->_inner_posn -#define CONTEXT_VARS(ctx_p) CONTEXT_VARS0(ctx_p,) -#define CONTEXT_VARS_CONST(ctx_p) CONTEXT_VARS0(ctx_p, const) - } // yask namespace. diff --git a/src/kernel/lib/generic_grids.cpp b/src/kernel/lib/generic_grids.cpp index 03c5a757..9a263ec2 100644 --- a/src/kernel/lib/generic_grids.cpp +++ b/src/kernel/lib/generic_grids.cpp @@ -29,16 +29,16 @@ using namespace std; namespace yask { // Ctor. No allocation is done. See notes on default_alloc(). - GenericGridBase::GenericGridBase(string name, + GenericGridBase::GenericGridBase(KernelStateBase& state, + const string& name, Layout& layout_base, - const GridDimNames& dimNames, - KernelSettingsPtr* settings, - ostream** ostr) : - _name(name), _layout_base(&layout_base), _opts(settings), _ostr(ostr) { + const GridDimNames& dimNames) : + KernelStateBase(state), + _name(name), + _layout_base(&layout_base) { for (auto& dn : dimNames) - _dims.addDimBack(dn, 1); + _grid_dims.addDimBack(dn, 1); _sync_layout_with_dims(); - } // Perform default allocation. @@ -46,7 +46,7 @@ namespace yask { // programmer should call get_num_elems() or get_num_bytes() and // then provide allocated memory via set_storage(). void GenericGridBase::default_alloc() { - auto& os = get_ostr(); + STATE_VARS(this); // Release any old data if last owner. release_storage(); @@ -74,11 +74,11 @@ namespace yask { // Make some descriptive info. string GenericGridBase::make_info_string(const string& elem_name) const { stringstream oss; - if (_dims.getNumDims() == 0) + if (_grid_dims.getNumDims() == 0) oss << "scalar"; else - oss << _dims.getNumDims() << "-D grid (" << - _dims.makeDimValStr(" * ") << ")"; + oss << _grid_dims.getNumDims() << "-D grid (" << + _grid_dims.makeDimValStr(" * ") << ")"; oss << " '" << _name << "'"; if (_elems) oss << " with data at " << _elems << " containing "; @@ -147,7 +147,7 @@ namespace yask { return get_num_elems(); // Dims & sizes same? - if (_dims != p->_dims) + if (_grid_dims != p->_grid_dims) return get_num_elems(); // Count abs diffs > epsilon. diff --git a/src/kernel/lib/generic_grids.hpp b/src/kernel/lib/generic_grids.hpp index 30e3905b..010d7e03 100644 --- a/src/kernel/lib/generic_grids.hpp +++ b/src/kernel/lib/generic_grids.hpp @@ -33,7 +33,8 @@ namespace yask { // A base class for a generic n-D grid. // This class does not define a type or memory layout. - class GenericGridBase { + class GenericGridBase : + public KernelStateBase { protected: std::string _name; // name for grid. @@ -52,36 +53,42 @@ namespace yask { // Note that both _dims and *_layout_base hold dimensions unless this // is a scalar. For a scalar, _dims is empty and _layout_base = 0. - IdxTuple _dims; // names and lengths of grid dimensions. + IdxTuple _grid_dims; // names and lengths of grid dimensions. Layout* _layout_base = 0; // memory layout. - // Command-line and env parameters. - KernelSettingsPtr* _opts; - - // Output stream for messages. - // Pointer-to-pointer to let it follow a parent's pointer. - std::ostream** _ostr = 0; - void _sync_dims_with_layout() { Indices idxs(_layout_base->get_sizes()); - idxs.setTupleVals(_dims); + idxs.setTupleVals(_grid_dims); } void _sync_layout_with_dims() { - Indices idxs(_dims); + Indices idxs(_grid_dims); _layout_base->set_sizes(idxs); } public: // Ctor. No allocation is done. See notes on default_alloc(). - GenericGridBase(std::string name, + GenericGridBase(KernelStateBase& state, + const std::string& name, Layout& layout_base, - const GridDimNames& dimNames, - KernelSettingsPtr* settings, - std::ostream** ostr); + const GridDimNames& dimNames); virtual ~GenericGridBase() { } + // Get state info. + KernelStatePtr& get_state() { + assert(_state); + return _state; + } + const KernelStatePtr& get_state() const { + assert(_state); + return _state; + } + std::ostream& get_ostr() const { + STATE_VARS(this); + return os; + } + // Perform default allocation. // For other options, // programmer should call get_num_elems() or get_num_bytes() and @@ -94,8 +101,9 @@ namespace yask { // NUMA accessors. virtual int get_numa_pref() const { + STATE_VARS_CONST(this); return (_numa_pref != _numa_unset) ? - _numa_pref : (*_opts)->_numa_pref; + _numa_pref : opts->_numa_pref; } virtual bool set_numa_pref(int numa_node) { #ifdef USE_NUMA @@ -107,19 +115,12 @@ namespace yask { #endif } - // Access dims. - const IdxTuple& get_dims() const { return _dims; } - - // Get the messsage output stream. - virtual std::ostream& get_ostr() const { - assert(_ostr); - assert(*_ostr); - return **_ostr; - } + // Access dims of this grid. + const IdxTuple& get_dims() const { return _grid_dims; } // Get number of elements. virtual idx_t get_num_elems() const { - return _dims.product(); + return _grid_dims.product(); } // Get size of one element. @@ -130,25 +131,25 @@ namespace yask { // Get number of dimensions. virtual int get_num_dims() const { - return _dims.getNumDims(); + return _grid_dims.getNumDims(); } // Get the nth dim name. virtual const std::string& get_dim_name(int n) const { - return _dims.getDimName(n); + return _grid_dims.getDimName(n); } // Is dim used? virtual bool is_dim_used(const std::string& dim) const { - return _dims.lookup(dim) != 0; + return _grid_dims.lookup(dim) != 0; } // Access nth dim size. idx_t get_dim_size(int n) const { - return _dims.getVal(n); + return _grid_dims.getVal(n); } void set_dim_size(int n, idx_t size) { - _dims.setVal(n, size); + _grid_dims.setVal(n, size); _sync_layout_with_dims(); } @@ -157,15 +158,15 @@ namespace yask { return _layout_base->get_sizes(); } void set_dim_sizes(const Indices& sizes) { - for (int i = 0; i < _dims.size(); i++) - _dims.setVal(i, sizes[i]); + for (int i = 0; i < _grid_dims.size(); i++) + _grid_dims.setVal(i, sizes[i]); _sync_layout_with_dims(); } // Return 'true' if dimensions are same names // and sizes, 'false' otherwise. inline bool are_dims_and_sizes_same(const GenericGridBase& src) { - return _dims == src._dims; + return _grid_dims == src._grid_dims; } // Print some descriptive info. @@ -174,7 +175,7 @@ namespace yask { // Get linear index. virtual idx_t get_index(const Indices& idxs, bool check=true) const =0; virtual idx_t get_index(const IdxTuple& pt, bool check=true) const { - assert(_dims.areDimsSame(pt)); + assert(_grid_dims.areDimsSame(pt)); Indices idxs(pt); return get_index(idxs, check); } @@ -213,12 +214,11 @@ namespace yask { public: // Ctor. No allocation is done. See notes on default_alloc(). - GenericGridTemplate(std::string name, + GenericGridTemplate(KernelStateBase& state, + const std::string& name, Layout& layout_base, - const GridDimNames& dimNames, - KernelSettingsPtr* settings, - std::ostream** ostr) : - GenericGridBase(name, layout_base, dimNames, settings, ostr) { } + const GridDimNames& dimNames) : + GenericGridBase(state, name, layout_base, dimNames) { } // Dealloc _base when last pointer to it is destructed. virtual ~GenericGridTemplate() { @@ -282,11 +282,10 @@ namespace yask { public: // Construct an unallocated grid. - GenericGrid(std::string name, - const GridDimNames& dimNames, - KernelSettingsPtr* settings, - std::ostream** ostr) : - GenericGridTemplate(name, _layout, dimNames, settings, ostr) { + GenericGrid(KernelStateBase& state, + std::string name, + const GridDimNames& dimNames) : + GenericGridTemplate(state, name, _layout, dimNames) { assert(int(dimNames.size()) == _layout.get_num_sizes()); } @@ -306,10 +305,10 @@ namespace yask { virtual idx_t get_index(const Indices& idxs, bool check=true) const final { #ifdef CHECK if (check) { - for (int i = 0; i < this->_dims.size(); i++) { + for (int i = 0; i < this->_grid_dims.size(); i++) { idx_t j = idxs[i]; assert(j >= 0); - assert(j < this->_dims.getVal(i)); + assert(j < this->_grid_dims.getVal(i)); } } #endif @@ -347,8 +346,9 @@ namespace yask { public: // Construct an unallocated scalar. - GenericScalar(std::string name) : - GenericGrid(name, _dimNames) { } + GenericScalar(KernelStateBase& state, + std::string name) : + GenericGrid(state, name, _dimNames) { } }; } // namespace yask. diff --git a/src/kernel/lib/grid_apis.cpp b/src/kernel/lib/grid_apis.cpp index 1b677d0b..7bf9c1eb 100644 --- a/src/kernel/lib/grid_apis.cpp +++ b/src/kernel/lib/grid_apis.cpp @@ -37,7 +37,8 @@ namespace yask { // APIs to get info from vars. #define GET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \ idx_t YkGridBase::api_name(const string& dim) const { \ - checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \ + STATE_VARS(this); \ + dims->checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \ int posn = get_dim_posn(dim, true, #api_name); \ idx_t mbit = 1LL << posn; \ if (prep_req && _rank_offsets[posn] < 0) \ @@ -47,6 +48,7 @@ namespace yask { return rtn; \ } \ idx_t YkGridBase::api_name(int posn) const { \ + STATE_VARS(this); \ idx_t mbit = 1LL << posn; \ auto rtn = expr; \ return rtn; \ @@ -63,8 +65,8 @@ namespace yask { GET_GRID_API(get_alloc_size, _allocs[posn], true, true, true, false) GET_GRID_API(get_first_rank_domain_index, _rank_offsets[posn], false, true, false, true) GET_GRID_API(get_last_rank_domain_index, _rank_offsets[posn] + _domains[posn] - 1, false, true, false, true) - GET_GRID_API(get_first_rank_halo_index, _rank_offsets[posn] - _left_halos[posn], false, false, true, true) - GET_GRID_API(get_last_rank_halo_index, _rank_offsets[posn] + _domains[posn] + _right_halos[posn] - 1, false, false, true, true) + GET_GRID_API(get_first_rank_halo_index, _rank_offsets[posn] - _left_halos[posn], false, true, false, true) + GET_GRID_API(get_last_rank_halo_index, _rank_offsets[posn] + _domains[posn] + _right_halos[posn] - 1, false, true, false, true) GET_GRID_API(get_first_rank_alloc_index, _rank_offsets[posn] + _local_offsets[posn] - _actl_left_pads[posn], false, true, false, true) GET_GRID_API(get_last_rank_alloc_index, _rank_offsets[posn] + _local_offsets[posn] + _domains[posn] + _actl_right_pads[posn] - 1, false, true, false, true) GET_GRID_API(_get_left_wf_ext, _left_wf_exts[posn], true, true, true, false) @@ -84,14 +86,16 @@ namespace yask { #define COMMA , #define SET_GRID_API(api_name, expr, step_ok, domain_ok, misc_ok) \ void YkGridBase::api_name(const string& dim, idx_t n) { \ - TRACE_MSG0(get_ostr(), "grid '" << get_name() << "'." \ + STATE_VARS(this); \ + TRACE_MSG("grid '" << get_name() << "'." \ #api_name "('" << dim << "', " << n << ")"); \ - checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \ + dims->checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \ int posn = get_dim_posn(dim, true, #api_name); \ idx_t mbit = 1LL << posn; \ expr; \ } \ void YkGridBase::api_name(int posn, idx_t n) { \ + STATE_VARS(this); \ idx_t mbit = 1LL << posn; \ int dim = posn; \ expr; \ @@ -160,6 +164,7 @@ namespace yask { } void YkGridBase::share_storage(yk_grid_ptr source) { + STATE_VARS(this); auto sp = dynamic_pointer_cast(source); assert(sp); @@ -193,7 +198,7 @@ namespace yask { } // Not a domain dim? - bool is_domain = _dims->_domain_dims.lookup(dname) != 0; + bool is_domain = domain_dims.lookup(dname) != 0; if (!is_domain) { auto tas = get_alloc_size(dname); auto sas = sp->get_alloc_size(dname); @@ -239,7 +244,7 @@ namespace yask { // Copy pad sizes. for (int i = 0; i < get_num_dims(); i++) { auto dname = get_dim_name(i); - bool is_domain = _dims->_domain_dims.lookup(dname) != 0; + bool is_domain = domain_dims.lookup(dname) != 0; if (is_domain) { _actl_left_pads[i] = sp->_actl_left_pads[i]; _actl_right_pads[i] = sp->_actl_right_pads[i]; diff --git a/src/kernel/lib/indices.hpp b/src/kernel/lib/indices.hpp new file mode 100644 index 00000000..dabb51bc --- /dev/null +++ b/src/kernel/lib/indices.hpp @@ -0,0 +1,404 @@ +/***************************************************************************** + +YASK: Yet Another Stencil Kernel +Copyright (c) 2014-2019, Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + +*****************************************************************************/ + +#pragma once + +namespace yask { + + typedef Tuple IdxTuple; + typedef std::vector GridIndices; + typedef std::vector GridDimSizes; + typedef std::vector GridDimNames; + + // A class to hold up to a given number of sizes or indices efficiently. + // Similar to a Tuple, but less overhead and doesn't keep names. + // Make sure this stays non-virtual. + // TODO: make this a template with _ndims as a parameter. + // TODO: ultimately, combine with Tuple w/o loss of efficiency. + class Indices { + + public: + + // Max number of indices that can be held. + // Note use of "+max_idxs" in code below to avoid compiler + // trying to take a reference to it, resulting in a undefined + // symbol (sometimes). + static constexpr int max_idxs = MAX_DIMS; + + // Step dim is always in [0] of an Indices type (if it is used). + static constexpr int step_posn = 0; + + protected: + idx_t _idxs[max_idxs]; + int _ndims; + + public: + // Ctors. + Indices() : _ndims(0) { } + Indices(int ndims) : _ndims(ndims) { } // NB: _idxs remain uninit! + Indices(const IdxTuple& src) { + setFromTuple(src); + } + Indices(const GridIndices& src) { + setFromVec(src); + } + Indices(const std::initializer_list& src) { + setFromInitList(src); + } + Indices(const idx_t src[], int ndims) { + setFromArray(src, ndims); + } + Indices(idx_t src, int ndims) { + setFromConst(src, ndims); + } + + // Default copy ctor, copy operator should be okay. + + // Access size. + inline int getNumDims() const { + return _ndims; + } + inline void setNumDims(int n) { + _ndims = n; + } + + // Access indices. + inline idx_t& operator[](int i) { + assert(i >= 0); + assert(i < _ndims); + return _idxs[i]; + } + inline const idx_t& operator[](int i) const { + assert(i >= 0); + assert(i < _ndims); + return _idxs[i]; + } + + // Write to an IdxTuple. + // The 'tgt' must have the same number of dims. + void setTupleVals(IdxTuple& tgt) const { + assert(tgt.size() == _ndims); + for (int i = 0; i < _ndims; i++) + if (i < tgt.size()) + tgt.setVal(i, _idxs[i]); + } + + // Read from an IdxTuple. + void setFromTuple(const IdxTuple& src) { + assert(src.size() <= +max_idxs); + int n = int(src.size()); + for (int i = 0; i < n; i++) + _idxs[i] = src.getVal(i); + _ndims = n; + } + + // Other inits. + void setFromVec(const GridIndices& src) { + assert(src.size() <= +max_idxs); + int n = int(src.size()); + for (int i = 0; i < n; i++) + _idxs[i] = src[i]; + _ndims = n; + } + + // default n => don't change _ndims. + void setFromArray(const idx_t src[], int n = -1) { + if (n < 0) + n = _ndims; + assert(n <= +max_idxs); + for (int i = 0; i < n; i++) + _idxs[i] = src[i]; + _ndims = n; + } + void setFromInitList(const std::initializer_list& src) { + assert(src.size() <= +max_idxs); + int i = 0; + for (auto idx : src) + _idxs[i++] = idx; + _ndims = i; + } + + // default n => don't change _ndims. + void setFromConst(idx_t val, int n = -1) { + if (n < 0) + n = _ndims; + assert(n <= +max_idxs); + for (int i = 0; i < n; i++) + _idxs[i] = val; + _ndims = n; + } + + // Some comparisons. + // These assume all the indices are valid or + // initialized to the same value. + bool operator==(const Indices& rhs) const { + if (_ndims != rhs._ndims) + return false; + for (int i = 0; i < _ndims; i++) + if (_idxs[i] != rhs._idxs[i]) + return false; + return true; + } + bool operator!=(const Indices& rhs) const { + return !operator==(rhs); + } + bool operator<(const Indices& rhs) const { + if (_ndims < rhs._ndims) + return true; + else if (_ndims > rhs._ndims) + return false; + for (int i = 0; i < _ndims; i++) + if (_idxs[i] < rhs._idxs[i]) + return true; + else if (_idxs[i] > rhs._idxs[i]) + return false; + return false; // equal, so not less than. + } + bool operator>(const Indices& rhs) const { + if (_ndims > rhs._ndims) + return true; + else if (_ndims < rhs._ndims) + return false; + for (int i = 0; i < _ndims; i++) + if (_idxs[i] > rhs._idxs[i]) + return true; + else if (_idxs[i] < rhs._idxs[i]) + return false; + return false; // equal, so not greater than. + } + + // Generic element-wise operator. + // Returns a new object. + inline Indices combineElements(std::function func, + const Indices& other) const { + Indices res(*this); + +#if EXACT_INDICES + // Use just the used elements. + for (int i = 0; i < _ndims; i++) +#else + // Use all to allow unroll and avoid jumps. +#pragma unroll + for (int i = 0; i < max_idxs; i++) +#endif + func(res._idxs[i], other._idxs[i]); + return res; + } + + // Some element-wise operators. + // These all return a new set of Indices rather + // than modifying this object. + inline Indices addElements(const Indices& other) const { + return combineElements([&](idx_t& lhs, idx_t rhs) { lhs += rhs; }, + other); + } + inline Indices subElements(const Indices& other) const { + return combineElements([&](idx_t& lhs, idx_t rhs) { lhs -= rhs; }, + other); + } + inline Indices mulElements(const Indices& other) const { + return combineElements([&](idx_t& lhs, idx_t rhs) { lhs *= rhs; }, + other); + } + inline Indices divElements(const Indices& other) const { + return combineElements([&](idx_t& lhs, idx_t rhs) { lhs /= rhs; }, + other); + } + inline Indices minElements(const Indices& other) const { + return combineElements([&](idx_t& lhs, idx_t rhs) { lhs = std::min(lhs, rhs); }, + other); + } + inline Indices maxElements(const Indices& other) const { + return combineElements([&](idx_t& lhs, idx_t rhs) { lhs = std::max(lhs, rhs); }, + other); + } + + // Generic element-wise operator with RHS const. + // Returns a new object. + inline Indices mapElements(std::function func, + idx_t crhs) const { + Indices res(*this); + +#if EXACT_INDICES + // Use just the used elements. + for (int i = 0; i < _ndims; i++) +#else + // Use all to allow unroll and avoid jumps. +#pragma unroll + for (int i = 0; i < max_idxs; i++) +#endif + func(res._idxs[i], crhs); + return res; + } + + // Operate on all elements. + Indices addConst(idx_t crhs) const { + return mapElements([&](idx_t& lhs, idx_t rhs) { lhs += rhs; }, + crhs); + } + Indices subConst(idx_t crhs) const { + return mapElements([&](idx_t& lhs, idx_t rhs) { lhs -= rhs; }, + crhs); + } + Indices mulConst(idx_t crhs) const { + return mapElements([&](idx_t& lhs, idx_t rhs) { lhs *= rhs; }, + crhs); + } + Indices divConst(idx_t crhs) const { + return mapElements([&](idx_t& lhs, idx_t rhs) { lhs /= rhs; }, + crhs); + } + Indices minConst(idx_t crhs) const { + return mapElements([&](idx_t& lhs, idx_t rhs) { lhs = std::min(lhs, rhs); }, + crhs); + } + Indices maxConst(idx_t crhs) const { + return mapElements([&](idx_t& lhs, idx_t rhs) { lhs = std::max(lhs, rhs); }, + crhs); + } + + // Reduce over all elements. + idx_t sum() const { + idx_t res = 0; + for (int i = 0; i < _ndims; i++) + res += _idxs[i]; + return res; + } + idx_t product() const { + idx_t res = 1; + for (int i = 0; i < _ndims; i++) + res *= _idxs[i]; + return res; + } + + // Make string like "x=4, y=8". + std::string makeDimValStr(const GridDimNames& names, + std::string separator=", ", + std::string infix="=", + std::string prefix="", + std::string suffix="") const { + assert((int)names.size() == _ndims); + + // Make a Tuple from names. + IdxTuple tmp; + for (int i = 0; i < int(names.size()); i++) + tmp.addDimBack(names[i], _idxs[i]); + return tmp.makeDimValStr(separator, infix, prefix, suffix); + } + + // Make string like "4, 3, 2". + std::string makeValStr(int nvals, + std::string separator=", ", + std::string prefix="", + std::string suffix="") const { + assert(nvals == _ndims); + + // Make a Tuple w/o useful names. + IdxTuple tmp; + for (int i = 0; i < nvals; i++) + tmp.addDimBack(std::string(1, '0' + char(i)), _idxs[i]); + return tmp.makeValStr(separator, prefix, suffix); + } + }; + + // Define OMP reductions on Indices. +#pragma omp declare reduction(min_idxs : Indices : \ + omp_out = omp_out.minElements(omp_in) ) \ + initializer (omp_priv = omp_orig) +#pragma omp declare reduction(max_idxs : Indices : \ + omp_out = omp_out.maxElements(omp_in) ) \ + initializer (omp_priv = omp_orig) + + // Layout algorithms using Indices. +#include "yask_layouts.hpp" + + // Forward defns. + struct Dims; + + // A group of Indices needed for generated loops. + // See the help message from gen_loops.pl for the + // documentation of the indices. + // Make sure this stays non-virtual. + struct ScanIndices { + int ndims = 0; + + // Input values; not modified. + Indices begin, end; // first and end (beyond last) range of each index. + Indices step; // step value within range. + Indices align; // alignment of steps after first one. + Indices align_ofs; // adjustment for alignment (see below). + Indices group_size; // proximity grouping within range. + + // Alignment: when possible, start positions after the first + // in each dim will be aligned such that ((start - align_ofs) % align) == 0. + + // Output values; set once for entire range. + Indices num_indices; // number of indices in each dim. + idx_t linear_indices = 0; // total indices over all dims (product of num_indices). + + // Output values; set for each index by loop code. + Indices start, stop; // first and last+1 for this sub-range. + Indices index; // 0-based unique index for each sub-range in each dim. + idx_t linear_index = 0; // 0-based index over all dims. + + // Example w/3 sub-ranges in overall range: + // begin end + // |--------------------------------------------| + // |------------------|------------------|------| + // start stop (index = 0) + // start stop (index = 1) + // start stop (index = 2) + + // Ctor. + ScanIndices(const Dims& dims, bool use_vec_align, IdxTuple* ofs); + + // Init from outer-loop indices. + // Start..stop from point in outer loop become begin..end + // for this loop. + // + // Example: + // begin (outer) end + // |--------------------------------------------| + // |------------------|------------------|------| + // start | stop + // V + // begin (this) end + // |------------------| + // start stop (may be sub-dividied later) + void initFromOuter(const ScanIndices& outer) { + + // Begin & end set from start & stop of outer loop. + begin = start = outer.start; + end = stop = outer.stop; + + // Pass some values through. + align = outer.align; + align_ofs = outer.align_ofs; + + // Leave others alone. + } + }; + +} // yask namespace. diff --git a/src/kernel/lib/new_grid.cpp b/src/kernel/lib/new_grid.cpp index 559554bf..3adadc54 100644 --- a/src/kernel/lib/new_grid.cpp +++ b/src/kernel/lib/new_grid.cpp @@ -30,75 +30,76 @@ namespace yask { // Make a new grid. YkGridPtr StencilContext::newGrid(const std::string& name, - const GridDimNames& dims, + const GridDimNames& gdims, const GridDimSizes* sizes) { + STATE_VARS(this); // Check parameters. bool got_sizes = sizes != NULL; if (got_sizes) { - if (dims.size() != sizes->size()) { + if (gdims.size() != sizes->size()) { FORMAT_AND_THROW_YASK_EXCEPTION("Error: attempt to create grid '" << name << "' with " << - dims.size() << " dimension names but " << sizes->size() << + gdims.size() << " dimension names but " << sizes->size() << " dimension sizes"); } } // First, try to make a grid that matches the layout in // the stencil. - YkGridPtr gp = newStencilGrid(name, dims); + YkGridPtr gp = newStencilGrid(name, gdims); // No match. if (!gp) { // Tuple of dims. IdxTuple dtup; - for (auto& d : dims) + for (auto& d : gdims) dtup.addDimBack(d, 0); #if ALLOW_NEW_GRIDS // Allow new grid types. // Check dims. - int ndims = dims.size(); - int step_posn = -1; // -1 => not used. + int ndims = gdims.size(); + bool step_used = false; set seenDims; for (int i = 0; i < ndims; i++) { + auto& gdim = gdims[i]; // Already used? - if (seenDims.count(dims[i])) { + if (seenDims.count(gdim)) { THROW_YASK_EXCEPTION("Error: cannot create grid '" + name + - "' because dimension '" + dims[i] + + "' because dimension '" + gdim + "' is used more than once"); } // Step dim? - if (dims[i] == _dims->_step_dim) { - step_posn = i; - if (i > 0) { + if (gdim == step_dim) { + step_used = true; + if (i != step_posn) { THROW_YASK_EXCEPTION("Error: cannot create grid '" + name + - "' because step dimension '" + dims[i] + - "' must be first dimension"); + "' because step dimension '" + gdim + + "' is not first dimension"); } } // Domain dim? - else if (_dims->_domain_dims.lookup(dims[i])) { + else if (domain_dims.lookup(gdim)) { } // Known misc dim? - else if (_dims->_misc_dims.lookup(dims[i])) { + else if (misc_dims.lookup(gdim)) { } // New misc dim? else { - _dims->_misc_dims.addDimBack(dims[i], 0); + misc_dims.addDimBack(gdim, 0); } } - bool do_wrap = step_posn >= 0; // Scalar? if (ndims == 0) - gp = make_shared>(_dims, name, dims, &_opts, &_ostr); + gp = make_shared>(*this, name, gdims); // Include auto-gen code for all other cases. #include "yask_grid_code.hpp" @@ -123,20 +124,20 @@ namespace yask { // Add to context. addGrid(gp, false); // mark as non-output grid. - // Set sizes as provided or via solution settings. + // Set sizes as provided. if (got_sizes) { - int ndims = dims.size(); + int ndims = gdims.size(); for (int i = 0; i < ndims; i++) { - auto& dname = dims[i]; + auto& gdim = gdims[i]; // Domain size. gp->_set_domain_size(i, sizes->at(i)); // Pads. // Set via both 'extra' and 'min'; larger result will be used. - if (_dims->_domain_dims.lookup(dname)) { - gp->set_extra_pad_size(i, _opts->_extra_pad_sizes[dname]); - gp->set_min_pad_size(i, _opts->_min_pad_sizes[dname]); + if (domain_dims.lookup(gdim)) { + gp->set_extra_pad_size(i, opts->_extra_pad_sizes[gdim]); + gp->set_min_pad_size(i, opts->_min_pad_sizes[gdim]); } // Offsets. @@ -145,9 +146,11 @@ namespace yask { } } + // Set sizes based on solution settings. else update_grid_info(); + // Mark as created via API. gp->set_new_grid(true); return gp; } diff --git a/src/kernel/lib/realv_grids.cpp b/src/kernel/lib/realv_grids.cpp index b3c7437c..7b3f350e 100644 --- a/src/kernel/lib/realv_grids.cpp +++ b/src/kernel/lib/realv_grids.cpp @@ -31,13 +31,14 @@ using namespace std; namespace yask { // Ctor. - YkGridBase::YkGridBase(GenericGridBase* ggb, - const GridDimNames& dimNames, - DimsPtr dims) : - _ggb(ggb), _dims(dims) { - + // Important: '*ggb' is NOT yet constructed. + YkGridBase::YkGridBase(KernelStateBase& stateb, + GenericGridBase* ggb, + const GridDimNames& dimNames) : + KernelStateBase(stateb), + _ggb(ggb) { + STATE_VARS(this); assert(ggb); - assert(dims.get()); // Init indices. int n = int(dimNames.size()); @@ -62,9 +63,9 @@ namespace yask { for (int i = 0; i < dimNames.size(); i++) { idx_t mbit = 1LL << i; auto& dname = dimNames[i]; - if (dname == _dims->_step_dim) + if (dname == step_dim) _step_dim_mask |= mbit; - else if (_dims->_domain_dims.lookup(dname)) + else if (domain_dims.lookup(dname)) _domain_dim_mask |= mbit; else _misc_dim_mask |= mbit; @@ -127,6 +128,7 @@ namespace yask { // Does not include user-specified min padding or // final rounding for left pad. Indices YkGridBase::getReqdPad(const Indices& halos, const Indices& wf_exts) const { + STATE_VARS(this); // Start with halos plus WF exts. Indices mp = halos.addElements(wf_exts); @@ -144,7 +146,7 @@ namespace yask { for (int i = 0; i < get_num_dims(); i++) { if (mp[i] >= 1) { auto& dname = get_dim_name(i); - auto* p = _dims->_fold_pts.lookup(dname); + auto* p = dims->_fold_pts.lookup(dname); if (p) { assert (*p >= 1); mp[i] += *p - 1; @@ -158,6 +160,7 @@ namespace yask { // Modifies _pads and _allocs. // Fails if mem different and already alloc'd. void YkGridBase::resize() { + STATE_VARS(this); // Original size. auto p = get_raw_storage_buffer(); @@ -216,7 +219,7 @@ namespace yask { // Make inner dim an odd number of vecs. // This reportedly helps avoid some uarch aliasing. - if (!p && get_dim_name(i) == _dims->_inner_dim && + if (!p && get_dim_name(i) == inner_dim && (new_allocs[i] / _vec_lens[i]) % 2 == 0) { new_right_pads[i] += _vec_lens[i]; new_allocs[i] += _vec_lens[i]; @@ -269,7 +272,7 @@ namespace yask { if (old_allocs != new_allocs || old_dirty != new_dirty) { Indices first_allocs = _rank_offsets.subElements(_actl_left_pads); Indices end_allocs = first_allocs.addElements(_allocs); - TRACE_MSG0(get_ostr(), "grid '" << get_name() << "' resized from " << + TRACE_MSG("grid '" << get_name() << "' resized from " << makeIndexString(old_allocs, " * ") << " to " << makeIndexString(new_allocs, " * ") << " at [" << makeIndexString(first_allocs) << " ... " << @@ -288,18 +291,19 @@ namespace yask { bool step_ok, bool domain_ok, bool misc_ok) const { + STATE_VARS(this); if (!is_dim_used(dim)) THROW_YASK_EXCEPTION("Error in " + fn_name + "(): dimension '" + dim + "' not found in " + make_info_string()); - _dims->checkDimType(dim, fn_name, step_ok, domain_ok, misc_ok); + dims->checkDimType(dim, fn_name, step_ok, domain_ok, misc_ok); } // Check for equality. // Return number of mismatches greater than epsilon. idx_t YkGridBase::compare(const YkGridBase* ref, real_t epsilon, - int maxPrint, - std::ostream& os) const { + int maxPrint) const { + STATE_VARS(this); if (!ref) { os << "** mismatch: no reference grid.\n"; return get_num_storage_elements(); @@ -316,7 +320,7 @@ namespace yask { // same values in extra-padding area. // TODO: check layout. idx_t errs = _ggb->count_diffs(ref->_ggb, epsilon); - TRACE_MSG0(get_ostr(), "count_diffs() returned " << errs); + TRACE_MSG("count_diffs() returned " << errs); if (!errs) return 0; @@ -345,7 +349,7 @@ namespace yask { // Don't compare points outside the domain. // TODO: check points in halo. auto& dname = pt.getDimName(i); - if (_dims->_domain_dims.lookup(dname)) { + if (domain_dims.lookup(dname)) { auto first_ok = get_first_rank_domain_index(dname); auto last_ok = get_last_rank_domain_index(dname); if (opt[i] < first_ok || opt[i] > last_ok) @@ -374,7 +378,7 @@ namespace yask { } return true; // keep visiting. }); - TRACE_MSG0(get_ostr(), "detailed compare returned " << errs); + TRACE_MSG("detailed compare returned " << errs); return errs; } @@ -492,6 +496,7 @@ namespace yask { const real_vec_t& val, int line, bool newline) const { + STATE_VARS(this); // Convert to elem indices. Indices eidxs = idxs.mulElements(_vec_lens); @@ -503,7 +508,7 @@ namespace yask { eidxs.setTupleVals(idxs2); // set vals from eidxs. // Visit every point in fold. - IdxTuple folds = _dims->_fold_pts; + IdxTuple folds = dims->_fold_pts; folds.visitAllPoints([&](const IdxTuple& fofs, size_t idx) { diff --git a/src/kernel/lib/realv_grids.hpp b/src/kernel/lib/realv_grids.hpp index 2a3a99a7..11fcd957 100644 --- a/src/kernel/lib/realv_grids.hpp +++ b/src/kernel/lib/realv_grids.hpp @@ -34,6 +34,7 @@ namespace yask { // Base class implementing all yk_grids. Can be used for grids // that contain either individual elements or vectors. class YkGridBase : + public KernelStateBase, public virtual yk_grid { // Rank and local offsets in domain dim: @@ -61,9 +62,6 @@ namespace yask { // actual data, message stream. GenericGridBase* _ggb = 0; - // Problem dimensions. (NOT grid dims.) - DimsPtr _dims; - // The following masks have one bit for each dim in the grid. idx_t _step_dim_mask = 0; idx_t _domain_dim_mask = 0; @@ -83,7 +81,7 @@ namespace yask { Indices _allocs; // actual grid alloc in reals | same. // Sizes in vectors for sizes that are always vec lens (to avoid division). - // Each entry _vec_lens may be same as _dims->_fold_pts or one, depending + // Each entry _vec_lens may be same as dims->_fold_pts or one, depending // on whether grid is fully vectorized. Indices _vec_lens; // num reals in each elem | one. Indices _vec_left_pads; // same as _actl_left_pads. @@ -178,9 +176,9 @@ namespace yask { const Indices& last_indices) const; public: - YkGridBase(GenericGridBase* ggb, - const GridDimNames& dimNames, - DimsPtr dims); + YkGridBase(KernelStateBase& state, + GenericGridBase* ggb, + const GridDimNames& dimNames); virtual ~YkGridBase() { } // Halo-exchange flag accessors. @@ -295,8 +293,7 @@ namespace yask { // Return number of mismatches greater than epsilon. virtual idx_t compare(const YkGridBase* ref, real_t epsilon = EPSILON, - int maxPrint = 20, - std::ostream& os = std::cerr) const; + int maxPrint = 20) const; // Make sure indices are in range. // Optionally fix them to be in range and return in 'fixed_indices'. @@ -610,13 +607,11 @@ namespace yask { } public: - YkElemGrid(DimsPtr dims, + YkElemGrid(KernelStateBase& state, std::string name, - const GridDimNames& dimNames, - KernelSettingsPtr* settings, - std::ostream** ostr) : - YkGridBase(&_data, dimNames, dims), - _data(name, dimNames, settings, ostr) { + const GridDimNames& dimNames) : + YkGridBase(state, &_data, dimNames), + _data(state, name, dimNames) { _has_step_dim = _use_step_idx; resize(); } @@ -730,14 +725,13 @@ namespace yask { } public: - YkVecGrid(DimsPtr dims, + YkVecGrid(KernelStateBase& stateb, const std::string& name, - const GridDimNames& dimNames, - KernelSettingsPtr* settings, - std::ostream** ostr) : - YkGridBase(&_data, dimNames, dims), - _data(name, dimNames, settings, ostr), + const GridDimNames& dimNames) : + YkGridBase(stateb, &_data, dimNames), + _data(stateb, name, dimNames), _vec_fold_posns(idx_t(0), int(dimNames.size())) { + STATE_VARS(this); _has_step_dim = _use_step_idx; // Template vec lengths. @@ -805,9 +799,10 @@ namespace yask { virtual const real_t* getElemPtr(const Indices& idxs, idx_t alloc_step_idx, bool checkBounds=true) const final { + STATE_VARS_CONST(this); #ifdef TRACE_MEM - _data.get_ostr() << get_name() << "." << "YkVecGrid::getElemPtr(" << + ostr << get_name() << "." << "YkVecGrid::getElemPtr(" << idxs.makeValStr(get_num_dims()) << ")"; #endif // Use template vec lengths instead of run-time values for @@ -865,13 +860,13 @@ namespace yask { } // Get 1D element index into vector. - auto i = _dims.get()->getElemIndexInVec(fold_ofs); + auto i = dims->getElemIndexInVec(fold_ofs); #ifdef DEBUG_LAYOUT // Compare to more explicit offset extraction. IdxTuple eofs = get_allocs(); // get dims for this grid. elem_ofs.setTupleVals(eofs); // set vals from elem_ofs. - auto i2 = _dims->getElemIndexInVec(eofs); + auto i2 = dims->getElemIndexInVec(eofs); assert(i == i2); #endif @@ -1029,6 +1024,7 @@ namespace yask { idx_t first_alloc_step_idx, const Indices& last_indices, idx_t last_alloc_step_idx) { + STATE_VARS(this); if (!is_storage_allocated()) return 0; Indices firstv, lastv; @@ -1037,7 +1033,7 @@ namespace yask { // Find range. IdxTuple numVecsTuple = get_slice_range(firstv, lastv); - TRACE_MSG0(get_ostr(), "set_vecs_in_slice: setting " << + TRACE_MSG("set_vecs_in_slice: setting " << numVecsTuple.makeDimValStr(" * ") << " vecs at [" << makeIndexString(firstv) << " ... " << makeIndexString(lastv) << "]"); @@ -1083,22 +1079,17 @@ namespace yask { idx_t first_alloc_step_idx, const Indices& last_indices, idx_t last_alloc_step_idx) const { - if (!is_storage_allocated()) { - yask_exception e; - std::stringstream err; - err << "Error: call to 'get_vecs_in_slice' with no data allocated for grid '" << - get_name() << "'.\n"; - e.add_message(err.str()); - throw e; - //exit_yask(1); - } + STATE_VARS(this); + if (!is_storage_allocated()) + FORMAT_AND_THROW_YASK_EXCEPTION("Error: call to 'get_vecs_in_slice' with no data allocated for grid '" << + get_name()); Indices firstv, lastv; checkIndices(first_indices, "get_vecs_in_slice", true, true, &firstv); checkIndices(last_indices, "get_vecs_in_slice", true, true, &lastv); // Find range. IdxTuple numVecsTuple = get_slice_range(firstv, lastv); - TRACE_MSG0(get_ostr(), "get_vecs_in_slice: getting " << + TRACE_MSG("get_vecs_in_slice: getting " << numVecsTuple.makeDimValStr(" * ") << " vecs at " << makeIndexString(firstv) << " ... " << makeIndexString(lastv)); diff --git a/src/kernel/lib/settings.cpp b/src/kernel/lib/settings.cpp index fd56c46f..379e957b 100644 --- a/src/kernel/lib/settings.cpp +++ b/src/kernel/lib/settings.cpp @@ -28,6 +28,20 @@ using namespace std; namespace yask { + // Set debug output to cout if my_rank == msg_rank + // or a null stream otherwise. + ostream& KernelStateBase::set_ostr() { + assert(_state); + assert(_state->_env); + assert(_state->_opts); + yask_output_factory yof; + if (_state->_env->my_rank == _state->_opts->msg_rank) + set_debug_output(yof.new_stdout_output()); + else + set_debug_output(yof.new_null_output()); + return get_debug_output()->get_ostream(); + } + // Check whether dim is of allowed type. void Dims::checkDimType(const std::string& dim, const std::string& fn_name, @@ -66,8 +80,7 @@ namespace yask { return new_env(MPI_COMM_NULL); } - ///// KernelEnv functions: - + // KernelEnv global lock objects. omp_lock_t KernelEnv::_debug_lock; bool KernelEnv::_debug_lock_init_done = false; @@ -221,6 +234,52 @@ namespace yask { return bufs[i].bufs[bd]; } + // Settings ctor. + KernelSettings::KernelSettings(DimsPtr dims, KernelEnvPtr env) : + _dims(dims), max_threads(env->max_threads) { + auto& step_dim = dims->_step_dim; + + // Use both step and domain dims for all size tuples. + _rank_sizes = dims->_stencil_dims; + _rank_sizes.setValsSame(def_rank); // size of rank. + _rank_sizes.setVal(step_dim, 0); // not used. + + _region_sizes = dims->_stencil_dims; + _region_sizes.setValsSame(0); // 0 => default settings. + + _block_group_sizes = dims->_stencil_dims; + _block_group_sizes.setValsSame(0); // 0 => min size. + + _block_sizes = dims->_stencil_dims; + _block_sizes.setValsSame(def_block); // size of block. + _block_sizes.setVal(step_dim, 0); // 0 => default. + + _mini_block_group_sizes = dims->_stencil_dims; + _mini_block_group_sizes.setValsSame(0); // 0 => min size. + + _mini_block_sizes = dims->_stencil_dims; + _mini_block_sizes.setValsSame(0); // 0 => default settings. + + _sub_block_group_sizes = dims->_stencil_dims; + _sub_block_group_sizes.setValsSame(0); // 0 => min size. + + _sub_block_sizes = dims->_stencil_dims; + _sub_block_sizes.setValsSame(0); // 0 => default settings. + + _min_pad_sizes = dims->_stencil_dims; + _min_pad_sizes.setValsSame(0); + + _extra_pad_sizes = dims->_stencil_dims; + _extra_pad_sizes.setValsSame(0); + + // Use only domain dims for MPI tuples. + _num_ranks = dims->_domain_dims; + _num_ranks.setValsSame(1); + + _rank_indices = dims->_domain_dims; + _rank_indices.setValsSame(0); + } + // Add options to set one domain var to a cmd-line parser. void KernelSettings::_add_domain_option(CommandLineParser& parser, const std::string& prefix, @@ -515,7 +574,7 @@ namespace yask { // Make sure all user-provided settings are valid and finish setting up some // other vars before allocating memory. // Called from prepare_solution(), during auto-tuning, etc. - void KernelSettings::adjustSettings(std::ostream& os, KernelEnvPtr env) { + void KernelSettings::adjustSettings(std::ostream& os) { auto& step_dim = _dims->_step_dim; auto& inner_dim = _dims->_inner_dim; @@ -711,4 +770,44 @@ namespace yask { #endif } + // Ctor. + KernelStateBase::KernelStateBase(KernelEnvPtr& kenv, + KernelSettingsPtr& ksettings) + { + // Create state. All other objects that need to share + // this state should use a shared ptr to it. + _state = make_shared(); + + // Share passed ptrs. + _state->_env = kenv; + _state->_opts = ksettings; + _state->_dims = ksettings->_dims; + + // Set _state->_debug per settings. + set_ostr(); + + // Create MPI Info object. + _state->_mpiInfo = make_shared(ksettings->_dims); + + // Set vars after above inits. + STATE_VARS(this); + + // Find index posns in stencil dims. + DOMAIN_VAR_LOOP(i, j) { + auto& dname = stencil_dims.getDimName(i); + if (state->_outer_posn < 0) + state->_outer_posn = i; + if (dname == dims->_inner_dim) + state->_inner_posn = i; + } + assert(outer_posn == state->_outer_posn); + } + + // ContextLinker ctor. + ContextLinker::ContextLinker(StencilContext* context) : + KernelStateBase(context->get_state()), + _context(context) { + assert(context); + } + } // namespace yask. diff --git a/src/kernel/lib/settings.hpp b/src/kernel/lib/settings.hpp index 6113a980..c979c0e3 100644 --- a/src/kernel/lib/settings.hpp +++ b/src/kernel/lib/settings.hpp @@ -27,315 +27,8 @@ IN THE SOFTWARE. namespace yask { - typedef Tuple IdxTuple; - typedef std::vector GridIndices; - typedef std::vector GridDimSizes; - typedef std::vector GridDimNames; - - // A class to hold up to a given number of sizes or indices efficiently. - // Similar to a Tuple, but less overhead and doesn't keep names. - // Make sure this stays non-virtual. - // TODO: make this a template with _ndims as a parameter. - // TODO: ultimately, combine with Tuple w/o loss of efficiency. - class Indices { - - public: - - // Max number of indices that can be held. - // Note use of "+max_idxs" in code below to avoid compiler - // trying to take a reference to it, resulting in a undefined - // symbol (sometimes). - static constexpr int max_idxs = MAX_DIMS; - - // Step dim is always in [0] of an Indices type (if it is used). - static constexpr int step_posn = 0; - - protected: - idx_t _idxs[max_idxs]; - int _ndims; - - public: - // Ctors. - Indices() : _ndims(0) { } - Indices(int ndims) : _ndims(ndims) { } // NB: _idxs remain uninit! - Indices(const IdxTuple& src) { - setFromTuple(src); - } - Indices(const GridIndices& src) { - setFromVec(src); - } - Indices(const std::initializer_list& src) { - setFromInitList(src); - } - Indices(const idx_t src[], int ndims) { - setFromArray(src, ndims); - } - Indices(idx_t src, int ndims) { - setFromConst(src, ndims); - } - - // Default copy ctor, copy operator should be okay. - - // Access size. - inline int getNumDims() const { - return _ndims; - } - inline void setNumDims(int n) { - _ndims = n; - } - - // Access indices. - inline idx_t& operator[](int i) { - assert(i >= 0); - assert(i < _ndims); - return _idxs[i]; - } - inline const idx_t& operator[](int i) const { - assert(i >= 0); - assert(i < _ndims); - return _idxs[i]; - } - - // Write to an IdxTuple. - // The 'tgt' must have the same number of dims. - void setTupleVals(IdxTuple& tgt) const { - assert(tgt.size() == _ndims); - for (int i = 0; i < _ndims; i++) - if (i < tgt.size()) - tgt.setVal(i, _idxs[i]); - } - - // Read from an IdxTuple. - void setFromTuple(const IdxTuple& src) { - assert(src.size() <= +max_idxs); - int n = int(src.size()); - for (int i = 0; i < n; i++) - _idxs[i] = src.getVal(i); - _ndims = n; - } - - // Other inits. - void setFromVec(const GridIndices& src) { - assert(src.size() <= +max_idxs); - int n = int(src.size()); - for (int i = 0; i < n; i++) - _idxs[i] = src[i]; - _ndims = n; - } - - // default n => don't change _ndims. - void setFromArray(const idx_t src[], int n = -1) { - if (n < 0) - n = _ndims; - assert(n <= +max_idxs); - for (int i = 0; i < n; i++) - _idxs[i] = src[i]; - _ndims = n; - } - void setFromInitList(const std::initializer_list& src) { - assert(src.size() <= +max_idxs); - int i = 0; - for (auto idx : src) - _idxs[i++] = idx; - _ndims = i; - } - - // default n => don't change _ndims. - void setFromConst(idx_t val, int n = -1) { - if (n < 0) - n = _ndims; - assert(n <= +max_idxs); - for (int i = 0; i < n; i++) - _idxs[i] = val; - _ndims = n; - } - - // Some comparisons. - // These assume all the indices are valid or - // initialized to the same value. - bool operator==(const Indices& rhs) const { - if (_ndims != rhs._ndims) - return false; - for (int i = 0; i < _ndims; i++) - if (_idxs[i] != rhs._idxs[i]) - return false; - return true; - } - bool operator!=(const Indices& rhs) const { - return !operator==(rhs); - } - bool operator<(const Indices& rhs) const { - if (_ndims < rhs._ndims) - return true; - else if (_ndims > rhs._ndims) - return false; - for (int i = 0; i < _ndims; i++) - if (_idxs[i] < rhs._idxs[i]) - return true; - else if (_idxs[i] > rhs._idxs[i]) - return false; - return false; // equal, so not less than. - } - bool operator>(const Indices& rhs) const { - if (_ndims > rhs._ndims) - return true; - else if (_ndims < rhs._ndims) - return false; - for (int i = 0; i < _ndims; i++) - if (_idxs[i] > rhs._idxs[i]) - return true; - else if (_idxs[i] < rhs._idxs[i]) - return false; - return false; // equal, so not greater than. - } - - // Generic element-wise operator. - // Returns a new object. - inline Indices combineElements(std::function func, - const Indices& other) const { - Indices res(*this); - -#if EXACT_INDICES - // Use just the used elements. - for (int i = 0; i < _ndims; i++) -#else - // Use all to allow unroll and avoid jumps. -#pragma unroll - for (int i = 0; i < max_idxs; i++) -#endif - func(res._idxs[i], other._idxs[i]); - return res; - } - - // Some element-wise operators. - // These all return a new set of Indices rather - // than modifying this object. - inline Indices addElements(const Indices& other) const { - return combineElements([&](idx_t& lhs, idx_t rhs) { lhs += rhs; }, - other); - } - inline Indices subElements(const Indices& other) const { - return combineElements([&](idx_t& lhs, idx_t rhs) { lhs -= rhs; }, - other); - } - inline Indices mulElements(const Indices& other) const { - return combineElements([&](idx_t& lhs, idx_t rhs) { lhs *= rhs; }, - other); - } - inline Indices divElements(const Indices& other) const { - return combineElements([&](idx_t& lhs, idx_t rhs) { lhs /= rhs; }, - other); - } - inline Indices minElements(const Indices& other) const { - return combineElements([&](idx_t& lhs, idx_t rhs) { lhs = std::min(lhs, rhs); }, - other); - } - inline Indices maxElements(const Indices& other) const { - return combineElements([&](idx_t& lhs, idx_t rhs) { lhs = std::max(lhs, rhs); }, - other); - } - - // Generic element-wise operator with RHS const. - // Returns a new object. - inline Indices mapElements(std::function func, - idx_t crhs) const { - Indices res(*this); - -#if EXACT_INDICES - // Use just the used elements. - for (int i = 0; i < _ndims; i++) -#else - // Use all to allow unroll and avoid jumps. -#pragma unroll - for (int i = 0; i < max_idxs; i++) -#endif - func(res._idxs[i], crhs); - return res; - } - - // Operate on all elements. - Indices addConst(idx_t crhs) const { - return mapElements([&](idx_t& lhs, idx_t rhs) { lhs += rhs; }, - crhs); - } - Indices subConst(idx_t crhs) const { - return mapElements([&](idx_t& lhs, idx_t rhs) { lhs -= rhs; }, - crhs); - } - Indices mulConst(idx_t crhs) const { - return mapElements([&](idx_t& lhs, idx_t rhs) { lhs *= rhs; }, - crhs); - } - Indices divConst(idx_t crhs) const { - return mapElements([&](idx_t& lhs, idx_t rhs) { lhs /= rhs; }, - crhs); - } - Indices minConst(idx_t crhs) const { - return mapElements([&](idx_t& lhs, idx_t rhs) { lhs = std::min(lhs, rhs); }, - crhs); - } - Indices maxConst(idx_t crhs) const { - return mapElements([&](idx_t& lhs, idx_t rhs) { lhs = std::max(lhs, rhs); }, - crhs); - } - - // Reduce over all elements. - idx_t sum() const { - idx_t res = 0; - for (int i = 0; i < _ndims; i++) - res += _idxs[i]; - return res; - } - idx_t product() const { - idx_t res = 1; - for (int i = 0; i < _ndims; i++) - res *= _idxs[i]; - return res; - } - - // Make string like "x=4, y=8". - std::string makeDimValStr(const GridDimNames& names, - std::string separator=", ", - std::string infix="=", - std::string prefix="", - std::string suffix="") const { - assert((int)names.size() == _ndims); - - // Make a Tuple from names. - IdxTuple tmp; - for (int i = 0; i < int(names.size()); i++) - tmp.addDimBack(names[i], _idxs[i]); - return tmp.makeDimValStr(separator, infix, prefix, suffix); - } - - // Make string like "4, 3, 2". - std::string makeValStr(int nvals, - std::string separator=", ", - std::string prefix="", - std::string suffix="") const { - assert(nvals == _ndims); - - // Make a Tuple w/o useful names. - IdxTuple tmp; - for (int i = 0; i < nvals; i++) - tmp.addDimBack(std::string(1, '0' + char(i)), _idxs[i]); - return tmp.makeValStr(separator, prefix, suffix); - } - }; - - // Define OMP reductions on Indices. -#pragma omp declare reduction(min_idxs : Indices : \ - omp_out = omp_out.minElements(omp_in) ) \ - initializer (omp_priv = omp_orig) -#pragma omp declare reduction(max_idxs : Indices : \ - omp_out = omp_out.maxElements(omp_in) ) \ - initializer (omp_priv = omp_orig) - - // Layout algorithms using Indices. -#include "yask_layouts.hpp" - // Forward defns. - struct StencilContext; + class StencilContext; class YkGridBase; // Some derivations from grid types. @@ -480,96 +173,6 @@ namespace yask { }; typedef std::shared_ptr DimsPtr; - // A group of Indices needed for generated loops. - // See the help message from gen_loops.pl for the - // documentation of the indices. - // Make sure this stays non-virtual. - struct ScanIndices { - int ndims = 0; - - // Input values; not modified. - Indices begin, end; // first and end (beyond last) range of each index. - Indices step; // step value within range. - Indices align; // alignment of steps after first one. - Indices align_ofs; // adjustment for alignment (see below). - Indices group_size; // proximity grouping within range. - - // Alignment: when possible, start positions after the first - // in each dim will be aligned such that ((start - align_ofs) % align) == 0. - - // Output values; set once for entire range. - Indices num_indices; // number of indices in each dim. - idx_t linear_indices = 0; // total indices over all dims (product of num_indices). - - // Output values; set for each index by loop code. - Indices start, stop; // first and last+1 for this sub-range. - Indices index; // 0-based unique index for each sub-range in each dim. - idx_t linear_index = 0; // 0-based index over all dims. - - // Example w/3 sub-ranges in overall range: - // begin end - // |--------------------------------------------| - // |------------------|------------------|------| - // start stop (index = 0) - // start stop (index = 1) - // start stop (index = 2) - - // Default init. - ScanIndices(const Dims& dims, bool use_vec_align, IdxTuple* ofs) : - ndims(NUM_STENCIL_DIMS), - begin(idx_t(0), ndims), - end(idx_t(0), ndims), - step(idx_t(1), ndims), - align(idx_t(1), ndims), - align_ofs(idx_t(0), ndims), - group_size(idx_t(1), ndims), - num_indices(idx_t(1), ndims), - start(idx_t(0), ndims), - stop(idx_t(0), ndims), - index(idx_t(0), ndims) { - - // i: index for stencil dims, j: index for domain dims. - DOMAIN_VAR_LOOP(i, j) { - - // Set alignment to vector lengths. - if (use_vec_align) - align[i] = fold_pts[j]; - - // Set alignment offset. - if (ofs) { - assert(ofs->getNumDims() == ndims - 1); - align_ofs[i] = ofs->getVal(j); - } - } - } - - // Init from outer-loop indices. - // Start..stop from point in outer loop become begin..end - // for this loop. - // - // Example: - // begin (outer) end - // |--------------------------------------------| - // |------------------|------------------|------| - // start | stop - // V - // begin (this) end - // |------------------| - // start stop (may be sub-dividied later) - void initFromOuter(const ScanIndices& outer) { - - // Begin & end set from start & stop of outer loop. - begin = start = outer.start; - end = stop = outer.stop; - - // Pass some values through. - align = outer.align; - align_ofs = outer.align_ofs; - - // Leave others alone. - } - }; - // MPI neighbor info. class MPIInfo { @@ -900,52 +503,8 @@ namespace yask { int _numa_pref = NUMA_PREF; int _numa_pref_max = 128; // GiB to alloc before using PMEM. - // Ctor. - // TODO: move code to settings.cpp. - KernelSettings(DimsPtr dims, KernelEnvPtr env) : - _dims(dims), max_threads(env->max_threads) { - auto& step_dim = dims->_step_dim; - - // Use both step and domain dims for all size tuples. - _rank_sizes = dims->_stencil_dims; - _rank_sizes.setValsSame(def_rank); // size of rank. - _rank_sizes.setVal(step_dim, 0); // not used. - - _region_sizes = dims->_stencil_dims; - _region_sizes.setValsSame(0); // 0 => default settings. - - _block_group_sizes = dims->_stencil_dims; - _block_group_sizes.setValsSame(0); // 0 => min size. - - _block_sizes = dims->_stencil_dims; - _block_sizes.setValsSame(def_block); // size of block. - _block_sizes.setVal(step_dim, 0); // 0 => default. - - _mini_block_group_sizes = dims->_stencil_dims; - _mini_block_group_sizes.setValsSame(0); // 0 => min size. - - _mini_block_sizes = dims->_stencil_dims; - _mini_block_sizes.setValsSame(0); // 0 => default settings. - - _sub_block_group_sizes = dims->_stencil_dims; - _sub_block_group_sizes.setValsSame(0); // 0 => min size. - - _sub_block_sizes = dims->_stencil_dims; - _sub_block_sizes.setValsSame(0); // 0 => default settings. - - _min_pad_sizes = dims->_stencil_dims; - _min_pad_sizes.setValsSame(0); - - _extra_pad_sizes = dims->_stencil_dims; - _extra_pad_sizes.setValsSame(0); - - // Use only domain dims for MPI tuples. - _num_ranks = dims->_domain_dims; - _num_ranks.setValsSame(1); - - _rank_indices = dims->_domain_dims; - _rank_indices.setValsSame(0); - } + // Ctor/dtor. + KernelSettings(DimsPtr dims, KernelEnvPtr env); virtual ~KernelSettings() { } protected: @@ -976,9 +535,9 @@ namespace yask { // values as needed. // Called from prepare_solution(), so it doesn't normally need to be called from user code. // Prints informational info to 'os'. - virtual void adjustSettings(std::ostream& os, KernelEnvPtr env); - virtual void adjustSettings(KernelEnvPtr env) { - adjustSettings(nullop->get_ostream(), env); + virtual void adjustSettings(std::ostream& os); + virtual void adjustSettings() { + adjustSettings(nullop->get_ostream()); } // Determine if this is the first or last rank in given dim. @@ -991,4 +550,222 @@ namespace yask { }; typedef std::shared_ptr KernelSettingsPtr; + // A collection of solution meta-data whose ownership is shared between + // various objects. + struct KernelState { + virtual ~KernelState() { } + + // Output stream for messages. + yask_output_ptr _debug; + + // Env. + KernelEnvPtr _env; + + // Command-line and env parameters. + KernelSettingsPtr _opts; + bool _use_pack_tuners = false; + + // Problem dims. + DimsPtr _dims; + + // Position of inner domain dim in stencil-dims tuple. + // Misc dims will follow this if/when using interleaving. + // TODO: move to Dims. + int _inner_posn = -1; // -1 => not set. + + // Position of outer domain dim in stencil-dims tuple. + // For 1D stencils, _outer_posn == _inner_posn. + // TODO: move to Dims. + int _outer_posn = -1; // -1 => not set. + + // MPI info. + MPIInfoPtr _mpiInfo; + }; + typedef std::shared_ptr KernelStatePtr; + + // Macro to define and set commonly-needed state vars efficiently. + // 'parent_p' is pointer to object containing 'KernelStatePtr _state'. + // '*_posn' vars are positions in stencil_dims. +#define STATE_VARS0(parent_p, pfx) \ + pfx auto* pp = parent_p; \ + assert(pp); \ + pfx auto* state = pp->_state.get(); \ + assert(state); \ + assert(state->_debug.get()); \ + auto& os = state->_debug.get()->get_ostream(); \ + pfx auto* env = state->_env.get(); \ + assert(env); \ + pfx auto* opts = state->_opts.get(); \ + assert(opts); \ + pfx auto* dims = state->_dims.get(); \ + assert(dims); \ + pfx auto* mpiInfo = state->_mpiInfo.get(); \ + assert(mpiInfo); \ + const auto& step_dim = dims->_step_dim; \ + const auto& inner_dim = dims->_inner_dim; \ + const auto& domain_dims = dims->_domain_dims; \ + constexpr int nddims = NUM_DOMAIN_DIMS; \ + assert(nddims == domain_dims.size()); \ + const auto& stencil_dims = dims->_stencil_dims; \ + constexpr int nsdims = NUM_STENCIL_DIMS; \ + assert(nsdims == stencil_dims.size()); \ + auto& misc_dims = dims->_misc_dims; \ + constexpr int step_posn = 0; \ + assert(step_posn == +Indices::step_posn); \ + constexpr int outer_posn = 1; \ + const int inner_posn = state->_inner_posn +#define STATE_VARS(parent_p) STATE_VARS0(parent_p,) +#define STATE_VARS_CONST(parent_p) STATE_VARS0(parent_p, const) + + // A base class containing a shared pointer to a kernel state. + // Used to ensure that the shared state object stays allocated when + // at least one of its owners exists. + class KernelStateBase { + protected: + + // Common state. This is a separate object to allow + // multiple objects to keep it alive via shared ptrs. + KernelStatePtr _state; + + public: + KernelStateBase(KernelStatePtr& state) : + _state(state) {} + KernelStateBase(KernelEnvPtr& env, + KernelSettingsPtr& settings); + virtual ~KernelStateBase() {} + + // Access to state. + KernelStatePtr& get_state() { + assert(_state); + return _state; + } + const KernelStatePtr& get_state() const { + assert(_state); + return _state; + } + KernelSettingsPtr& get_settings() { return _state->_opts; } + const KernelSettingsPtr& get_settings() const { return _state->_opts; } + KernelEnvPtr& get_env() { return _state->_env; } + const KernelEnvPtr& get_env() const { return _state->_env; } + DimsPtr& get_dims() { return _state->_dims; } + const DimsPtr& get_dims() const { return _state->_dims; } + MPIInfoPtr& get_mpi_info() { return _state->_mpiInfo; } + const MPIInfoPtr& get_mpi_info() const { return _state->_mpiInfo; } + bool use_pack_tuners() const { return _state->_use_pack_tuners; } + virtual yask_output_ptr get_debug_output() const { return _state->_debug; } + virtual void set_debug_output(yask_output_ptr debug) { _state->_debug = debug; } + + // Set debug output to cout if my_rank == msg_rank + // or a null stream otherwise. + std::ostream& set_ostr(); + std::ostream& get_ostr() const { + STATE_VARS(this); + return os; + } + + // Set number of threads w/o using thread-divisor. + // Return number of threads. + // Do nothing and return 0 if not properly initialized. + int set_max_threads() { + STATE_VARS(this); + + // Get max number of threads. + int mt = std::max(opts->max_threads, 1); + + // Reset number of OMP threads to max allowed. + omp_set_num_threads(mt); + return mt; + } + + // Get total number of computation threads to use. + int get_num_comp_threads(int& region_threads, int& blk_threads) const { + STATE_VARS(this); + + // Max threads / divisor. + int mt = std::max(opts->max_threads, 1); + int td = std::max(opts->thread_divisor, 1); + int at = mt / td; + at = std::max(at, 1); + + // Blk threads per region thread. + int bt = std::max(opts->num_block_threads, 1); + bt = std::min(bt, at); // Cannot be > 'at'. + blk_threads = bt; + + // Region threads. + int rt = at / bt; + rt = std::max(rt, 1); + region_threads = rt; + + // Total number of block threads. + return bt * rt; + } + + // Set number of threads to use for something other than a region. + // Return number of threads. + // Do nothing and return 0 if not properly initialized. + int set_all_threads() { + int rt, bt; + int at = get_num_comp_threads(rt, bt); + omp_set_num_threads(at); + return at; + } + + // Set number of threads to use for a region. + // Enable nested OMP if there are >1 block threads, + // disable otherwise. + // Return number of threads. + // Do nothing and return 0 if not properly initialized. + int set_region_threads() { + int rt, bt; + int at = get_num_comp_threads(rt, bt); + + // Limit outer nesting to allow num_block_threads per nested + // block loop. + yask_num_threads[0] = rt; + + if (bt > 1) { + omp_set_nested(1); + omp_set_max_active_levels(2); + yask_num_threads[1] = bt; + } + else { + omp_set_nested(0); + omp_set_max_active_levels(1); + yask_num_threads[1] = 0; + } + + omp_set_num_threads(rt); + return rt; + } + + // Set number of threads for a block. + // Return number of threads. + // Do nothing and return 0 if not properly initialized. + int set_block_threads() { + int rt, bt; + int at = get_num_comp_threads(rt, bt); + + if (omp_get_max_active_levels() > 1) + omp_set_num_threads(bt); + return bt; + } + + }; + + // An object that is created from a context, shares ownership of the + // state, and keeps a pointer back to the context. However, it does not + // share ownership of the context itself. That would create an ownership + // loop that would not allow the context to be deleted. + class ContextLinker : + public KernelStateBase { + + protected: + StencilContext* _context; + + public: + ContextLinker(StencilContext* context); + virtual ~ContextLinker() { } + }; + } // yask namespace. diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp index cc6a69ce..ab69aef5 100644 --- a/src/kernel/lib/setup.cpp +++ b/src/kernel/lib/setup.cpp @@ -31,37 +31,42 @@ using namespace std; namespace yask { - // Constructor. - StencilContext::StencilContext(KernelEnvPtr kenv, - KernelSettingsPtr ksettings) : - _ostr(&std::cout), - _env(kenv), - _opts(ksettings), - _dims(ksettings->_dims), - _at(this, _opts.get()) - { - // Set debug output object. - yask_output_factory yof; - set_debug_output(yof.new_stdout_output()); - - // Create MPI Info object. - _mpiInfo = std::make_shared(ksettings->_dims); - - // Set output to msg-rank per settings. - set_ostr(); + // ScanIndices ctor. + ScanIndices::ScanIndices(const Dims& dims, bool use_vec_align, IdxTuple* ofs) : + ndims(NUM_STENCIL_DIMS), + begin(idx_t(0), ndims), + end(idx_t(0), ndims), + step(idx_t(1), ndims), + align(idx_t(1), ndims), + align_ofs(idx_t(0), ndims), + group_size(idx_t(1), ndims), + num_indices(idx_t(1), ndims), + start(idx_t(0), ndims), + stop(idx_t(0), ndims), + index(idx_t(0), ndims) { + + // i: index for stencil dims, j: index for domain dims. + DOMAIN_VAR_LOOP(i, j) { - // Set standard context vars after above init code. - CONTEXT_VARS(this); + // Set alignment to vector lengths. + if (use_vec_align) + align[i] = fold_pts[j]; - // Find index posns in stencil dims. - DOMAIN_VAR_LOOP(i, j) { - auto& dname = stencil_dims.getDimName(i); - if (_outer_posn < 0) - _outer_posn = i; - if (dname == dims->_inner_dim) - _inner_posn = i; + // Set alignment offset. + if (ofs) { + assert(ofs->getNumDims() == ndims - 1); + align_ofs[i] = ofs->getVal(j); + } } - assert(outer_posn == _outer_posn); + } + + // Context ctor. + StencilContext::StencilContext(KernelEnvPtr& kenv, + KernelSettingsPtr& ksettings) : + KernelStateBase(kenv, ksettings), + _at(this, ksettings.get()) + { + STATE_VARS(this); // Init various tuples to make sure they have the correct dims. rank_domain_offsets = domain_dims; @@ -83,7 +88,7 @@ namespace yask { // if not using MPI to properly init these vars. Called from // prepare_solution(), so it doesn't normally need to be called from user code. void StencilContext::setupRank() { - CONTEXT_VARS(this); + STATE_VARS(this); auto me = env->my_rank; int num_neighbors = 0; @@ -129,9 +134,9 @@ namespace yask { // Exchange coord and size info between all ranks. for (int rn = 0; rn < env->num_ranks; rn++) { MPI_Bcast(&coords[rn][0], nddims, MPI_INTEGER8, - rn, _env->comm); + rn, env->comm); MPI_Bcast(&rsizes[rn][0], nddims, MPI_INTEGER8, - rn, _env->comm); + rn, env->comm); } // Now, the tables are filled in for all ranks. @@ -205,7 +210,7 @@ namespace yask { auto mysz = rsizes[me][dj]; auto rnsz = rsizes[rn][dj]; if (mysz != rnsz) { - auto& dnamej = _opts->_rank_indices.getDimName(dj); + auto& dnamej = opts->_rank_indices.getDimName(dj); FORMAT_AND_THROW_YASK_EXCEPTION ("Error: rank " << rn << " and " << me << " are both at rank-index " << coords[me][di] << @@ -236,11 +241,11 @@ namespace yask { assert(roffsets.max() <= 2); // Convert the offsets into a 1D index. - auto rn_ofs = _mpiInfo->getNeighborIndex(roffsets); - TRACE_MSG("neighborhood size = " << _mpiInfo->neighborhood_sizes.makeDimValStr() << + auto rn_ofs = mpiInfo->getNeighborIndex(roffsets); + TRACE_MSG("neighborhood size = " << mpiInfo->neighborhood_sizes.makeDimValStr() << " & roffsets of rank " << rn << " = " << roffsets.makeDimValStr() << " => " << rn_ofs); - assert(idx_t(rn_ofs) < _mpiInfo->neighborhood_size); + assert(idx_t(rn_ofs) < mpiInfo->neighborhood_size); // Save rank of this neighbor into the MPI info object. mpiInfo->my_neighbors.at(rn_ofs) = rn; @@ -283,7 +288,7 @@ namespace yask { auto rnsz = rsizes[rn][j]; auto vlen = fold_pts[j]; if (rnsz % vlen != 0) { - auto& dname = _opts->_rank_indices.getDimName(j); + auto& dname = opts->_rank_indices.getDimName(j); TRACE_MSG("cannot use vector halo exchange with rank " << rn << " because its size in '" << dname << "' is " << rnsz); vlen_mults = false; @@ -291,7 +296,7 @@ namespace yask { } // Save vec-mult flag. - _mpiInfo->has_all_vlen_mults.at(rn_ofs) = vlen_mults; + mpiInfo->has_all_vlen_mults.at(rn_ofs) = vlen_mults; } // self or immediate neighbor in any direction. @@ -312,7 +317,7 @@ namespace yask { // Set wave-front settings. // This should be called anytime a setting or rank offset is changed. void StencilContext::update_grid_info() { - CONTEXT_VARS(this); + STATE_VARS(this); // If we haven't finished constructing the context, it's too early // to do this. @@ -331,18 +336,18 @@ namespace yask { continue; // Loop through each domain dim. - for (auto& dim : _dims->_domain_dims.getDims()) { + for (auto& dim : domain_dims.getDims()) { auto& dname = dim.getName(); if (gp->is_dim_used(dname)) { // Rank domains. - gp->_set_domain_size(dname, _opts->_rank_sizes[dname]); + gp->_set_domain_size(dname, opts->_rank_sizes[dname]); // Pads. // Set via both 'extra' and 'min'; larger result will be used. - gp->set_extra_pad_size(dname, _opts->_extra_pad_sizes[dname]); - gp->set_min_pad_size(dname, _opts->_min_pad_sizes[dname]); + gp->set_extra_pad_size(dname, opts->_extra_pad_sizes[dname]); + gp->set_min_pad_size(dname, opts->_min_pad_sizes[dname]); // Offsets. gp->_set_rank_offset(dname, rank_domain_offsets[dname]); @@ -358,9 +363,9 @@ namespace yask { // Calculate wave-front shifts. // See the wavefront diagram in run_solution() for description // of angles and extensions. - idx_t tb_steps = _opts->_block_sizes[step_dim]; // use requested size; actual may be less. + idx_t tb_steps = opts->_block_sizes[step_dim]; // use requested size; actual may be less. assert(tb_steps >= 0); - wf_steps = _opts->_region_sizes[step_dim]; + wf_steps = opts->_region_sizes[step_dim]; wf_steps = max(wf_steps, tb_steps); // round up WF steps if less than TB steps. assert(wf_steps >= 0); num_wf_shifts = 0; @@ -377,18 +382,18 @@ namespace yask { assert(num_wf_shifts >= 0); // Determine whether separate tuners can be used. - _use_pack_tuners = (tb_steps == 0) && (stPacks.size() > 1); + state->_use_pack_tuners = (tb_steps == 0) && (stPacks.size() > 1); // Calculate angles and related settings. - for (auto& dim : _dims->_domain_dims.getDims()) { + for (auto& dim : domain_dims.getDims()) { auto& dname = dim.getName(); - auto rnsize = _opts->_region_sizes[dname]; - auto rksize = _opts->_rank_sizes[dname]; - auto nranks = _opts->_num_ranks[dname]; + auto rnsize = opts->_region_sizes[dname]; + auto rksize = opts->_rank_sizes[dname]; + auto nranks = opts->_num_ranks[dname]; // Req'd shift in this dim based on max halos. // TODO: use different angle for L & R side of each pack. - idx_t angle = ROUND_UP(max_halos[dname], _dims->_fold_pts[dname]); + idx_t angle = ROUND_UP(max_halos[dname], dims->_fold_pts[dname]); // Determine the spatial skewing angles for WF tiling. We // only need non-zero angles if the region size is less than the @@ -409,7 +414,7 @@ namespace yask { // Is domain size at least as large as halo + wf_ext in direction // when there are multiple ranks? auto min_size = max_halos[dname] + shifts; - if (_opts->_num_ranks[dname] > 1 && rksize < min_size) { + if (opts->_num_ranks[dname] > 1 && rksize < min_size) { FORMAT_AND_THROW_YASK_EXCEPTION("Error: rank-domain size of " << rksize << " in '" << dname << "' dim is less than minimum size of " << min_size << ", which is based on stencil halos and temporal wave-front sizes"); @@ -417,11 +422,11 @@ namespace yask { // If there is another rank to the left, set wave-front // extension on the left. - left_wf_exts[dname] = _opts->is_first_rank(dname) ? 0 : shifts; + left_wf_exts[dname] = opts->is_first_rank(dname) ? 0 : shifts; // If there is another rank to the right, set wave-front // extension on the right. - right_wf_exts[dname] = _opts->is_last_rank(dname) ? 0 : shifts; + right_wf_exts[dname] = opts->is_last_rank(dname) ? 0 : shifts; } // Now that wave-front settings are known, we can push this info @@ -436,7 +441,7 @@ namespace yask { continue; // Loop through each domain dim. - for (auto& dim : _dims->_domain_dims.getDims()) { + for (auto& dim : domain_dims.getDims()) { auto& dname = dim.getName(); if (gp->is_dim_used(dname)) { @@ -459,7 +464,7 @@ namespace yask { // considering temporal conditions; this assumes worst-case, which is // all packs always done. void StencilContext::update_tb_info() { - CONTEXT_VARS(this); + STATE_VARS(this); // Get requested size. tb_steps = opts->_block_sizes[step_dim]; @@ -485,21 +490,21 @@ namespace yask { // Loop through each domain dim. DOMAIN_VAR_LOOP(i, j) { - auto& dim = _dims->_domain_dims.getDim(j); + auto& dim = domain_dims.getDim(j); auto& dname = dim.getName(); - auto rnsize = _opts->_region_sizes[i]; + auto rnsize = opts->_region_sizes[i]; // There must be only one block size when using TB, so get // sizes from context settings instead of packs. - assert(_use_pack_tuners == false); - auto blksize = _opts->_block_sizes[i]; - auto mblksize = _opts->_mini_block_sizes[i]; + assert(state->_use_pack_tuners == false); + auto blksize = opts->_block_sizes[i]; + auto mblksize = opts->_mini_block_sizes[i]; // Req'd shift in this dim based on max halos. // Can't use separate L & R shift because of possible data reuse in grids. // Can't use separate shifts for each pack for same reason. // TODO: make round-up optional. - auto fpts = _dims->_fold_pts[j]; + auto fpts = dims->_fold_pts[j]; idx_t angle = ROUND_UP(max_halos[j], fpts); // Determine the spatial skewing angles for MB. @@ -591,7 +596,7 @@ namespace yask { // TODO: use actual number of shifts dynamically instead of this // max. DOMAIN_VAR_LOOP(i, j) { - auto blk_sz = _opts->_block_sizes[i]; + auto blk_sz = opts->_block_sizes[i]; auto tb_angle = tb_angles[j]; tb_widths[j] = blk_sz; tb_tops[j] = blk_sz; @@ -601,7 +606,7 @@ namespace yask { if (num_tb_shifts > 0 && tb_angle > 0) { // See equations above for block size. - auto fpts = _dims->_fold_pts[j]; + auto fpts = dims->_fold_pts[j]; idx_t min_top_sz = fpts; idx_t sa = num_tb_shifts * tb_angle; idx_t min_blk_width = min_top_sz + 2 * sa; @@ -619,7 +624,7 @@ namespace yask { // Init all grids & params by calling initFn. void StencilContext::initValues(function realInitFn) { - CONTEXT_VARS(this); + STATE_VARS(this); real_t seed = 0.1; os << "Initializing grids...\n" << flush; @@ -637,7 +642,7 @@ namespace yask { // Set the bounding-box for each stencil-bundle and whole domain. void StencilContext::find_bounding_boxes() { - ostream& os = get_ostr(); + STATE_VARS(this); os << "Constructing bounding boxes for " << stBundles.size() << " stencil-bundles(s)...\n" << flush; YaskTimer bbtimer; @@ -645,7 +650,7 @@ namespace yask { // Rank BB is based only on rank offsets and rank domain sizes. rank_bb.bb_begin = rank_domain_offsets; - rank_bb.bb_end = rank_domain_offsets.addElements(_opts->_rank_sizes, false); + rank_bb.bb_end = rank_domain_offsets.addElements(opts->_rank_sizes, false); rank_bb.update_bb("rank", *this, true, &os); // BB may be extended for wave-fronts. @@ -659,8 +664,8 @@ namespace yask { // Find BB for each pack. for (auto sp : stPacks) { auto& spbb = sp->getBB(); - spbb.bb_begin = _dims->_domain_dims; - spbb.bb_end = _dims->_domain_dims; + spbb.bb_begin = domain_dims; + spbb.bb_end = domain_dims; // Find BB for each bundle in this pack. for (auto sb : *sp) { @@ -699,8 +704,8 @@ namespace yask { // Copy BB vars from another. void StencilBundleBase::copy_bounding_box(const StencilBundleBase* src) { - CONTEXT_VARS(_generic_context); - TRACE_MSG3("copy_bounding_box for '" << get_name() << "' from '" << + STATE_VARS(this); + TRACE_MSG("copy_bounding_box for '" << get_name() << "' from '" << src->get_name() << "'..."); _bundle_bb = src->_bundle_bb; @@ -713,11 +718,12 @@ namespace yask { // Step-vars are tested dynamically for each step // as it is executed. void StencilBundleBase::find_bounding_box() { - CONTEXT_VARS(_generic_context); - TRACE_MSG3("find_bounding_box for '" << get_name() << "'..."); + STATE_VARS(this); + TRACE_MSG("find_bounding_box for '" << get_name() << "'..."); // Init overall bundle BB to that of parent and clear list. - _bundle_bb = cp->ext_bb; + assert(_context); + _bundle_bb = _context->ext_bb; assert(_bundle_bb.bb_valid); _bb_list.clear(); @@ -727,7 +733,7 @@ namespace yask { // If there is no condition, just add full BB to list. if (!is_sub_domain_expr()) { - TRACE_MSG3("adding 1 sub-BB: [" << _bundle_bb.bb_begin.makeDimValStr() << + TRACE_MSG("adding 1 sub-BB: [" << _bundle_bb.bb_begin.makeDimValStr() << " ... " << _bundle_bb.bb_end.makeDimValStr() << ")"); _bb_list.push_back(_bundle_bb); return; @@ -744,7 +750,7 @@ namespace yask { idx_t outer_len = _bundle_bb.bb_len[odim]; idx_t nthreads = yask_get_num_threads(); idx_t len_per_thr = CEIL_DIV(outer_len, nthreads); - TRACE_MSG3("find_bounding_box: running " << nthreads << " thread(s) over " << + TRACE_MSG("find_bounding_box: running " << nthreads << " thread(s) over " << outer_len << " point(s) in outer dim"); // List of full BBs for each thread. @@ -818,7 +824,7 @@ namespace yask { while (do_scan) { do_scan = false; - TRACE_MSG3("scanning " << scan_len.makeDimValStr(" * ") << + TRACE_MSG("scanning " << scan_len.makeDimValStr(" * ") << " starting at " << bdpt.makeDimValStr()); scan_len.visitAllPoints ([&](const IdxTuple& eofs, size_t eidx) { @@ -874,14 +880,14 @@ namespace yask { return true; // keep looking for invalid point. }); // Looking for invalid point. } // while scan is adjusted. - TRACE_MSG3("found BB " << scan_len.makeDimValStr(" * ") << + TRACE_MSG("found BB " << scan_len.makeDimValStr(" * ") << " starting at " << bdpt.makeDimValStr()); // 'scan_len' now contains sizes of the new BB. BoundingBox new_bb; new_bb.bb_begin = bdpt; new_bb.bb_end = bdpt.addElements(scan_len); - new_bb.update_bb("sub-bb", *cp, true); + new_bb.update_bb("sub-bb", *_context, true); cur_bb_list.push_back(new_bb); } // new rect found. @@ -889,7 +895,7 @@ namespace yask { return true; // from labmda; keep looking. }); // Looking for new rects. }); // threads/slices. - TRACE_MSG3("sub-bbs found in " << + TRACE_MSG("sub-bbs found in " << bbtimer.get_secs_since_start() << " secs."); // At this point, we have a set of full BBs. @@ -900,13 +906,13 @@ namespace yask { // TODO: merge in a parallel binary tree instead of sequentially. for (int n = 0; n < nthreads; n++) { auto& cur_bb_list = bb_lists[n]; - TRACE_MSG3("processing " << cur_bb_list.size() << + TRACE_MSG("processing " << cur_bb_list.size() << " sub-BB(s) in bundle '" << get_name() << "' from thread " << n); // BBs in slice 'n'. for (auto& bbn : cur_bb_list) { - TRACE_MSG3(" sub-BB: [" << bbn.bb_begin.makeDimValStr() << + TRACE_MSG(" sub-BB: [" << bbn.bb_begin.makeDimValStr() << " ... " << bbn.bb_end.makeDimValStr() << ")"); // Don't bother with empty BB. @@ -948,9 +954,9 @@ namespace yask { // Merge by just increasing the size of 'bb'. bb.bb_end[odim] = bbn.bb_end[odim]; - TRACE_MSG3(" merging to form [" << bb.bb_begin.makeDimValStr() << + TRACE_MSG(" merging to form [" << bb.bb_begin.makeDimValStr() << " ... " << bb.bb_end.makeDimValStr() << ")"); - bb.update_bb("sub-bb", *cp, true); + bb.update_bb("sub-bb", *_context, true); break; } } @@ -958,15 +964,15 @@ namespace yask { // If not merged, add 'bbn' as new. if (!do_merge) { _bb_list.push_back(bbn); - TRACE_MSG3(" adding as final sub-BB #" << _bb_list.size()); + TRACE_MSG(" adding as final sub-BB #" << _bb_list.size()); } } } // Finalize overall BB. - _bundle_bb.update_bb(get_name(), *cp, false); + _bundle_bb.update_bb(get_name(), *_context, false); bbtimer.stop(); - TRACE_MSG3("find-bounding-box: done in " << + TRACE_MSG("find-bounding-box: done in " << bbtimer.get_elapsed_secs() << " secs."); } diff --git a/src/kernel/lib/soln_apis.cpp b/src/kernel/lib/soln_apis.cpp index ad966dd1..f2c53d72 100644 --- a/src/kernel/lib/soln_apis.cpp +++ b/src/kernel/lib/soln_apis.cpp @@ -36,43 +36,45 @@ namespace yask { #define GET_SOLN_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \ idx_t StencilContext::api_name(const string& dim) const { \ + STATE_VARS(this); \ if (prep_req && !rank_bb.bb_valid) \ THROW_YASK_EXCEPTION("Error: '" #api_name \ "()' called before calling 'prepare_solution()'"); \ - checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \ + dims->checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \ return expr; \ } - GET_SOLN_API(get_num_ranks, _opts->_num_ranks[dim], false, true, false, false) + GET_SOLN_API(get_num_ranks, opts->_num_ranks[dim], false, true, false, false) GET_SOLN_API(get_overall_domain_size, overall_domain_sizes[dim], false, true, false, true) - GET_SOLN_API(get_rank_domain_size, _opts->_rank_sizes[dim], false, true, false, false) - GET_SOLN_API(get_region_size, _opts->_region_sizes[dim], true, true, false, false) - GET_SOLN_API(get_block_size, _opts->_block_sizes[dim], true, true, false, false) + GET_SOLN_API(get_rank_domain_size, opts->_rank_sizes[dim], false, true, false, false) + GET_SOLN_API(get_region_size, opts->_region_sizes[dim], true, true, false, false) + GET_SOLN_API(get_block_size, opts->_block_sizes[dim], true, true, false, false) GET_SOLN_API(get_first_rank_domain_index, rank_bb.bb_begin[dim], false, true, false, true) GET_SOLN_API(get_last_rank_domain_index, rank_bb.bb_end[dim] - 1, false, true, false, true) - GET_SOLN_API(get_min_pad_size, _opts->_min_pad_sizes[dim], false, true, false, false) - GET_SOLN_API(get_rank_index, _opts->_rank_indices[dim], false, true, false, true) + GET_SOLN_API(get_min_pad_size, opts->_min_pad_sizes[dim], false, true, false, false) + GET_SOLN_API(get_rank_index, opts->_rank_indices[dim], false, true, false, true) #undef GET_SOLN_API // The grid sizes updated any time these settings are changed. #define SET_SOLN_API(api_name, expr, step_ok, domain_ok, misc_ok, reset_prep) \ void StencilContext::api_name(const string& dim, idx_t n) { \ - checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \ + STATE_VARS(this); \ + dims->checkDimType(dim, #api_name, step_ok, domain_ok, misc_ok); \ expr; \ update_grid_info(); \ if (reset_prep) rank_bb.bb_valid = ext_bb.bb_valid = false; \ } - SET_SOLN_API(set_rank_index, _opts->_rank_indices[dim] = n, false, true, false, true) - SET_SOLN_API(set_num_ranks, _opts->_num_ranks[dim] = n, false, true, false, true) - SET_SOLN_API(set_rank_domain_size, _opts->_rank_sizes[dim] = n, false, true, false, true) - SET_SOLN_API(set_region_size, _opts->_region_sizes[dim] = n, true, true, false, true) - SET_SOLN_API(set_block_size, _opts->_block_sizes[dim] = n, true, true, false, true) - SET_SOLN_API(set_min_pad_size, _opts->_min_pad_sizes[dim] = n, false, true, false, false) + SET_SOLN_API(set_rank_index, opts->_rank_indices[dim] = n, false, true, false, true) + SET_SOLN_API(set_num_ranks, opts->_num_ranks[dim] = n, false, true, false, true) + SET_SOLN_API(set_rank_domain_size, opts->_rank_sizes[dim] = n, false, true, false, true) + SET_SOLN_API(set_region_size, opts->_region_sizes[dim] = n, true, true, false, true) + SET_SOLN_API(set_block_size, opts->_block_sizes[dim] = n, true, true, false, true) + SET_SOLN_API(set_min_pad_size, opts->_min_pad_sizes[dim] = n, false, true, false, false) #undef SET_SOLN_API // Allocate grids and MPI bufs. // Initialize some data structures. void StencilContext::prepare_solution() { - CONTEXT_VARS(this); + STATE_VARS(this); // Don't continue until all ranks are this far. env->global_barrier(); @@ -99,22 +101,22 @@ namespace yask { // Adjust all settings before setting MPI buffers or sizing grids. // Prints adjusted settings. // TODO: print settings again after auto-tuning. - _opts->adjustSettings(os, _env); + opts->adjustSettings(os); // Copy current settings to packs. // Needed here because settings may have been changed via APIs // since last call to prepare_solution(). // This will wipe out any previous auto-tuning. for (auto& sp : stPacks) - sp->getLocalSettings() = *_opts; + sp->getLocalSettings() = *opts; // Init auto-tuner to run silently during normal operation. reset_auto_tuner(true, false); // Report ranks. os << endl; - os << "Num MPI ranks: " << _env->get_num_ranks() << endl; - os << "This MPI rank index: " << _env->get_rank_index() << endl; + os << "Num MPI ranks: " << env->get_num_ranks() << endl; + os << "This MPI rank index: " << env->get_rank_index() << endl; // report threads. { @@ -189,36 +191,36 @@ namespace yask { } void StencilContext::print_info() { - CONTEXT_VARS(this); + STATE_VARS(this); // Calc and report total allocation and domain sizes. rank_nbytes = get_num_bytes(); - tot_nbytes = sumOverRanks(rank_nbytes, _env->comm); + tot_nbytes = sumOverRanks(rank_nbytes, env->comm); rank_domain_pts = rank_bb.bb_num_points; - tot_domain_pts = sumOverRanks(rank_domain_pts, _env->comm); + tot_domain_pts = sumOverRanks(rank_domain_pts, env->comm); os << "\nDomain size in this rank (points): " << makeNumStr(rank_domain_pts) << "\nTotal allocation in this rank: " << makeByteStr(rank_nbytes) << - "\nOverall problem size in " << _env->num_ranks << " rank(s) (points): " << + "\nOverall problem size in " << env->num_ranks << " rank(s) (points): " << makeNumStr(tot_domain_pts) << - "\nTotal overall allocation in " << _env->num_ranks << " rank(s): " << + "\nTotal overall allocation in " << env->num_ranks << " rank(s): " << makeByteStr(tot_nbytes) << endl; // Report some sizes and settings. os << "\nWork-unit sizes in points (from smallest to largest):\n" - " vector-size: " << _dims->_fold_pts.makeDimValStr(" * ") << endl << - " cluster-size: " << _dims->_cluster_pts.makeDimValStr(" * ") << endl << - " sub-block-size: " << _opts->_sub_block_sizes.makeDimValStr(" * ") << endl << - " mini-block-size: " << _opts->_mini_block_sizes.makeDimValStr(" * ") << endl << - " block-size: " << _opts->_block_sizes.makeDimValStr(" * ") << endl << - " region-size: " << _opts->_region_sizes.makeDimValStr(" * ") << endl << - " rank-domain-size: " << _opts->_rank_sizes.makeDimValStr(" * ") << endl << + " vector-size: " << dims->_fold_pts.makeDimValStr(" * ") << endl << + " cluster-size: " << dims->_cluster_pts.makeDimValStr(" * ") << endl << + " sub-block-size: " << opts->_sub_block_sizes.makeDimValStr(" * ") << endl << + " mini-block-size: " << opts->_mini_block_sizes.makeDimValStr(" * ") << endl << + " block-size: " << opts->_block_sizes.makeDimValStr(" * ") << endl << + " region-size: " << opts->_region_sizes.makeDimValStr(" * ") << endl << + " rank-domain-size: " << opts->_rank_sizes.makeDimValStr(" * ") << endl << " overall-problem-size: " << overall_domain_sizes.makeDimValStr(" * ") << endl; #ifdef SHOW_GROUPS os << - " sub-block-group-size: " << _opts->_sub_block_group_sizes.makeDimValStr(" * ") << endl << - " block-group-size: " << _opts->_block_group_sizes.makeDimValStr(" * ") << endl << + " sub-block-group-size: " << opts->_sub_block_group_sizes.makeDimValStr(" * ") << endl << + " block-group-size: " << opts->_block_group_sizes.makeDimValStr(" * ") << endl << #endif os << "\nOther settings:\n" " yask-version: " << yask_get_version_string() << endl << @@ -228,8 +230,8 @@ namespace yask { " ... " << rank_bb.bb_end.subElements(1).makeDimValStr() << endl; #ifdef USE_MPI os << - " num-ranks: " << _opts->_num_ranks.makeDimValStr(" * ") << endl << - " rank-indices: " << _opts->_rank_indices.makeDimValStr() << endl << + " num-ranks: " << opts->_num_ranks.makeDimValStr(" * ") << endl << + " rank-indices: " << opts->_rank_indices.makeDimValStr() << endl << " rank-domain-offsets: " << rank_domain_offsets.makeDimValOffsetStr() << endl; if (opts->overlap_comms) os << @@ -238,8 +240,8 @@ namespace yask { #endif os << " vector-len: " << VLEN << endl << - " extra-padding: " << _opts->_extra_pad_sizes.makeDimValStr() << endl << - " minimum-padding: " << _opts->_min_pad_sizes.makeDimValStr() << endl << + " extra-padding: " << opts->_extra_pad_sizes.makeDimValStr() << endl << + " minimum-padding: " << opts->_min_pad_sizes.makeDimValStr() << endl << " L1-prefetch-distance: " << PFD_L1 << endl << " L2-prefetch-distance: " << PFD_L2 << endl << " max-halos: " << max_halos.makeDimValStr() << endl; @@ -259,7 +261,7 @@ namespace yask { // Dealloc grids, etc. void StencilContext::end_solution() { - CONTEXT_VARS(this); + STATE_VARS(this); // Final halo exchange (usually not needed). exchange_halos(); @@ -294,10 +296,11 @@ namespace yask { } string StencilContext::apply_command_line_options(const string& args) { + STATE_VARS(this); // Create a parser and add base options to it. CommandLineParser parser; - _opts->add_options(parser); + opts->add_options(parser); // Tokenize default args. vector argsv; @@ -318,7 +321,7 @@ namespace yask { // Add a new grid to the containers. void StencilContext::addGrid(YkGridPtr gp, bool is_output) { - CONTEXT_VARS(this); + STATE_VARS(this); assert(gp); auto& gname = gp->get_name(); if (gridMap.count(gname)) @@ -345,7 +348,7 @@ namespace yask { /// Get statistics associated with preceding calls to run_solution(). yk_stats_ptr StencilContext::get_stats() { - CONTEXT_VARS(this); + STATE_VARS(this); // Numbers of threads. int rthr, bthr; diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp index 4b50cb87..50d0b80c 100644 --- a/src/kernel/lib/stencil_calc.cpp +++ b/src/kernel/lib/stencil_calc.cpp @@ -40,8 +40,8 @@ namespace yask { void StencilBundleBase::calc_mini_block(int region_thread_idx, KernelSettings& settings, const ScanIndices& mini_block_idxs) { - CONTEXT_VARS(_generic_context); - TRACE_MSG3("calc_mini_block('" << get_name() << "'): [" << + STATE_VARS(this); + TRACE_MSG("calc_mini_block('" << get_name() << "'): [" << mini_block_idxs.begin.makeValStr(nsdims) << " ... " << mini_block_idxs.end.makeValStr(nsdims) << ") by " << mini_block_idxs.step.makeValStr(nsdims) << @@ -57,7 +57,7 @@ namespace yask { // Nothing to do if outer BB is empty. if (_bundle_bb.bb_num_points == 0) { - TRACE_MSG3("calc_mini_block: empty BB"); + TRACE_MSG("calc_mini_block: empty BB"); return; } @@ -75,7 +75,7 @@ namespace yask { // Loop through each solid BB for this bundle. // For each BB, calc intersection between it and 'mini_block_idxs'. // If this is non-empty, apply the bundle to all its required sub-blocks. - TRACE_MSG3("calc_mini_block('" << get_name() << "'): checking " << + TRACE_MSG("calc_mini_block('" << get_name() << "'): checking " << _bb_list.size() << " BB(s)"); int bbn = 0; for (auto& bb : _bb_list) { @@ -106,12 +106,12 @@ namespace yask { // nothing to do? if (!bb_ok) { - TRACE_MSG3("calc_mini_block for bundle '" << get_name() << + TRACE_MSG("calc_mini_block for bundle '" << get_name() << "': no overlap between bundle " << bbn << " and current block"); continue; // to next BB. } - TRACE_MSG3("calc_mini_block('" << get_name() << + TRACE_MSG("calc_mini_block('" << get_name() << "'): after trimming for BB " << bbn << ": [" << mb_idxs.begin.makeValStr(nsdims) << " ... " << mb_idxs.end.makeValStr(nsdims) << ")"); @@ -128,7 +128,7 @@ namespace yask { // eventually work on a separate sub-block. This is nested within // an OMP region thread. If there is only one block per thread, // nested OMP is disabled, and this OMP pragma does nothing. - int nbt = cp->set_block_threads(); + int nbt = _context->set_block_threads(); bool bind_threads = nbt > 1 && settings.bind_block_threads; _Pragma("omp parallel proc_bind(spread)") { int block_thread_idx = (nbt <= 1) ? 0 : omp_get_thread_num(); @@ -158,7 +158,7 @@ namespace yask { adj_mb_idxs.step[i] = adj_mb_idxs.end[i] - adj_mb_idxs.begin[i]; } - TRACE_MSG3("calc_mini_block('" << get_name() << "'): " << + TRACE_MSG("calc_mini_block('" << get_name() << "'): " << " for reqd bundle '" << sg->get_name() << "': [" << adj_mb_idxs.begin.makeValStr(nsdims) << " ... " << adj_mb_idxs.end.makeValStr(nsdims) << ") by " << @@ -205,8 +205,8 @@ namespace yask { int block_thread_idx, KernelSettings& settings, const ScanIndices& mini_block_idxs) { - CONTEXT_VARS(_generic_context); - TRACE_MSG3("calc_sub_block_scalar for bundle '" << get_name() << "': [" << + STATE_VARS(this); + TRACE_MSG("calc_sub_block_scalar for bundle '" << get_name() << "': [" << mini_block_idxs.start.makeValStr(nsdims) << " ... " << mini_block_idxs.stop.makeValStr(nsdims) << ") by region thread " << region_thread_idx << @@ -243,8 +243,8 @@ namespace yask { int block_thread_idx, KernelSettings& settings, const ScanIndices& mini_block_idxs) { - CONTEXT_VARS(_generic_context); - TRACE_MSG3("calc_sub_block_vec for bundle '" << get_name() << "': [" << + STATE_VARS(this); + TRACE_MSG("calc_sub_block_vec for bundle '" << get_name() << "': [" << mini_block_idxs.start.makeValStr(nsdims) << " ... " << mini_block_idxs.stop.makeValStr(nsdims) << ") by region thread " << region_thread_idx << @@ -308,7 +308,7 @@ namespace yask { DOMAIN_VAR_LOOP(i, j) { // Rank offset. - auto rofs = cp->rank_domain_offsets[j]; + auto rofs = _context->rank_domain_offsets[j]; // Begin/end of rank-relative scalar elements in this dim. auto ebgn = sub_block_idxs.begin[i] - rofs; @@ -459,7 +459,7 @@ namespace yask { // Full rectilinear polytope of aligned clusters: use optimized code. if (do_clusters) { - TRACE_MSG3("calc_sub_block_vec: using cluster code for [" << + TRACE_MSG("calc_sub_block_vec: using cluster code for [" << sub_block_fcidxs.begin.makeValStr(nsdims) << " ... " << sub_block_fcidxs.end.makeValStr(nsdims) << ") by region thread " << region_thread_idx << @@ -486,7 +486,7 @@ namespace yask { // Full and partial peel/remainder vectors in all dims except // the inner one. if (do_vectors) { - TRACE_MSG3("calc_sub_block_vec: using vector code for [" << + TRACE_MSG("calc_sub_block_vec: using vector code for [" << sub_block_vidxs.begin.makeValStr(nsdims) << " ... " << sub_block_vidxs.end.makeValStr(nsdims) << ") *not* within full vector-clusters at [" << @@ -570,7 +570,7 @@ namespace yask { misc_idxs.step.setFromConst(1); misc_idxs.align.setFromConst(1); - TRACE_MSG3("calc_sub_block_vec: using scalar code for [" << + TRACE_MSG("calc_sub_block_vec: using scalar code for [" << misc_idxs.begin.makeValStr(nsdims) << " ... " << misc_idxs.end.makeValStr(nsdims) << ") *not* within vectors at [" << @@ -587,13 +587,13 @@ namespace yask { #define MISC_FN(pt_idxs) do { \ bool ok = false; \ DOMAIN_VAR_LOOP(i, j) { \ - auto rofs = cp->rank_domain_offsets[j]; \ + auto rofs = _context->rank_domain_offsets[j]; \ if (pt_idxs.start[i] < rofs + sub_block_vidxs.begin[i] || \ pt_idxs.start[i] >= rofs + sub_block_vidxs.end[i]) { \ ok = true; break; } \ } \ if (ok) { \ - TRACE_MSG3("calc_sub_block_vec: at pt " << \ + TRACE_MSG("calc_sub_block_vec: at pt " << \ pt_idxs.start.makeValStr(nsdims)); \ calc_scalar(region_thread_idx, pt_idxs.start); \ } \ @@ -615,8 +615,8 @@ namespace yask { void StencilBundleBase::calc_loop_of_clusters(int region_thread_idx, int block_thread_idx, const ScanIndices& loop_idxs) { - CONTEXT_VARS(_generic_context); - TRACE_MSG3("calc_loop_of_clusters: local vector-indices [" << + STATE_VARS(this); + TRACE_MSG("calc_loop_of_clusters: local vector-indices [" << loop_idxs.start.makeValStr(nsdims) << " ... " << loop_idxs.stop.makeValStr(nsdims) << ") by region thread " << region_thread_idx << @@ -649,8 +649,8 @@ namespace yask { int block_thread_idx, const ScanIndices& loop_idxs, idx_t write_mask) { - CONTEXT_VARS(_generic_context); - TRACE_MSG3("calc_loop_of_vectors: local vector-indices [" << + STATE_VARS(this); + TRACE_MSG("calc_loop_of_vectors: local vector-indices [" << loop_idxs.start.makeValStr(nsdims) << " ... " << loop_idxs.stop.makeValStr(nsdims) << ") w/write-mask = 0x" << hex << write_mask << dec << @@ -688,7 +688,7 @@ namespace yask { // Returns adjusted indices. ScanIndices StencilBundleBase::adjust_span(int region_thread_idx, const ScanIndices& idxs) const { - CONTEXT_VARS(_generic_context); + STATE_VARS(this); ScanIndices adj_idxs(idxs); // Loop thru vecs of scratch grids for this bundle. @@ -723,7 +723,7 @@ namespace yask { adj_idxs.end[i] = idxs.end[i] + rh; // Make sure grid covers index bounds. - TRACE_MSG3("adjust_span: mini-blk [" << + TRACE_MSG("adjust_span: mini-blk [" << idxs.begin[i] << "..." << idxs.end[i] << ") adjusted to [" << adj_idxs.begin[i] << "..." << @@ -817,6 +817,11 @@ namespace yask { " num reqd scratch bundles: " << (sg_list.size() - 1) << endl; // TODO: add info on scratch bundles here. + if (sg->is_sub_domain_expr()) + os << " sub-domain expr: '" << sg->get_domain_description() << "'\n"; + if (sg->is_step_cond_expr()) + os << " step-condition expr: '" << sg->get_step_cond_description() << "'\n"; + os << " bundle size (points): " << makeNumStr(bb.bb_size) << endl; if (bb.bb_size) { diff --git a/src/kernel/lib/stencil_calc.hpp b/src/kernel/lib/stencil_calc.hpp index 7695cf0d..29edce67 100644 --- a/src/kernel/lib/stencil_calc.hpp +++ b/src/kernel/lib/stencil_calc.hpp @@ -32,9 +32,10 @@ namespace yask { // A stencil context contains one or more packs. // A pure-virtual class base for a stencil bundle. - class StencilBundleBase { + class StencilBundleBase : + public ContextLinker { + protected: - StencilContext* _generic_context = 0; std::string _name; int _scalar_fp_ops = 0; int _scalar_points_read = 0; @@ -65,7 +66,7 @@ namespace yask { // Ranks offsets must already be subtracted. // Each dim in 'orig' must be a multiple of corresponding vec len. void normalize_indices(const Indices& orig, Indices& norm) const { - CONTEXT_VARS(_generic_context); + STATE_VARS(this); assert(orig.getNumDims() == nsdims); assert(norm.getNumDims() == nsdims); @@ -97,20 +98,9 @@ namespace yask { // ctor, dtor. StencilBundleBase(StencilContext* context) : - _generic_context(context) { - //CONTEXT_VARS(context); - } - + ContextLinker(context) { } virtual ~StencilBundleBase() { } - // Access to dims and MPI info. - DimsPtr& get_dims() const { - return _generic_context->get_dims(); - } - MPIInfoPtr& get_mpi_info() { - return _generic_context->get_mpi_info(); - } - // Get name of this bundle. const std::string& get_name() const { return _name; } @@ -172,13 +162,17 @@ namespace yask { virtual bool is_in_valid_domain(const Indices& idxs) const =0; - // Return true if there is a non-default sub-domain expression. + // Return true if there is a non-default conditions. virtual bool is_sub_domain_expr() const { return false; } + virtual bool + is_step_cond_expr() const { return false; } - // Return human-readable description of sub-domain. + // Return human-readable description of conditions. virtual std::string get_domain_description() const =0; + virtual std::string + get_step_cond_description() const =0; // Determine whether step index is enabled. virtual bool @@ -277,20 +271,18 @@ namespace yask { // "Independent" implies that they may be evaluated // in any order. class BundlePack : + public ContextLinker, public std::vector { protected: std::string _name; - // Parent solution. - StencilContext* _context = 0; - // Union of bounding boxes for all bundles in this pack. BoundingBox _pack_bb; // Local pack settings. // Only some of these will be used. - KernelSettings _opts; + KernelSettings _pack_opts; // Auto-tuner for pack settings. AutoTuner _at; @@ -312,12 +304,12 @@ namespace yask { idx_t tot_writes_per_step = 0; idx_t tot_fpops_per_step = 0; - BundlePack(const std::string& name, - StencilContext* ctx) : + BundlePack(StencilContext* context, + const std::string& name) : + ContextLinker(context), _name(name), - _context(ctx), - _opts(*ctx->get_settings()), // make a copy of the context settings. - _at(ctx, &_opts, name) { } + _pack_opts(*context->get_state()->_opts), // init w/a copy of the base settings. + _at(context, &_pack_opts, name) { } virtual ~BundlePack() { } const std::string& get_name() { @@ -342,13 +334,14 @@ namespace yask { // Accessors. BoundingBox& getBB() { return _pack_bb; } AutoTuner& getAT() { return _at; } - KernelSettings& getLocalSettings() { return _opts; } + KernelSettings& getLocalSettings() { return _pack_opts; } // If using separate pack tuners, return local settings. // Otherwise, return one in context. KernelSettings& getActiveSettings() { - return _context->use_pack_tuners() ? _opts : - *_context->get_settings().get(); } + STATE_VARS(this); + return use_pack_tuners() ? _pack_opts : *opts; + } // Perf-tracking methods. void start_timers(); diff --git a/src/kernel/lib/yask.hpp b/src/kernel/lib/yask.hpp index 74374b86..75a5a95a 100644 --- a/src/kernel/lib/yask.hpp +++ b/src/kernel/lib/yask.hpp @@ -154,15 +154,8 @@ typedef std::uint64_t uidx_t; #define TRACE_MSG0(os, msg) ((void)0) #endif -// macro for debug message from a StencilContext method. -#define TRACE_MSG1(msg) TRACE_MSG0(get_ostr(), msg) -#define TRACE_MSG(msg) TRACE_MSG1(msg) - -// macro for debug message when _context ptr is defined. -#define TRACE_MSG2(msg) TRACE_MSG0(_context->get_ostr(), msg) - -// macro for debug message when _generic_context ptr is defined. -#define TRACE_MSG3(msg) TRACE_MSG0(_generic_context->get_ostr(), msg) +// macro for debug message when 'os' is defined. +#define TRACE_MSG(msg) TRACE_MSG0(os, msg) // breakpoint. #define INT3 asm volatile("int $3") diff --git a/src/kernel/lib/yask_stencil.hpp b/src/kernel/lib/yask_stencil.hpp index b5356875..57faa0ec 100644 --- a/src/kernel/lib/yask_stencil.hpp +++ b/src/kernel/lib/yask_stencil.hpp @@ -72,6 +72,7 @@ IN THE SOFTWARE. #include "realv.hpp" // Base types for stencil context, etc. +#include "indices.hpp" #include "settings.hpp" #include "generic_grids.hpp" #include "realv_grids.hpp" diff --git a/src/kernel/tests/grid_test.cpp b/src/kernel/tests/grid_test.cpp index ae05af13..00770eac 100644 --- a/src/kernel/tests/grid_test.cpp +++ b/src/kernel/tests/grid_test.cpp @@ -70,10 +70,10 @@ int main(int argc, char** argv) { os << "0-D test...\n"; GridDimNames gdims; string name = "test grid"; - YkGridPtr g0 = make_shared>(dims, name, gdims, &settings, &osp); + YkGridPtr g0 = make_shared>(*context, name, gdims); g0->alloc_storage(); os << g0->make_info_string() << endl; - YkGridPtr g1 = make_shared>(dims, name, gdims, &settings, &osp); + YkGridPtr g1 = make_shared>(*context, name, gdims); g1->alloc_storage(); os << g1->make_info_string() << endl; @@ -91,8 +91,8 @@ int main(int argc, char** argv) { os << "3-D test...\n"; GridDimNames gdims = {"x", "y", "z"}; string name = "test grid"; - YkGridPtr g3 = make_shared>(dims, name, gdims, &settings, &osp); - YkGridPtr g3f = make_shared>(dims, name, gdims, &settings, &osp); + YkGridPtr g3 = make_shared>(*context, name, gdims); + YkGridPtr g3f = make_shared>(*context, name, gdims); int i = 0; int min_pad = 3; for (auto dname : gdims) { diff --git a/src/kernel/tests/yask_kernel_api_exception_test.py b/src/kernel/tests/yask_kernel_api_exception_test.py index a48086ba..f2ab9c1b 100755 --- a/src/kernel/tests/yask_kernel_api_exception_test.py +++ b/src/kernel/tests/yask_kernel_api_exception_test.py @@ -339,7 +339,7 @@ def init_grid(grid, timestep) : print("Running the solution for 10 more steps...") soln.run_solution(1, 10) - # Print final result at timestep 11. + # Print final result at timestep 11, assuming update was to t+1. for grid in soln.get_grids() : read_grid(grid, 11) diff --git a/src/kernel/tests/yask_kernel_api_test.cpp b/src/kernel/tests/yask_kernel_api_test.cpp index 80e1074b..f1167018 100644 --- a/src/kernel/tests/yask_kernel_api_test.cpp +++ b/src/kernel/tests/yask_kernel_api_test.cpp @@ -113,19 +113,35 @@ int main() { // Print out some info about the grids and init their data. for (auto grid : soln->get_grids()) { - os << " " << grid->get_name() << "("; - for (auto dname : grid->get_dim_names()) - os << " '" << dname << "'"; - os << " )\n"; + os << " grid-var '" << grid->get_name() << ":\n"; for (auto dname : grid->get_dim_names()) { + os << " '" << dname << "' dim:\n"; + os << " alloc-size on this rank: " << + grid->get_alloc_size(dname) << endl; + + // Is this a domain dim? if (domain_dim_set.count(dname)) { - os << " '" << dname << "' domain index range on this rank: " << + os << " domain index range on this rank: " << grid->get_first_rank_domain_index(dname) << " ... " << grid->get_last_rank_domain_index(dname) << endl; - os << " '" << dname << "' allowed index range on this rank: " << + os << " domain+halo index range on this rank: " << + grid->get_first_rank_halo_index(dname) << " ... " << + grid->get_last_rank_halo_index(dname) << endl; + os << " allowed index range on this rank: " << grid->get_first_rank_alloc_index(dname) << " ... " << grid->get_last_rank_alloc_index(dname) << endl; } + + // Step dim? + else if (dname == soln->get_step_dim_name()) { + } + + // Misc dim? + else { + os << " misc index range: " << + grid->get_first_misc_index(dname) << " ... " << + grid->get_last_misc_index(dname) << endl; + } } // First, just init all the elements to the same value. @@ -138,35 +154,39 @@ int main() { // Create indices describing a subset of the overall domain. vector first_indices, last_indices; for (auto dname : grid->get_dim_names()) { + idx_t first_idx = 0, last_idx = 0; // Is this a domain dim? if (domain_dim_set.count(dname)) { // Set indices to create a small cube (assuming 3D) // in center of overall problem. + // Using global indices. idx_t psize = soln->get_overall_domain_size(dname); - idx_t first_idx = psize/2 - 30; - idx_t last_idx = psize/2 + 30; - first_indices.push_back(first_idx); - last_indices.push_back(last_idx); + first_idx = psize/2 - 30; + last_idx = psize/2 + 30; } // Step dim? else if (dname == soln->get_step_dim_name()) { - // Add indices for timestep zero (0) only. - first_indices.push_back(0); - last_indices.push_back(0); + // Set indices for one time-step. + first_idx = 0; + last_idx = 0; } // Misc dim? else { - // Add indices to set all allowed values. - // (This isn't really meaningful; it's just illustrative.) - first_indices.push_back(grid->get_first_misc_index(dname)); - last_indices.push_back(grid->get_last_misc_index(dname)); + // Set indices to set all allowed values. + first_idx = grid->get_first_misc_index(dname); + last_idx = grid->get_last_misc_index(dname); + assert(last_idx - first_idx + 1 == grid->get_alloc_size(dname)); } + + // Add indices to index vectors. + first_indices.push_back(first_idx); + last_indices.push_back(last_idx); } // Init the values using the indices created above. diff --git a/src/kernel/tests/yask_kernel_api_test.py b/src/kernel/tests/yask_kernel_api_test.py index f2341b08..2c74fdf9 100755 --- a/src/kernel/tests/yask_kernel_api_test.py +++ b/src/kernel/tests/yask_kernel_api_test.py @@ -30,58 +30,69 @@ import argparse import yask_kernel as yk -# Read data from grid using NumPy ndarray. -def read_grid(grid, timestep) : - - # Ignore with fixed-sized grids. - if grid.is_fixed_size(): - return - - print("Reading grid '" + grid.get_name() + "' at time " + repr(timestep) + "...") +# Prepare an NymPy ndarray to hold a slice of 'grid'. +def make_ndarray(grid, timestep) : # Create indices for YASK and shape for NumPy. first_indices = [] last_indices = [] shape = [] + point = () nelems = 1 for dname in grid.get_dim_names() : if dname == soln.get_step_dim_name() : - # Read one timestep only. - # So, we don't need to add a time axis in 'shape'. - first_indices += [timestep] - last_indices += [timestep] + # Slice in requested time. + first_idx = timestep + last_idx = timestep # Domain dim? elif dname in soln.get_domain_dim_names() : - # Full domain in this rank. - first_idx = soln.get_first_rank_domain_index(dname) - last_idx = soln.get_last_rank_domain_index(dname) - - # Read one point in the halo, too. - first_idx -= 1 - last_idx += 1 - - first_indices += [first_idx] - last_indices += [last_idx] - shape += [last_idx - first_idx + 1] - nelems *= last_idx - first_idx + 1 + # Cover full alloc in this rank. + first_idx = grid.get_first_rank_alloc_index(dname) + last_idx = grid.get_last_rank_alloc_index(dname) # Misc dim? else : - # Read first index only. - first_indices += [grid.get_first_misc_index(dname)] - last_indices += [grid.get_first_misc_index(dname)] + # Cover all misc values. + first_idx = grid.get_first_misc_index(dname) + last_idx = grid.get_last_misc_index(dname) + + # Add indices to API vars. + first_indices += [first_idx] + last_indices += [last_idx] + ne = last_idx - first_idx + 1 + shape += [ne] + nelems *= ne + + # Define first point in ndarray. + point += (0, ) # Create a NumPy ndarray to hold the extracted data. - ndarray1 = np.empty(shape, dtype, 'C'); + print("Creating a NumPy ndarray with shape " + repr(shape) + " and " + + repr(nelems) + " element(s)...") + ndarray = np.zeros(shape, dtype, 'C'); + return ndarray, first_indices, last_indices, point - print("Reading " + repr(nelems) + " element(s)...") - nread = grid.get_elements_in_slice(ndarray1.data, first_indices, last_indices) - print(ndarray1) +# Read data from grid using NumPy ndarray. +def read_grid(grid, timestep) : + + # Ignore with fixed-sized grids. + if grid.is_fixed_size(): + return + print("Testing reading grid '" + grid.get_name() + "' at time " + repr(timestep) + "...") + ndarray, first_indices, last_indices, point = make_ndarray(grid, timestep) + + print("Reading 1 element...") + val1 = grid.get_element(first_indices) + print("Read value " + repr(val1)) + + print("Reading all element(s) in ndarray...") + nread = grid.get_elements_in_slice(ndarray.data, first_indices, last_indices) + print(ndarray) # Raw access to this grid. if soln.get_element_bytes() == 4 : @@ -92,66 +103,53 @@ def read_grid(grid, timestep) : fp_ptr = ct.cast(int(raw_ptr), ptype) num_elems = grid.get_num_storage_elements() print("Raw data: " + repr(fp_ptr[0]) + ", ..., " + repr(fp_ptr[num_elems-1])) - #ndarray2 = np.fromiter(fp_ptr, dtype, num_elems); print(ndarray2) # Init grid using NumPy ndarray. def init_grid(grid, timestep) : print("Initializing grid '" + grid.get_name() + "' at time " + repr(timestep) + "...") - - # Create indices for YASK, shape & point for NumPy. - first_indices = [] - last_indices = [] - shape = [] - point = () - nelems = 1 - for dname in grid.get_dim_names() : - - if dname == soln.get_step_dim_name() : - - # Write one timestep only. - # So, we don't need to add a time axis in 'shape'. - first_indices += [timestep] - last_indices += [timestep] - - # Domain dim? - elif dname in soln.get_domain_dim_names() : - - # Full domain in this rank. - first_idx = soln.get_first_rank_domain_index(dname) - last_idx = soln.get_last_rank_domain_index(dname) - - # Write one point in the halo, too. - first_idx -= 1 - last_idx += 1 - - first_indices += [first_idx] - last_indices += [last_idx] - shape += [last_idx - first_idx + 1] - nelems *= last_idx - first_idx + 1 - - # Since the array covers one layer of points in the halo - # starting at 0,..,0, 1,..,1 is the first point in the - # computable domain. - point += (1,) - - # Misc dim? - else : - - # Write first index only. - first_indices += [grid.get_first_misc_index(dname)] - last_indices += [grid.get_first_misc_index(dname)] - - # Create a NumPy ndarray to hold the data. - ndarray = np.zeros(shape, dtype, 'C'); + ndarray, first_indices, last_indices, point = make_ndarray(grid, timestep) # Set one point to a non-zero value. - ndarray[point] = 21.0; + val1 = 21.0 + ndarray[point] = val1 print(ndarray) - print("Writing " + repr(nelems) + " element(s)...") + print("Setting grid from all element(s) in ndarray...") nset = grid.set_elements_in_slice(ndarray.data, first_indices, last_indices) print("Set " + repr(nset) + " element(s) in rank " + repr(env.get_rank_index())) + # Check that set worked. + print("Reading those element(s)...") + val2 = grid.get_element(first_indices) + assert val2 == val1 + val2 = grid.get_element(last_indices) + if nset > 1 : + assert val2 == 0.0 + else : + assert val2 == val1 # Only 1 val => first == last. + ndarray2 = ndarray + ndarray2.fill(5.0) + nread = grid.get_elements_in_slice(ndarray2.data, first_indices, last_indices) + assert nread == ndarray2.size + assert ndarray2[point] == val1 + assert ndarray2.sum() == val1 # One point is val1; others are zero. + + # Test element set. + print("Testing setting 1 point at " + repr(last_indices) + "...") + val1 += 1.0 + nset = grid.set_element(val1, last_indices); + assert nset == 1 + val2 = grid.get_element(last_indices) + assert val2 == val1 + + # Test add. + val3 = 2.0 + print("Testing adding to 1 point at " + repr(last_indices) + "...") + nset = grid.add_to_element(val3, last_indices); + assert nset == 1 + val2 = grid.get_element(last_indices) + assert val2 == val1 + val3 + # Main script. if __name__ == "__main__": @@ -242,8 +240,9 @@ def init_grid(grid, timestep) : continue # Init timestep 0 using NumPy. - # This will set one point in each rank. init_grid(grid, 0) + + # Print out the values. read_grid(grid, 0) # Simple one-index example. @@ -302,9 +301,8 @@ def init_grid(grid, timestep) : nset = grid.set_elements_in_slice_same(0.5, first_indices, last_indices) print("Set " + repr(nset) + " element(s) in rank " + repr(env.get_rank_index())) - # Print the initial contents of the grid at timesteps 0 and 1. + # Print the initial contents of the grid. read_grid(grid, 0) - read_grid(grid, 1) # Apply the stencil solution to the data. env.global_barrier() @@ -318,7 +316,7 @@ def init_grid(grid, timestep) : print("Running the solution for 10 more steps...") soln.run_solution(1, 10) - # Print final result at timestep 11. + # Print final result at timestep 11, assuming update was to t+1. for grid in soln.get_grids() : read_grid(grid, 11) diff --git a/src/stencils/SimpleTestStencils.cpp b/src/stencils/SimpleTestStencils.cpp index 826960f8..d278e979 100644 --- a/src/stencils/SimpleTestStencils.cpp +++ b/src/stencils/SimpleTestStencils.cpp @@ -518,50 +518,51 @@ class TestSubdomainStencil3 : public StencilRadiusBase { REGISTER_STENCIL(TestSubdomainStencil3); -class TestStepCondStencil1 : public StencilRadiusBase { +// Test step condition. +class TestStepCondStencil1 : public Test1dBase { protected: - // Indices & dimensions. - MAKE_STEP_INDEX(t); // step in time dim. - MAKE_DOMAIN_INDEX(x); // spatial dim. + // Indices. + MAKE_MISC_INDEX(b); // Vars. MAKE_GRID(A, t, x); // time-varying grid. + MAKE_ARRAY(B, b); public: TestStepCondStencil1(StencilList& stencils, int radius=2) : - StencilRadiusBase("test_step_cond_1d", stencils, radius) { } + Test1dBase("test_step_cond_1d", stencils, radius) { } // Define equation to apply to all points in 'A' grid. virtual void define() { // Time condition. Condition tc0 = (t % 2 == 0); + + // Var condition. + Condition vc0 = (B(0) > B(1)); - // Set A w/different stencils. + // Set A w/different stencils depending on the conditions. It is + // the programmer's responsibility to ensure that the conditions are + // exclusive when necessary. It is not checked at compile or + // run-time. - GridValue u = A(t, x); - for (int r = 1; r <= _radius; r++) - u += A(t, x-r); - for (int r = 1; r <= _radius + 1; r++) - u += A(t, x+r); - A(t+1, x) EQUALS u / (_radius * 2 + 2) IF_STEP tc0; + // Use this equation when t is even. + A(t+1, x) EQUALS def_1d(A, t, x, 0, 0) IF_STEP tc0; - GridValue v = A(t, x); - for (int r = 1; r <= _radius + 3; r++) - v += A(t, x-r); - for (int r = 1; r <= _radius + 2; r++) - v += A(t, x+r); - A(t+1, x) EQUALS v / (_radius * 2 + 6) IF_STEP !tc0; + // Use this equation when t is odd and B(0) > B(1). + A(t+1, x) EQUALS def_1d(A, t, x, 1, 2) IF_STEP !tc0 && vc0; + + // Use this equation when t is even and B(0) <= B(1). + A(t+1, x) EQUALS def_1d(A, t, x, 2, 3) IF_STEP !tc0 && !vc0; } }; REGISTER_STENCIL(TestStepCondStencil1); // Test the use of conditional updates with scratch-pad grids. - class TestScratchSubdomainStencil1 : public Test1dBase { protected: diff --git a/utils/bin/gen_layouts.pl b/utils/bin/gen_layouts.pl index b7c33370..b9352f43 100755 --- a/utils/bin/gen_layouts.pl +++ b/utils/bin/gen_layouts.pl @@ -276,8 +276,8 @@ END my $wrap = $w ? "true" : "false"; # Creation. - print " else if (ndims == $n && do_wrap == $wrap)\n", - " gp = make_shared>(_dims, name, dims, &_opts, &_ostr);\n"; + print " else if (ndims == $n && step_used == $wrap)\n", + " gp = make_shared>(*this, name, gdims);\n"; } }