From e97bb9ee35871cae7f591361dc323fec873ad0a4 Mon Sep 17 00:00:00 2001 From: Vincenzo Eduardo Padulano Date: Mon, 3 Jun 2024 08:34:42 +0200 Subject: [PATCH] [df] Ensure execution order of Define operations is guaranteed In the RDataFrame computation graph execution, the `RAction::Run` method is responsible for requesting the values from the readers that are necessary for that action. This is done by calling `RColumnReaderBase::Get` with the correct template type given by the variadic argument type list of the action, which corresponds to the list of column types that action requires. So far, this variadic template function call was done by simply unpacking the argument pack as the arguments to the `Exec` function of the action helper data member. This meant that the execution order was never guaranteed (undefined until C++17, unspecified since), according to the explanation at https://en.cppreference.com/w/cpp/language/eval_order. This commit ensures that the execution order of the calls to the `Get` method of the readers is done left to right, following the same order of the template parameter pack. This is done by using the sequenced-before feature of the list-initialization of any C++ struct/class. A simple struct helper is used to this end. --- tree/dataframe/inc/ROOT/RDF/RAction.hxx | 3 ++- tree/dataframe/inc/ROOT/RDF/Utils.hxx | 17 +++++++++++++++++ tree/dataframe/src/RDataFrame.cxx | 16 ++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RAction.hxx b/tree/dataframe/inc/ROOT/RDF/RAction.hxx index 35b2d0237e127..c2f9917a6b24b 100644 --- a/tree/dataframe/inc/ROOT/RDF/RAction.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RAction.hxx @@ -101,7 +101,8 @@ public: template void CallExec(unsigned int slot, Long64_t entry, TypeList, std::index_sequence) { - fHelper.Exec(slot, fValues[slot][S]->template Get(entry)...); + ROOT::Internal::RDF::CallGuaranteedOrder{[&](auto &&...args) { return fHelper.Exec(slot, args...); }, + fValues[slot][S]->template Get(entry)...}; (void)entry; // avoid unused parameter warning (gcc 12.1) } diff --git a/tree/dataframe/inc/ROOT/RDF/Utils.hxx b/tree/dataframe/inc/ROOT/RDF/Utils.hxx index e50b89437f71c..868056ec1cef3 100644 --- a/tree/dataframe/inc/ROOT/RDF/Utils.hxx +++ b/tree/dataframe/inc/ROOT/RDF/Utils.hxx @@ -300,6 +300,23 @@ public: auto Insert(const std::string &string) -> decltype(fStrings)::const_iterator; }; +/** + * \brief Struct to wrap the call to a function with a guaranteed order of + * execution of its arguments. + * \tparam F Type of the callable. + * \tparam Args Variadic types of the arguments to the callable. + * + * The execution order is guaranteed by calling the function in the constructor + * thus enabling the exploitation of the list-initialization sequenced-before + * feature (See rule 9 at https://en.cppreference.com/w/cpp/language/eval_order). + */ +struct CallGuaranteedOrder { + template + CallGuaranteedOrder(F &&f, Args &&...args) + { + f(std::forward(args)...); + } +}; } // end NS RDF } // end NS Internal } // end NS ROOT diff --git a/tree/dataframe/src/RDataFrame.cxx b/tree/dataframe/src/RDataFrame.cxx index 91d72f6e2466b..670c87dcca34d 100644 --- a/tree/dataframe/src/RDataFrame.cxx +++ b/tree/dataframe/src/RDataFrame.cxx @@ -614,6 +614,22 @@ for el in df.Take[int]("x"): print(f"Element: {el}") ~~~ +### Actions and readers + +An action that needs values for its computations will request it from a reader, e.g. a column created via `Define` or +available from the input dataset. The action will request values from each column of the list of input columns (either +inferred or specified by the user), in order. For example: + +~~~{.cpp} +ROOT::RDataFrame df{1}; +auto df1 = df.Define("x", []{ return 11; }); +auto df2 = df1.Define("y", []{ return 22; }); +auto graph = df2.Graph("x","y"); +~~~ + +The `Graph` action is going to request first the value from column "x", then that of column "y". Specifically, the order +of execution of the operations of nodes in this branch of the computation graph is guaranteed to be top to bottom. + \anchor distrdf ## Distributed execution