From 7465986654fb53a278cc7d103c310042b005e24a Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Fri, 15 May 2026 17:26:23 +0200 Subject: [PATCH 1/2] [RDF][docs] Group the RDataFrame interface into member groups. Create groups for transformations and actions. This makes it easier to look at the RDF interface at a glance, because base-class functions are inlined in the list of functions. This required reordering of the code, but this is an NFC. --- tree/dataframe/inc/ROOT/RDF/RInterface.hxx | 394 +++++++++++---------- 1 file changed, 214 insertions(+), 180 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index 3da4f47624608..a1be74557da86 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -166,9 +166,6 @@ public: { } - /// \name Transformation - /// \{ - //////////////////////////////////////////////////////////////////////////// /// \brief Cast any RDataFrame node to a common type ROOT::RDF::RNode. /// Different RDataFrame methods return different C++ types. All nodes, however, @@ -192,6 +189,14 @@ public: return RNode(std::static_pointer_cast<::ROOT::Detail::RDF::RNodeBase>(fProxiedPtr), *fLoopManager, fColRegister); } + /// \name Transformations + /// These functions transform the columns of the dataframe, such as filtering events or defining columns. + /// Transformations can be chained, for example + /// ~~~{.cpp} + /// auto filtered = rdf.Filter(...).Define(...).Define(...); + /// ~~~ + /// \{ + //////////////////////////////////////////////////////////////////////////// /// \brief Append a filter to the call graph. /// \param[in] f Function, lambda expression, functor class or any other callable object. It must return a `bool` @@ -1314,6 +1319,55 @@ public: return newInterface; } + // clang-format off + //////////////////////////////////////////////////////////////////////////// + /// \brief Creates a node that filters entries based on range: [begin, end). + /// \param[in] begin Initial entry number considered for this range. + /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset. + /// \param[in] stride Process one entry of the [begin, end) range every `stride` entries. Must be strictly greater than 0. + /// \return the first node of the computation graph for which the event loop is limited to a certain range of entries. + /// + /// Note that in case of previous Ranges and Filters the selected range refers to the transformed dataset. + /// Ranges are only available if EnableImplicitMT has _not_ been called. Multi-thread ranges are not supported. + /// + /// ### Example usage: + /// ~~~{.cpp} + /// auto d_0_30 = d.Range(0, 30); // Pick the first 30 entries + /// auto d_15_end = d.Range(15, 0); // Pick all entries from 15 onwards + /// auto d_15_end_3 = d.Range(15, 0, 3); // Stride: from event 15, pick an event every 3 + /// ~~~ + // clang-format on + RInterface> Range(unsigned int begin, unsigned int end, unsigned int stride = 1) + { + // check invariants + if (stride == 0 || (end != 0 && end < begin)) + throw std::runtime_error("Range: stride must be strictly greater than 0 and end must be greater than begin."); + CheckIMTDisabled("Range"); + + using Range_t = RDFDetail::RRange; + auto rangePtr = std::make_shared(begin, end, stride, fProxiedPtr); + RInterface> newInterface(std::move(rangePtr), *fLoopManager, fColRegister); + return newInterface; + } + + // clang-format off + //////////////////////////////////////////////////////////////////////////// + /// \brief Creates a node that filters entries based on range. + /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset. + /// \return a node of the computation graph for which the range is defined. + /// + /// See the other Range overload for a detailed description. + // clang-format on + RInterface> Range(unsigned int end) { return Range(0, end, 1); } + + /// \} + // --------------------------------------------------------------------------------- + // End of the doxygen group for Transformations + + /// \name Immediate Actions + /// Immediate Actions start the event loop and generate a type of result. + /// \{ + template [[deprecated("Snapshot is not any more a template. You can safely remove the template parameters.")]] RResultPtr> @@ -1723,46 +1777,6 @@ public: return Cache(selectedColumns); } - // clang-format off - //////////////////////////////////////////////////////////////////////////// - /// \brief Creates a node that filters entries based on range: [begin, end). - /// \param[in] begin Initial entry number considered for this range. - /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset. - /// \param[in] stride Process one entry of the [begin, end) range every `stride` entries. Must be strictly greater than 0. - /// \return the first node of the computation graph for which the event loop is limited to a certain range of entries. - /// - /// Note that in case of previous Ranges and Filters the selected range refers to the transformed dataset. - /// Ranges are only available if EnableImplicitMT has _not_ been called. Multi-thread ranges are not supported. - /// - /// ### Example usage: - /// ~~~{.cpp} - /// auto d_0_30 = d.Range(0, 30); // Pick the first 30 entries - /// auto d_15_end = d.Range(15, 0); // Pick all entries from 15 onwards - /// auto d_15_end_3 = d.Range(15, 0, 3); // Stride: from event 15, pick an event every 3 - /// ~~~ - // clang-format on - RInterface> Range(unsigned int begin, unsigned int end, unsigned int stride = 1) - { - // check invariants - if (stride == 0 || (end != 0 && end < begin)) - throw std::runtime_error("Range: stride must be strictly greater than 0 and end must be greater than begin."); - CheckIMTDisabled("Range"); - - using Range_t = RDFDetail::RRange; - auto rangePtr = std::make_shared(begin, end, stride, fProxiedPtr); - RInterface> newInterface(std::move(rangePtr), *fLoopManager, fColRegister); - return newInterface; - } - - // clang-format off - //////////////////////////////////////////////////////////////////////////// - /// \brief Creates a node that filters entries based on range. - /// \param[in] end Final entry number (excluded) considered for this range. 0 means that the range goes until the end of the dataset. - /// \return a node of the computation graph for which the range is defined. - /// - /// See the other Range overload for a detailed description. - // clang-format on - RInterface> Range(unsigned int end) { return Range(0, end, 1); } // clang-format off //////////////////////////////////////////////////////////////////////////// @@ -1828,64 +1842,13 @@ public: fLoopManager->Run(); } - // clang-format off - //////////////////////////////////////////////////////////////////////////// - /// \brief Execute a user-defined reduce operation on the values of a column. - /// \tparam F The type of the reduce callable. Automatically deduced. - /// \tparam T The type of the column to apply the reduction to. Automatically deduced. - /// \param[in] f A callable with signature `T(T,T)` - /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead. - /// \return the reduced quantity wrapped in a ROOT::RDF:RResultPtr. - /// - /// A reduction takes two values of a column and merges them into one (e.g. - /// by summing them, taking the maximum, etc). This action performs the - /// specified reduction operation on all processed column values, returning - /// a single value of the same type. The callable f must satisfy the general - /// requirements of a *processing function* besides having signature `T(T,T)` - /// where `T` is the type of column columnName. - /// - /// The returned reduced value of each thread (e.g. the initial value of a sum) is initialized to a - /// default-constructed T object. This is commonly expected to be the neutral/identity element for the specific - /// reduction operation `f` (e.g. 0 for a sum, 1 for a product). If a default-constructed T does not satisfy this - /// requirement, users should explicitly specify an initialization value for T by calling the appropriate `Reduce` - /// overload. - /// - /// ### Example usage: - /// ~~~{.cpp} - /// auto sumOfIntCol = d.Reduce([](int x, int y) { return x + y; }, "intCol"); - /// ~~~ - /// - /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. Also see RResultPtr. - // clang-format on - template ::ret_type> - RResultPtr Reduce(F f, std::string_view columnName = "") - { - static_assert( - std::is_default_constructible::value, - "reduce object cannot be default-constructed. Please provide an initialisation value (redIdentity)"); - return Reduce(std::move(f), columnName, T()); - } - - //////////////////////////////////////////////////////////////////////////// - /// \brief Execute a user-defined reduce operation on the values of a column. - /// \tparam F The type of the reduce callable. Automatically deduced. - /// \tparam T The type of the column to apply the reduction to. Automatically deduced. - /// \param[in] f A callable with signature `T(T,T)` - /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead. - /// \param[in] redIdentity The reduced object of each thread is initialized to this value. - /// \return the reduced quantity wrapped in a RResultPtr. - /// - /// ### Example usage: - /// ~~~{.cpp} - /// auto sumOfIntColWithOffset = d.Reduce([](int x, int y) { return x + y; }, "intCol", 42); - /// ~~~ - /// See the description of the first Reduce overload for more information. - template ::ret_type> - RResultPtr Reduce(F f, std::string_view columnName, const T &redIdentity) - { - return Aggregate(f, f, columnName, redIdentity); - } + /// \} + // End of doxygen group for immediate actions + // ---------------------------------------------------------------------------------------- + /// \name Actions + /// Actions declare a type of result to be produced, for example histograms or summary statistics. + /// Actions are lazy, i.e. they are only executed once a result is requested. + /// \{ //////////////////////////////////////////////////////////////////////////// /// \brief Return the number of entries processed (*lazy action*). @@ -3449,6 +3412,105 @@ public: return MakeResultPtr(rep, *fLoopManager, std::move(action)); } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Provides a representation of the columns in the dataset. + /// \tparam ColumnTypes variadic list of branch/column types. + /// \param[in] columnList Names of the columns to be displayed. + /// \param[in] nRows Number of events for each column to be displayed. + /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row. + /// \return the `RDisplay` instance wrapped in a RResultPtr. + /// + /// This function returns a `RResultPtr` containing all the entries to be displayed, organized in a tabular + /// form. RDisplay will either print on the standard output a summarized version through `RDisplay::Print()` or will + /// return a complete version through `RDisplay::AsString()`. + /// + /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see + /// RResultPtr. + /// + /// Example usage: + /// ~~~{.cpp} + /// // Preparing the RResultPtr object with all columns and default number of entries + /// auto d1 = rdf.Display(""); + /// // Preparing the RResultPtr object with two columns and 128 entries + /// auto d2 = d.Display({"x", "y"}, 128); + /// // Printing the short representations, the event loop will run + /// d1->Print(); + /// d2->Print(); + /// ~~~ + template + RResultPtr Display(const ColumnNames_t &columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10) + { + CheckIMTDisabled("Display"); + auto newCols = columnList; + newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column + auto displayer = std::make_shared(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements); + using displayHelperArgs_t = std::pair>; + // Need to add ULong64_t type corresponding to the first column rdfentry_ + return CreateAction( + std::move(newCols), displayer, std::make_shared(nRows, displayer), fProxiedPtr); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Provides a representation of the columns in the dataset. + /// \param[in] columnList Names of the columns to be displayed. + /// \param[in] nRows Number of events for each column to be displayed. + /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row. + /// \return the `RDisplay` instance wrapped in a RResultPtr. + /// + /// This overload automatically infers the column types. + /// See the previous overloads for further details. + /// + /// Invoked when no types are specified to Display + RResultPtr Display(const ColumnNames_t &columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10) + { + CheckIMTDisabled("Display"); + auto newCols = columnList; + newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column + auto displayer = std::make_shared(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements); + using displayHelperArgs_t = std::pair>; + return CreateAction( + std::move(newCols), displayer, std::make_shared(nRows, displayer), fProxiedPtr, + columnList.size() + 1); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Provides a representation of the columns in the dataset. + /// \param[in] columnNameRegexp A regular expression to select the columns. + /// \param[in] nRows Number of events for each column to be displayed. + /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row. + /// \return the `RDisplay` instance wrapped in a RResultPtr. + /// + /// The existing columns are matched against the regular expression. If the string provided + /// is empty, all columns are selected. + /// See the previous overloads for further details. + RResultPtr + Display(std::string_view columnNameRegexp = "", size_t nRows = 5, size_t nMaxCollectionElements = 10) + { + const auto columnNames = GetColumnNames(); + const auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Display"); + return Display(selectedColumns, nRows, nMaxCollectionElements); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Provides a representation of the columns in the dataset. + /// \param[in] columnList Names of the columns to be displayed. + /// \param[in] nRows Number of events for each column to be displayed. + /// \param[in] nMaxCollectionElements Number of maximum elements in collection. + /// \return the `RDisplay` instance wrapped in a RResultPtr. + /// + /// See the previous overloads for further details. + RResultPtr + Display(std::initializer_list columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10) + { + ColumnNames_t selectedColumns(columnList); + return Display(selectedColumns, nRows, nMaxCollectionElements); + } + + /// \} + // End of the doxygen group for actions + // ---------------------------------------------------------------------------------------- + /// \brief Returns the names of the filters created. /// \return the container of filters names. /// @@ -3465,6 +3527,11 @@ public: /// std::vector GetFilterNames() { return RDFInternal::GetFilterNames(fProxiedPtr); } + /// \name User-defined Actions (lazy) + /// Pass user-defined functions to be applied to the data and create results. + /// These actions are lazy, i.e., they only run once a result is actually requested. + /// \{ + // clang-format off //////////////////////////////////////////////////////////////////////////// /// \brief Execute a user-defined accumulation operation on the processed column values in each processing slot. @@ -3640,101 +3707,68 @@ public: } } + + // clang-format off //////////////////////////////////////////////////////////////////////////// - /// \brief Provides a representation of the columns in the dataset. - /// \tparam ColumnTypes variadic list of branch/column types. - /// \param[in] columnList Names of the columns to be displayed. - /// \param[in] nRows Number of events for each column to be displayed. - /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row. - /// \return the `RDisplay` instance wrapped in a RResultPtr. + /// \brief Execute a user-defined reduce operation on the values of a column. + /// \tparam F The type of the reduce callable. Automatically deduced. + /// \tparam T The type of the column to apply the reduction to. Automatically deduced. + /// \param[in] f A callable with signature `T(T,T)` + /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead. + /// \return the reduced quantity wrapped in a ROOT::RDF:RResultPtr. /// - /// This function returns a `RResultPtr` containing all the entries to be displayed, organized in a tabular - /// form. RDisplay will either print on the standard output a summarized version through `RDisplay::Print()` or will - /// return a complete version through `RDisplay::AsString()`. + /// A reduction takes two values of a column and merges them into one (e.g. + /// by summing them, taking the maximum, etc). This action performs the + /// specified reduction operation on all processed column values, returning + /// a single value of the same type. The callable f must satisfy the general + /// requirements of a *processing function* besides having signature `T(T,T)` + /// where `T` is the type of column columnName. /// - /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see - /// RResultPtr. + /// The returned reduced value of each thread (e.g. the initial value of a sum) is initialized to a + /// default-constructed T object. This is commonly expected to be the neutral/identity element for the specific + /// reduction operation `f` (e.g. 0 for a sum, 1 for a product). If a default-constructed T does not satisfy this + /// requirement, users should explicitly specify an initialization value for T by calling the appropriate `Reduce` + /// overload. /// - /// Example usage: + /// ### Example usage: /// ~~~{.cpp} - /// // Preparing the RResultPtr object with all columns and default number of entries - /// auto d1 = rdf.Display(""); - /// // Preparing the RResultPtr object with two columns and 128 entries - /// auto d2 = d.Display({"x", "y"}, 128); - /// // Printing the short representations, the event loop will run - /// d1->Print(); - /// d2->Print(); + /// auto sumOfIntCol = d.Reduce([](int x, int y) { return x + y; }, "intCol"); /// ~~~ - template - RResultPtr Display(const ColumnNames_t &columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10) - { - CheckIMTDisabled("Display"); - auto newCols = columnList; - newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column - auto displayer = std::make_shared(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements); - using displayHelperArgs_t = std::pair>; - // Need to add ULong64_t type corresponding to the first column rdfentry_ - return CreateAction( - std::move(newCols), displayer, std::make_shared(nRows, displayer), fProxiedPtr); - } - - //////////////////////////////////////////////////////////////////////////// - /// \brief Provides a representation of the columns in the dataset. - /// \param[in] columnList Names of the columns to be displayed. - /// \param[in] nRows Number of events for each column to be displayed. - /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row. - /// \return the `RDisplay` instance wrapped in a RResultPtr. - /// - /// This overload automatically infers the column types. - /// See the previous overloads for further details. /// - /// Invoked when no types are specified to Display - RResultPtr Display(const ColumnNames_t &columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10) - { - CheckIMTDisabled("Display"); - auto newCols = columnList; - newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column - auto displayer = std::make_shared(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements); - using displayHelperArgs_t = std::pair>; - return CreateAction( - std::move(newCols), displayer, std::make_shared(nRows, displayer), fProxiedPtr, - columnList.size() + 1); - } - - //////////////////////////////////////////////////////////////////////////// - /// \brief Provides a representation of the columns in the dataset. - /// \param[in] columnNameRegexp A regular expression to select the columns. - /// \param[in] nRows Number of events for each column to be displayed. - /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row. - /// \return the `RDisplay` instance wrapped in a RResultPtr. - /// - /// The existing columns are matched against the regular expression. If the string provided - /// is empty, all columns are selected. - /// See the previous overloads for further details. - RResultPtr - Display(std::string_view columnNameRegexp = "", size_t nRows = 5, size_t nMaxCollectionElements = 10) + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. + // clang-format on + template ::ret_type> + RResultPtr Reduce(F f, std::string_view columnName = "") { - const auto columnNames = GetColumnNames(); - const auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Display"); - return Display(selectedColumns, nRows, nMaxCollectionElements); + static_assert( + std::is_default_constructible::value, + "reduce object cannot be default-constructed. Please provide an initialisation value (redIdentity)"); + return Reduce(std::move(f), columnName, T()); } //////////////////////////////////////////////////////////////////////////// - /// \brief Provides a representation of the columns in the dataset. - /// \param[in] columnList Names of the columns to be displayed. - /// \param[in] nRows Number of events for each column to be displayed. - /// \param[in] nMaxCollectionElements Number of maximum elements in collection. - /// \return the `RDisplay` instance wrapped in a RResultPtr. + /// \brief Execute a user-defined reduce operation on the values of a column. + /// \tparam F The type of the reduce callable. Automatically deduced. + /// \tparam T The type of the column to apply the reduction to. Automatically deduced. + /// \param[in] f A callable with signature `T(T,T)` + /// \param[in] columnName The column to be reduced. If omitted, the first default column is used instead. + /// \param[in] redIdentity The reduced object of each thread is initialized to this value. + /// \return the reduced quantity wrapped in a RResultPtr. /// - /// See the previous overloads for further details. - RResultPtr - Display(std::initializer_list columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10) + /// ### Example usage: + /// ~~~{.cpp} + /// auto sumOfIntColWithOffset = d.Reduce([](int x, int y) { return x + y; }, "intCol", 42); + /// ~~~ + /// See the description of the first Reduce overload for more information. + template ::ret_type> + RResultPtr Reduce(F f, std::string_view columnName, const T &redIdentity) { - ColumnNames_t selectedColumns(columnList); - return Display(selectedColumns, nRows, nMaxCollectionElements); + return Aggregate(f, f, columnName, redIdentity); } /// \} + // End of the doxygen group for user-defined actions private: template ::ret_type> From f07dc90a368b008c8894ac5f55da7e4d461f7038 Mon Sep 17 00:00:00 2001 From: Stephan Hageboeck Date: Wed, 20 May 2026 13:09:07 +0200 Subject: [PATCH 2/2] [RDF][NFC] Move immediate actions after actions in RInterface.hxx - Also explain that these are eager. --- tree/dataframe/inc/ROOT/RDF/RInterface.hxx | 3317 ++++++++++---------- 1 file changed, 1659 insertions(+), 1658 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index a1be74557da86..aa3bb93285d00 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -1364,2151 +1364,2152 @@ public: // --------------------------------------------------------------------------------- // End of the doxygen group for Transformations - /// \name Immediate Actions - /// Immediate Actions start the event loop and generate a type of result. + /// \name Actions + /// Actions declare a type of result to be produced, for example histograms or summary statistics. + /// Actions are lazy, i.e. they are only executed once a result is requested. /// \{ - template - [[deprecated("Snapshot is not any more a template. You can safely remove the template parameters.")]] - RResultPtr> - Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, - const RSnapshotOptions &options = RSnapshotOptions()) - { - return Snapshot(treename, filename, columnList, options); - } - //////////////////////////////////////////////////////////////////////////// - /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`. - /// \param[in] treename The name of the output TTree or RNTuple. - /// \param[in] filename The name of the output TFile. - /// \param[in] columnList The list of names of the columns/branches/fields to be written. - /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple. - /// \return a `RDataFrame` that wraps the snapshotted dataset. - /// - /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. - /// The types of the columns are automatically inferred and do not need to be specified. + /// \brief Return the number of entries processed (*lazy action*). + /// \return the number of entries wrapped in a RResultPtr. /// - /// Support for writing of nested branches/fields is limited (although RDataFrame is able to read them) and dot ('.') - /// characters in input column names will be replaced by underscores ('_') in the branches produced by Snapshot. - /// When writing a variable size array through Snapshot, it is required that the column indicating its size is also - /// written out and it appears before the array in the columnList. + /// Useful e.g. for counting the number of entries passing a certain filter (see also `Report`). + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. /// - /// By default, in case of TTree, TChain or RNTuple inputs, Snapshot will try to write out all top-level branches. - /// For other types of inputs, all columns returned by GetColumnNames() will be written out. Systematic variations of - /// columns will be included if the corresponding flag is set in RSnapshotOptions. See \ref snapshot-with-variations - /// "Snapshot with Variations" for more details. If friend trees or chains are present, by default all friend - /// top-level branches that have names that do not collide with names of branches in the main TTree/TChain will be - /// written out. Since v6.24, Snapshot will also write out friend branches with the same names of branches in the - /// main TTree/TChain with names of the form - /// `_` in order to differentiate them from the branches in the main tree/chain. + /// ### Example usage: + /// ~~~{.cpp} + /// auto nEntriesAfterCuts = myFilteredDf.Count(); + /// ~~~ /// - /// ### Writing to a sub-directory + RResultPtr Count() + { + const auto nSlots = fLoopManager->GetNSlots(); + auto cSPtr = std::make_shared(0); + using Helper_t = RDFInternal::CountHelper; + using Action_t = RDFInternal::RAction; + auto action = std::make_unique(Helper_t(cSPtr, nSlots), ColumnNames_t({}), fProxiedPtr, + RDFInternal::RColumnRegister(fColRegister)); + return MakeResultPtr(cSPtr, *fLoopManager, std::move(action)); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Return a collection of values of a column (*lazy action*, returns a std::vector by default). + /// \tparam T The type of the column. + /// \tparam COLL The type of collection used to store the values. + /// \param[in] column The name of the column to collect the values of. + /// \return the content of the selected column wrapped in a RResultPtr. /// - /// Snapshot supports writing the TTree or RNTuple in a sub-directory inside the TFile. It is sufficient to specify - /// the directory path as part of the TTree or RNTuple name, e.g. `df.Snapshot("subdir/t", "f.root")` writes TTree - /// `t` in the sub-directory `subdir` of file `f.root` (creating file and sub-directory as needed). + /// The collection type to be specified for C-style array columns is `RVec`: + /// in this case the returned collection is a `std::vector>`. + /// ### Example usage: + /// ~~~{.cpp} + /// // In this case intCol is a std::vector + /// auto intCol = rdf.Take("integerColumn"); + /// // Same content as above but in this case taken as a RVec + /// auto intColAsRVec = rdf.Take>("integerColumn"); + /// // In this case intCol is a std::vector>, a collection of collections + /// auto cArrayIntCol = rdf.Take>("cArrayInt"); + /// ~~~ + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. + template > + RResultPtr Take(std::string_view column = "") + { + const auto columns = column.empty() ? ColumnNames_t() : ColumnNames_t({std::string(column)}); + + const auto validColumnNames = GetValidatedColumnNames(1, columns); + CheckAndFillDSColumns(validColumnNames, TTraits::TypeList()); + + using Helper_t = RDFInternal::TakeHelper; + using Action_t = RDFInternal::RAction; + auto valuesPtr = std::make_shared(); + const auto nSlots = fLoopManager->GetNSlots(); + + auto action = + std::make_unique(Helper_t(valuesPtr, nSlots), validColumnNames, fProxiedPtr, fColRegister); + return MakeResultPtr(valuesPtr, *fLoopManager, std::move(action)); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*). + /// \tparam V The type of the column used to fill the histogram. + /// \param[in] model The returned histogram will be constructed using this as a model. + /// \param[in] vName The name of the column that will fill the histogram. + /// \return the monodimensional histogram wrapped in a RResultPtr. /// - /// \attention In multi-thread runs (i.e. when EnableImplicitMT() has been called) threads will loop over clusters of - /// entries in an undefined order, so Snapshot will produce outputs in which (clusters of) entries will be shuffled - /// with respect to the input TTree. Using such "shuffled" TTrees as friends of the original trees would result in - /// wrong associations between entries in the main TTree and entries in the "shuffled" friend. Since v6.22, ROOT will - /// error out if such a "shuffled" TTree is used in a friendship. + /// Columns can be of a container type (e.g. `std::vector`), in which case the histogram + /// is filled with each one of the elements of the container. In case multiple columns of container type + /// are provided (e.g. values and weights) they must have the same length for each one of the events (but + /// possibly different lengths between events). + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. /// - /// \note In case no events are written out (e.g. because no event passes all filters), Snapshot will still write the - /// requested output TTree or RNTuple to the file, with all the branches requested to preserve the dataset schema. + /// ### Example usage: + /// ~~~{.cpp} + /// // Deduce column type (this invocation needs jitting internally) + /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myColumn"); + /// // Explicit column type + /// auto myHist2 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myColumn"); + /// ~~~ /// - /// \note Snapshot will refuse to process columns with names of the form `#columnname`. These are special columns - /// made available by some data sources (e.g. RNTupleDS) that represent the size of column `columnname`, and are - /// not meant to be written out with that name (which is not a valid C++ variable name). Instead, go through an - /// Alias(): `df.Alias("nbar", "#bar").Snapshot(..., {"nbar"})`. + /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory + /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that + /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). + template + RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.}, std::string_view vName = "") + { + const auto userColumns = vName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(vName)}); + + const auto validatedColumns = GetValidatedColumnNames(1, userColumns); + + std::shared_ptr<::TH1D> h(nullptr); + { + ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); + h = model.GetHistogram(); + } + + if (h->GetXaxis()->GetXmax() == h->GetXaxis()->GetXmin()) + h->SetCanExtend(::TH1::kAllAxes); + return CreateAction(validatedColumns, h, h, fProxiedPtr); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*). + /// \tparam V The type of the column used to fill the histogram. + /// \param[in] vName The name of the column that will fill the histogram. + /// \return the monodimensional histogram wrapped in a RResultPtr. /// - /// ### Example invocations: + /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.). + /// The "name" and "title" strings are built starting from the input column name. + /// See the description of the first Histo1D() overload for more details. /// + /// ### Example usage: /// ~~~{.cpp} - /// // No need to specify column types, they are automatically deduced thanks - /// // to information coming from the data source - /// df.Snapshot("outputTree", "outputFile.root", {"x", "y"}); + /// // Deduce column type (this invocation needs jitting internally) + /// auto myHist1 = myDf.Histo1D("myColumn"); + /// // Explicit column type + /// auto myHist2 = myDf.Histo1D("myColumn"); /// ~~~ + template + RResultPtr<::TH1D> Histo1D(std::string_view vName) + { + const auto h_name = std::string(vName); + const auto h_title = h_name + ";" + h_name + ";count"; + return Histo1D({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*). + /// \tparam V The type of the column used to fill the histogram. + /// \tparam W The type of the column used as weights. + /// \param[in] model The returned histogram will be constructed using this as a model. + /// \param[in] vName The name of the column that will fill the histogram. + /// \param[in] wName The name of the column that will provide the weights. + /// \return the monodimensional histogram wrapped in a RResultPtr. /// - /// To book a Snapshot without triggering the event loop, one needs to set the appropriate flag in - /// `RSnapshotOptions`: - /// ~~~{.cpp} - /// RSnapshotOptions opts; - /// opts.fLazy = true; - /// df.Snapshot("outputTree", "outputFile.root", {"x"}, opts); - /// ~~~ + /// See the description of the first Histo1D() overload for more details. /// - /// To snapshot to the RNTuple data format, the `fOutputFormat` option in `RSnapshotOptions` needs to be set - /// accordingly: + /// ### Example usage: /// ~~~{.cpp} - /// RSnapshotOptions opts; - /// opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; - /// df.Snapshot("outputNTuple", "outputFile.root", {"x"}, opts); + /// // Deduce column type (this invocation needs jitting internally) + /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight"); + /// // Explicit column type + /// auto myHist2 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight"); /// ~~~ + template + RResultPtr<::TH1D> Histo1D(const TH1DModel &model, std::string_view vName, std::string_view wName) + { + const std::vector columnViews = {vName, wName}; + const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) + ? ColumnNames_t() + : ColumnNames_t(columnViews.begin(), columnViews.end()); + std::shared_ptr<::TH1D> h(nullptr); + { + ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); + h = model.GetHistogram(); + } + + if (h->GetXaxis()->GetXmax() == h->GetXaxis()->GetXmin()) + h->SetCanExtend(::TH1::kAllAxes); + return CreateAction(userColumns, h, h, fProxiedPtr); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*). + /// \tparam V The type of the column used to fill the histogram. + /// \tparam W The type of the column used as weights. + /// \param[in] vName The name of the column that will fill the histogram. + /// \param[in] wName The name of the column that will provide the weights. + /// \return the monodimensional histogram wrapped in a RResultPtr. /// - /// Snapshot systematic variations resulting from a Vary() call (see details \ref snapshot-with-variations "here"): + /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.). + /// The "name" and "title" strings are built starting from the input column names. + /// See the description of the first Histo1D() overload for more details. + /// + /// ### Example usage: /// ~~~{.cpp} - /// RSnapshotOptions opts; - /// opts.fIncludeVariations = true; - /// df.Snapshot("outputTree", "outputFile.root", {"x"}, opts); + /// // Deduce column types (this invocation needs jitting internally) + /// auto myHist1 = myDf.Histo1D("myValue", "myweight"); + /// // Explicit column types + /// auto myHist2 = myDf.Histo1D("myValue", "myweight"); /// ~~~ - RResultPtr> Snapshot(std::string_view treename, std::string_view filename, - const ColumnNames_t &columnList, - const RSnapshotOptions &options = RSnapshotOptions()) + template + RResultPtr<::TH1D> Histo1D(std::string_view vName, std::string_view wName) { - // like columnList but with `#var` columns removed - auto colListNoPoundSizes = RDFInternal::FilterArraySizeColNames(columnList, "Snapshot"); - // like columnListWithoutSizeColumns but with aliases resolved - auto colListNoAliases = GetValidatedColumnNames(colListNoPoundSizes.size(), colListNoPoundSizes); - RDFInternal::CheckForDuplicateSnapshotColumns(colListNoAliases); - // like validCols but with missing size branches required by array branches added in the right positions - const auto pairOfColumnLists = - RDFInternal::AddSizeBranches(GetDataSource(), std::move(colListNoAliases), std::move(colListNoPoundSizes)); - const auto &colListNoAliasesWithSizeBranches = pairOfColumnLists.first; - const auto &colListWithAliasesAndSizeBranches = pairOfColumnLists.second; - - const auto fullTreeName = treename; - const auto parsedTreePath = RDFInternal::ParseTreePath(fullTreeName); - treename = parsedTreePath.fTreeName; - const auto &dirname = parsedTreePath.fDirName; - - ::TDirectory::TContext ctxt; - - RResultPtr> resPtr; - - auto retrieveTypeID = [](const std::string &colName, const std::string &colTypeName, - bool isRNTuple = false) -> const std::type_info * { - try { - return &ROOT::Internal::RDF::TypeName2TypeID(colTypeName); - } catch (const std::runtime_error &err) { - if (isRNTuple) - return &typeid(ROOT::Internal::RDF::UseNativeDataType); - - if (std::string(err.what()).find("Cannot extract type_info of type") != std::string::npos) { - // We could not find RTTI for this column, thus we cannot write it out at the moment. - std::string trueTypeName{colTypeName}; - if (colTypeName.rfind("CLING_UNKNOWN_TYPE", 0) == 0) - trueTypeName = colTypeName.substr(19); - std::string msg{"No runtime type information is available for column \"" + colName + - "\" with type name \"" + trueTypeName + - "\". Thus, it cannot be written to disk with Snapshot. Make sure to generate and load " - "ROOT dictionaries for the type of this column."}; - - throw std::runtime_error(msg); - } else { - throw; - } - } - }; - - RDFInternal::CheckSnapshotOptionsFormatCompatibility(options); - - if (options.fOutputFormat == ESnapshotOutputFormat::kRNTuple) { - // The data source of the RNTuple resulting from the Snapshot action does not exist yet here, so we create one - // without a data source for now, and set it once the actual data source can be created (i.e., after - // writing the RNTuple). - auto newRDF = std::make_shared>(std::make_shared(colListNoPoundSizes)); - - auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ - std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches, - options, newRDF->GetLoopManager(), GetLoopManager(), true /* fToNTuple */, /*fIncludeVariations=*/false}); - - auto &&nColumns = colListNoAliasesWithSizeBranches.size(); - const auto validColumnNames = GetValidatedColumnNames(nColumns, colListNoAliasesWithSizeBranches); - - const auto nSlots = fLoopManager->GetNSlots(); - std::vector colTypeIDs; - colTypeIDs.reserve(nColumns); - for (decltype(nColumns) i{}; i < nColumns; i++) { - const auto &colName = validColumnNames[i]; - const auto colTypeName = ROOT::Internal::RDF::ColumnName2ColumnTypeName( - colName, /*tree*/ nullptr, GetDataSource(), fColRegister.GetDefine(colName), options.fVector2RVec); - const std::type_info *colTypeID = retrieveTypeID(colName, colTypeName, /*isRNTuple*/ true); - colTypeIDs.push_back(colTypeID); - } - // Crucial e.g. if the column names do not correspond to already-available column readers created by the data - // source - CheckAndFillDSColumns(validColumnNames, colTypeIDs); - - auto action = - RDFInternal::BuildAction(validColumnNames, snapHelperArgs, nSlots, fProxiedPtr, fColRegister, colTypeIDs); - resPtr = MakeResultPtr(newRDF, *GetLoopManager(), std::move(action)); - } else { - if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS" && - options.fOutputFormat == ESnapshotOutputFormat::kDefault) { - Warning("Snapshot", - "The default Snapshot output data format is TTree, but the input data format is RNTuple. If you " - "want to Snapshot to RNTuple or suppress this warning, set the appropriate fOutputFormat option in " - "RSnapshotOptions. Note that this current default behaviour might change in the future."); - } - - // We create an RLoopManager without a data source. This needs to be initialised when the output TTree dataset - // has actually been created and written to TFile, i.e. at the end of the Snapshot execution. - auto newRDF = std::make_shared>( - std::make_shared(colListNoAliasesWithSizeBranches)); - - auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ - std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches, - options, newRDF->GetLoopManager(), GetLoopManager(), false /* fToRNTuple */, options.fIncludeVariations}); - - auto &&nColumns = colListNoAliasesWithSizeBranches.size(); - const auto validColumnNames = GetValidatedColumnNames(nColumns, colListNoAliasesWithSizeBranches); - - const auto nSlots = fLoopManager->GetNSlots(); - std::vector colTypeIDs; - colTypeIDs.reserve(nColumns); - for (decltype(nColumns) i{}; i < nColumns; i++) { - const auto &colName = validColumnNames[i]; - const auto colTypeName = ROOT::Internal::RDF::ColumnName2ColumnTypeName( - colName, /*tree*/ nullptr, GetDataSource(), fColRegister.GetDefine(colName), options.fVector2RVec); - const std::type_info *colTypeID = retrieveTypeID(colName, colTypeName); - colTypeIDs.push_back(colTypeID); - } - // Crucial e.g. if the column names do not correspond to already-available column readers created by the data - // source - CheckAndFillDSColumns(validColumnNames, colTypeIDs); - - auto action = - RDFInternal::BuildAction(validColumnNames, snapHelperArgs, nSlots, fProxiedPtr, fColRegister, colTypeIDs); - resPtr = MakeResultPtr(newRDF, *GetLoopManager(), std::move(action)); - } - - if (!options.fLazy) - *resPtr; - return resPtr; - } - - // clang-format off - //////////////////////////////////////////////////////////////////////////// - /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`. - /// \param[in] treename The name of the output TTree or RNTuple. - /// \param[in] filename The name of the output TFile. - /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns. - /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple - /// \return a `RDataFrame` that wraps the snapshotted dataset. - /// - /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. - /// The types of the columns are automatically inferred and do not need to be specified. - /// - /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages. - RResultPtr> Snapshot(std::string_view treename, std::string_view filename, - std::string_view columnNameRegexp = "", - const RSnapshotOptions &options = RSnapshotOptions()) - { - const auto definedColumns = fColRegister.GenerateColumnNames(); - - const auto dsColumns = GetDataSource() ? ROOT::Internal::RDF::GetTopLevelFieldNames(*GetDataSource()) : ColumnNames_t{}; - // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those - ColumnNames_t dsColumnsWithoutSizeColumns; - std::copy_if(dsColumns.begin(), dsColumns.end(), std::back_inserter(dsColumnsWithoutSizeColumns), - [](const std::string &name) { return name.size() < 13 || name.substr(0, 13) != "R_rdf_sizeof_"; }); - ColumnNames_t columnNames; - columnNames.reserve(definedColumns.size() + dsColumnsWithoutSizeColumns.size()); - columnNames.insert(columnNames.end(), definedColumns.begin(), definedColumns.end()); - columnNames.insert(columnNames.end(), dsColumnsWithoutSizeColumns.begin(), dsColumnsWithoutSizeColumns.end()); - - // The only way we can get duplicate entries is if a column coming from a tree or data-source is Redefine'd. - // RemoveDuplicates should preserve ordering of the columns: it might be meaningful. - RDFInternal::RemoveDuplicates(columnNames); - - std::vector selectedColumns; - try { - selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot"); - } - catch (const std::runtime_error &e){ - // No columns were found, try again but consider all input data source columns - if (auto ds = GetDataSource()) - selectedColumns = RDFInternal::ConvertRegexToColumns(ds->GetColumnNames(), columnNameRegexp, "Snapshot"); - else - throw e; - } - - if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS") { - RDFInternal::RemoveRNTupleSubfields(selectedColumns); - } - - return Snapshot(treename, filename, selectedColumns, options); + // We build name and title based on the value and weight column names + std::string str_vName{vName}; + std::string str_wName{wName}; + const auto h_name = str_vName + "_weighted_" + str_wName; + const auto h_title = str_vName + ", weights: " + str_wName + ";" + str_vName + ";count * " + str_wName; + return Histo1D({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName, wName); } - // clang-format on - // clang-format off //////////////////////////////////////////////////////////////////////////// - /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`. - /// \param[in] treename The name of the output TTree or RNTuple. - /// \param[in] filename The name of the output TFile. - /// \param[in] columnList The list of names of the columns/branches to be written. - /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple. - /// \return a `RDataFrame` that wraps the snapshotted dataset. - /// - /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. - /// The types of the columns are automatically inferred and do not need to be specified. + /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*). + /// \tparam V The type of the column used to fill the histogram. + /// \tparam W The type of the column used as weights. + /// \param[in] model The returned histogram will be constructed using this as a model. + /// \return the monodimensional histogram wrapped in a RResultPtr. /// - /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages. - RResultPtr> Snapshot(std::string_view treename, std::string_view filename, - std::initializer_list columnList, - const RSnapshotOptions &options = RSnapshotOptions()) + /// This overload will use the first two default columns as column names. + /// See the description of the first Histo1D() overload for more details. + template + RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.}) { - ColumnNames_t selectedColumns(columnList); - return Snapshot(treename, filename, selectedColumns, options); + return Histo1D(model, "", ""); } - // clang-format on //////////////////////////////////////////////////////////////////////////// - /// \brief Save selected columns in memory. - /// \tparam ColumnTypes variadic list of branch/column types. - /// \param[in] columnList columns to be cached in memory. - /// \return a `RDataFrame` that wraps the cached dataset. - /// - /// This action returns a new `RDataFrame` object, completely detached from - /// the originating `RDataFrame`. The new dataframe only contains the cached - /// columns and stores their content in memory for fast, zero-copy subsequent access. - /// - /// Use `Cache` if you know you will only need a subset of the (`Filter`ed) data that - /// fits in memory and that will be accessed many times. + /// \brief Fill and return a two-dimensional histogram (*lazy action*). + /// \tparam V1 The type of the column used to fill the x axis of the histogram. + /// \tparam V2 The type of the column used to fill the y axis of the histogram. + /// \param[in] model The returned histogram will be constructed using this as a model. + /// \param[in] v1Name The name of the column that will fill the x axis. + /// \param[in] v2Name The name of the column that will fill the y axis. + /// \return the bidimensional histogram wrapped in a RResultPtr. /// - /// \note Cache will refuse to process columns with names of the form `#columnname`. These are special columns - /// made available by some data sources (e.g. RNTupleDS) that represent the size of column `columnname`, and are - /// not meant to be written out with that name (which is not a valid C++ variable name). Instead, go through an - /// Alias(): `df.Alias("nbar", "#bar").Cache(..., {"nbar"})`. + /// Columns can be of a container type (e.g. std::vector), in which case the histogram + /// is filled with each one of the elements of the container. In case multiple columns of container type + /// are provided (e.g. values and weights) they must have the same length for each one of the events (but + /// possibly different lengths between events). + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: - /// - /// **Types and columns specified:** /// ~~~{.cpp} - /// auto cache_some_cols_df = df.Cache({"col0", "col1", "col2"}); + /// // Deduce column types (this invocation needs jitting internally) + /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY"); + /// // Explicit column types + /// auto myHist2 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY"); /// ~~~ /// - /// **Types inferred and columns specified (this invocation relies on jitting):** - /// ~~~{.cpp} - /// auto cache_some_cols_df = df.Cache({"col0", "col1", "col2"}); - /// ~~~ /// - /// **Types inferred and columns selected with a regexp (this invocation relies on jitting):** - /// ~~~{.cpp} - /// auto cache_all_cols_df = df.Cache(myRegexp); - /// ~~~ - template - RInterface Cache(const ColumnNames_t &columnList) + /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory + /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that + /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). + template + RResultPtr<::TH2D> Histo2D(const TH2DModel &model, std::string_view v1Name = "", std::string_view v2Name = "") { - auto staticSeq = std::make_index_sequence(); - return CacheImpl(columnList, staticSeq); + std::shared_ptr<::TH2D> h(nullptr); + { + ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); + h = model.GetHistogram(); + } + if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) { + throw std::runtime_error("2D histograms with no axes limits are not supported yet."); + } + const std::vector columnViews = {v1Name, v2Name}; + const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) + ? ColumnNames_t() + : ColumnNames_t(columnViews.begin(), columnViews.end()); + return CreateAction(userColumns, h, h, fProxiedPtr); } //////////////////////////////////////////////////////////////////////////// - /// \brief Save selected columns in memory. - /// \param[in] columnList columns to be cached in memory - /// \return a `RDataFrame` that wraps the cached dataset. + /// \brief Fill and return a weighted two-dimensional histogram (*lazy action*). + /// \tparam V1 The type of the column used to fill the x axis of the histogram. + /// \tparam V2 The type of the column used to fill the y axis of the histogram. + /// \tparam W The type of the column used for the weights of the histogram. + /// \param[in] model The returned histogram will be constructed using this as a model. + /// \param[in] v1Name The name of the column that will fill the x axis. + /// \param[in] v2Name The name of the column that will fill the y axis. + /// \param[in] wName The name of the column that will provide the weights. + /// \return the bidimensional histogram wrapped in a RResultPtr. /// - /// See the previous overloads for more information. - RInterface Cache(const ColumnNames_t &columnList) + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. + /// + /// ### Example usage: + /// ~~~{.cpp} + /// // Deduce column types (this invocation needs jitting internally) + /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight"); + /// // Explicit column types + /// auto myHist2 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight"); + /// ~~~ + /// + /// See the documentation of the first Histo2D() overload for more details. + template + RResultPtr<::TH2D> + Histo2D(const TH2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName) { - // Early return: if the list of columns is empty, just return an empty RDF - // If we proceed, the jitted call will not compile! - if (columnList.empty()) { - auto nEntries = *this->Count(); - RInterface emptyRDF(std::make_shared(nEntries)); - return emptyRDF; + std::shared_ptr<::TH2D> h(nullptr); + { + ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); + h = model.GetHistogram(); } - - std::stringstream cacheCall; - auto upcastNode = RDFInternal::UpcastNode(fProxiedPtr); - RInterface> upcastInterface(fProxiedPtr, *fLoopManager, - fColRegister); - // build a string equivalent to - // "(RInterface*)(this)->Cache(*(ColumnNames_t*)(&columnList))" - RInterface resRDF(std::make_shared(0)); - cacheCall << "*reinterpret_cast*>(" - << RDFInternal::PrettyPrintAddr(&resRDF) - << ") = reinterpret_cast*>(" - << RDFInternal::PrettyPrintAddr(&upcastInterface) << ")->Cache<"; - - const auto columnListWithoutSizeColumns = RDFInternal::FilterArraySizeColNames(columnList, "Cache"); - - const auto validColumnNames = - GetValidatedColumnNames(columnListWithoutSizeColumns.size(), columnListWithoutSizeColumns); - const auto colTypes = - GetValidatedArgTypes(validColumnNames, fColRegister, nullptr, GetDataSource(), "Cache", /*vector2RVec=*/false); - for (const auto &colType : colTypes) - cacheCall << colType << ", "; - if (!columnListWithoutSizeColumns.empty()) - cacheCall.seekp(-2, cacheCall.cur); // remove the last ", - cacheCall << ">(*reinterpret_cast*>(" // vector should be ColumnNames_t - << RDFInternal::PrettyPrintAddr(&columnListWithoutSizeColumns) << "));"; - - // book the code to jit with the RLoopManager and trigger the event loop - fLoopManager->ToJitExec(cacheCall.str()); - fLoopManager->Jit(); - - return resRDF; + if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) { + throw std::runtime_error("2D histograms with no axes limits are not supported yet."); + } + const std::vector columnViews = {v1Name, v2Name, wName}; + const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) + ? ColumnNames_t() + : ColumnNames_t(columnViews.begin(), columnViews.end()); + return CreateAction(userColumns, h, h, fProxiedPtr); } - //////////////////////////////////////////////////////////////////////////// - /// \brief Save selected columns in memory. - /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns. - /// \return a `RDataFrame` that wraps the cached dataset. - /// - /// The existing columns are matched against the regular expression. If the string provided - /// is empty, all columns are selected. See the previous overloads for more information. - RInterface Cache(std::string_view columnNameRegexp = "") + template + RResultPtr<::TH2D> Histo2D(const TH2DModel &model) { - const auto definedColumns = fColRegister.GenerateColumnNames(); - const auto dsColumns = GetDataSource() ? GetDataSource()->GetColumnNames() : ColumnNames_t{}; - // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those - ColumnNames_t dsColumnsWithoutSizeColumns; - std::copy_if(dsColumns.begin(), dsColumns.end(), std::back_inserter(dsColumnsWithoutSizeColumns), - [](const std::string &name) { return name.size() < 13 || name.substr(0, 13) != "R_rdf_sizeof_"; }); - ColumnNames_t columnNames; - columnNames.reserve(definedColumns.size() + dsColumns.size()); - columnNames.insert(columnNames.end(), definedColumns.begin(), definedColumns.end()); - columnNames.insert(columnNames.end(), dsColumns.begin(), dsColumns.end()); - const auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Cache"); - return Cache(selectedColumns); + return Histo2D(model, "", "", ""); } //////////////////////////////////////////////////////////////////////////// - /// \brief Save selected columns in memory. - /// \param[in] columnList columns to be cached in memory. - /// \return a `RDataFrame` that wraps the cached dataset. + /// \brief Fill and return a three-dimensional histogram (*lazy action*). + /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present. + /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present. + /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present. + /// \param[in] model The returned histogram will be constructed using this as a model. + /// \param[in] v1Name The name of the column that will fill the x axis. + /// \param[in] v2Name The name of the column that will fill the y axis. + /// \param[in] v3Name The name of the column that will fill the z axis. + /// \return the tridimensional histogram wrapped in a RResultPtr. /// - /// See the previous overloads for more information. - RInterface Cache(std::initializer_list columnList) + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. + /// + /// ### Example usage: + /// ~~~{.cpp} + /// // Deduce column types (this invocation needs jitting internally) + /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.}, + /// "myValueX", "myValueY", "myValueZ"); + /// // Explicit column types + /// auto myHist2 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.}, + /// "myValueX", "myValueY", "myValueZ"); + /// ~~~ + /// \note If three-dimensional histograms consume too much memory in multithreaded runs, the cloning of TH3D + /// per thread can be reduced using ROOT::RDF::Experimental::ThreadsPerTH3(). See the section "Memory Usage" in + /// the RDataFrame description. + /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory + /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that + /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). + template + RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name = "", std::string_view v2Name = "", + std::string_view v3Name = "") { - ColumnNames_t selectedColumns(columnList); - return Cache(selectedColumns); + std::shared_ptr<::TH3D> h(nullptr); + { + ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); + h = model.GetHistogram(); + } + if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) { + throw std::runtime_error("3D histograms with no axes limits are not supported yet."); + } + const std::vector columnViews = {v1Name, v2Name, v3Name}; + const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) + ? ColumnNames_t() + : ColumnNames_t(columnViews.begin(), columnViews.end()); + return CreateAction(userColumns, h, h, fProxiedPtr); } - - // clang-format off //////////////////////////////////////////////////////////////////////////// - /// \brief Execute a user-defined function on each entry (*instant action*). - /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations. - /// \param[in] columns Names of the columns/branches in input to the user function. + /// \brief Fill and return a three-dimensional histogram (*lazy action*). + /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present. + /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present. + /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present. + /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present. + /// \param[in] model The returned histogram will be constructed using this as a model. + /// \param[in] v1Name The name of the column that will fill the x axis. + /// \param[in] v2Name The name of the column that will fill the y axis. + /// \param[in] v3Name The name of the column that will fill the z axis. + /// \param[in] wName The name of the column that will provide the weights. + /// \return the tridimensional histogram wrapped in a RResultPtr. /// - /// The callable `f` is invoked once per entry. This is an *instant action*: - /// upon invocation, an event loop as well as execution of all scheduled actions - /// is triggered. - /// Users are responsible for the thread-safety of this callable when executing - /// with implicit multi-threading enabled (i.e. ROOT::EnableImplicitMT). + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// myDf.Foreach([](int i){ std::cout << i << std::endl;}, {"myIntColumn"}); + /// // Deduce column types (this invocation needs jitting internally) + /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.}, + /// "myValueX", "myValueY", "myValueZ", "myWeight"); + /// // Explicit column types + /// using d_t = double; + /// auto myHist2 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.}, + /// "myValueX", "myValueY", "myValueZ", "myWeight"); /// ~~~ - // clang-format on - template - void Foreach(F f, const ColumnNames_t &columns = {}) + /// + /// + /// See the documentation of the first Histo2D() overload for more details. + template + RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name, std::string_view v2Name, + std::string_view v3Name, std::string_view wName) { - using arg_types = typename TTraits::CallableTraits::arg_types_nodecay; - using ret_type = typename TTraits::CallableTraits::ret_type; - ForeachSlot(RDFInternal::AddSlotParameter(f, arg_types()), columns); + std::shared_ptr<::TH3D> h(nullptr); + { + ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); + h = model.GetHistogram(); + } + if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) { + throw std::runtime_error("3D histograms with no axes limits are not supported yet."); + } + const std::vector columnViews = {v1Name, v2Name, v3Name, wName}; + const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) + ? ColumnNames_t() + : ColumnNames_t(columnViews.begin(), columnViews.end()); + return CreateAction(userColumns, h, h, fProxiedPtr); + } + + template + RResultPtr<::TH3D> Histo3D(const TH3DModel &model) + { + return Histo3D(model, "", "", "", ""); } - // clang-format off //////////////////////////////////////////////////////////////////////////// - /// \brief Execute a user-defined function requiring a processing slot index on each entry (*instant action*). - /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations. - /// \param[in] columns Names of the columns/branches in input to the user function. + /// \brief Fill and return an N-dimensional histogram (*lazy action*). + /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred if not + /// present. + /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the + /// object. + /// \param[in] model The returned histogram will be constructed using this as a model. + /// \param[in] columnList + /// A list containing the names of the columns that will be passed when calling `Fill`. + /// \param[in] wName The name of the column that will provide the weights. + /// \return the N-dimensional histogram wrapped in a RResultPtr. /// - /// Same as `Foreach`, but the user-defined function takes an extra - /// `unsigned int` as its first parameter, the *processing slot index*. - /// This *slot index* will be assigned a different value, `0` to `poolSize - 1`, - /// for each thread of execution. - /// This is meant as a helper in writing thread-safe `Foreach` - /// actions when using `RDataFrame` after `ROOT::EnableImplicitMT()`. - /// The user-defined processing callable is able to follow different - /// *streams of processing* indexed by the first parameter. - /// `ForeachSlot` works just as well with single-thread execution: in that - /// case `slot` will always be `0`. + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. See RResultPtr documentation. /// /// ### Example usage: /// ~~~{.cpp} - /// myDf.ForeachSlot([](unsigned int s, int i){ std::cout << "Slot " << s << ": "<< i << std::endl;}, {"myIntColumn"}); + /// auto myFilledObj = myDf.HistoND({"name","title", 4, + /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}}, + /// {"col0", "col1", "col2", "col3"}); /// ~~~ - // clang-format on - template - void ForeachSlot(F f, const ColumnNames_t &columns = {}) + /// + /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new + /// argument `wName`: `HistoND(model, cols, weightCol)`. + /// + template // need FirstColumn to disambiguate overloads + RResultPtr<::THnD> HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName = "") { - using ColTypes_t = TypeTraits::RemoveFirstParameter_t::arg_types>; - constexpr auto nColumns = ColTypes_t::list_size; + std::shared_ptr<::THnD> h(nullptr); + { + ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); + h = model.GetHistogram(); + const auto hDims = h->GetNdimensions(); + decltype(hDims) nCols = columnList.size(); - const auto validColumnNames = GetValidatedColumnNames(nColumns, columns); - CheckAndFillDSColumns(validColumnNames, ColTypes_t()); + if (!wName.empty() && nCols == hDims + 1) + throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of " + "input columns contains one column more than the number of dimensions of the " + "histogram. Call as 'HistoND(model, cols, weightCol)'."); - using Helper_t = RDFInternal::ForeachSlotHelper; - using Action_t = RDFInternal::RAction; + if (nCols == hDims + 1) + Warning("HistoND", "Passing the column with the weights as the last column in the list is deprecated. " + "Instead, pass it as a separate argument, e.g. 'HistoND(model, cols, weightCol)'."); - auto action = std::make_unique(Helper_t(std::move(f)), validColumnNames, fProxiedPtr, fColRegister); + if (!wName.empty() || nCols == hDims + 1) + h->Sumw2(); - fLoopManager->Run(); - } + if (nCols != hDims + 1 && nCols != hDims) + throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes."); + } - /// \} - // End of doxygen group for immediate actions - // ---------------------------------------------------------------------------------------- - /// \name Actions - /// Actions declare a type of result to be produced, for example histograms or summary statistics. - /// Actions are lazy, i.e. they are only executed once a result is requested. - /// \{ + if (!wName.empty()) { + // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of + // passed arguments is one more the number of dimensions of the histogram. + ColumnNames_t userColumns = columnList; + userColumns.push_back(std::string{wName}); + return CreateAction(userColumns, h, h, + fProxiedPtr); + } + return CreateAction(columnList, h, h, + fProxiedPtr); + } //////////////////////////////////////////////////////////////////////////// - /// \brief Return the number of entries processed (*lazy action*). - /// \return the number of entries wrapped in a RResultPtr. + /// \brief Fill and return an N-dimensional histogram (*lazy action*). + /// \param[in] model The returned histogram will be constructed using this as a model. + /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` + /// \param[in] wName The name of the column that will provide the weights. + /// \return the N-dimensional histogram wrapped in a RResultPtr. /// - /// Useful e.g. for counting the number of entries passing a certain filter (see also `Report`). /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// auto nEntriesAfterCuts = myFilteredDf.Count(); + /// auto myFilledObj = myDf.HistoND({"name","title", 4, + /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}}, + /// {"col0", "col1", "col2", "col3"}); /// ~~~ /// - RResultPtr Count() + /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new + /// argument `wName`: `HistoND(model, cols, weightCol)`. + /// + RResultPtr<::THnD> HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName = "") { - const auto nSlots = fLoopManager->GetNSlots(); - auto cSPtr = std::make_shared(0); - using Helper_t = RDFInternal::CountHelper; - using Action_t = RDFInternal::RAction; - auto action = std::make_unique(Helper_t(cSPtr, nSlots), ColumnNames_t({}), fProxiedPtr, - RDFInternal::RColumnRegister(fColRegister)); - return MakeResultPtr(cSPtr, *fLoopManager, std::move(action)); + std::shared_ptr<::THnD> h(nullptr); + { + ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); + h = model.GetHistogram(); + const auto hDims = h->GetNdimensions(); + decltype(hDims) nCols = columnList.size(); + + if (!wName.empty() && nCols == hDims + 1) + throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of " + "input columns contains one column more than the number of dimensions of the " + "histogram. Call as 'HistoND(model, cols, weightCol)'."); + + if (nCols == hDims + 1) + Warning("HistoND", "Passing the column with the weights as the last column in the list is deprecated. " + "Instead, pass it as a separate argument, e.g. 'HistoND(model, cols, weightCol)'."); + + if (!wName.empty() || nCols == hDims + 1) + h->Sumw2(); + + if (nCols != hDims + 1 && nCols != hDims) + throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes."); + } + + if (!wName.empty()) { + // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of + // passed arguments is one more the number of dimensions of the histogram. + ColumnNames_t userColumns = columnList; + userColumns.push_back(std::string{wName}); + return CreateAction(userColumns, h, h, fProxiedPtr, + userColumns.size()); + } + return CreateAction(columnList, h, h, fProxiedPtr, + columnList.size()); } //////////////////////////////////////////////////////////////////////////// - /// \brief Return a collection of values of a column (*lazy action*, returns a std::vector by default). - /// \tparam T The type of the column. - /// \tparam COLL The type of collection used to store the values. - /// \param[in] column The name of the column to collect the values of. - /// \return the content of the selected column wrapped in a RResultPtr. + /// \brief Fill and return a sparse N-dimensional histogram (*lazy action*). + /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred if not + /// present. + /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the + /// object. + /// \param[in] model The returned histogram will be constructed using this as a model. + /// \param[in] columnList + /// A list containing the names of the columns that will be passed when calling `Fill`. + /// \param[in] wName The name of the column that will provide the weights. + /// \return the N-dimensional histogram wrapped in a RResultPtr. + /// + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. See RResultPtr documentation. /// - /// The collection type to be specified for C-style array columns is `RVec`: - /// in this case the returned collection is a `std::vector>`. /// ### Example usage: /// ~~~{.cpp} - /// // In this case intCol is a std::vector - /// auto intCol = rdf.Take("integerColumn"); - /// // Same content as above but in this case taken as a RVec - /// auto intColAsRVec = rdf.Take>("integerColumn"); - /// // In this case intCol is a std::vector>, a collection of collections - /// auto cArrayIntCol = rdf.Take>("cArrayInt"); + /// auto myFilledObj = myDf.HistoNSparseD({"name","title", 4, + /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}}, + /// {"col0", "col1", "col2", "col3"}); /// ~~~ - /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. Also see RResultPtr. - template > - RResultPtr Take(std::string_view column = "") + /// + /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new + /// argument `wName`: `HistoND(model, cols, weightCol)`. + /// + template // need FirstColumn to disambiguate overloads + RResultPtr<::THnSparseD> + HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName = "") { - const auto columns = column.empty() ? ColumnNames_t() : ColumnNames_t({std::string(column)}); + std::shared_ptr<::THnSparseD> h(nullptr); + { + ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); + h = model.GetHistogram(); + const auto hDims = h->GetNdimensions(); + decltype(hDims) nCols = columnList.size(); - const auto validColumnNames = GetValidatedColumnNames(1, columns); - CheckAndFillDSColumns(validColumnNames, TTraits::TypeList()); + if (!wName.empty() && nCols == hDims + 1) + throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of " + "input columns contains one column more than the number of dimensions of the " + "histogram. Call as 'HistoNSparseD(model, cols, weightCol)'."); - using Helper_t = RDFInternal::TakeHelper; - using Action_t = RDFInternal::RAction; - auto valuesPtr = std::make_shared(); - const auto nSlots = fLoopManager->GetNSlots(); + if (nCols == hDims + 1) + Warning("HistoNSparseD", + "Passing the column with the weights as the last column in the list is deprecated. " + "Instead, pass it as a separate argument, e.g. 'HistoNSparseD(model, cols, weightCol)'."); - auto action = - std::make_unique(Helper_t(valuesPtr, nSlots), validColumnNames, fProxiedPtr, fColRegister); - return MakeResultPtr(valuesPtr, *fLoopManager, std::move(action)); + if (!wName.empty() || nCols == hDims + 1) + h->Sumw2(); + + if (nCols != hDims + 1 && nCols != hDims) + throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes."); + } + + if (!wName.empty()) { + // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of + // passed arguments is one more the number of dimensions of the histogram. + ColumnNames_t userColumns = columnList; + userColumns.push_back(std::string{wName}); + return CreateAction(userColumns, h, h, + fProxiedPtr); + } + return CreateAction(columnList, h, h, + fProxiedPtr); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*). - /// \tparam V The type of the column used to fill the histogram. + /// \brief Fill and return a sparse N-dimensional histogram (*lazy action*). /// \param[in] model The returned histogram will be constructed using this as a model. - /// \param[in] vName The name of the column that will fill the histogram. - /// \return the monodimensional histogram wrapped in a RResultPtr. + /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` + /// \param[in] wName The name of the column that will provide the weights. + /// \return the N-dimensional histogram wrapped in a RResultPtr. /// - /// Columns can be of a container type (e.g. `std::vector`), in which case the histogram - /// is filled with each one of the elements of the container. In case multiple columns of container type - /// are provided (e.g. values and weights) they must have the same length for each one of the events (but - /// possibly different lengths between events). /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// // Deduce column type (this invocation needs jitting internally) - /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myColumn"); - /// // Explicit column type - /// auto myHist2 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myColumn"); + /// auto myFilledObj = myDf.HistoNSparseD({"name","title", 4, + /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}}, + /// {"col0", "col1", "col2", "col3"}); /// ~~~ /// - /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory - /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that - /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). - template - RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.}, std::string_view vName = "") + /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new + /// argument `wName`: `HistoND(model, cols, weightCol)`. + /// + RResultPtr<::THnSparseD> + HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName = "") { - const auto userColumns = vName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(vName)}); - - const auto validatedColumns = GetValidatedColumnNames(1, userColumns); - - std::shared_ptr<::TH1D> h(nullptr); + std::shared_ptr<::THnSparseD> h(nullptr); { ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); h = model.GetHistogram(); + const auto hDims = h->GetNdimensions(); + decltype(hDims) nCols = columnList.size(); + + if (!wName.empty() && nCols == hDims + 1) + throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of " + "input columns contains one column more than the number of dimensions of the " + "histogram. Call as 'HistoNSparseD(model, cols, weightCol)'."); + + if (nCols == hDims + 1) + Warning("HistoNSparseD", + "Passing the column with the weights as the last column in the list is deprecated. " + "Instead, pass it as a separate argument, e.g. 'HistoNSparseD(model, cols, weightCol)'."); + + if (!wName.empty() || nCols == hDims + 1) + h->Sumw2(); + + if (nCols != hDims + 1 && nCols != hDims) + throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes."); } - if (h->GetXaxis()->GetXmax() == h->GetXaxis()->GetXmin()) - h->SetCanExtend(::TH1::kAllAxes); - return CreateAction(validatedColumns, h, h, fProxiedPtr); + if (!wName.empty()) { + // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of + // passed arguments is one more the number of dimensions of the histogram. + ColumnNames_t userColumns = columnList; + userColumns.push_back(std::string{wName}); + return CreateAction( + userColumns, h, h, fProxiedPtr, userColumns.size()); + } + return CreateAction( + columnList, h, h, fProxiedPtr, columnList.size()); } +#ifdef R__HAS_ROOT7 //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a one-dimensional histogram with the values of a column (*lazy action*). - /// \tparam V The type of the column used to fill the histogram. + /// \brief Fill and return a one-dimensional RHist (*lazy action*). + /// \tparam BinContentType The bin content type of the returned RHist. + /// \param[in] nNormalBins The returned histogram will be constructed using this number of normal bins. + /// \param[in] interval The axis interval of the constructed histogram (lower end inclusive, upper end exclusive). /// \param[in] vName The name of the column that will fill the histogram. - /// \return the monodimensional histogram wrapped in a RResultPtr. + /// \return the histogram wrapped in a RResultPtr. /// - /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.). - /// The "name" and "title" strings are built starting from the input column name. - /// See the description of the first Histo1D() overload for more details. + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// // Deduce column type (this invocation needs jitting internally) - /// auto myHist1 = myDf.Histo1D("myColumn"); - /// // Explicit column type - /// auto myHist2 = myDf.Histo1D("myColumn"); + /// auto myHist = myDf.Hist(10, {5, 15}, "col0"); /// ~~~ - template - RResultPtr<::TH1D> Histo1D(std::string_view vName) + template + RResultPtr> + Hist(std::uint64_t nNormalBins, std::pair interval, std::string_view vName) { - const auto h_name = std::string(vName); - const auto h_title = h_name + ";" + h_name + ";count"; - return Histo1D({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName); + std::shared_ptr h = std::make_shared>(nNormalBins, interval); + + const ColumnNames_t columnList = {std::string(vName)}; + + return Hist(h, columnList); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*). - /// \tparam V The type of the column used to fill the histogram. - /// \tparam W The type of the column used as weights. - /// \param[in] model The returned histogram will be constructed using this as a model. - /// \param[in] vName The name of the column that will fill the histogram. - /// \param[in] wName The name of the column that will provide the weights. - /// \return the monodimensional histogram wrapped in a RResultPtr. + /// \brief Fill and return an RHist (*lazy action*). + /// \tparam BinContentType The bin content type of the returned RHist. + /// \param[in] axes The returned histogram will be constructed using these axes. + /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` + /// \return the histogram wrapped in a RResultPtr. /// - /// See the description of the first Histo1D() overload for more details. + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// // Deduce column type (this invocation needs jitting internally) - /// auto myHist1 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight"); - /// // Explicit column type - /// auto myHist2 = myDf.Histo1D({"histName", "histTitle", 64u, 0., 128.}, "myValue", "myweight"); + /// ROOT::Experimental::RRegularAxis axis(10, {5.0, 15.0}); + /// auto myHist = myDf.Hist({axis}, {"col0"}); /// ~~~ - template - RResultPtr<::TH1D> Histo1D(const TH1DModel &model, std::string_view vName, std::string_view wName) + template + RResultPtr> + Hist(std::vector axes, const ColumnNames_t &columnList) { - const std::vector columnViews = {vName, wName}; - const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) - ? ColumnNames_t() - : ColumnNames_t(columnViews.begin(), columnViews.end()); - std::shared_ptr<::TH1D> h(nullptr); - { - ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); - h = model.GetHistogram(); + if (axes.size() != columnList.size()) { + std::string msg = "Wrong number of columns for the specified number of histogram axes: "; + msg += "expected " + std::to_string(axes.size()) + ", got " + std::to_string(columnList.size()); + throw std::invalid_argument(msg); } - if (h->GetXaxis()->GetXmax() == h->GetXaxis()->GetXmin()) - h->SetCanExtend(::TH1::kAllAxes); - return CreateAction(userColumns, h, h, fProxiedPtr); + std::shared_ptr h = std::make_shared>(std::move(axes)); + + return Hist(h, columnList); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*). - /// \tparam V The type of the column used to fill the histogram. - /// \tparam W The type of the column used as weights. - /// \param[in] vName The name of the column that will fill the histogram. - /// \param[in] wName The name of the column that will provide the weights. - /// \return the monodimensional histogram wrapped in a RResultPtr. + /// \brief Fill the provided RHist (*lazy action*). + /// \param[in] h The histogram that should be filled. + /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` + /// \return the histogram wrapped in a RResultPtr. /// - /// This overload uses a default model histogram TH1D(name, title, 128u, 0., 0.). - /// The "name" and "title" strings are built starting from the input column names. - /// See the description of the first Histo1D() overload for more details. + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. + /// + /// During execution of the computation graph, the passed histogram must only be accessed with methods that are + /// allowed during concurrent filling. /// /// ### Example usage: /// ~~~{.cpp} - /// // Deduce column types (this invocation needs jitting internally) - /// auto myHist1 = myDf.Histo1D("myValue", "myweight"); - /// // Explicit column types - /// auto myHist2 = myDf.Histo1D("myValue", "myweight"); + /// auto h = std::make_shared>(10, {5.0, 15.0}); + /// auto myHist = myDf.Hist(h, {"col0"}); /// ~~~ - template - RResultPtr<::TH1D> Histo1D(std::string_view vName, std::string_view wName) + template + RResultPtr> + Hist(std::shared_ptr> h, const ColumnNames_t &columnList) { - // We build name and title based on the value and weight column names - std::string str_vName{vName}; - std::string str_wName{wName}; - const auto h_name = str_vName + "_weighted_" + str_wName; - const auto h_title = str_vName + ", weights: " + str_wName + ";" + str_vName + ";count * " + str_wName; - return Histo1D({h_name.c_str(), h_title.c_str(), 128u, 0., 0.}, vName, wName); - } + RDFInternal::WarnHist(); - //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a one-dimensional histogram with the weighted values of a column (*lazy action*). - /// \tparam V The type of the column used to fill the histogram. - /// \tparam W The type of the column used as weights. - /// \param[in] model The returned histogram will be constructed using this as a model. - /// \return the monodimensional histogram wrapped in a RResultPtr. - /// - /// This overload will use the first two default columns as column names. - /// See the description of the first Histo1D() overload for more details. - template - RResultPtr<::TH1D> Histo1D(const TH1DModel &model = {"", "", 128u, 0., 0.}) - { - return Histo1D(model, "", ""); + if (h->GetNDimensions() != columnList.size()) { + std::string msg = "Wrong number of columns for the passed histogram: "; + msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size()); + throw std::invalid_argument(msg); + } + + return CreateAction(columnList, h, h, fProxiedPtr, + columnList.size()); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a two-dimensional histogram (*lazy action*). - /// \tparam V1 The type of the column used to fill the x axis of the histogram. - /// \tparam V2 The type of the column used to fill the y axis of the histogram. - /// \param[in] model The returned histogram will be constructed using this as a model. - /// \param[in] v1Name The name of the column that will fill the x axis. - /// \param[in] v2Name The name of the column that will fill the y axis. - /// \return the bidimensional histogram wrapped in a RResultPtr. + /// \brief Fill and return a one-dimensional RHist with weights (*lazy action*). + /// \tparam BinContentType The bin content type of the returned RHist. + /// \param[in] nNormalBins The returned histogram will be constructed using this number of normal bins. + /// \param[in] interval The axis interval of the constructed histogram (lower end inclusive, upper end exclusive). + /// \param[in] vName The name of the column that will fill the histogram. + /// \param[in] wName The name of the column that will provide the weights. + /// \return the histogram wrapped in a RResultPtr. /// - /// Columns can be of a container type (e.g. std::vector), in which case the histogram - /// is filled with each one of the elements of the container. In case multiple columns of container type - /// are provided (e.g. values and weights) they must have the same length for each one of the events (but - /// possibly different lengths between events). /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// // Deduce column types (this invocation needs jitting internally) - /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY"); - /// // Explicit column types - /// auto myHist2 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY"); + /// auto myHist = myDf.Hist(10, {5, 15}, "col0", "colW"); /// ~~~ - /// - /// - /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory - /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that - /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). - template - RResultPtr<::TH2D> Histo2D(const TH2DModel &model, std::string_view v1Name = "", std::string_view v2Name = "") + template + RResultPtr> + Hist(std::uint64_t nNormalBins, std::pair interval, std::string_view vName, std::string_view wName) { - std::shared_ptr<::TH2D> h(nullptr); - { - ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); - h = model.GetHistogram(); - } - if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) { - throw std::runtime_error("2D histograms with no axes limits are not supported yet."); - } - const std::vector columnViews = {v1Name, v2Name}; - const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) - ? ColumnNames_t() - : ColumnNames_t(columnViews.begin(), columnViews.end()); - return CreateAction(userColumns, h, h, fProxiedPtr); + std::shared_ptr h = std::make_shared>(nNormalBins, interval); + + const ColumnNames_t columnList = {std::string(vName)}; + + return Hist(h, columnList, wName); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a weighted two-dimensional histogram (*lazy action*). - /// \tparam V1 The type of the column used to fill the x axis of the histogram. - /// \tparam V2 The type of the column used to fill the y axis of the histogram. - /// \tparam W The type of the column used for the weights of the histogram. - /// \param[in] model The returned histogram will be constructed using this as a model. - /// \param[in] v1Name The name of the column that will fill the x axis. - /// \param[in] v2Name The name of the column that will fill the y axis. + /// \brief Fill and return an RHist with weights (*lazy action*). + /// \tparam BinContentType The bin content type of the returned RHist. + /// \param[in] axes The returned histogram will be constructed using these axes. + /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` /// \param[in] wName The name of the column that will provide the weights. - /// \return the bidimensional histogram wrapped in a RResultPtr. + /// \return the histogram wrapped in a RResultPtr. /// /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// + /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling). + /// /// ### Example usage: /// ~~~{.cpp} - /// // Deduce column types (this invocation needs jitting internally) - /// auto myHist1 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight"); - /// // Explicit column types - /// auto myHist2 = myDf.Histo2D({"histName", "histTitle", 64u, 0., 128., 32u, -4., 4.}, "myValueX", "myValueY", "myWeight"); + /// ROOT::Experimental::RRegularAxis axis(10, {5.0, 15.0}); + /// auto myHist = myDf.Hist({axis}, {"col0"}, "colW"); /// ~~~ - /// - /// See the documentation of the first Histo2D() overload for more details. - template - RResultPtr<::TH2D> - Histo2D(const TH2DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName) + template + RResultPtr> + Hist(std::vector axes, const ColumnNames_t &columnList, std::string_view wName) { - std::shared_ptr<::TH2D> h(nullptr); - { - ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); - h = model.GetHistogram(); - } - if (!RDFInternal::HistoUtils<::TH2D>::HasAxisLimits(*h)) { - throw std::runtime_error("2D histograms with no axes limits are not supported yet."); + static_assert(ROOT::Experimental::RHistEngine::SupportsWeightedFilling, + "weighted filling is not supported for integral bin content types"); + + if (axes.size() != columnList.size()) { + std::string msg = "Wrong number of columns for the specified number of histogram axes: "; + msg += "expected " + std::to_string(axes.size()) + ", got " + std::to_string(columnList.size()); + throw std::invalid_argument(msg); } - const std::vector columnViews = {v1Name, v2Name, wName}; - const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) - ? ColumnNames_t() - : ColumnNames_t(columnViews.begin(), columnViews.end()); - return CreateAction(userColumns, h, h, fProxiedPtr); - } - template - RResultPtr<::TH2D> Histo2D(const TH2DModel &model) - { - return Histo2D(model, "", "", ""); + std::shared_ptr h = std::make_shared>(std::move(axes)); + + return Hist(h, columnList, wName); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a three-dimensional histogram (*lazy action*). - /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present. - /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present. - /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present. - /// \param[in] model The returned histogram will be constructed using this as a model. - /// \param[in] v1Name The name of the column that will fill the x axis. - /// \param[in] v2Name The name of the column that will fill the y axis. - /// \param[in] v3Name The name of the column that will fill the z axis. - /// \return the tridimensional histogram wrapped in a RResultPtr. + /// \brief Fill the provided RHist with weights (*lazy action*). + /// \param[in] h The histogram that should be filled. + /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` + /// \param[in] wName The name of the column that will provide the weights. + /// \return the histogram wrapped in a RResultPtr. /// /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// + /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling). + /// + /// During execution of the computation graph, the passed histogram must only be accessed with methods that are + /// allowed during concurrent filling. + /// /// ### Example usage: /// ~~~{.cpp} - /// // Deduce column types (this invocation needs jitting internally) - /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.}, - /// "myValueX", "myValueY", "myValueZ"); - /// // Explicit column types - /// auto myHist2 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.}, - /// "myValueX", "myValueY", "myValueZ"); + /// auto h = std::make_shared>(10, {5.0, 15.0}); + /// auto myHist = myDf.Hist(h, {"col0"}, "colW"); /// ~~~ - /// \note If three-dimensional histograms consume too much memory in multithreaded runs, the cloning of TH3D - /// per thread can be reduced using ROOT::RDF::Experimental::ThreadsPerTH3(). See the section "Memory Usage" in - /// the RDataFrame description. - /// \note Differently from other ROOT interfaces, the returned histogram is not associated to gDirectory - /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that - /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). - template - RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name = "", std::string_view v2Name = "", - std::string_view v3Name = "") + template + RResultPtr> + Hist(std::shared_ptr> h, const ColumnNames_t &columnList, + std::string_view wName) { - std::shared_ptr<::TH3D> h(nullptr); - { - ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); - h = model.GetHistogram(); - } - if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) { - throw std::runtime_error("3D histograms with no axes limits are not supported yet."); + static_assert(ROOT::Experimental::RHistEngine::SupportsWeightedFilling, + "weighted filling is not supported for integral bin content types"); + + RDFInternal::WarnHist(); + + if (h->GetNDimensions() != columnList.size()) { + std::string msg = "Wrong number of columns for the passed histogram: "; + msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size()); + throw std::invalid_argument(msg); } - const std::vector columnViews = {v1Name, v2Name, v3Name}; - const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) - ? ColumnNames_t() - : ColumnNames_t(columnViews.begin(), columnViews.end()); - return CreateAction(userColumns, h, h, fProxiedPtr); + + // Add the weight column to the list of argument columns to pass it through the infrastructure. + ColumnNames_t columnListWithWeights(columnList); + columnListWithWeights.push_back(std::string(wName)); + + return CreateAction( + columnListWithWeights, h, h, fProxiedPtr, columnListWithWeights.size()); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a three-dimensional histogram (*lazy action*). - /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present. - /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present. - /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present. - /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present. - /// \param[in] model The returned histogram will be constructed using this as a model. - /// \param[in] v1Name The name of the column that will fill the x axis. - /// \param[in] v2Name The name of the column that will fill the y axis. - /// \param[in] v3Name The name of the column that will fill the z axis. - /// \param[in] wName The name of the column that will provide the weights. - /// \return the tridimensional histogram wrapped in a RResultPtr. + /// \brief Fill the provided RHistEngine (*lazy action*). + /// \param[in] h The histogram that should be filled. + /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` + /// \return the histogram wrapped in a RResultPtr. /// /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// + /// During execution of the computation graph, the passed histogram must only be accessed with methods that are + /// allowed during concurrent filling. + /// /// ### Example usage: /// ~~~{.cpp} - /// // Deduce column types (this invocation needs jitting internally) - /// auto myHist1 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.}, - /// "myValueX", "myValueY", "myValueZ", "myWeight"); - /// // Explicit column types - /// using d_t = double; - /// auto myHist2 = myDf.Histo3D({"name", "title", 64u, 0., 128., 32u, -4., 4., 8u, -2., 2.}, - /// "myValueX", "myValueY", "myValueZ", "myWeight"); + /// auto h = std::make_shared>(10, {5.0, 15.0}); + /// auto myHist = myDf.Hist(h, {"col0"}); /// ~~~ - /// - /// - /// See the documentation of the first Histo2D() overload for more details. - template - RResultPtr<::TH3D> Histo3D(const TH3DModel &model, std::string_view v1Name, std::string_view v2Name, - std::string_view v3Name, std::string_view wName) + template + RResultPtr> + Hist(std::shared_ptr> h, const ColumnNames_t &columnList) { - std::shared_ptr<::TH3D> h(nullptr); - { - ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); - h = model.GetHistogram(); - } - if (!RDFInternal::HistoUtils<::TH3D>::HasAxisLimits(*h)) { - throw std::runtime_error("3D histograms with no axes limits are not supported yet."); + RDFInternal::WarnHist(); + + if (h->GetNDimensions() != columnList.size()) { + std::string msg = "Wrong number of columns for the passed histogram: "; + msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size()); + throw std::invalid_argument(msg); } - const std::vector columnViews = {v1Name, v2Name, v3Name, wName}; - const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) - ? ColumnNames_t() - : ColumnNames_t(columnViews.begin(), columnViews.end()); - return CreateAction(userColumns, h, h, fProxiedPtr); - } - template - RResultPtr<::TH3D> Histo3D(const TH3DModel &model) - { - return Histo3D(model, "", "", "", ""); + return CreateAction(columnList, h, h, fProxiedPtr, + columnList.size()); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return an N-dimensional histogram (*lazy action*). - /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred if not - /// present. - /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the - /// object. - /// \param[in] model The returned histogram will be constructed using this as a model. - /// \param[in] columnList - /// A list containing the names of the columns that will be passed when calling `Fill`. + /// \brief Fill the provided RHistEngine with weights (*lazy action*). + /// \param[in] h The histogram that should be filled. + /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` /// \param[in] wName The name of the column that will provide the weights. - /// \return the N-dimensional histogram wrapped in a RResultPtr. + /// \return the histogram wrapped in a RResultPtr. /// /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. See RResultPtr documentation. + /// booked but not executed. Also see RResultPtr. + /// + /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling). + /// + /// During execution of the computation graph, the passed histogram must only be accessed with methods that are + /// allowed during concurrent filling. /// /// ### Example usage: /// ~~~{.cpp} - /// auto myFilledObj = myDf.HistoND({"name","title", 4, - /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}}, - /// {"col0", "col1", "col2", "col3"}); + /// auto h = std::make_shared>(10, {5.0, 15.0}); + /// auto myHist = myDf.Hist(h, {"col0"}, "colW"); /// ~~~ - /// - /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new - /// argument `wName`: `HistoND(model, cols, weightCol)`. - /// - template // need FirstColumn to disambiguate overloads - RResultPtr<::THnD> HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName = "") + template + RResultPtr> + Hist(std::shared_ptr> h, const ColumnNames_t &columnList, + std::string_view wName) { - std::shared_ptr<::THnD> h(nullptr); - { - ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); - h = model.GetHistogram(); - const auto hDims = h->GetNdimensions(); - decltype(hDims) nCols = columnList.size(); - - if (!wName.empty() && nCols == hDims + 1) - throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of " - "input columns contains one column more than the number of dimensions of the " - "histogram. Call as 'HistoND(model, cols, weightCol)'."); - - if (nCols == hDims + 1) - Warning("HistoND", "Passing the column with the weights as the last column in the list is deprecated. " - "Instead, pass it as a separate argument, e.g. 'HistoND(model, cols, weightCol)'."); + static_assert(ROOT::Experimental::RHistEngine::SupportsWeightedFilling, + "weighted filling is not supported for integral bin content types"); - if (!wName.empty() || nCols == hDims + 1) - h->Sumw2(); + RDFInternal::WarnHist(); - if (nCols != hDims + 1 && nCols != hDims) - throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes."); + if (h->GetNDimensions() != columnList.size()) { + std::string msg = "Wrong number of columns for the passed histogram: "; + msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size()); + throw std::invalid_argument(msg); } - if (!wName.empty()) { - // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of - // passed arguments is one more the number of dimensions of the histogram. - ColumnNames_t userColumns = columnList; - userColumns.push_back(std::string{wName}); - return CreateAction(userColumns, h, h, - fProxiedPtr); - } - return CreateAction(columnList, h, h, - fProxiedPtr); + // Add the weight column to the list of argument columns to pass it through the infrastructure. + ColumnNames_t columnListWithWeights(columnList); + columnListWithWeights.push_back(std::string(wName)); + + return CreateAction( + columnListWithWeights, h, h, fProxiedPtr, columnListWithWeights.size()); } +#endif //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return an N-dimensional histogram (*lazy action*). - /// \param[in] model The returned histogram will be constructed using this as a model. - /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` - /// \param[in] wName The name of the column that will provide the weights. - /// \return the N-dimensional histogram wrapped in a RResultPtr. + /// \brief Fill and return a TGraph object (*lazy action*). + /// \tparam X The type of the column used to fill the x axis. + /// \tparam Y The type of the column used to fill the y axis. + /// \param[in] x The name of the column that will fill the x axis. + /// \param[in] y The name of the column that will fill the y axis. + /// \return the TGraph wrapped in a RResultPtr. + /// + /// Columns can be of a container type (e.g. std::vector), in which case the TGraph + /// is filled with each one of the elements of the container. + /// If Multithreading is enabled, the order in which points are inserted is undefined. + /// If the Graph has to be drawn, it is suggested to the user to sort it on the x before printing. + /// A name and a title to the TGraph is given based on the input column names. /// /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// auto myFilledObj = myDf.HistoND({"name","title", 4, - /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}}, - /// {"col0", "col1", "col2", "col3"}); + /// // Deduce column types (this invocation needs jitting internally) + /// auto myGraph1 = myDf.Graph("xValues", "yValues"); + /// // Explicit column types + /// auto myGraph2 = myDf.Graph("xValues", "yValues"); /// ~~~ /// - /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new - /// argument `wName`: `HistoND(model, cols, weightCol)`. - /// - RResultPtr<::THnD> HistoND(const THnDModel &model, const ColumnNames_t &columnList, std::string_view wName = "") + /// \note Differently from other ROOT interfaces, the returned TGraph is not associated to gDirectory + /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that + /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). + template + RResultPtr<::TGraph> Graph(std::string_view x = "", std::string_view y = "") { - std::shared_ptr<::THnD> h(nullptr); - { - ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); - h = model.GetHistogram(); - const auto hDims = h->GetNdimensions(); - decltype(hDims) nCols = columnList.size(); - - if (!wName.empty() && nCols == hDims + 1) - throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of " - "input columns contains one column more than the number of dimensions of the " - "histogram. Call as 'HistoND(model, cols, weightCol)'."); + auto graph = std::make_shared<::TGraph>(); + const std::vector columnViews = {x, y}; + const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) + ? ColumnNames_t() + : ColumnNames_t(columnViews.begin(), columnViews.end()); - if (nCols == hDims + 1) - Warning("HistoND", "Passing the column with the weights as the last column in the list is deprecated. " - "Instead, pass it as a separate argument, e.g. 'HistoND(model, cols, weightCol)'."); + const auto validatedColumns = GetValidatedColumnNames(2, userColumns); - if (!wName.empty() || nCols == hDims + 1) - h->Sumw2(); + // We build a default name and title based on the input columns + const auto g_name = validatedColumns[1] + "_vs_" + validatedColumns[0]; + const auto g_title = validatedColumns[1] + " vs " + validatedColumns[0]; + graph->SetNameTitle(g_name.c_str(), g_title.c_str()); + graph->GetXaxis()->SetTitle(validatedColumns[0].c_str()); + graph->GetYaxis()->SetTitle(validatedColumns[1].c_str()); - if (nCols != hDims + 1 && nCols != hDims) - throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes."); - } - - if (!wName.empty()) { - // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of - // passed arguments is one more the number of dimensions of the histogram. - ColumnNames_t userColumns = columnList; - userColumns.push_back(std::string{wName}); - return CreateAction(userColumns, h, h, fProxiedPtr, - userColumns.size()); - } - return CreateAction(columnList, h, h, fProxiedPtr, - columnList.size()); - } + return CreateAction(validatedColumns, graph, graph, fProxiedPtr); + } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a sparse N-dimensional histogram (*lazy action*). - /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred if not - /// present. - /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the - /// object. - /// \param[in] model The returned histogram will be constructed using this as a model. - /// \param[in] columnList - /// A list containing the names of the columns that will be passed when calling `Fill`. - /// \param[in] wName The name of the column that will provide the weights. - /// \return the N-dimensional histogram wrapped in a RResultPtr. + /// \brief Fill and return a TGraphAsymmErrors object (*lazy action*). + /// \param[in] x The name of the column that will fill the x axis. + /// \param[in] y The name of the column that will fill the y axis. + /// \param[in] exl The name of the column of X low errors + /// \param[in] exh The name of the column of X high errors + /// \param[in] eyl The name of the column of Y low errors + /// \param[in] eyh The name of the column of Y high errors + /// \return the TGraphAsymmErrors wrapped in a RResultPtr. + /// + /// Columns can be of a container type (e.g. std::vector), in which case the graph + /// is filled with each one of the elements of the container. + /// If Multithreading is enabled, the order in which points are inserted is undefined. /// /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. See RResultPtr documentation. + /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// auto myFilledObj = myDf.HistoNSparseD({"name","title", 4, - /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}}, - /// {"col0", "col1", "col2", "col3"}); + /// // Deduce column types (this invocation needs jitting internally) + /// auto myGAE1 = myDf.GraphAsymmErrors("xValues", "yValues", "exl", "exh", "eyl", "eyh"); + /// // Explicit column types + /// using f = float + /// auto myGAE2 = myDf.GraphAsymmErrors("xValues", "yValues", "exl", "exh", "eyl", "eyh"); /// ~~~ /// - /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new - /// argument `wName`: `HistoND(model, cols, weightCol)`. + /// `GraphAsymmErrors` should also be used for the cases in which values associated only with + /// one of the axes have associated errors. For example, only `ey` exist and `ex` are equal to zero. + /// In such cases, user should do the following: + /// ~~~{.cpp} + /// // Create a column of zeros in RDataFrame + /// auto rdf_withzeros = rdf.Define("zero", "0"); + /// // or alternatively: + /// auto rdf_withzeros = rdf.Define("zero", []() -> double { return 0.;}); + /// // Create the graph with y errors only + /// auto rdf_errorsOnYOnly = rdf_withzeros.GraphAsymmErrors("xValues", "yValues", "zero", "zero", "eyl", "eyh"); + /// ~~~ /// - template // need FirstColumn to disambiguate overloads - RResultPtr<::THnSparseD> - HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName = "") + /// \note Differently from other ROOT interfaces, the returned TGraphAsymmErrors is not associated to gDirectory + /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that + /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). + template + RResultPtr<::TGraphAsymmErrors> + GraphAsymmErrors(std::string_view x = "", std::string_view y = "", std::string_view exl = "", + std::string_view exh = "", std::string_view eyl = "", std::string_view eyh = "") { - std::shared_ptr<::THnSparseD> h(nullptr); - { - ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); - h = model.GetHistogram(); - const auto hDims = h->GetNdimensions(); - decltype(hDims) nCols = columnList.size(); - - if (!wName.empty() && nCols == hDims + 1) - throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of " - "input columns contains one column more than the number of dimensions of the " - "histogram. Call as 'HistoNSparseD(model, cols, weightCol)'."); - - if (nCols == hDims + 1) - Warning("HistoNSparseD", - "Passing the column with the weights as the last column in the list is deprecated. " - "Instead, pass it as a separate argument, e.g. 'HistoNSparseD(model, cols, weightCol)'."); + auto graph = std::make_shared<::TGraphAsymmErrors>(); + const std::vector columnViews = {x, y, exl, exh, eyl, eyh}; + const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) + ? ColumnNames_t() + : ColumnNames_t(columnViews.begin(), columnViews.end()); - if (!wName.empty() || nCols == hDims + 1) - h->Sumw2(); + const auto validatedColumns = GetValidatedColumnNames(6, userColumns); - if (nCols != hDims + 1 && nCols != hDims) - throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes."); - } + // We build a default name and title based on the input columns + const auto g_name = validatedColumns[1] + "_vs_" + validatedColumns[0]; + const auto g_title = validatedColumns[1] + " vs " + validatedColumns[0]; + graph->SetNameTitle(g_name.c_str(), g_title.c_str()); + graph->GetXaxis()->SetTitle(validatedColumns[0].c_str()); + graph->GetYaxis()->SetTitle(validatedColumns[1].c_str()); - if (!wName.empty()) { - // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of - // passed arguments is one more the number of dimensions of the histogram. - ColumnNames_t userColumns = columnList; - userColumns.push_back(std::string{wName}); - return CreateAction(userColumns, h, h, - fProxiedPtr); - } - return CreateAction(columnList, h, h, - fProxiedPtr); + return CreateAction(validatedColumns, graph, + graph, fProxiedPtr); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a sparse N-dimensional histogram (*lazy action*). - /// \param[in] model The returned histogram will be constructed using this as a model. - /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` - /// \param[in] wName The name of the column that will provide the weights. - /// \return the N-dimensional histogram wrapped in a RResultPtr. + /// \brief Fill and return a one-dimensional profile (*lazy action*). + /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present. + /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present. + /// \param[in] model The model to be considered to build the new return value. + /// \param[in] v1Name The name of the column that will fill the x axis. + /// \param[in] v2Name The name of the column that will fill the y axis. + /// \return the monodimensional profile wrapped in a RResultPtr. /// /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// auto myFilledObj = myDf.HistoNSparseD({"name","title", 4, - /// {40,40,40,40}, {20.,20.,20.,20.}, {60.,60.,60.,60.}}, - /// {"col0", "col1", "col2", "col3"}); + /// // Deduce column types (this invocation needs jitting internally) + /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues"); + /// // Explicit column types + /// auto myProf2 = myDf.Graph({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues"); /// ~~~ /// - /// \note A column with event weights should not be passed as part of `columnList`, but instead be passed in the new - /// argument `wName`: `HistoND(model, cols, weightCol)`. - /// - RResultPtr<::THnSparseD> - HistoNSparseD(const THnSparseDModel &model, const ColumnNames_t &columnList, std::string_view wName = "") + /// \note Differently from other ROOT interfaces, the returned profile is not associated to gDirectory + /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that + /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). + template + RResultPtr<::TProfile> + Profile1D(const TProfile1DModel &model, std::string_view v1Name = "", std::string_view v2Name = "") { - std::shared_ptr<::THnSparseD> h(nullptr); + std::shared_ptr<::TProfile> h(nullptr); { ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); - h = model.GetHistogram(); - const auto hDims = h->GetNdimensions(); - decltype(hDims) nCols = columnList.size(); - - if (!wName.empty() && nCols == hDims + 1) - throw std::invalid_argument("The weight column was passed as an argument and at the same time the list of " - "input columns contains one column more than the number of dimensions of the " - "histogram. Call as 'HistoNSparseD(model, cols, weightCol)'."); - - if (nCols == hDims + 1) - Warning("HistoNSparseD", - "Passing the column with the weights as the last column in the list is deprecated. " - "Instead, pass it as a separate argument, e.g. 'HistoNSparseD(model, cols, weightCol)'."); - - if (!wName.empty() || nCols == hDims + 1) - h->Sumw2(); - - if (nCols != hDims + 1 && nCols != hDims) - throw std::invalid_argument("Wrong number of columns for the specified number of histogram axes."); + h = model.GetProfile(); } - if (!wName.empty()) { - // The action helper will invoke THnBase::Fill overload that performs weighted filling in case the number of - // passed arguments is one more the number of dimensions of the histogram. - ColumnNames_t userColumns = columnList; - userColumns.push_back(std::string{wName}); - return CreateAction( - userColumns, h, h, fProxiedPtr, userColumns.size()); + if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) { + throw std::runtime_error("Profiles with no axes limits are not supported yet."); } - return CreateAction( - columnList, h, h, fProxiedPtr, columnList.size()); + const std::vector columnViews = {v1Name, v2Name}; + const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) + ? ColumnNames_t() + : ColumnNames_t(columnViews.begin(), columnViews.end()); + return CreateAction(userColumns, h, h, fProxiedPtr); } -#ifdef R__HAS_ROOT7 //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a one-dimensional RHist (*lazy action*). - /// \tparam BinContentType The bin content type of the returned RHist. - /// \param[in] nNormalBins The returned histogram will be constructed using this number of normal bins. - /// \param[in] interval The axis interval of the constructed histogram (lower end inclusive, upper end exclusive). - /// \param[in] vName The name of the column that will fill the histogram. - /// \return the histogram wrapped in a RResultPtr. + /// \brief Fill and return a one-dimensional profile (*lazy action*). + /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present. + /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present. + /// \tparam W The type of the column the weights of which are used to fill the profile. Inferred if not present. + /// \param[in] model The model to be considered to build the new return value. + /// \param[in] v1Name The name of the column that will fill the x axis. + /// \param[in] v2Name The name of the column that will fill the y axis. + /// \param[in] wName The name of the column that will provide the weights. + /// \return the monodimensional profile wrapped in a RResultPtr. /// /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// auto myHist = myDf.Hist(10, {5, 15}, "col0"); + /// // Deduce column types (this invocation needs jitting internally) + /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues", "weight"); + /// // Explicit column types + /// auto myProf2 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, + /// "xValues", "yValues", "weight"); /// ~~~ - template - RResultPtr> - Hist(std::uint64_t nNormalBins, std::pair interval, std::string_view vName) + /// + /// See the first Profile1D() overload for more details. + template + RResultPtr<::TProfile> + Profile1D(const TProfile1DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName) { - std::shared_ptr h = std::make_shared>(nNormalBins, interval); + std::shared_ptr<::TProfile> h(nullptr); + { + ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); + h = model.GetProfile(); + } - const ColumnNames_t columnList = {std::string(vName)}; + if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) { + throw std::runtime_error("Profile histograms with no axes limits are not supported yet."); + } + const std::vector columnViews = {v1Name, v2Name, wName}; + const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) + ? ColumnNames_t() + : ColumnNames_t(columnViews.begin(), columnViews.end()); + return CreateAction(userColumns, h, h, fProxiedPtr); + } - return Hist(h, columnList); + //////////////////////////////////////////////////////////////////////////// + /// \brief Fill and return a one-dimensional profile (*lazy action*). + /// See the first Profile1D() overload for more details. + template + RResultPtr<::TProfile> Profile1D(const TProfile1DModel &model) + { + return Profile1D(model, "", "", ""); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return an RHist (*lazy action*). - /// \tparam BinContentType The bin content type of the returned RHist. - /// \param[in] axes The returned histogram will be constructed using these axes. - /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` - /// \return the histogram wrapped in a RResultPtr. + /// \brief Fill and return a two-dimensional profile (*lazy action*). + /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present. + /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present. + /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present. + /// \param[in] model The returned profile will be constructed using this as a model. + /// \param[in] v1Name The name of the column that will fill the x axis. + /// \param[in] v2Name The name of the column that will fill the y axis. + /// \param[in] v3Name The name of the column that will fill the z axis. + /// \return the bidimensional profile wrapped in a RResultPtr. /// /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// ROOT::Experimental::RRegularAxis axis(10, {5.0, 15.0}); - /// auto myHist = myDf.Hist({axis}, {"col0"}); - /// ~~~ - template - RResultPtr> - Hist(std::vector axes, const ColumnNames_t &columnList) - { - if (axes.size() != columnList.size()) { - std::string msg = "Wrong number of columns for the specified number of histogram axes: "; - msg += "expected " + std::to_string(axes.size()) + ", got " + std::to_string(columnList.size()); - throw std::invalid_argument(msg); - } - - std::shared_ptr h = std::make_shared>(std::move(axes)); - - return Hist(h, columnList); - } - - //////////////////////////////////////////////////////////////////////////// - /// \brief Fill the provided RHist (*lazy action*). - /// \param[in] h The histogram that should be filled. - /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` - /// \return the histogram wrapped in a RResultPtr. - /// - /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. Also see RResultPtr. - /// - /// During execution of the computation graph, the passed histogram must only be accessed with methods that are - /// allowed during concurrent filling. - /// - /// ### Example usage: - /// ~~~{.cpp} - /// auto h = std::make_shared>(10, {5.0, 15.0}); - /// auto myHist = myDf.Hist(h, {"col0"}); + /// // Deduce column types (this invocation needs jitting internally) + /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20}, + /// "xValues", "yValues", "zValues"); + /// // Explicit column types + /// auto myProf2 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20}, + /// "xValues", "yValues", "zValues"); /// ~~~ - template - RResultPtr> - Hist(std::shared_ptr> h, const ColumnNames_t &columnList) + /// + /// \note Differently from other ROOT interfaces, the returned profile is not associated to gDirectory + /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that + /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). + template + RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name = "", + std::string_view v2Name = "", std::string_view v3Name = "") { - RDFInternal::WarnHist(); - - if (h->GetNDimensions() != columnList.size()) { - std::string msg = "Wrong number of columns for the passed histogram: "; - msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size()); - throw std::invalid_argument(msg); + std::shared_ptr<::TProfile2D> h(nullptr); + { + ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); + h = model.GetProfile(); } - return CreateAction(columnList, h, h, fProxiedPtr, - columnList.size()); + if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) { + throw std::runtime_error("2D profiles with no axes limits are not supported yet."); + } + const std::vector columnViews = {v1Name, v2Name, v3Name}; + const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) + ? ColumnNames_t() + : ColumnNames_t(columnViews.begin(), columnViews.end()); + return CreateAction(userColumns, h, h, fProxiedPtr); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a one-dimensional RHist with weights (*lazy action*). - /// \tparam BinContentType The bin content type of the returned RHist. - /// \param[in] nNormalBins The returned histogram will be constructed using this number of normal bins. - /// \param[in] interval The axis interval of the constructed histogram (lower end inclusive, upper end exclusive). - /// \param[in] vName The name of the column that will fill the histogram. + /// \brief Fill and return a two-dimensional profile (*lazy action*). + /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present. + /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present. + /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present. + /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present. + /// \param[in] model The returned histogram will be constructed using this as a model. + /// \param[in] v1Name The name of the column that will fill the x axis. + /// \param[in] v2Name The name of the column that will fill the y axis. + /// \param[in] v3Name The name of the column that will fill the z axis. /// \param[in] wName The name of the column that will provide the weights. - /// \return the histogram wrapped in a RResultPtr. + /// \return the bidimensional profile wrapped in a RResultPtr. /// /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// auto myHist = myDf.Hist(10, {5, 15}, "col0", "colW"); + /// // Deduce column types (this invocation needs jitting internally) + /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20}, + /// "xValues", "yValues", "zValues", "weight"); + /// // Explicit column types + /// auto myProf2 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20}, + /// "xValues", "yValues", "zValues", "weight"); /// ~~~ - template - RResultPtr> - Hist(std::uint64_t nNormalBins, std::pair interval, std::string_view vName, std::string_view wName) + /// + /// See the first Profile2D() overload for more details. + template + RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name, std::string_view v2Name, + std::string_view v3Name, std::string_view wName) { - std::shared_ptr h = std::make_shared>(nNormalBins, interval); + std::shared_ptr<::TProfile2D> h(nullptr); + { + ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); + h = model.GetProfile(); + } - const ColumnNames_t columnList = {std::string(vName)}; + if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) { + throw std::runtime_error("2D profiles with no axes limits are not supported yet."); + } + const std::vector columnViews = {v1Name, v2Name, v3Name, wName}; + const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) + ? ColumnNames_t() + : ColumnNames_t(columnViews.begin(), columnViews.end()); + return CreateAction(userColumns, h, h, fProxiedPtr); + } - return Hist(h, columnList, wName); + /// \brief Fill and return a two-dimensional profile (*lazy action*). + /// See the first Profile2D() overload for more details. + template + RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model) + { + return Profile2D(model, "", "", "", ""); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return an RHist with weights (*lazy action*). - /// \tparam BinContentType The bin content type of the returned RHist. - /// \param[in] axes The returned histogram will be constructed using these axes. - /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` - /// \param[in] wName The name of the column that will provide the weights. - /// \return the histogram wrapped in a RResultPtr. + /// \brief Return an object of type T on which `T::Fill` will be called once per event (*lazy action*). /// - /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. Also see RResultPtr. + /// Type T must provide at least: + /// - a copy-constructor + /// - a `Fill` method that accepts as many arguments and with same types as the column names passed as columnList + /// (these types can also be passed as template parameters to this method) + /// - a `Merge` method with signature `Merge(TCollection *)` or `Merge(const std::vector&)` that merges the + /// objects passed as argument into the object on which `Merge` was called (an analogous of TH1::Merge). Note that + /// if the signature that takes a `TCollection*` is used, then T must inherit from TObject (to allow insertion in + /// the TCollection*). /// - /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling). + /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred together with OtherColumns if not present. + /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the object. + /// \tparam T The type of the object to fill. Automatically deduced. + /// \param[in] model The model to be considered to build the new return value. + /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` + /// \return the filled object wrapped in a RResultPtr. + /// + /// The user gives up ownership of the model object. + /// The list of column names to be used for filling must always be specified. + /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. + /// Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// ROOT::Experimental::RRegularAxis axis(10, {5.0, 15.0}); - /// auto myHist = myDf.Hist({axis}, {"col0"}, "colW"); + /// MyClass obj; + /// // Deduce column types (this invocation needs jitting internally, and in this case + /// // MyClass needs to be known to the interpreter) + /// auto myFilledObj = myDf.Fill(obj, {"col0", "col1"}); + /// // explicit column types + /// auto myFilledObj = myDf.Fill(obj, {"col0", "col1"}); /// ~~~ - template - RResultPtr> - Hist(std::vector axes, const ColumnNames_t &columnList, std::string_view wName) + /// + template + RResultPtr> Fill(T &&model, const ColumnNames_t &columnList) { - static_assert(ROOT::Experimental::RHistEngine::SupportsWeightedFilling, - "weighted filling is not supported for integral bin content types"); - - if (axes.size() != columnList.size()) { - std::string msg = "Wrong number of columns for the specified number of histogram axes: "; - msg += "expected " + std::to_string(axes.size()) + ", got " + std::to_string(columnList.size()); - throw std::invalid_argument(msg); + auto h = std::make_shared>(std::forward(model)); + if (!RDFInternal::HistoUtils::HasAxisLimits(*h)) { + throw std::runtime_error("The absence of axes limits is not supported yet."); } - - std::shared_ptr h = std::make_shared>(std::move(axes)); - - return Hist(h, columnList, wName); + return CreateAction(columnList, h, h, fProxiedPtr, + columnList.size()); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill the provided RHist with weights (*lazy action*). - /// \param[in] h The histogram that should be filled. - /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` - /// \param[in] wName The name of the column that will provide the weights. - /// \return the histogram wrapped in a RResultPtr. - /// - /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. Also see RResultPtr. - /// - /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling). + /// \brief Return a TStatistic object, filled once per event (*lazy action*). /// - /// During execution of the computation graph, the passed histogram must only be accessed with methods that are - /// allowed during concurrent filling. + /// \tparam V The type of the value column + /// \param[in] value The name of the column with the values to fill the statistics with. + /// \return the filled TStatistic object wrapped in a RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// auto h = std::make_shared>(10, {5.0, 15.0}); - /// auto myHist = myDf.Hist(h, {"col0"}, "colW"); + /// // Deduce column type (this invocation needs jitting internally) + /// auto stats0 = myDf.Stats("values"); + /// // Explicit column type + /// auto stats1 = myDf.Stats("values"); /// ~~~ - template - RResultPtr> - Hist(std::shared_ptr> h, const ColumnNames_t &columnList, - std::string_view wName) + /// + template + RResultPtr Stats(std::string_view value = "") { - static_assert(ROOT::Experimental::RHistEngine::SupportsWeightedFilling, - "weighted filling is not supported for integral bin content types"); - - RDFInternal::WarnHist(); - - if (h->GetNDimensions() != columnList.size()) { - std::string msg = "Wrong number of columns for the passed histogram: "; - msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size()); - throw std::invalid_argument(msg); + ColumnNames_t columns; + if (!value.empty()) { + columns.emplace_back(std::string(value)); + } + const auto validColumnNames = GetValidatedColumnNames(1, columns); + if (std::is_same::value) { + return Fill(TStatistic(), validColumnNames); + } else { + return Fill(TStatistic(), validColumnNames); } - - // Add the weight column to the list of argument columns to pass it through the infrastructure. - ColumnNames_t columnListWithWeights(columnList); - columnListWithWeights.push_back(std::string(wName)); - - return CreateAction( - columnListWithWeights, h, h, fProxiedPtr, columnListWithWeights.size()); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill the provided RHistEngine (*lazy action*). - /// \param[in] h The histogram that should be filled. - /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` - /// \return the histogram wrapped in a RResultPtr. - /// - /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. Also see RResultPtr. + /// \brief Return a TStatistic object, filled once per event (*lazy action*). /// - /// During execution of the computation graph, the passed histogram must only be accessed with methods that are - /// allowed during concurrent filling. + /// \tparam V The type of the value column + /// \tparam W The type of the weight column + /// \param[in] value The name of the column with the values to fill the statistics with. + /// \param[in] weight The name of the column with the weights to fill the statistics with. + /// \return the filled TStatistic object wrapped in a RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// auto h = std::make_shared>(10, {5.0, 15.0}); - /// auto myHist = myDf.Hist(h, {"col0"}); + /// // Deduce column types (this invocation needs jitting internally) + /// auto stats0 = myDf.Stats("values", "weights"); + /// // Explicit column types + /// auto stats1 = myDf.Stats("values", "weights"); /// ~~~ - template - RResultPtr> - Hist(std::shared_ptr> h, const ColumnNames_t &columnList) + /// + template + RResultPtr Stats(std::string_view value, std::string_view weight) { - RDFInternal::WarnHist(); - - if (h->GetNDimensions() != columnList.size()) { - std::string msg = "Wrong number of columns for the passed histogram: "; - msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size()); - throw std::invalid_argument(msg); + ColumnNames_t columns{std::string(value), std::string(weight)}; + constexpr auto vIsInferred = std::is_same::value; + constexpr auto wIsInferred = std::is_same::value; + const auto validColumnNames = GetValidatedColumnNames(2, columns); + // We have 3 cases: + // 1. Both types are inferred: we use Fill and let the jit kick in. + // 2. One of the two types is explicit and the other one is inferred: the case is not supported. + // 3. Both types are explicit: we invoke the fully compiled Fill method. + if (vIsInferred && wIsInferred) { + return Fill(TStatistic(), validColumnNames); + } else if (vIsInferred != wIsInferred) { + std::string error("The "); + error += vIsInferred ? "value " : "weight "; + error += "column type is explicit, while the "; + error += vIsInferred ? "weight " : "value "; + error += " is specified to be inferred. This case is not supported: please specify both types or none."; + throw std::runtime_error(error); + } else { + return Fill(TStatistic(), validColumnNames); } - - return CreateAction(columnList, h, h, fProxiedPtr, - columnList.size()); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill the provided RHistEngine with weights (*lazy action*). - /// \param[in] h The histogram that should be filled. - /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` - /// \param[in] wName The name of the column that will provide the weights. - /// \return the histogram wrapped in a RResultPtr. + /// \brief Return the minimum of processed column values (*lazy action*). + /// \tparam T The type of the branch/column. + /// \param[in] columnName The name of the branch/column to be treated. + /// \return the minimum value of the selected column wrapped in a RResultPtr. + /// + /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct + /// template specialization of this method. + /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise. /// /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// - /// This overload is not available for integral bin content types (see \ref RHistEngine::SupportsWeightedFilling). - /// - /// During execution of the computation graph, the passed histogram must only be accessed with methods that are - /// allowed during concurrent filling. - /// /// ### Example usage: /// ~~~{.cpp} - /// auto h = std::make_shared>(10, {5.0, 15.0}); - /// auto myHist = myDf.Hist(h, {"col0"}, "colW"); + /// // Deduce column type (this invocation needs jitting internally) + /// auto minVal0 = myDf.Min("values"); + /// // Explicit column type + /// auto minVal1 = myDf.Min("values"); /// ~~~ - template - RResultPtr> - Hist(std::shared_ptr> h, const ColumnNames_t &columnList, - std::string_view wName) + /// + template + RResultPtr> Min(std::string_view columnName = "") { - static_assert(ROOT::Experimental::RHistEngine::SupportsWeightedFilling, - "weighted filling is not supported for integral bin content types"); - - RDFInternal::WarnHist(); - - if (h->GetNDimensions() != columnList.size()) { - std::string msg = "Wrong number of columns for the passed histogram: "; - msg += "expected " + std::to_string(h->GetNDimensions()) + ", got " + std::to_string(columnList.size()); - throw std::invalid_argument(msg); - } - - // Add the weight column to the list of argument columns to pass it through the infrastructure. - ColumnNames_t columnListWithWeights(columnList); - columnListWithWeights.push_back(std::string(wName)); - - return CreateAction( - columnListWithWeights, h, h, fProxiedPtr, columnListWithWeights.size()); + const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); + using RetType_t = RDFDetail::MinReturnType_t; + auto minV = std::make_shared(std::numeric_limits::max()); + return CreateAction(userColumns, minV, minV, fProxiedPtr); } -#endif //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a TGraph object (*lazy action*). - /// \tparam X The type of the column used to fill the x axis. - /// \tparam Y The type of the column used to fill the y axis. - /// \param[in] x The name of the column that will fill the x axis. - /// \param[in] y The name of the column that will fill the y axis. - /// \return the TGraph wrapped in a RResultPtr. + /// \brief Return the maximum of processed column values (*lazy action*). + /// \tparam T The type of the branch/column. + /// \param[in] columnName The name of the branch/column to be treated. + /// \return the maximum value of the selected column wrapped in a RResultPtr. /// - /// Columns can be of a container type (e.g. std::vector), in which case the TGraph - /// is filled with each one of the elements of the container. - /// If Multithreading is enabled, the order in which points are inserted is undefined. - /// If the Graph has to be drawn, it is suggested to the user to sort it on the x before printing. - /// A name and a title to the TGraph is given based on the input column names. + /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct + /// template specialization of this method. + /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise. /// /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// // Deduce column types (this invocation needs jitting internally) - /// auto myGraph1 = myDf.Graph("xValues", "yValues"); - /// // Explicit column types - /// auto myGraph2 = myDf.Graph("xValues", "yValues"); + /// // Deduce column type (this invocation needs jitting internally) + /// auto maxVal0 = myDf.Max("values"); + /// // Explicit column type + /// auto maxVal1 = myDf.Max("values"); /// ~~~ /// - /// \note Differently from other ROOT interfaces, the returned TGraph is not associated to gDirectory - /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that - /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). - template - RResultPtr<::TGraph> Graph(std::string_view x = "", std::string_view y = "") + template + RResultPtr> Max(std::string_view columnName = "") { - auto graph = std::make_shared<::TGraph>(); - const std::vector columnViews = {x, y}; - const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) - ? ColumnNames_t() - : ColumnNames_t(columnViews.begin(), columnViews.end()); - - const auto validatedColumns = GetValidatedColumnNames(2, userColumns); - - // We build a default name and title based on the input columns - const auto g_name = validatedColumns[1] + "_vs_" + validatedColumns[0]; - const auto g_title = validatedColumns[1] + " vs " + validatedColumns[0]; - graph->SetNameTitle(g_name.c_str(), g_title.c_str()); - graph->GetXaxis()->SetTitle(validatedColumns[0].c_str()); - graph->GetYaxis()->SetTitle(validatedColumns[1].c_str()); - - return CreateAction(validatedColumns, graph, graph, fProxiedPtr); + const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); + using RetType_t = RDFDetail::MaxReturnType_t; + auto maxV = std::make_shared(std::numeric_limits::lowest()); + return CreateAction(userColumns, maxV, maxV, fProxiedPtr); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a TGraphAsymmErrors object (*lazy action*). - /// \param[in] x The name of the column that will fill the x axis. - /// \param[in] y The name of the column that will fill the y axis. - /// \param[in] exl The name of the column of X low errors - /// \param[in] exh The name of the column of X high errors - /// \param[in] eyl The name of the column of Y low errors - /// \param[in] eyh The name of the column of Y high errors - /// \return the TGraphAsymmErrors wrapped in a RResultPtr. + /// \brief Return the mean of processed column values (*lazy action*). + /// \tparam T The type of the branch/column. + /// \param[in] columnName The name of the branch/column to be treated. + /// \return the mean value of the selected column wrapped in a RResultPtr. /// - /// Columns can be of a container type (e.g. std::vector), in which case the graph - /// is filled with each one of the elements of the container. - /// If Multithreading is enabled, the order in which points are inserted is undefined. + /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct + /// template specialization of this method. + /// Note that internally, the summations are executed with Kahan sums in double precision, irrespective + /// of the type of column that is read. /// /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// // Deduce column types (this invocation needs jitting internally) - /// auto myGAE1 = myDf.GraphAsymmErrors("xValues", "yValues", "exl", "exh", "eyl", "eyh"); - /// // Explicit column types - /// using f = float - /// auto myGAE2 = myDf.GraphAsymmErrors("xValues", "yValues", "exl", "exh", "eyl", "eyh"); - /// ~~~ - /// - /// `GraphAsymmErrors` should also be used for the cases in which values associated only with - /// one of the axes have associated errors. For example, only `ey` exist and `ex` are equal to zero. - /// In such cases, user should do the following: - /// ~~~{.cpp} - /// // Create a column of zeros in RDataFrame - /// auto rdf_withzeros = rdf.Define("zero", "0"); - /// // or alternatively: - /// auto rdf_withzeros = rdf.Define("zero", []() -> double { return 0.;}); - /// // Create the graph with y errors only - /// auto rdf_errorsOnYOnly = rdf_withzeros.GraphAsymmErrors("xValues", "yValues", "zero", "zero", "eyl", "eyh"); + /// // Deduce column type (this invocation needs jitting internally) + /// auto meanVal0 = myDf.Mean("values"); + /// // Explicit column type + /// auto meanVal1 = myDf.Mean("values"); /// ~~~ /// - /// \note Differently from other ROOT interfaces, the returned TGraphAsymmErrors is not associated to gDirectory - /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that - /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). - template - RResultPtr<::TGraphAsymmErrors> - GraphAsymmErrors(std::string_view x = "", std::string_view y = "", std::string_view exl = "", - std::string_view exh = "", std::string_view eyl = "", std::string_view eyh = "") + template + RResultPtr Mean(std::string_view columnName = "") { - auto graph = std::make_shared<::TGraphAsymmErrors>(); - const std::vector columnViews = {x, y, exl, exh, eyl, eyh}; - const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) - ? ColumnNames_t() - : ColumnNames_t(columnViews.begin(), columnViews.end()); - - const auto validatedColumns = GetValidatedColumnNames(6, userColumns); - - // We build a default name and title based on the input columns - const auto g_name = validatedColumns[1] + "_vs_" + validatedColumns[0]; - const auto g_title = validatedColumns[1] + " vs " + validatedColumns[0]; - graph->SetNameTitle(g_name.c_str(), g_title.c_str()); - graph->GetXaxis()->SetTitle(validatedColumns[0].c_str()); - graph->GetYaxis()->SetTitle(validatedColumns[1].c_str()); - - return CreateAction(validatedColumns, graph, - graph, fProxiedPtr); + const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); + auto meanV = std::make_shared(0); + return CreateAction(userColumns, meanV, meanV, fProxiedPtr); } //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a one-dimensional profile (*lazy action*). - /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present. - /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present. - /// \param[in] model The model to be considered to build the new return value. - /// \param[in] v1Name The name of the column that will fill the x axis. - /// \param[in] v2Name The name of the column that will fill the y axis. - /// \return the monodimensional profile wrapped in a RResultPtr. + /// \brief Return the unbiased standard deviation of processed column values (*lazy action*). + /// \tparam T The type of the branch/column. + /// \param[in] columnName The name of the branch/column to be treated. + /// \return the standard deviation value of the selected column wrapped in a RResultPtr. + /// + /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct + /// template specialization of this method. /// /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// // Deduce column types (this invocation needs jitting internally) - /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues"); - /// // Explicit column types - /// auto myProf2 = myDf.Graph({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues"); + /// // Deduce column type (this invocation needs jitting internally) + /// auto stdDev0 = myDf.StdDev("values"); + /// // Explicit column type + /// auto stdDev1 = myDf.StdDev("values"); /// ~~~ /// - /// \note Differently from other ROOT interfaces, the returned profile is not associated to gDirectory - /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that - /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). - template - RResultPtr<::TProfile> - Profile1D(const TProfile1DModel &model, std::string_view v1Name = "", std::string_view v2Name = "") + template + RResultPtr StdDev(std::string_view columnName = "") { - std::shared_ptr<::TProfile> h(nullptr); - { - ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); - h = model.GetProfile(); - } - - if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) { - throw std::runtime_error("Profiles with no axes limits are not supported yet."); - } - const std::vector columnViews = {v1Name, v2Name}; - const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) - ? ColumnNames_t() - : ColumnNames_t(columnViews.begin(), columnViews.end()); - return CreateAction(userColumns, h, h, fProxiedPtr); - } + const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); + auto stdDeviationV = std::make_shared(0); + return CreateAction(userColumns, stdDeviationV, stdDeviationV, fProxiedPtr); + } + // clang-format off //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a one-dimensional profile (*lazy action*). - /// \tparam V1 The type of the column the values of which are used to fill the profile. Inferred if not present. - /// \tparam V2 The type of the column the values of which are used to fill the profile. Inferred if not present. - /// \tparam W The type of the column the weights of which are used to fill the profile. Inferred if not present. - /// \param[in] model The model to be considered to build the new return value. - /// \param[in] v1Name The name of the column that will fill the x axis. - /// \param[in] v2Name The name of the column that will fill the y axis. - /// \param[in] wName The name of the column that will provide the weights. - /// \return the monodimensional profile wrapped in a RResultPtr. + /// \brief Return the sum of processed column values (*lazy action*). + /// \tparam T The type of the branch/column. + /// \param[in] columnName The name of the branch/column. + /// \param[in] initValue Optional initial value for the sum. If not present, the column values must be default-constructible. + /// \return the sum of the selected column wrapped in a RResultPtr. + /// + /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct + /// template specialization of this method. + /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise. /// /// This action is *lazy*: upon invocation of this method the calculation is /// booked but not executed. Also see RResultPtr. /// /// ### Example usage: /// ~~~{.cpp} - /// // Deduce column types (this invocation needs jitting internally) - /// auto myProf1 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, "xValues", "yValues", "weight"); - /// // Explicit column types - /// auto myProf2 = myDf.Profile1D({"profName", "profTitle", 64u, -4., 4.}, - /// "xValues", "yValues", "weight"); + /// // Deduce column type (this invocation needs jitting internally) + /// auto sum0 = myDf.Sum("values"); + /// // Explicit column type + /// auto sum1 = myDf.Sum("values"); /// ~~~ /// - /// See the first Profile1D() overload for more details. - template - RResultPtr<::TProfile> - Profile1D(const TProfile1DModel &model, std::string_view v1Name, std::string_view v2Name, std::string_view wName) - { - std::shared_ptr<::TProfile> h(nullptr); - { - ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); - h = model.GetProfile(); - } - - if (!RDFInternal::HistoUtils<::TProfile>::HasAxisLimits(*h)) { - throw std::runtime_error("Profile histograms with no axes limits are not supported yet."); - } - const std::vector columnViews = {v1Name, v2Name, wName}; - const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) - ? ColumnNames_t() - : ColumnNames_t(columnViews.begin(), columnViews.end()); - return CreateAction(userColumns, h, h, fProxiedPtr); - } - - //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a one-dimensional profile (*lazy action*). - /// See the first Profile1D() overload for more details. - template - RResultPtr<::TProfile> Profile1D(const TProfile1DModel &model) + template + RResultPtr> + Sum(std::string_view columnName = "", + const RDFDetail::SumReturnType_t &initValue = RDFDetail::SumReturnType_t{}) { - return Profile1D(model, "", "", ""); + const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); + auto sumV = std::make_shared>(initValue); + return CreateAction(userColumns, sumV, sumV, fProxiedPtr); } + // clang-format on //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a two-dimensional profile (*lazy action*). - /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present. - /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present. - /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present. - /// \param[in] model The returned profile will be constructed using this as a model. - /// \param[in] v1Name The name of the column that will fill the x axis. - /// \param[in] v2Name The name of the column that will fill the y axis. - /// \param[in] v3Name The name of the column that will fill the z axis. - /// \return the bidimensional profile wrapped in a RResultPtr. + /// \brief Gather filtering statistics. + /// \return the resulting `RCutFlowReport` instance wrapped in a RResultPtr. /// - /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. Also see RResultPtr. + /// Calling `Report` on the main `RDataFrame` object gathers stats for + /// all named filters in the call graph. Calling this method on a + /// stored chain state (i.e. a graph node different from the first) gathers + /// the stats for all named filters in the chain section between the original + /// `RDataFrame` and that node (included). Stats are gathered in the same + /// order as the named filters have been added to the graph. + /// A RResultPtr is returned to allow inspection of the + /// effects cuts had. + /// + /// This action is *lazy*: upon invocation of + /// this method the calculation is booked but not executed. See RResultPtr + /// documentation. /// /// ### Example usage: /// ~~~{.cpp} - /// // Deduce column types (this invocation needs jitting internally) - /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20}, - /// "xValues", "yValues", "zValues"); - /// // Explicit column types - /// auto myProf2 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20}, - /// "xValues", "yValues", "zValues"); + /// auto filtered = d.Filter(cut1, {"b1"}, "Cut1").Filter(cut2, {"b2"}, "Cut2"); + /// auto cutReport = filtered3.Report(); + /// cutReport->Print(); /// ~~~ /// - /// \note Differently from other ROOT interfaces, the returned profile is not associated to gDirectory - /// and the caller is responsible for its lifetime (in particular, a typical source of confusion is that - /// if result histograms go out of scope before the end of the program, ROOT might display a blank canvas). - template - RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name = "", - std::string_view v2Name = "", std::string_view v3Name = "") + RResultPtr Report() { - std::shared_ptr<::TProfile2D> h(nullptr); - { - ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); - h = model.GetProfile(); - } + bool returnEmptyReport = false; + // if this is a RInterface on which `Define` has been called, users + // are calling `Report` on a chain of the form LoopManager->Define->Define->..., which + // certainly does not contain named filters. + // The number 4 takes into account the implicit columns for entry and slot number + // and their aliases (2 + 2, i.e. {r,t}dfentry_ and {r,t}dfslot_) + if (std::is_same::value && fColRegister.GenerateColumnNames().size() > 4) + returnEmptyReport = true; - if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) { - throw std::runtime_error("2D profiles with no axes limits are not supported yet."); - } - const std::vector columnViews = {v1Name, v2Name, v3Name}; - const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) - ? ColumnNames_t() - : ColumnNames_t(columnViews.begin(), columnViews.end()); - return CreateAction(userColumns, h, h, fProxiedPtr); + auto rep = std::make_shared(); + using Helper_t = RDFInternal::ReportHelper; + using Action_t = RDFInternal::RAction; + + auto action = std::make_unique(Helper_t(rep, fProxiedPtr.get(), returnEmptyReport), ColumnNames_t({}), + fProxiedPtr, RDFInternal::RColumnRegister(fColRegister)); + + return MakeResultPtr(rep, *fLoopManager, std::move(action)); } + //////////////////////////////////////////////////////////////////////////// - /// \brief Fill and return a two-dimensional profile (*lazy action*). - /// \tparam V1 The type of the column used to fill the x axis of the histogram. Inferred if not present. - /// \tparam V2 The type of the column used to fill the y axis of the histogram. Inferred if not present. - /// \tparam V3 The type of the column used to fill the z axis of the histogram. Inferred if not present. - /// \tparam W The type of the column used for the weights of the histogram. Inferred if not present. - /// \param[in] model The returned histogram will be constructed using this as a model. - /// \param[in] v1Name The name of the column that will fill the x axis. - /// \param[in] v2Name The name of the column that will fill the y axis. - /// \param[in] v3Name The name of the column that will fill the z axis. - /// \param[in] wName The name of the column that will provide the weights. - /// \return the bidimensional profile wrapped in a RResultPtr. + /// \brief Provides a representation of the columns in the dataset. + /// \tparam ColumnTypes variadic list of branch/column types. + /// \param[in] columnList Names of the columns to be displayed. + /// \param[in] nRows Number of events for each column to be displayed. + /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row. + /// \return the `RDisplay` instance wrapped in a RResultPtr. /// - /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. Also see RResultPtr. + /// This function returns a `RResultPtr` containing all the entries to be displayed, organized in a tabular + /// form. RDisplay will either print on the standard output a summarized version through `RDisplay::Print()` or will + /// return a complete version through `RDisplay::AsString()`. /// - /// ### Example usage: + /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see + /// RResultPtr. + /// + /// Example usage: /// ~~~{.cpp} - /// // Deduce column types (this invocation needs jitting internally) - /// auto myProf1 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20}, - /// "xValues", "yValues", "zValues", "weight"); - /// // Explicit column types - /// auto myProf2 = myDf.Profile2D({"profName", "profTitle", 40, -4, 4, 40, -4, 4, 0, 20}, - /// "xValues", "yValues", "zValues", "weight"); + /// // Preparing the RResultPtr object with all columns and default number of entries + /// auto d1 = rdf.Display(""); + /// // Preparing the RResultPtr object with two columns and 128 entries + /// auto d2 = d.Display({"x", "y"}, 128); + /// // Printing the short representations, the event loop will run + /// d1->Print(); + /// d2->Print(); /// ~~~ + template + RResultPtr Display(const ColumnNames_t &columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10) + { + CheckIMTDisabled("Display"); + auto newCols = columnList; + newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column + auto displayer = std::make_shared(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements); + using displayHelperArgs_t = std::pair>; + // Need to add ULong64_t type corresponding to the first column rdfentry_ + return CreateAction( + std::move(newCols), displayer, std::make_shared(nRows, displayer), fProxiedPtr); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Provides a representation of the columns in the dataset. + /// \param[in] columnList Names of the columns to be displayed. + /// \param[in] nRows Number of events for each column to be displayed. + /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row. + /// \return the `RDisplay` instance wrapped in a RResultPtr. /// - /// See the first Profile2D() overload for more details. - template - RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model, std::string_view v1Name, std::string_view v2Name, - std::string_view v3Name, std::string_view wName) + /// This overload automatically infers the column types. + /// See the previous overloads for further details. + /// + /// Invoked when no types are specified to Display + RResultPtr Display(const ColumnNames_t &columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10) { - std::shared_ptr<::TProfile2D> h(nullptr); - { - ROOT::Internal::RDF::RIgnoreErrorLevelRAII iel(kError); - h = model.GetProfile(); - } + CheckIMTDisabled("Display"); + auto newCols = columnList; + newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column + auto displayer = std::make_shared(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements); + using displayHelperArgs_t = std::pair>; + return CreateAction( + std::move(newCols), displayer, std::make_shared(nRows, displayer), fProxiedPtr, + columnList.size() + 1); + } - if (!RDFInternal::HistoUtils<::TProfile2D>::HasAxisLimits(*h)) { - throw std::runtime_error("2D profiles with no axes limits are not supported yet."); - } - const std::vector columnViews = {v1Name, v2Name, v3Name, wName}; - const auto userColumns = RDFInternal::AtLeastOneEmptyString(columnViews) - ? ColumnNames_t() - : ColumnNames_t(columnViews.begin(), columnViews.end()); - return CreateAction(userColumns, h, h, fProxiedPtr); + //////////////////////////////////////////////////////////////////////////// + /// \brief Provides a representation of the columns in the dataset. + /// \param[in] columnNameRegexp A regular expression to select the columns. + /// \param[in] nRows Number of events for each column to be displayed. + /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row. + /// \return the `RDisplay` instance wrapped in a RResultPtr. + /// + /// The existing columns are matched against the regular expression. If the string provided + /// is empty, all columns are selected. + /// See the previous overloads for further details. + RResultPtr + Display(std::string_view columnNameRegexp = "", size_t nRows = 5, size_t nMaxCollectionElements = 10) + { + const auto columnNames = GetColumnNames(); + const auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Display"); + return Display(selectedColumns, nRows, nMaxCollectionElements); } - /// \brief Fill and return a two-dimensional profile (*lazy action*). - /// See the first Profile2D() overload for more details. - template - RResultPtr<::TProfile2D> Profile2D(const TProfile2DModel &model) + //////////////////////////////////////////////////////////////////////////// + /// \brief Provides a representation of the columns in the dataset. + /// \param[in] columnList Names of the columns to be displayed. + /// \param[in] nRows Number of events for each column to be displayed. + /// \param[in] nMaxCollectionElements Number of maximum elements in collection. + /// \return the `RDisplay` instance wrapped in a RResultPtr. + /// + /// See the previous overloads for further details. + RResultPtr + Display(std::initializer_list columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10) { - return Profile2D(model, "", "", "", ""); + ColumnNames_t selectedColumns(columnList); + return Display(selectedColumns, nRows, nMaxCollectionElements); + } + + /// \} + // End of the doxygen group for actions + // ---------------------------------------------------------------------------------------- + + /// \name Immediate Actions + /// Immediate Actions eagerly start the event loop and produce a result. + /// \{ + + template + [[deprecated("Snapshot is not any more a template. You can safely remove the template parameters.")]] + RResultPtr> + Snapshot(std::string_view treename, std::string_view filename, const ColumnNames_t &columnList, + const RSnapshotOptions &options = RSnapshotOptions()) + { + return Snapshot(treename, filename, columnList, options); } //////////////////////////////////////////////////////////////////////////// - /// \brief Return an object of type T on which `T::Fill` will be called once per event (*lazy action*). + /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`. + /// \param[in] treename The name of the output TTree or RNTuple. + /// \param[in] filename The name of the output TFile. + /// \param[in] columnList The list of names of the columns/branches/fields to be written. + /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple. + /// \return a `RDataFrame` that wraps the snapshotted dataset. /// - /// Type T must provide at least: - /// - a copy-constructor - /// - a `Fill` method that accepts as many arguments and with same types as the column names passed as columnList - /// (these types can also be passed as template parameters to this method) - /// - a `Merge` method with signature `Merge(TCollection *)` or `Merge(const std::vector&)` that merges the - /// objects passed as argument into the object on which `Merge` was called (an analogous of TH1::Merge). Note that - /// if the signature that takes a `TCollection*` is used, then T must inherit from TObject (to allow insertion in - /// the TCollection*). + /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. + /// The types of the columns are automatically inferred and do not need to be specified. /// - /// \tparam FirstColumn The first type of the column the values of which are used to fill the object. Inferred together with OtherColumns if not present. - /// \tparam OtherColumns A list of the other types of the columns the values of which are used to fill the object. - /// \tparam T The type of the object to fill. Automatically deduced. - /// \param[in] model The model to be considered to build the new return value. - /// \param[in] columnList A list containing the names of the columns that will be passed when calling `Fill` - /// \return the filled object wrapped in a RResultPtr. + /// Support for writing of nested branches/fields is limited (although RDataFrame is able to read them) and dot ('.') + /// characters in input column names will be replaced by underscores ('_') in the branches produced by Snapshot. + /// When writing a variable size array through Snapshot, it is required that the column indicating its size is also + /// written out and it appears before the array in the columnList. /// - /// The user gives up ownership of the model object. - /// The list of column names to be used for filling must always be specified. - /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. - /// Also see RResultPtr. + /// By default, in case of TTree, TChain or RNTuple inputs, Snapshot will try to write out all top-level branches. + /// For other types of inputs, all columns returned by GetColumnNames() will be written out. Systematic variations of + /// columns will be included if the corresponding flag is set in RSnapshotOptions. See \ref snapshot-with-variations + /// "Snapshot with Variations" for more details. If friend trees or chains are present, by default all friend + /// top-level branches that have names that do not collide with names of branches in the main TTree/TChain will be + /// written out. Since v6.24, Snapshot will also write out friend branches with the same names of branches in the + /// main TTree/TChain with names of the form + /// `_` in order to differentiate them from the branches in the main tree/chain. + /// + /// ### Writing to a sub-directory + /// + /// Snapshot supports writing the TTree or RNTuple in a sub-directory inside the TFile. It is sufficient to specify + /// the directory path as part of the TTree or RNTuple name, e.g. `df.Snapshot("subdir/t", "f.root")` writes TTree + /// `t` in the sub-directory `subdir` of file `f.root` (creating file and sub-directory as needed). + /// + /// \attention In multi-thread runs (i.e. when EnableImplicitMT() has been called) threads will loop over clusters of + /// entries in an undefined order, so Snapshot will produce outputs in which (clusters of) entries will be shuffled + /// with respect to the input TTree. Using such "shuffled" TTrees as friends of the original trees would result in + /// wrong associations between entries in the main TTree and entries in the "shuffled" friend. Since v6.22, ROOT will + /// error out if such a "shuffled" TTree is used in a friendship. + /// + /// \note In case no events are written out (e.g. because no event passes all filters), Snapshot will still write the + /// requested output TTree or RNTuple to the file, with all the branches requested to preserve the dataset schema. + /// + /// \note Snapshot will refuse to process columns with names of the form `#columnname`. These are special columns + /// made available by some data sources (e.g. RNTupleDS) that represent the size of column `columnname`, and are + /// not meant to be written out with that name (which is not a valid C++ variable name). Instead, go through an + /// Alias(): `df.Alias("nbar", "#bar").Snapshot(..., {"nbar"})`. + /// + /// ### Example invocations: /// - /// ### Example usage: /// ~~~{.cpp} - /// MyClass obj; - /// // Deduce column types (this invocation needs jitting internally, and in this case - /// // MyClass needs to be known to the interpreter) - /// auto myFilledObj = myDf.Fill(obj, {"col0", "col1"}); - /// // explicit column types - /// auto myFilledObj = myDf.Fill(obj, {"col0", "col1"}); + /// // No need to specify column types, they are automatically deduced thanks + /// // to information coming from the data source + /// df.Snapshot("outputTree", "outputFile.root", {"x", "y"}); /// ~~~ /// - template - RResultPtr> Fill(T &&model, const ColumnNames_t &columnList) + /// To book a Snapshot without triggering the event loop, one needs to set the appropriate flag in + /// `RSnapshotOptions`: + /// ~~~{.cpp} + /// RSnapshotOptions opts; + /// opts.fLazy = true; + /// df.Snapshot("outputTree", "outputFile.root", {"x"}, opts); + /// ~~~ + /// + /// To snapshot to the RNTuple data format, the `fOutputFormat` option in `RSnapshotOptions` needs to be set + /// accordingly: + /// ~~~{.cpp} + /// RSnapshotOptions opts; + /// opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + /// df.Snapshot("outputNTuple", "outputFile.root", {"x"}, opts); + /// ~~~ + /// + /// Snapshot systematic variations resulting from a Vary() call (see details \ref snapshot-with-variations "here"): + /// ~~~{.cpp} + /// RSnapshotOptions opts; + /// opts.fIncludeVariations = true; + /// df.Snapshot("outputTree", "outputFile.root", {"x"}, opts); + /// ~~~ + RResultPtr> Snapshot(std::string_view treename, std::string_view filename, + const ColumnNames_t &columnList, + const RSnapshotOptions &options = RSnapshotOptions()) { - auto h = std::make_shared>(std::forward(model)); - if (!RDFInternal::HistoUtils::HasAxisLimits(*h)) { - throw std::runtime_error("The absence of axes limits is not supported yet."); + // like columnList but with `#var` columns removed + auto colListNoPoundSizes = RDFInternal::FilterArraySizeColNames(columnList, "Snapshot"); + // like columnListWithoutSizeColumns but with aliases resolved + auto colListNoAliases = GetValidatedColumnNames(colListNoPoundSizes.size(), colListNoPoundSizes); + RDFInternal::CheckForDuplicateSnapshotColumns(colListNoAliases); + // like validCols but with missing size branches required by array branches added in the right positions + const auto pairOfColumnLists = + RDFInternal::AddSizeBranches(GetDataSource(), std::move(colListNoAliases), std::move(colListNoPoundSizes)); + const auto &colListNoAliasesWithSizeBranches = pairOfColumnLists.first; + const auto &colListWithAliasesAndSizeBranches = pairOfColumnLists.second; + + const auto fullTreeName = treename; + const auto parsedTreePath = RDFInternal::ParseTreePath(fullTreeName); + treename = parsedTreePath.fTreeName; + const auto &dirname = parsedTreePath.fDirName; + + ::TDirectory::TContext ctxt; + + RResultPtr> resPtr; + + auto retrieveTypeID = [](const std::string &colName, const std::string &colTypeName, + bool isRNTuple = false) -> const std::type_info * { + try { + return &ROOT::Internal::RDF::TypeName2TypeID(colTypeName); + } catch (const std::runtime_error &err) { + if (isRNTuple) + return &typeid(ROOT::Internal::RDF::UseNativeDataType); + + if (std::string(err.what()).find("Cannot extract type_info of type") != std::string::npos) { + // We could not find RTTI for this column, thus we cannot write it out at the moment. + std::string trueTypeName{colTypeName}; + if (colTypeName.rfind("CLING_UNKNOWN_TYPE", 0) == 0) + trueTypeName = colTypeName.substr(19); + std::string msg{"No runtime type information is available for column \"" + colName + + "\" with type name \"" + trueTypeName + + "\". Thus, it cannot be written to disk with Snapshot. Make sure to generate and load " + "ROOT dictionaries for the type of this column."}; + + throw std::runtime_error(msg); + } else { + throw; + } + } + }; + + RDFInternal::CheckSnapshotOptionsFormatCompatibility(options); + + if (options.fOutputFormat == ESnapshotOutputFormat::kRNTuple) { + // The data source of the RNTuple resulting from the Snapshot action does not exist yet here, so we create one + // without a data source for now, and set it once the actual data source can be created (i.e., after + // writing the RNTuple). + auto newRDF = std::make_shared>(std::make_shared(colListNoPoundSizes)); + + auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ + std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches, + options, newRDF->GetLoopManager(), GetLoopManager(), true /* fToNTuple */, /*fIncludeVariations=*/false}); + + auto &&nColumns = colListNoAliasesWithSizeBranches.size(); + const auto validColumnNames = GetValidatedColumnNames(nColumns, colListNoAliasesWithSizeBranches); + + const auto nSlots = fLoopManager->GetNSlots(); + std::vector colTypeIDs; + colTypeIDs.reserve(nColumns); + for (decltype(nColumns) i{}; i < nColumns; i++) { + const auto &colName = validColumnNames[i]; + const auto colTypeName = ROOT::Internal::RDF::ColumnName2ColumnTypeName( + colName, /*tree*/ nullptr, GetDataSource(), fColRegister.GetDefine(colName), options.fVector2RVec); + const std::type_info *colTypeID = retrieveTypeID(colName, colTypeName, /*isRNTuple*/ true); + colTypeIDs.push_back(colTypeID); + } + // Crucial e.g. if the column names do not correspond to already-available column readers created by the data + // source + CheckAndFillDSColumns(validColumnNames, colTypeIDs); + + auto action = + RDFInternal::BuildAction(validColumnNames, snapHelperArgs, nSlots, fProxiedPtr, fColRegister, colTypeIDs); + resPtr = MakeResultPtr(newRDF, *GetLoopManager(), std::move(action)); + } else { + if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS" && + options.fOutputFormat == ESnapshotOutputFormat::kDefault) { + Warning("Snapshot", + "The default Snapshot output data format is TTree, but the input data format is RNTuple. If you " + "want to Snapshot to RNTuple or suppress this warning, set the appropriate fOutputFormat option in " + "RSnapshotOptions. Note that this current default behaviour might change in the future."); + } + + // We create an RLoopManager without a data source. This needs to be initialised when the output TTree dataset + // has actually been created and written to TFile, i.e. at the end of the Snapshot execution. + auto newRDF = std::make_shared>( + std::make_shared(colListNoAliasesWithSizeBranches)); + + auto snapHelperArgs = std::make_shared(RDFInternal::SnapshotHelperArgs{ + std::string(filename), std::string(dirname), std::string(treename), colListWithAliasesAndSizeBranches, + options, newRDF->GetLoopManager(), GetLoopManager(), false /* fToRNTuple */, options.fIncludeVariations}); + + auto &&nColumns = colListNoAliasesWithSizeBranches.size(); + const auto validColumnNames = GetValidatedColumnNames(nColumns, colListNoAliasesWithSizeBranches); + + const auto nSlots = fLoopManager->GetNSlots(); + std::vector colTypeIDs; + colTypeIDs.reserve(nColumns); + for (decltype(nColumns) i{}; i < nColumns; i++) { + const auto &colName = validColumnNames[i]; + const auto colTypeName = ROOT::Internal::RDF::ColumnName2ColumnTypeName( + colName, /*tree*/ nullptr, GetDataSource(), fColRegister.GetDefine(colName), options.fVector2RVec); + const std::type_info *colTypeID = retrieveTypeID(colName, colTypeName); + colTypeIDs.push_back(colTypeID); + } + // Crucial e.g. if the column names do not correspond to already-available column readers created by the data + // source + CheckAndFillDSColumns(validColumnNames, colTypeIDs); + + auto action = + RDFInternal::BuildAction(validColumnNames, snapHelperArgs, nSlots, fProxiedPtr, fColRegister, colTypeIDs); + resPtr = MakeResultPtr(newRDF, *GetLoopManager(), std::move(action)); } - return CreateAction(columnList, h, h, fProxiedPtr, - columnList.size()); + + if (!options.fLazy) + *resPtr; + return resPtr; } + // clang-format off //////////////////////////////////////////////////////////////////////////// - /// \brief Return a TStatistic object, filled once per event (*lazy action*). - /// - /// \tparam V The type of the value column - /// \param[in] value The name of the column with the values to fill the statistics with. - /// \return the filled TStatistic object wrapped in a RResultPtr. + /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`. + /// \param[in] treename The name of the output TTree or RNTuple. + /// \param[in] filename The name of the output TFile. + /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns. + /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple + /// \return a `RDataFrame` that wraps the snapshotted dataset. /// - /// ### Example usage: - /// ~~~{.cpp} - /// // Deduce column type (this invocation needs jitting internally) - /// auto stats0 = myDf.Stats("values"); - /// // Explicit column type - /// auto stats1 = myDf.Stats("values"); - /// ~~~ + /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. + /// The types of the columns are automatically inferred and do not need to be specified. /// - template - RResultPtr Stats(std::string_view value = "") + /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages. + RResultPtr> Snapshot(std::string_view treename, std::string_view filename, + std::string_view columnNameRegexp = "", + const RSnapshotOptions &options = RSnapshotOptions()) { - ColumnNames_t columns; - if (!value.empty()) { - columns.emplace_back(std::string(value)); + const auto definedColumns = fColRegister.GenerateColumnNames(); + + const auto dsColumns = GetDataSource() ? ROOT::Internal::RDF::GetTopLevelFieldNames(*GetDataSource()) : ColumnNames_t{}; + // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those + ColumnNames_t dsColumnsWithoutSizeColumns; + std::copy_if(dsColumns.begin(), dsColumns.end(), std::back_inserter(dsColumnsWithoutSizeColumns), + [](const std::string &name) { return name.size() < 13 || name.substr(0, 13) != "R_rdf_sizeof_"; }); + ColumnNames_t columnNames; + columnNames.reserve(definedColumns.size() + dsColumnsWithoutSizeColumns.size()); + columnNames.insert(columnNames.end(), definedColumns.begin(), definedColumns.end()); + columnNames.insert(columnNames.end(), dsColumnsWithoutSizeColumns.begin(), dsColumnsWithoutSizeColumns.end()); + + // The only way we can get duplicate entries is if a column coming from a tree or data-source is Redefine'd. + // RemoveDuplicates should preserve ordering of the columns: it might be meaningful. + RDFInternal::RemoveDuplicates(columnNames); + + std::vector selectedColumns; + try { + selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot"); } - const auto validColumnNames = GetValidatedColumnNames(1, columns); - if (std::is_same::value) { - return Fill(TStatistic(), validColumnNames); - } else { - return Fill(TStatistic(), validColumnNames); + catch (const std::runtime_error &e){ + // No columns were found, try again but consider all input data source columns + if (auto ds = GetDataSource()) + selectedColumns = RDFInternal::ConvertRegexToColumns(ds->GetColumnNames(), columnNameRegexp, "Snapshot"); + else + throw e; } - } - //////////////////////////////////////////////////////////////////////////// - /// \brief Return a TStatistic object, filled once per event (*lazy action*). - /// - /// \tparam V The type of the value column - /// \tparam W The type of the weight column - /// \param[in] value The name of the column with the values to fill the statistics with. - /// \param[in] weight The name of the column with the weights to fill the statistics with. - /// \return the filled TStatistic object wrapped in a RResultPtr. - /// - /// ### Example usage: - /// ~~~{.cpp} - /// // Deduce column types (this invocation needs jitting internally) - /// auto stats0 = myDf.Stats("values", "weights"); - /// // Explicit column types - /// auto stats1 = myDf.Stats("values", "weights"); - /// ~~~ - /// - template - RResultPtr Stats(std::string_view value, std::string_view weight) - { - ColumnNames_t columns{std::string(value), std::string(weight)}; - constexpr auto vIsInferred = std::is_same::value; - constexpr auto wIsInferred = std::is_same::value; - const auto validColumnNames = GetValidatedColumnNames(2, columns); - // We have 3 cases: - // 1. Both types are inferred: we use Fill and let the jit kick in. - // 2. One of the two types is explicit and the other one is inferred: the case is not supported. - // 3. Both types are explicit: we invoke the fully compiled Fill method. - if (vIsInferred && wIsInferred) { - return Fill(TStatistic(), validColumnNames); - } else if (vIsInferred != wIsInferred) { - std::string error("The "); - error += vIsInferred ? "value " : "weight "; - error += "column type is explicit, while the "; - error += vIsInferred ? "weight " : "value "; - error += " is specified to be inferred. This case is not supported: please specify both types or none."; - throw std::runtime_error(error); - } else { - return Fill(TStatistic(), validColumnNames); + if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS") { + RDFInternal::RemoveRNTupleSubfields(selectedColumns); } - } - //////////////////////////////////////////////////////////////////////////// - /// \brief Return the minimum of processed column values (*lazy action*). - /// \tparam T The type of the branch/column. - /// \param[in] columnName The name of the branch/column to be treated. - /// \return the minimum value of the selected column wrapped in a RResultPtr. - /// - /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct - /// template specialization of this method. - /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise. - /// - /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. Also see RResultPtr. - /// - /// ### Example usage: - /// ~~~{.cpp} - /// // Deduce column type (this invocation needs jitting internally) - /// auto minVal0 = myDf.Min("values"); - /// // Explicit column type - /// auto minVal1 = myDf.Min("values"); - /// ~~~ - /// - template - RResultPtr> Min(std::string_view columnName = "") - { - const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); - using RetType_t = RDFDetail::MinReturnType_t; - auto minV = std::make_shared(std::numeric_limits::max()); - return CreateAction(userColumns, minV, minV, fProxiedPtr); + return Snapshot(treename, filename, selectedColumns, options); } + // clang-format on + // clang-format off //////////////////////////////////////////////////////////////////////////// - /// \brief Return the maximum of processed column values (*lazy action*). - /// \tparam T The type of the branch/column. - /// \param[in] columnName The name of the branch/column to be treated. - /// \return the maximum value of the selected column wrapped in a RResultPtr. - /// - /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct - /// template specialization of this method. - /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise. - /// - /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. Also see RResultPtr. + /// \brief Save selected columns to disk, in a new TTree or RNTuple `treename` in file `filename`. + /// \param[in] treename The name of the output TTree or RNTuple. + /// \param[in] filename The name of the output TFile. + /// \param[in] columnList The list of names of the columns/branches to be written. + /// \param[in] options RSnapshotOptions struct with extra options to pass to TFile and TTree/RNTuple. + /// \return a `RDataFrame` that wraps the snapshotted dataset. /// - /// ### Example usage: - /// ~~~{.cpp} - /// // Deduce column type (this invocation needs jitting internally) - /// auto maxVal0 = myDf.Max("values"); - /// // Explicit column type - /// auto maxVal1 = myDf.Max("values"); - /// ~~~ + /// This function returns a `RDataFrame` built with the output TTree or RNTuple as a source. + /// The types of the columns are automatically inferred and do not need to be specified. /// - template - RResultPtr> Max(std::string_view columnName = "") + /// See Snapshot(std::string_view, std::string_view, const ColumnNames_t&, const RSnapshotOptions &) for a more complete description and example usages. + RResultPtr> Snapshot(std::string_view treename, std::string_view filename, + std::initializer_list columnList, + const RSnapshotOptions &options = RSnapshotOptions()) { - const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); - using RetType_t = RDFDetail::MaxReturnType_t; - auto maxV = std::make_shared(std::numeric_limits::lowest()); - return CreateAction(userColumns, maxV, maxV, fProxiedPtr); + ColumnNames_t selectedColumns(columnList); + return Snapshot(treename, filename, selectedColumns, options); } + // clang-format on //////////////////////////////////////////////////////////////////////////// - /// \brief Return the mean of processed column values (*lazy action*). - /// \tparam T The type of the branch/column. - /// \param[in] columnName The name of the branch/column to be treated. - /// \return the mean value of the selected column wrapped in a RResultPtr. - /// - /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct - /// template specialization of this method. - /// Note that internally, the summations are executed with Kahan sums in double precision, irrespective - /// of the type of column that is read. - /// - /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. Also see RResultPtr. - /// - /// ### Example usage: - /// ~~~{.cpp} - /// // Deduce column type (this invocation needs jitting internally) - /// auto meanVal0 = myDf.Mean("values"); - /// // Explicit column type - /// auto meanVal1 = myDf.Mean("values"); - /// ~~~ + /// \brief Save selected columns in memory. + /// \tparam ColumnTypes variadic list of branch/column types. + /// \param[in] columnList columns to be cached in memory. + /// \return a `RDataFrame` that wraps the cached dataset. /// - template - RResultPtr Mean(std::string_view columnName = "") - { - const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); - auto meanV = std::make_shared(0); - return CreateAction(userColumns, meanV, meanV, fProxiedPtr); - } - - //////////////////////////////////////////////////////////////////////////// - /// \brief Return the unbiased standard deviation of processed column values (*lazy action*). - /// \tparam T The type of the branch/column. - /// \param[in] columnName The name of the branch/column to be treated. - /// \return the standard deviation value of the selected column wrapped in a RResultPtr. + /// This action returns a new `RDataFrame` object, completely detached from + /// the originating `RDataFrame`. The new dataframe only contains the cached + /// columns and stores their content in memory for fast, zero-copy subsequent access. /// - /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct - /// template specialization of this method. + /// Use `Cache` if you know you will only need a subset of the (`Filter`ed) data that + /// fits in memory and that will be accessed many times. /// - /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. Also see RResultPtr. + /// \note Cache will refuse to process columns with names of the form `#columnname`. These are special columns + /// made available by some data sources (e.g. RNTupleDS) that represent the size of column `columnname`, and are + /// not meant to be written out with that name (which is not a valid C++ variable name). Instead, go through an + /// Alias(): `df.Alias("nbar", "#bar").Cache(..., {"nbar"})`. /// /// ### Example usage: + /// + /// **Types and columns specified:** /// ~~~{.cpp} - /// // Deduce column type (this invocation needs jitting internally) - /// auto stdDev0 = myDf.StdDev("values"); - /// // Explicit column type - /// auto stdDev1 = myDf.StdDev("values"); + /// auto cache_some_cols_df = df.Cache({"col0", "col1", "col2"}); /// ~~~ /// - template - RResultPtr StdDev(std::string_view columnName = "") - { - const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); - auto stdDeviationV = std::make_shared(0); - return CreateAction(userColumns, stdDeviationV, stdDeviationV, fProxiedPtr); - } - - // clang-format off - //////////////////////////////////////////////////////////////////////////// - /// \brief Return the sum of processed column values (*lazy action*). - /// \tparam T The type of the branch/column. - /// \param[in] columnName The name of the branch/column. - /// \param[in] initValue Optional initial value for the sum. If not present, the column values must be default-constructible. - /// \return the sum of the selected column wrapped in a RResultPtr. - /// - /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct - /// template specialization of this method. - /// If the type of the column is inferred, the return type is `double`, the type of the column otherwise. - /// - /// This action is *lazy*: upon invocation of this method the calculation is - /// booked but not executed. Also see RResultPtr. - /// - /// ### Example usage: + /// **Types inferred and columns specified (this invocation relies on jitting):** /// ~~~{.cpp} - /// // Deduce column type (this invocation needs jitting internally) - /// auto sum0 = myDf.Sum("values"); - /// // Explicit column type - /// auto sum1 = myDf.Sum("values"); + /// auto cache_some_cols_df = df.Cache({"col0", "col1", "col2"}); /// ~~~ /// - template - RResultPtr> - Sum(std::string_view columnName = "", - const RDFDetail::SumReturnType_t &initValue = RDFDetail::SumReturnType_t{}) + /// **Types inferred and columns selected with a regexp (this invocation relies on jitting):** + /// ~~~{.cpp} + /// auto cache_all_cols_df = df.Cache(myRegexp); + /// ~~~ + template + RInterface Cache(const ColumnNames_t &columnList) { - const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); - auto sumV = std::make_shared>(initValue); - return CreateAction(userColumns, sumV, sumV, fProxiedPtr); + auto staticSeq = std::make_index_sequence(); + return CacheImpl(columnList, staticSeq); } - // clang-format on //////////////////////////////////////////////////////////////////////////// - /// \brief Gather filtering statistics. - /// \return the resulting `RCutFlowReport` instance wrapped in a RResultPtr. - /// - /// Calling `Report` on the main `RDataFrame` object gathers stats for - /// all named filters in the call graph. Calling this method on a - /// stored chain state (i.e. a graph node different from the first) gathers - /// the stats for all named filters in the chain section between the original - /// `RDataFrame` and that node (included). Stats are gathered in the same - /// order as the named filters have been added to the graph. - /// A RResultPtr is returned to allow inspection of the - /// effects cuts had. - /// - /// This action is *lazy*: upon invocation of - /// this method the calculation is booked but not executed. See RResultPtr - /// documentation. - /// - /// ### Example usage: - /// ~~~{.cpp} - /// auto filtered = d.Filter(cut1, {"b1"}, "Cut1").Filter(cut2, {"b2"}, "Cut2"); - /// auto cutReport = filtered3.Report(); - /// cutReport->Print(); - /// ~~~ + /// \brief Save selected columns in memory. + /// \param[in] columnList columns to be cached in memory + /// \return a `RDataFrame` that wraps the cached dataset. /// - RResultPtr Report() + /// See the previous overloads for more information. + RInterface Cache(const ColumnNames_t &columnList) { - bool returnEmptyReport = false; - // if this is a RInterface on which `Define` has been called, users - // are calling `Report` on a chain of the form LoopManager->Define->Define->..., which - // certainly does not contain named filters. - // The number 4 takes into account the implicit columns for entry and slot number - // and their aliases (2 + 2, i.e. {r,t}dfentry_ and {r,t}dfslot_) - if (std::is_same::value && fColRegister.GenerateColumnNames().size() > 4) - returnEmptyReport = true; + // Early return: if the list of columns is empty, just return an empty RDF + // If we proceed, the jitted call will not compile! + if (columnList.empty()) { + auto nEntries = *this->Count(); + RInterface emptyRDF(std::make_shared(nEntries)); + return emptyRDF; + } - auto rep = std::make_shared(); - using Helper_t = RDFInternal::ReportHelper; - using Action_t = RDFInternal::RAction; + std::stringstream cacheCall; + auto upcastNode = RDFInternal::UpcastNode(fProxiedPtr); + RInterface> upcastInterface(fProxiedPtr, *fLoopManager, + fColRegister); + // build a string equivalent to + // "(RInterface*)(this)->Cache(*(ColumnNames_t*)(&columnList))" + RInterface resRDF(std::make_shared(0)); + cacheCall << "*reinterpret_cast*>(" + << RDFInternal::PrettyPrintAddr(&resRDF) + << ") = reinterpret_cast*>(" + << RDFInternal::PrettyPrintAddr(&upcastInterface) << ")->Cache<"; - auto action = std::make_unique(Helper_t(rep, fProxiedPtr.get(), returnEmptyReport), ColumnNames_t({}), - fProxiedPtr, RDFInternal::RColumnRegister(fColRegister)); + const auto columnListWithoutSizeColumns = RDFInternal::FilterArraySizeColNames(columnList, "Cache"); - return MakeResultPtr(rep, *fLoopManager, std::move(action)); - } + const auto validColumnNames = + GetValidatedColumnNames(columnListWithoutSizeColumns.size(), columnListWithoutSizeColumns); + const auto colTypes = + GetValidatedArgTypes(validColumnNames, fColRegister, nullptr, GetDataSource(), "Cache", /*vector2RVec=*/false); + for (const auto &colType : colTypes) + cacheCall << colType << ", "; + if (!columnListWithoutSizeColumns.empty()) + cacheCall.seekp(-2, cacheCall.cur); // remove the last ", + cacheCall << ">(*reinterpret_cast*>(" // vector should be ColumnNames_t + << RDFInternal::PrettyPrintAddr(&columnListWithoutSizeColumns) << "));"; + + // book the code to jit with the RLoopManager and trigger the event loop + fLoopManager->ToJitExec(cacheCall.str()); + fLoopManager->Jit(); + return resRDF; + } //////////////////////////////////////////////////////////////////////////// - /// \brief Provides a representation of the columns in the dataset. - /// \tparam ColumnTypes variadic list of branch/column types. - /// \param[in] columnList Names of the columns to be displayed. - /// \param[in] nRows Number of events for each column to be displayed. - /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row. - /// \return the `RDisplay` instance wrapped in a RResultPtr. - /// - /// This function returns a `RResultPtr` containing all the entries to be displayed, organized in a tabular - /// form. RDisplay will either print on the standard output a summarized version through `RDisplay::Print()` or will - /// return a complete version through `RDisplay::AsString()`. - /// - /// This action is *lazy*: upon invocation of this method the calculation is booked but not executed. Also see - /// RResultPtr. + /// \brief Save selected columns in memory. + /// \param[in] columnNameRegexp The regular expression to match the column names to be selected. The presence of a '^' and a '$' at the end of the string is implicitly assumed if they are not specified. The dialect supported is PCRE via the TPRegexp class. An empty string signals the selection of all columns. + /// \return a `RDataFrame` that wraps the cached dataset. /// - /// Example usage: - /// ~~~{.cpp} - /// // Preparing the RResultPtr object with all columns and default number of entries - /// auto d1 = rdf.Display(""); - /// // Preparing the RResultPtr object with two columns and 128 entries - /// auto d2 = d.Display({"x", "y"}, 128); - /// // Printing the short representations, the event loop will run - /// d1->Print(); - /// d2->Print(); - /// ~~~ - template - RResultPtr Display(const ColumnNames_t &columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10) + /// The existing columns are matched against the regular expression. If the string provided + /// is empty, all columns are selected. See the previous overloads for more information. + RInterface Cache(std::string_view columnNameRegexp = "") { - CheckIMTDisabled("Display"); - auto newCols = columnList; - newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column - auto displayer = std::make_shared(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements); - using displayHelperArgs_t = std::pair>; - // Need to add ULong64_t type corresponding to the first column rdfentry_ - return CreateAction( - std::move(newCols), displayer, std::make_shared(nRows, displayer), fProxiedPtr); + const auto definedColumns = fColRegister.GenerateColumnNames(); + const auto dsColumns = GetDataSource() ? GetDataSource()->GetColumnNames() : ColumnNames_t{}; + // Ignore R_rdf_sizeof_* columns coming from datasources: we don't want to Snapshot those + ColumnNames_t dsColumnsWithoutSizeColumns; + std::copy_if(dsColumns.begin(), dsColumns.end(), std::back_inserter(dsColumnsWithoutSizeColumns), + [](const std::string &name) { return name.size() < 13 || name.substr(0, 13) != "R_rdf_sizeof_"; }); + ColumnNames_t columnNames; + columnNames.reserve(definedColumns.size() + dsColumns.size()); + columnNames.insert(columnNames.end(), definedColumns.begin(), definedColumns.end()); + columnNames.insert(columnNames.end(), dsColumns.begin(), dsColumns.end()); + const auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Cache"); + return Cache(selectedColumns); } //////////////////////////////////////////////////////////////////////////// - /// \brief Provides a representation of the columns in the dataset. - /// \param[in] columnList Names of the columns to be displayed. - /// \param[in] nRows Number of events for each column to be displayed. - /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row. - /// \return the `RDisplay` instance wrapped in a RResultPtr. - /// - /// This overload automatically infers the column types. - /// See the previous overloads for further details. + /// \brief Save selected columns in memory. + /// \param[in] columnList columns to be cached in memory. + /// \return a `RDataFrame` that wraps the cached dataset. /// - /// Invoked when no types are specified to Display - RResultPtr Display(const ColumnNames_t &columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10) + /// See the previous overloads for more information. + RInterface Cache(std::initializer_list columnList) { - CheckIMTDisabled("Display"); - auto newCols = columnList; - newCols.insert(newCols.begin(), "rdfentry_"); // Artificially insert first column - auto displayer = std::make_shared(newCols, GetColumnTypeNamesList(newCols), nMaxCollectionElements); - using displayHelperArgs_t = std::pair>; - return CreateAction( - std::move(newCols), displayer, std::make_shared(nRows, displayer), fProxiedPtr, - columnList.size() + 1); + ColumnNames_t selectedColumns(columnList); + return Cache(selectedColumns); } + + // clang-format off //////////////////////////////////////////////////////////////////////////// - /// \brief Provides a representation of the columns in the dataset. - /// \param[in] columnNameRegexp A regular expression to select the columns. - /// \param[in] nRows Number of events for each column to be displayed. - /// \param[in] nMaxCollectionElements Maximum number of collection elements to display per row. - /// \return the `RDisplay` instance wrapped in a RResultPtr. + /// \brief Execute a user-defined function on each entry (*instant action*). + /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations. + /// \param[in] columns Names of the columns/branches in input to the user function. /// - /// The existing columns are matched against the regular expression. If the string provided - /// is empty, all columns are selected. - /// See the previous overloads for further details. - RResultPtr - Display(std::string_view columnNameRegexp = "", size_t nRows = 5, size_t nMaxCollectionElements = 10) + /// The callable `f` is invoked once per entry. This is an *instant action*: + /// upon invocation, an event loop as well as execution of all scheduled actions + /// is triggered. + /// Users are responsible for the thread-safety of this callable when executing + /// with implicit multi-threading enabled (i.e. ROOT::EnableImplicitMT). + /// + /// ### Example usage: + /// ~~~{.cpp} + /// myDf.Foreach([](int i){ std::cout << i << std::endl;}, {"myIntColumn"}); + /// ~~~ + // clang-format on + template + void Foreach(F f, const ColumnNames_t &columns = {}) { - const auto columnNames = GetColumnNames(); - const auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Display"); - return Display(selectedColumns, nRows, nMaxCollectionElements); + using arg_types = typename TTraits::CallableTraits::arg_types_nodecay; + using ret_type = typename TTraits::CallableTraits::ret_type; + ForeachSlot(RDFInternal::AddSlotParameter(f, arg_types()), columns); } + // clang-format off //////////////////////////////////////////////////////////////////////////// - /// \brief Provides a representation of the columns in the dataset. - /// \param[in] columnList Names of the columns to be displayed. - /// \param[in] nRows Number of events for each column to be displayed. - /// \param[in] nMaxCollectionElements Number of maximum elements in collection. - /// \return the `RDisplay` instance wrapped in a RResultPtr. + /// \brief Execute a user-defined function requiring a processing slot index on each entry (*instant action*). + /// \param[in] f Function, lambda expression, functor class or any other callable object performing user defined calculations. + /// \param[in] columns Names of the columns/branches in input to the user function. /// - /// See the previous overloads for further details. - RResultPtr - Display(std::initializer_list columnList, size_t nRows = 5, size_t nMaxCollectionElements = 10) + /// Same as `Foreach`, but the user-defined function takes an extra + /// `unsigned int` as its first parameter, the *processing slot index*. + /// This *slot index* will be assigned a different value, `0` to `poolSize - 1`, + /// for each thread of execution. + /// This is meant as a helper in writing thread-safe `Foreach` + /// actions when using `RDataFrame` after `ROOT::EnableImplicitMT()`. + /// The user-defined processing callable is able to follow different + /// *streams of processing* indexed by the first parameter. + /// `ForeachSlot` works just as well with single-thread execution: in that + /// case `slot` will always be `0`. + /// + /// ### Example usage: + /// ~~~{.cpp} + /// myDf.ForeachSlot([](unsigned int s, int i){ std::cout << "Slot " << s << ": "<< i << std::endl;}, {"myIntColumn"}); + /// ~~~ + // clang-format on + template + void ForeachSlot(F f, const ColumnNames_t &columns = {}) { - ColumnNames_t selectedColumns(columnList); - return Display(selectedColumns, nRows, nMaxCollectionElements); + using ColTypes_t = TypeTraits::RemoveFirstParameter_t::arg_types>; + constexpr auto nColumns = ColTypes_t::list_size; + + const auto validColumnNames = GetValidatedColumnNames(nColumns, columns); + CheckAndFillDSColumns(validColumnNames, ColTypes_t()); + + using Helper_t = RDFInternal::ForeachSlotHelper; + using Action_t = RDFInternal::RAction; + + auto action = std::make_unique(Helper_t(std::move(f)), validColumnNames, fProxiedPtr, fColRegister); + + fLoopManager->Run(); } /// \} - // End of the doxygen group for actions + // End of doxygen group for immediate actions // ---------------------------------------------------------------------------------------- /// \brief Returns the names of the filters created.