Implemented get_top_n_[data|view]()

hosseinmoein · Jul 17, 2024 · cfe5d14 · cfe5d14
1 parent d4ece48
commit cfe5d14
Show file tree

Hide file tree

Showing 9 changed files with 461 additions and 25 deletions.
diff --git a/docs/HTML/get_top_n_data.html b/docs/HTML/get_top_n_data.html
diff --git a/docs/HTML/join_by_column.html b/docs/HTML/join_by_column.html
@@ -76,7 +76,7 @@
                join_policy jp) const;
         </B></PRE></font>
       </td>
-      <td WIDTH="33.3%">
+      <td>
         It joins the data between self (lhs) and rhs and returns the joined data in a StdDataFrame, based on specification in join_policy.<BR>
         The returned DataFrame is indexed by a sequence of unsigned long from 0 to N. The returned DataFrame will at least have two columns names lhs.INDEX and rhs.INDEX containing the lhs and rhs indices based on join policy.<BR>
         The following conditions must be met for this method<BR>
@@ -87,7 +87,7 @@
           <LI>In both lhs and rhs, columns with the same name must have the same type</LI>
         </OL>
       </td>
-      <td WIDTH="33.3%">
+      <td width="38%">
         <B>RHS_T</B>: Type of DataFrame rhs<BR>
         <B>T</B>: Type of the named column<BR>
         <B>Ts</B>: List all the types of all data columns. A type should be specified in the list only once.<BR>
@@ -105,18 +105,18 @@
 join_by_index(const RHS_T &amp;rhs, join_policy jp) const;
         </B></PRE></font>
       </td>
-      <td WIDTH="33.3%">
+      <td>
         It joins the data between self (lhs) and rhs and returns the joined data<BR>
         in a StdDataFrame, based on specification in join_policy.<BR>
         The following conditions must be met for this method<BR>
         to compile and work properly:<BR>
         <OL>
           <LI>I type must be the same between lhs and rhs.</LI>
           <LI>Ordering (< > != ==) must be well defined for type I</LI>
-          <LI>In both lhs and rhs, columns with the same name must have the same Type</LI>
+          <LI>In both lhs and rhs, columns with the same name must have the same type</LI>
         </OL>
       </td>
-      <td WIDTH="33.3%">
+      <td width="38%">
         <B>RHS_T</B>: Type of DataFrame rhs<BR>
         <B>Ts</B>: List all the types of all data columns. A type should be specified in the list only once.<BR>
         <B>rhs</B>: The rhs DataFrame<BR>

diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h
@@ -2785,6 +2785,39 @@ class   DataFrame : public ThreadGranularity {
     [[nodiscard]] DataFrame<I, HeteroVector<std::size_t(H::align_value)>>
     get_data(const StlVecType<const char *> &col_names) const;
 
+    // This returns a new DataFrame with the n top rows of the given column.
+    // The returned DataFrame rows will be in the same order as self.
+    //
+    // NOTE: Comparison operators (<, >, ==) must be well defined for type T.
+    //
+    // T:
+    //   Type of column name
+    // Ts:
+    //   List all the types of all data columns. A type should be specified in
+    //   the list only once.
+    // col_name:
+    //   Name of the given column
+    // n:
+	//   Number of top rows
+    //
+    template<typename T, typename ... Ts>
+    [[nodiscard]] DataFrame<I, HeteroVector<std::size_t(H::align_value)>>
+    get_top_n_data(const char *col_name, size_type n) const;
+
+    // Smae as above but it returns a View with the n top rows of
+    // the given column.
+    //
+    template<typename T, typename ... Ts>
+    [[nodiscard]] PtrView
+    get_top_n_view(const char *col_name, size_type n);
+
+    // Same as above but it returns a const View with the n top rows of
+    // the given column.
+    //
+    template<typename T, typename ... Ts>
+    [[nodiscard]] ConstPtrView
+    get_top_n_view(const char *col_name, size_type n) const;
+
     // This returns a new DataFrame with the same index column as self and an
     // integer column with the same name for each column in self.
     // The integer columns in returned DataFrame show a duplication mask for

diff --git a/include/DataFrame/DataFrameStatsVisitors.h b/include/DataFrame/DataFrameStatsVisitors.h
@@ -814,7 +814,7 @@ struct  NExtremumVisitor  {
     operator() (K idx_begin, K /*idx_end*/, H column_begin, H column_end)  {
 
 #ifdef HMDF_SANITY_EXCEPTIONS
-        if (std::distance(column_begin, column_end) < n_)
+        if (size_type(std::distance(column_begin, column_end)) < n_)
             throw DataFrameError("NExtremumVisitor: column size must be >= N");
 #endif // HMDF_SANITY_EXCEPTIONS
 

diff --git a/include/DataFrame/Internals/DataFrame_get.tcc b/include/DataFrame/Internals/DataFrame_get.tcc
@@ -2540,6 +2540,54 @@ get_view(const StlVecType<const char *> &col_names) const  {
 
 // ----------------------------------------------------------------------------
 
+template<typename I, typename H>
+template<typename T, typename ... Ts>
+DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::
+get_top_n_data(const char *name, size_type n) const  {
+
+    using res_t = DataFrame<I, HeteroVector<align_value>>;
+    using visitor_t = NLargestVisitor<T, I>;
+
+    res_t   result;
+
+    top_n_common_<T, visitor_t, res_t, Ts ...>(name, visitor_t { n }, result);
+    return (result);
+}
+
+// ----------------------------------------------------------------------------
+
+template<typename I, typename H>
+template<typename T, typename ... Ts>
+typename DataFrame<I, H>::PtrView DataFrame<I, H>::
+get_top_n_view(const char *name, size_type n)  {
+
+    using res_t = PtrView;
+    using visitor_t = NLargestVisitor<T, I>;
+
+    res_t   result;
+
+    top_n_common_<T, visitor_t, res_t, Ts ...>(name, visitor_t { n }, result);
+    return (result);
+}
+
+// ----------------------------------------------------------------------------
+
+template<typename I, typename H>
+template<typename T, typename ... Ts>
+typename DataFrame<I, H>::ConstPtrView DataFrame<I, H>::
+get_top_n_view(const char *name, size_type n) const  {
+
+    using res_t = ConstPtrView;
+    using visitor_t = NLargestVisitor<T, I>;
+
+    res_t   result;
+
+    top_n_common_<T, visitor_t, res_t, Ts ...>(name, visitor_t { n }, result);
+    return (result);
+}
+
+// ----------------------------------------------------------------------------
+
 template<typename I, typename H>
 template<hashable_equal ... Ts>
 DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::

diff --git a/include/DataFrame/Internals/DataFrame_misc.tcc b/include/DataFrame/Internals/DataFrame_misc.tcc
@@ -735,9 +735,9 @@ operator() (T &vec)  {
 
     using VecType = typename std::remove_reference<T>::type;
     using ValueType = typename VecType::value_type;
-    using ViewType = typename DF::template ColumnVecType<ValueType>;
+    using ViewColType = typename DF::template ColumnVecType<ValueType>;
 
-    ViewType        new_col;
+    ViewColType     new_col;
     const size_type vec_size = vec.size();
 
     new_col.reserve(std::min(sel_indices.size(), vec_size));

diff --git a/include/DataFrame/Internals/DataFrame_private_decl.h b/include/DataFrame/Internals/DataFrame_private_decl.h
@@ -463,7 +463,7 @@ fill_missing_lagrange_(ColumnVecType<T> &vec,
             vec[k] = y;
             count += 1;
         }
-	}
+    }
     return;
 }
 
@@ -602,7 +602,7 @@ join_helper_common_(
                  &rhs = std::as_const(rhs),
                  &joined_index_idx = std::as_const(joined_index_idx),
                  &result] () -> void  {
-                    index_join_functor_common_<res_t, RHS_T, Ts ...>    functor(
+                    index_join_functor_common_<res_t, RHS_T, Ts ...>   functor(
                         name.c_str(),
                         rhs,
                         joined_index_idx,
@@ -836,8 +836,8 @@ data_by_sel_common_(const StlVecType<size_type> &col_indices,
         for (const auto &[name, idx] : column_list_) [[likely]]  {
             sel_load_functor_<res_t, size_type, Ts ...> functor(name.c_str(),
                                                                 col_indices,
-                                                                 idx_s,
-                                                                 ret_df);
+                                                                idx_s,
+                                                                ret_df);
 
             data_[idx].change(functor);
         }
@@ -912,6 +912,62 @@ view_by_sel_common_(const StlVecType<size_type> &col_indices,
 
 // ----------------------------------------------------------------------------
 
+template<typename T, typename V, typename R, typename ... Ts>
+void top_n_common_(const char *col_name, V &&visitor, R &result) const  {
+
+    using res_t = R;
+
+    const ColumnVecType<T>  *vec { nullptr };
+
+    if (! ::strcmp(col_name, DF_INDEX_COL_NAME))
+        vec = (const ColumnVecType<T> *) &(get_index());
+    else
+        vec = (const ColumnVecType<T> *) &(get_column<T>(col_name));
+
+    visitor.pre();
+    visitor(indices_.begin(), indices_.end(), vec->begin(), vec->end());
+    visitor.post();
+    visitor.sort_by_index_idx();
+
+    typename res_t::IndexVecType    new_index;
+    StlVecType<size_type>           idxs;
+
+    new_index.reserve(visitor.get_result().size());
+    idxs.reserve(visitor.get_result().size());
+    for (const auto &res : visitor.get_result())  {
+        if constexpr (std::is_same_v<res_t,
+                                     DataFrame<I, HeteroVector<align_value>>>)
+            new_index.push_back(indices_[res.index_idx]);
+        else  // Views
+            new_index.push_back(
+                &(const_cast<DataFrame *>(this)->indices_[res.index_idx]));
+        idxs.push_back(res.index_idx);
+    }
+    result.indices_ = std::move(new_index);
+
+    const SpinGuard guard(lock_);
+
+    if constexpr (std::is_same_v<res_t,
+                                  DataFrame<I, HeteroVector<align_value>>>)  {
+        for (const auto &[name, idx] : column_list_) [[likely]]  {
+            sel_load_functor_<res_t, size_type, Ts ...> functor(
+                name.c_str(), idxs, 0, result);
+
+            data_[idx].change(functor);
+        }
+    }
+    else  {  // Views
+        for (const auto &[name, idx] : column_list_) [[likely]]  {
+            sel_load_view_functor_<size_type, res_t, Ts ...>    functor(
+                name.c_str(), idxs, 0, result);
+
+            data_[idx].change(functor);
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+
 template<typename V, typename T>
 inline static void
 replace_vector_vals_(V &data_vec,

diff --git a/test/dataframe_tester_3.cc b/test/dataframe_tester_3.cc
@@ -4475,6 +4475,97 @@ static void test_duplication_mask()  {
 
 // ----------------------------------------------------------------------------
 
+static void test_get_top_n_data()  {
+
+    std::cout << "\nTesting get_top_n_data( ) ..." << std::endl;
+
+    StlVecType<unsigned long>   idx =
+        { 123450, 123451, 123452, 123453, 123454, 123455, 123456,
+          123457, 123458, 123459, 123460, 123461, 123462, 123466 };
+    StlVecType<double>          d1 =
+        { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
+    StlVecType<double>          d2 =
+        { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
+    StlVecType<double>          d3 =
+        { 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0, 10 };
+    StlVecType<int>             i1 = { 22, 23, 24, 25, 99 };
+    MyDataFrame                 df;
+
+    df.load_data(std::move(idx),
+                 std::make_pair("col_1", d1),
+                 std::make_pair("col_2", d2),
+                 std::make_pair("col_3", d3),
+                 std::make_pair("col_4", i1));
+
+    auto    lbd =
+        [](const unsigned long &, const double &val) -> bool {
+            return (val < 100.0);
+        };
+    auto    view =
+        df.get_view_by_sel<double, decltype(lbd), double, int, std::string>
+            ("col_1", lbd);
+
+    auto    res1 =
+        df.get_top_n_data<double, int, double, std::string>("col_3", 4);
+    auto    res2 =
+        view.get_top_n_data<double, int, double, std::string>("col_3", 4);
+    auto    res3 =
+        df.get_top_n_view<double, int, double, std::string>("col_3", 4);
+    auto    res4 =
+        view.get_top_n_view<double, int, double, std::string>("col_3", 4);
+    auto    res5 =
+        view.get_top_n_data<unsigned int, int, double, std::string>
+            (DF_INDEX_COL_NAME, 4);
+
+    {
+        StlVecType<unsigned long>   out_idx =
+            { 123453, 123454, 123456, 123462 };
+        StlVecType<double>          out_col_2 = { 11, 12, 14, 32 };
+        StlVecType<double>          out_col_3 = { 18, 19, 21, 19 };
+        StlVecType<int>             out_col_4 = { 25, 99, 0, 0 };
+
+        assert(res1.get_index() == out_idx);
+        assert(res1.get_column<double>("col_2") == out_col_2);
+        assert(res1.get_column<double>("col_3") == out_col_3);
+        assert(res1.get_column<int>("col_4") == out_col_4);
+    }
+    {
+        StlVecType<unsigned long>   out_idx =
+            { 123453, 123454, 123456, 123462 };
+        StlVecType<double>          out_col_2 = { 11, 12, 14, 32 };
+        StlVecType<double>          out_col_3 = { 18, 19, 21, 19 };
+        StlVecType<int>             out_col_4 = { 25, 99, 0, 0 };
+
+        assert(res2.get_index() == out_idx);
+        assert(res2.get_column<double>("col_2") == out_col_2);
+        assert(res2.get_column<double>("col_3") == out_col_3);
+        assert(res2.get_column<int>("col_4") == out_col_4);
+    }
+
+    res3.write<std::ostream, double, int, std::string>
+        (std::cout, io_format::csv);
+    std::cout << std::endl;
+
+    res4.write<std::ostream, double, int, std::string>
+        (std::cout, io_format::csv);
+    std::cout << std::endl;
+
+    {
+        StlVecType<unsigned long>   out_idx =
+            { 123460, 123461, 123462, 123466 };
+        StlVecType<double>          out_col_2 = { 30, 31, 32, 1.89 };
+        StlVecType<double>          out_col_3 = { 2.3, 0.34, 19, 10 };
+        StlVecType<int>             out_col_4 = { 0, 0, 0, 0 };
+
+        assert(res5.get_index() == out_idx);
+        assert(res5.get_column<double>("col_2") == out_col_2);
+        assert(res5.get_column<double>("col_3") == out_col_3);
+        assert(res5.get_column<int>("col_4") == out_col_4);
+    }
+}
+
+// -----------------------------------------------------------------------------
+
 int main(int, char *[]) {
 
     MyDataFrame::set_optimum_thread_level();
@@ -4563,6 +4654,7 @@ int main(int, char *[]) {
     test_change_freq_2();
     test_change_freq_3();
     test_duplication_mask();
+    test_get_top_n_data();
 
     return (0);
 }