diff --git a/docs/HTML/get_top_n_data.html b/docs/HTML/get_top_n_data.html new file mode 100644 index 00000000..442f01c1 --- /dev/null +++ b/docs/HTML/get_top_n_data.html @@ -0,0 +1,193 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Signature Description Parameters
+

+template<typename T, typename F, typename ... Ts>
+DataFrame
+get_top_n_data(const char *col_name, size_type n) const;
+        
+
+ This returns a new DataFrame with the n top rows of the given column. The returned DataFrame rows will be in the same order as self.

+ NOTE Comparison operators (<, >, ==) must be well defined for type T.
+
+ T: Type of the named column
+ Ts: The list of types for all columns. A type should be specified only once
+ col_name: Name of the data column
+ n: Number of top rows
+
+

+template<typename T, typename F, typename ... Ts>
+PtrView
+get_top_n_view(const char *col_name, size_type n);
+        
+
+ This is identical with above get_top_n_data(), but:
+
    +
  1. The result is a view
  2. +
  3. Since the result is a view, you cannot call make_consistent() on the result.
  4. +
+ NOTE: There are certain operations that you cannot do with a view. For example, you cannot add/delete columns, etc.
+
+ T: Type of the named column
+ Ts: The list of types for all columns. A type should be specified only once
+ col_name: Name of the data column
+ n: Number of top rows
+
+

+template<typename T, typename F, typename ... Ts>
+ConstPtrView
+get_top_n_view(const char *col_name, size_type n) const;
+        
+
+ Same as above view, but it returns a const view. You can not change data in const views. But if the data is changed in the original DataFrame or through another view, it is refelcted in the const view. + + T: Type of the named column
+ Ts: The list of types for all columns. A type should be specified only once
+ col_name: Name of the data column
+ n: Number of top rows
+
+ +
static void test_get_top_n_data()  {
+
+    std::cout << "\nTesting get_top_n_data( ) ..." << std::endl;
+
+    StlVecType<unsigned long>   idx = { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
+    StlVecType<double>          d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
+    StlVecType<double>          d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
+    StlVecType<double>          d3 = { 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0, 10 };
+    StlVecType<int>             i1 = { 22, 23, 24, 25, 99 };
+    MyDataFrame                 df;
+
+    df.load_data(std::move(idx),
+                 std::make_pair("col_1", d1),
+                 std::make_pair("col_2", d2),
+                 std::make_pair("col_3", d3),
+                 std::make_pair("col_4", i1));
+
+    auto    lbd = [](const unsigned long &, const double &val) -> bool {
+                      return (val < 100.0);
+                  };
+    auto    view = df.get_view_by_sel<double, decltype(lbd), double, int, std::string>("col_1", lbd);
+
+    auto    res1 = df.get_top_n_data<double, int, double, std::string>("col_3", 4);
+    auto    res2 = view.get_top_n_data<double, int, double, std::string>("col_3", 4);
+    auto    res3 = df.get_top_n_view<double, int, double, std::string>("col_3", 4);
+    auto    res4 = view.get_top_n_view<double, int, double, std::string>("col_3", 4);
+    auto    res5 = view.get_top_n_data<unsigned int, int, double, std::string>(DF_INDEX_COL_NAME, 4);
+
+    {
+        StlVecType<unsigned long>   out_idx = { 123453, 123454, 123456, 123462 };
+        StlVecType<double>          out_col_2 = { 11, 12, 14, 32 };
+        StlVecType<double>          out_col_3 = { 18, 19, 21, 19 };
+        StlVecType<int>             out_col_4 = { 25, 99, 0, 0 };
+
+        assert(res1.get_index() == out_idx);
+        assert(res1.get_column<double>("col_2") == out_col_2);
+        assert(res1.get_column<double>("col_3") == out_col_3);
+        assert(res1.get_column<int>("col_4") == out_col_4);
+    }
+    {
+        StlVecType<unsigned long>   out_idx = { 123453, 123454, 123456, 123462 };
+        StlVecType<double>          out_col_2 = { 11, 12, 14, 32 };
+        StlVecType<double>          out_col_3 = { 18, 19, 21, 19 };
+        StlVecType<int>             out_col_4 = { 25, 99, 0, 0 };
+
+        assert(res2.get_index() == out_idx);
+        assert(res2.get_column<double>("col_2") == out_col_2);
+        assert(res2.get_column<double>("col_3") == out_col_3);
+        assert(res2.get_column<int>("col_4") == out_col_4);
+    }
+
+    res3.write<std::ostream, double, int, std::string>(std::cout, io_format::csv);
+    std::cout << std::endl;
+
+    res4.write<std::ostream, double, int, std::string>(std::cout, io_format::csv);
+    std::cout << std::endl;
+
+    {
+        StlVecType<unsigned long>   out_idx = { 123460, 123461, 123462, 123466 };
+        StlVecType<double>          out_col_2 = { 30, 31, 32, 1.89 };
+        StlVecType<double>          out_col_3 = { 2.3, 0.34, 19, 10 };
+        StlVecType<int>             out_col_4 = { 0, 0, 0, 0 };
+
+        assert(res5.get_index() == out_idx);
+        assert(res5.get_column<double>("col_2") == out_col_2);
+        assert(res5.get_column<double>("col_3") == out_col_3);
+        assert(res5.get_column<int>("col_4") == out_col_4);
+    }
+}
+
+ +
C++ DataFrame + + + + + diff --git a/docs/HTML/join_by_column.html b/docs/HTML/join_by_column.html index b09611a8..96ea1a65 100644 --- a/docs/HTML/join_by_column.html +++ b/docs/HTML/join_by_column.html @@ -76,7 +76,7 @@ join_policy jp) const; - + It joins the data between self (lhs) and rhs and returns the joined data in a StdDataFrame, based on specification in join_policy.
The returned DataFrame is indexed by a sequence of unsigned long from 0 to N. The returned DataFrame will at least have two columns names lhs.INDEX and rhs.INDEX containing the lhs and rhs indices based on join policy.
The following conditions must be met for this method
@@ -87,7 +87,7 @@
  • In both lhs and rhs, columns with the same name must have the same type
  • - + RHS_T: Type of DataFrame rhs
    T: Type of the named column
    Ts: List all the types of all data columns. A type should be specified in the list only once.
    @@ -105,7 +105,7 @@ join_by_index(const RHS_T &rhs, join_policy jp) const; - + It joins the data between self (lhs) and rhs and returns the joined data
    in a StdDataFrame, based on specification in join_policy.
    The following conditions must be met for this method
    @@ -113,10 +113,10 @@
    1. I type must be the same between lhs and rhs.
    2. Ordering (< > != ==) must be well defined for type I
    3. -
    4. In both lhs and rhs, columns with the same name must have the same Type
    5. +
    6. In both lhs and rhs, columns with the same name must have the same type
    - + RHS_T: Type of DataFrame rhs
    Ts: List all the types of all data columns. A type should be specified in the list only once.
    rhs: The rhs DataFrame
    diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h index cbe13582..1ccef727 100644 --- a/include/DataFrame/DataFrame.h +++ b/include/DataFrame/DataFrame.h @@ -2785,6 +2785,39 @@ class DataFrame : public ThreadGranularity { [[nodiscard]] DataFrame> get_data(const StlVecType &col_names) const; + // This returns a new DataFrame with the n top rows of the given column. + // The returned DataFrame rows will be in the same order as self. + // + // NOTE: Comparison operators (<, >, ==) must be well defined for type T. + // + // T: + // Type of column name + // Ts: + // List all the types of all data columns. A type should be specified in + // the list only once. + // col_name: + // Name of the given column + // n: + // Number of top rows + // + template + [[nodiscard]] DataFrame> + get_top_n_data(const char *col_name, size_type n) const; + + // Smae as above but it returns a View with the n top rows of + // the given column. + // + template + [[nodiscard]] PtrView + get_top_n_view(const char *col_name, size_type n); + + // Same as above but it returns a const View with the n top rows of + // the given column. + // + template + [[nodiscard]] ConstPtrView + get_top_n_view(const char *col_name, size_type n) const; + // This returns a new DataFrame with the same index column as self and an // integer column with the same name for each column in self. // The integer columns in returned DataFrame show a duplication mask for diff --git a/include/DataFrame/DataFrameStatsVisitors.h b/include/DataFrame/DataFrameStatsVisitors.h index ff83fd95..c592d622 100644 --- a/include/DataFrame/DataFrameStatsVisitors.h +++ b/include/DataFrame/DataFrameStatsVisitors.h @@ -814,7 +814,7 @@ struct NExtremumVisitor { operator() (K idx_begin, K /*idx_end*/, H column_begin, H column_end) { #ifdef HMDF_SANITY_EXCEPTIONS - if (std::distance(column_begin, column_end) < n_) + if (size_type(std::distance(column_begin, column_end)) < n_) throw DataFrameError("NExtremumVisitor: column size must be >= N"); #endif // HMDF_SANITY_EXCEPTIONS diff --git a/include/DataFrame/Internals/DataFrame_get.tcc b/include/DataFrame/Internals/DataFrame_get.tcc index dce457ea..885dd535 100644 --- a/include/DataFrame/Internals/DataFrame_get.tcc +++ b/include/DataFrame/Internals/DataFrame_get.tcc @@ -2540,6 +2540,54 @@ get_view(const StlVecType &col_names) const { // ---------------------------------------------------------------------------- +template +template +DataFrame> DataFrame:: +get_top_n_data(const char *name, size_type n) const { + + using res_t = DataFrame>; + using visitor_t = NLargestVisitor; + + res_t result; + + top_n_common_(name, visitor_t { n }, result); + return (result); +} + +// ---------------------------------------------------------------------------- + +template +template +typename DataFrame::PtrView DataFrame:: +get_top_n_view(const char *name, size_type n) { + + using res_t = PtrView; + using visitor_t = NLargestVisitor; + + res_t result; + + top_n_common_(name, visitor_t { n }, result); + return (result); +} + +// ---------------------------------------------------------------------------- + +template +template +typename DataFrame::ConstPtrView DataFrame:: +get_top_n_view(const char *name, size_type n) const { + + using res_t = ConstPtrView; + using visitor_t = NLargestVisitor; + + res_t result; + + top_n_common_(name, visitor_t { n }, result); + return (result); +} + +// ---------------------------------------------------------------------------- + template template DataFrame> DataFrame:: diff --git a/include/DataFrame/Internals/DataFrame_misc.tcc b/include/DataFrame/Internals/DataFrame_misc.tcc index 292d45f4..82269775 100644 --- a/include/DataFrame/Internals/DataFrame_misc.tcc +++ b/include/DataFrame/Internals/DataFrame_misc.tcc @@ -735,9 +735,9 @@ operator() (T &vec) { using VecType = typename std::remove_reference::type; using ValueType = typename VecType::value_type; - using ViewType = typename DF::template ColumnVecType; + using ViewColType = typename DF::template ColumnVecType; - ViewType new_col; + ViewColType new_col; const size_type vec_size = vec.size(); new_col.reserve(std::min(sel_indices.size(), vec_size)); diff --git a/include/DataFrame/Internals/DataFrame_private_decl.h b/include/DataFrame/Internals/DataFrame_private_decl.h index 67663d7b..aed48831 100644 --- a/include/DataFrame/Internals/DataFrame_private_decl.h +++ b/include/DataFrame/Internals/DataFrame_private_decl.h @@ -463,7 +463,7 @@ fill_missing_lagrange_(ColumnVecType &vec, vec[k] = y; count += 1; } - } + } return; } @@ -602,7 +602,7 @@ join_helper_common_( &rhs = std::as_const(rhs), &joined_index_idx = std::as_const(joined_index_idx), &result] () -> void { - index_join_functor_common_ functor( + index_join_functor_common_ functor( name.c_str(), rhs, joined_index_idx, @@ -836,8 +836,8 @@ data_by_sel_common_(const StlVecType &col_indices, for (const auto &[name, idx] : column_list_) [[likely]] { sel_load_functor_ functor(name.c_str(), col_indices, - idx_s, - ret_df); + idx_s, + ret_df); data_[idx].change(functor); } @@ -912,6 +912,62 @@ view_by_sel_common_(const StlVecType &col_indices, // ---------------------------------------------------------------------------- +template +void top_n_common_(const char *col_name, V &&visitor, R &result) const { + + using res_t = R; + + const ColumnVecType *vec { nullptr }; + + if (! ::strcmp(col_name, DF_INDEX_COL_NAME)) + vec = (const ColumnVecType *) &(get_index()); + else + vec = (const ColumnVecType *) &(get_column(col_name)); + + visitor.pre(); + visitor(indices_.begin(), indices_.end(), vec->begin(), vec->end()); + visitor.post(); + visitor.sort_by_index_idx(); + + typename res_t::IndexVecType new_index; + StlVecType idxs; + + new_index.reserve(visitor.get_result().size()); + idxs.reserve(visitor.get_result().size()); + for (const auto &res : visitor.get_result()) { + if constexpr (std::is_same_v>>) + new_index.push_back(indices_[res.index_idx]); + else // Views + new_index.push_back( + &(const_cast(this)->indices_[res.index_idx])); + idxs.push_back(res.index_idx); + } + result.indices_ = std::move(new_index); + + const SpinGuard guard(lock_); + + if constexpr (std::is_same_v>>) { + for (const auto &[name, idx] : column_list_) [[likely]] { + sel_load_functor_ functor( + name.c_str(), idxs, 0, result); + + data_[idx].change(functor); + } + } + else { // Views + for (const auto &[name, idx] : column_list_) [[likely]] { + sel_load_view_functor_ functor( + name.c_str(), idxs, 0, result); + + data_[idx].change(functor); + } + } +} + +// ---------------------------------------------------------------------------- + template inline static void replace_vector_vals_(V &data_vec, diff --git a/test/dataframe_tester_3.cc b/test/dataframe_tester_3.cc index 26ff5164..50ac80f6 100644 --- a/test/dataframe_tester_3.cc +++ b/test/dataframe_tester_3.cc @@ -4475,6 +4475,97 @@ static void test_duplication_mask() { // ---------------------------------------------------------------------------- +static void test_get_top_n_data() { + + std::cout << "\nTesting get_top_n_data( ) ..." << std::endl; + + StlVecType idx = + { 123450, 123451, 123452, 123453, 123454, 123455, 123456, + 123457, 123458, 123459, 123460, 123461, 123462, 123466 }; + StlVecType d1 = + { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; + StlVecType d2 = + { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 }; + StlVecType d3 = + { 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0, 10 }; + StlVecType i1 = { 22, 23, 24, 25, 99 }; + MyDataFrame df; + + df.load_data(std::move(idx), + std::make_pair("col_1", d1), + std::make_pair("col_2", d2), + std::make_pair("col_3", d3), + std::make_pair("col_4", i1)); + + auto lbd = + [](const unsigned long &, const double &val) -> bool { + return (val < 100.0); + }; + auto view = + df.get_view_by_sel + ("col_1", lbd); + + auto res1 = + df.get_top_n_data("col_3", 4); + auto res2 = + view.get_top_n_data("col_3", 4); + auto res3 = + df.get_top_n_view("col_3", 4); + auto res4 = + view.get_top_n_view("col_3", 4); + auto res5 = + view.get_top_n_data + (DF_INDEX_COL_NAME, 4); + + { + StlVecType out_idx = + { 123453, 123454, 123456, 123462 }; + StlVecType out_col_2 = { 11, 12, 14, 32 }; + StlVecType out_col_3 = { 18, 19, 21, 19 }; + StlVecType out_col_4 = { 25, 99, 0, 0 }; + + assert(res1.get_index() == out_idx); + assert(res1.get_column("col_2") == out_col_2); + assert(res1.get_column("col_3") == out_col_3); + assert(res1.get_column("col_4") == out_col_4); + } + { + StlVecType out_idx = + { 123453, 123454, 123456, 123462 }; + StlVecType out_col_2 = { 11, 12, 14, 32 }; + StlVecType out_col_3 = { 18, 19, 21, 19 }; + StlVecType out_col_4 = { 25, 99, 0, 0 }; + + assert(res2.get_index() == out_idx); + assert(res2.get_column("col_2") == out_col_2); + assert(res2.get_column("col_3") == out_col_3); + assert(res2.get_column("col_4") == out_col_4); + } + + res3.write + (std::cout, io_format::csv); + std::cout << std::endl; + + res4.write + (std::cout, io_format::csv); + std::cout << std::endl; + + { + StlVecType out_idx = + { 123460, 123461, 123462, 123466 }; + StlVecType out_col_2 = { 30, 31, 32, 1.89 }; + StlVecType out_col_3 = { 2.3, 0.34, 19, 10 }; + StlVecType out_col_4 = { 0, 0, 0, 0 }; + + assert(res5.get_index() == out_idx); + assert(res5.get_column("col_2") == out_col_2); + assert(res5.get_column("col_3") == out_col_3); + assert(res5.get_column("col_4") == out_col_4); + } +} + +// ----------------------------------------------------------------------------- + int main(int, char *[]) { MyDataFrame::set_optimum_thread_level(); @@ -4563,6 +4654,7 @@ int main(int, char *[]) { test_change_freq_2(); test_change_freq_3(); test_duplication_mask(); + test_get_top_n_data(); return (0); } diff --git a/test/dataframe_tester_output.txt b/test/dataframe_tester_output.txt index 73e9945b..bed45779 100644 --- a/test/dataframe_tester_output.txt +++ b/test/dataframe_tester_output.txt @@ -714,18 +714,18 @@ col_2:1::8, col_3:1::15, col_str:1::11, col_4:1::22, -INDEX:1::123450, -col_1:1::1, -col_2:1::8, -col_3:1::15, -col_str:1::11, -col_4:1::22, -INDEX:1::123450, -col_1:1::1, -col_2:1::8, -col_3:1::15, -col_str:1::11, -col_4:1::22, +INDEX:1::123451, +col_1:1::2, +col_2:1::9, +col_3:1::16, +col_str:1::22, +col_4:1::23, +INDEX:1::123452, +col_1:1::3, +col_2:1::10, +col_3:1::17, +col_str:1::33, +col_4:1::24, Testing write(json) ... Writing in JSON: @@ -1993,6 +1993,20 @@ Testing change_freq_2( ) ... Testing change_freq_3( ) ... Testing duplication_mask( ) ... + +Testing get_top_n_data( ) ... +INDEX:4::123453,123454,123456,123462, +col_1:4::4,5,7,13, +col_2:4::11,12,14,32, +col_3:4::18,19,21,19, +col_4:4::25,99,0,0, + +INDEX:4::123453,123454,123456,123462, +col_1:4::4,5,7,13, +col_2:4::11,12,14,32, +col_3:4::18,19,21,19, +col_4:4::25,99,0,0, + Hello World! Str Column = A, B, C, D, E, F, G, H, I, J, There are 5031 IBM close prices @@ -2011,7 +2025,7 @@ INDEX:10:,string col:10:,Cool Column:10:,numbers:10: