Skip to content

Commit

Permalink
Implemented get_top_n_[data|view]()
Browse files Browse the repository at this point in the history
  • Loading branch information
hosseinmoein committed Jul 17, 2024
1 parent d4ece48 commit cfe5d14
Show file tree
Hide file tree
Showing 9 changed files with 461 additions and 25 deletions.
193 changes: 193 additions & 0 deletions docs/HTML/get_top_n_data.html

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions docs/HTML/join_by_column.html
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
join_policy jp) const;
</B></PRE></font>
</td>
<td WIDTH="33.3%">
<td>
It joins the data between self (lhs) and rhs and returns the joined data in a StdDataFrame, based on specification in join_policy.<BR>
The returned DataFrame is indexed by a sequence of unsigned long from 0 to N. The returned DataFrame will at least have two columns names lhs.INDEX and rhs.INDEX containing the lhs and rhs indices based on join policy.<BR>
The following conditions must be met for this method<BR>
Expand All @@ -87,7 +87,7 @@
<LI>In both lhs and rhs, columns with the same name must have the same type</LI>
</OL>
</td>
<td WIDTH="33.3%">
<td width="38%">
<B>RHS_T</B>: Type of DataFrame rhs<BR>
<B>T</B>: Type of the named column<BR>
<B>Ts</B>: List all the types of all data columns. A type should be specified in the list only once.<BR>
Expand All @@ -105,18 +105,18 @@
join_by_index(const RHS_T &amp;rhs, join_policy jp) const;
</B></PRE></font>
</td>
<td WIDTH="33.3%">
<td>
It joins the data between self (lhs) and rhs and returns the joined data<BR>
in a StdDataFrame, based on specification in join_policy.<BR>
The following conditions must be met for this method<BR>
to compile and work properly:<BR>
<OL>
<LI>I type must be the same between lhs and rhs.</LI>
<LI>Ordering (< > != ==) must be well defined for type I</LI>
<LI>In both lhs and rhs, columns with the same name must have the same Type</LI>
<LI>In both lhs and rhs, columns with the same name must have the same type</LI>
</OL>
</td>
<td WIDTH="33.3%">
<td width="38%">
<B>RHS_T</B>: Type of DataFrame rhs<BR>
<B>Ts</B>: List all the types of all data columns. A type should be specified in the list only once.<BR>
<B>rhs</B>: The rhs DataFrame<BR>
Expand Down
33 changes: 33 additions & 0 deletions include/DataFrame/DataFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -2785,6 +2785,39 @@ class DataFrame : public ThreadGranularity {
[[nodiscard]] DataFrame<I, HeteroVector<std::size_t(H::align_value)>>
get_data(const StlVecType<const char *> &col_names) const;

// This returns a new DataFrame with the n top rows of the given column.
// The returned DataFrame rows will be in the same order as self.
//
// NOTE: Comparison operators (<, >, ==) must be well defined for type T.
//
// T:
// Type of column name
// Ts:
// List all the types of all data columns. A type should be specified in
// the list only once.
// col_name:
// Name of the given column
// n:
// Number of top rows
//
template<typename T, typename ... Ts>
[[nodiscard]] DataFrame<I, HeteroVector<std::size_t(H::align_value)>>
get_top_n_data(const char *col_name, size_type n) const;

// Smae as above but it returns a View with the n top rows of
// the given column.
//
template<typename T, typename ... Ts>
[[nodiscard]] PtrView
get_top_n_view(const char *col_name, size_type n);

// Same as above but it returns a const View with the n top rows of
// the given column.
//
template<typename T, typename ... Ts>
[[nodiscard]] ConstPtrView
get_top_n_view(const char *col_name, size_type n) const;

// This returns a new DataFrame with the same index column as self and an
// integer column with the same name for each column in self.
// The integer columns in returned DataFrame show a duplication mask for
Expand Down
2 changes: 1 addition & 1 deletion include/DataFrame/DataFrameStatsVisitors.h
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,7 @@ struct NExtremumVisitor {
operator() (K idx_begin, K /*idx_end*/, H column_begin, H column_end) {

#ifdef HMDF_SANITY_EXCEPTIONS
if (std::distance(column_begin, column_end) < n_)
if (size_type(std::distance(column_begin, column_end)) < n_)
throw DataFrameError("NExtremumVisitor: column size must be >= N");
#endif // HMDF_SANITY_EXCEPTIONS

Expand Down
48 changes: 48 additions & 0 deletions include/DataFrame/Internals/DataFrame_get.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -2540,6 +2540,54 @@ get_view(const StlVecType<const char *> &col_names) const {

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<typename T, typename ... Ts>
DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::
get_top_n_data(const char *name, size_type n) const {

using res_t = DataFrame<I, HeteroVector<align_value>>;
using visitor_t = NLargestVisitor<T, I>;

res_t result;

top_n_common_<T, visitor_t, res_t, Ts ...>(name, visitor_t { n }, result);
return (result);
}

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<typename T, typename ... Ts>
typename DataFrame<I, H>::PtrView DataFrame<I, H>::
get_top_n_view(const char *name, size_type n) {

using res_t = PtrView;
using visitor_t = NLargestVisitor<T, I>;

res_t result;

top_n_common_<T, visitor_t, res_t, Ts ...>(name, visitor_t { n }, result);
return (result);
}

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<typename T, typename ... Ts>
typename DataFrame<I, H>::ConstPtrView DataFrame<I, H>::
get_top_n_view(const char *name, size_type n) const {

using res_t = ConstPtrView;
using visitor_t = NLargestVisitor<T, I>;

res_t result;

top_n_common_<T, visitor_t, res_t, Ts ...>(name, visitor_t { n }, result);
return (result);
}

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<hashable_equal ... Ts>
DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::
Expand Down
4 changes: 2 additions & 2 deletions include/DataFrame/Internals/DataFrame_misc.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -735,9 +735,9 @@ operator() (T &vec) {

using VecType = typename std::remove_reference<T>::type;
using ValueType = typename VecType::value_type;
using ViewType = typename DF::template ColumnVecType<ValueType>;
using ViewColType = typename DF::template ColumnVecType<ValueType>;

ViewType new_col;
ViewColType new_col;
const size_type vec_size = vec.size();

new_col.reserve(std::min(sel_indices.size(), vec_size));
Expand Down
64 changes: 60 additions & 4 deletions include/DataFrame/Internals/DataFrame_private_decl.h
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ fill_missing_lagrange_(ColumnVecType<T> &vec,
vec[k] = y;
count += 1;
}
}
}
return;
}

Expand Down Expand Up @@ -602,7 +602,7 @@ join_helper_common_(
&rhs = std::as_const(rhs),
&joined_index_idx = std::as_const(joined_index_idx),
&result] () -> void {
index_join_functor_common_<res_t, RHS_T, Ts ...> functor(
index_join_functor_common_<res_t, RHS_T, Ts ...> functor(
name.c_str(),
rhs,
joined_index_idx,
Expand Down Expand Up @@ -836,8 +836,8 @@ data_by_sel_common_(const StlVecType<size_type> &col_indices,
for (const auto &[name, idx] : column_list_) [[likely]] {
sel_load_functor_<res_t, size_type, Ts ...> functor(name.c_str(),
col_indices,
idx_s,
ret_df);
idx_s,
ret_df);

data_[idx].change(functor);
}
Expand Down Expand Up @@ -912,6 +912,62 @@ view_by_sel_common_(const StlVecType<size_type> &col_indices,

// ----------------------------------------------------------------------------

template<typename T, typename V, typename R, typename ... Ts>
void top_n_common_(const char *col_name, V &&visitor, R &result) const {

using res_t = R;

const ColumnVecType<T> *vec { nullptr };

if (! ::strcmp(col_name, DF_INDEX_COL_NAME))
vec = (const ColumnVecType<T> *) &(get_index());
else
vec = (const ColumnVecType<T> *) &(get_column<T>(col_name));

visitor.pre();
visitor(indices_.begin(), indices_.end(), vec->begin(), vec->end());
visitor.post();
visitor.sort_by_index_idx();

typename res_t::IndexVecType new_index;
StlVecType<size_type> idxs;

new_index.reserve(visitor.get_result().size());
idxs.reserve(visitor.get_result().size());
for (const auto &res : visitor.get_result()) {
if constexpr (std::is_same_v<res_t,
DataFrame<I, HeteroVector<align_value>>>)
new_index.push_back(indices_[res.index_idx]);
else // Views
new_index.push_back(
&(const_cast<DataFrame *>(this)->indices_[res.index_idx]));
idxs.push_back(res.index_idx);
}
result.indices_ = std::move(new_index);

const SpinGuard guard(lock_);

if constexpr (std::is_same_v<res_t,
DataFrame<I, HeteroVector<align_value>>>) {
for (const auto &[name, idx] : column_list_) [[likely]] {
sel_load_functor_<res_t, size_type, Ts ...> functor(
name.c_str(), idxs, 0, result);

data_[idx].change(functor);
}
}
else { // Views
for (const auto &[name, idx] : column_list_) [[likely]] {
sel_load_view_functor_<size_type, res_t, Ts ...> functor(
name.c_str(), idxs, 0, result);

data_[idx].change(functor);
}
}
}

// ----------------------------------------------------------------------------

template<typename V, typename T>
inline static void
replace_vector_vals_(V &data_vec,
Expand Down
92 changes: 92 additions & 0 deletions test/dataframe_tester_3.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4475,6 +4475,97 @@ static void test_duplication_mask() {

// ----------------------------------------------------------------------------

static void test_get_top_n_data() {

std::cout << "\nTesting get_top_n_data( ) ..." << std::endl;

StlVecType<unsigned long> idx =
{ 123450, 123451, 123452, 123453, 123454, 123455, 123456,
123457, 123458, 123459, 123460, 123461, 123462, 123466 };
StlVecType<double> d1 =
{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
StlVecType<double> d2 =
{ 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
StlVecType<double> d3 =
{ 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0, 10 };
StlVecType<int> i1 = { 22, 23, 24, 25, 99 };
MyDataFrame df;

df.load_data(std::move(idx),
std::make_pair("col_1", d1),
std::make_pair("col_2", d2),
std::make_pair("col_3", d3),
std::make_pair("col_4", i1));

auto lbd =
[](const unsigned long &, const double &val) -> bool {
return (val < 100.0);
};
auto view =
df.get_view_by_sel<double, decltype(lbd), double, int, std::string>
("col_1", lbd);

auto res1 =
df.get_top_n_data<double, int, double, std::string>("col_3", 4);
auto res2 =
view.get_top_n_data<double, int, double, std::string>("col_3", 4);
auto res3 =
df.get_top_n_view<double, int, double, std::string>("col_3", 4);
auto res4 =
view.get_top_n_view<double, int, double, std::string>("col_3", 4);
auto res5 =
view.get_top_n_data<unsigned int, int, double, std::string>
(DF_INDEX_COL_NAME, 4);

{
StlVecType<unsigned long> out_idx =
{ 123453, 123454, 123456, 123462 };
StlVecType<double> out_col_2 = { 11, 12, 14, 32 };
StlVecType<double> out_col_3 = { 18, 19, 21, 19 };
StlVecType<int> out_col_4 = { 25, 99, 0, 0 };

assert(res1.get_index() == out_idx);
assert(res1.get_column<double>("col_2") == out_col_2);
assert(res1.get_column<double>("col_3") == out_col_3);
assert(res1.get_column<int>("col_4") == out_col_4);
}
{
StlVecType<unsigned long> out_idx =
{ 123453, 123454, 123456, 123462 };
StlVecType<double> out_col_2 = { 11, 12, 14, 32 };
StlVecType<double> out_col_3 = { 18, 19, 21, 19 };
StlVecType<int> out_col_4 = { 25, 99, 0, 0 };

assert(res2.get_index() == out_idx);
assert(res2.get_column<double>("col_2") == out_col_2);
assert(res2.get_column<double>("col_3") == out_col_3);
assert(res2.get_column<int>("col_4") == out_col_4);
}

res3.write<std::ostream, double, int, std::string>
(std::cout, io_format::csv);
std::cout << std::endl;

res4.write<std::ostream, double, int, std::string>
(std::cout, io_format::csv);
std::cout << std::endl;

{
StlVecType<unsigned long> out_idx =
{ 123460, 123461, 123462, 123466 };
StlVecType<double> out_col_2 = { 30, 31, 32, 1.89 };
StlVecType<double> out_col_3 = { 2.3, 0.34, 19, 10 };
StlVecType<int> out_col_4 = { 0, 0, 0, 0 };

assert(res5.get_index() == out_idx);
assert(res5.get_column<double>("col_2") == out_col_2);
assert(res5.get_column<double>("col_3") == out_col_3);
assert(res5.get_column<int>("col_4") == out_col_4);
}
}

// -----------------------------------------------------------------------------

int main(int, char *[]) {

MyDataFrame::set_optimum_thread_level();
Expand Down Expand Up @@ -4563,6 +4654,7 @@ int main(int, char *[]) {
test_change_freq_2();
test_change_freq_3();
test_duplication_mask();
test_get_top_n_data();

return (0);
}
Expand Down
Loading

0 comments on commit cfe5d14

Please sign in to comment.