Skip to content

Commit

Permalink
Implemented duplication_mask()
Browse files Browse the repository at this point in the history
  • Loading branch information
hosseinmoein committed Jul 11, 2024
1 parent 3506017 commit 306b192
Show file tree
Hide file tree
Showing 7 changed files with 446 additions and 25 deletions.
4 changes: 4 additions & 0 deletions docs/HTML/DataFrame.html
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,10 @@ <H2><font color="blue">API Reference with code samples</font></H2>
<td title="Drops missing values"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/drop_missing.html">drop_missing</a>()</td>
</tr>

<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
<td title="Returns a DataFrame with duplication masks for each column"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/duplication_mask.html">duplication_mask</a>()</td>
</tr>

<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
<td title="Returns true/false if empty"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/empty.html">empty</a>()</td>
</tr>
Expand Down
160 changes: 160 additions & 0 deletions docs/HTML/duplication_mask.html

Large diffs are not rendered by default.

37 changes: 37 additions & 0 deletions include/DataFrame/DataFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -2785,6 +2785,43 @@ class DataFrame : public ThreadGranularity {
[[nodiscard]] DataFrame<I, HeteroVector<std::size_t(H::align_value)>>
get_data(const StlVecType<const char *> &col_names) const;

// This returns a new DataFrame with the same index column as self and an
// integer column with the same name for each column in self.
// The integer columns in returned DataFrame show a duplication mask for
// each column in self.
// For example, if self has a column like:
// ----------------------------------------
// | aa | bb | cc | aa | dd | aa | bb | hh |
// ----------------------------------------
// The returned DataFrame has a corresponding integer column with the
// same name
// If binary is false:
// --------------------------------
// | 3 | 2 | 1 | 3 | 1 | 3 | 2 | 1 |
// --------------------------------
// If binary is true:
// --------------------------------
// | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 |
// --------------------------------
//
// NOTE: All column types must be hash-able and have == operator
// well defined
//
// Ts:
// List all the types of all data columns. A type should be specified in
// the list only once.
// include_index:
// If true, it includes the index column to determine uniqueness
// binary:
// If false, the returned integer columns contain the count of each item
// in the original column. If true, the returned integer columns contain
// 0’s and 1’s depending on if original items in the column had
// duplicates or not.
//
template<hashable_equal ... Ts>
[[nodiscard]] DataFrame<I, HeteroVector<std::size_t(H::align_value)>>
duplication_mask(bool include_index, bool binrary = false) const;

// It behaves like get_data(), but it returns a View.
// A view is a DataFrame that is a reference to the original DataFrame.
// So if you modify anything in the view the original DataFrame will
Expand Down
24 changes: 23 additions & 1 deletion include/DataFrame/Internals/DataFrame_functors.h
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ struct change_freq_functor_ : DataVec::template visitor_base<Ts ...> {

inline change_freq_functor_(const char *n,
DataFrame &r,
const IndexVecType &oi)
const IndexVecType &oi)
: name (n), res(r), old_idx(oi) { }

const char *name;
Expand All @@ -752,6 +752,28 @@ struct change_freq_functor_ : DataVec::template visitor_base<Ts ...> {

// ----------------------------------------------------------------------------

template<typename ... Ts>
struct dup_mask_functor_ : DataVec::template visitor_base<Ts ...> {

inline dup_mask_functor_(const char *n,
DataFrame &r,
const IndexVecType &oi,
bool ii,
bool b)
: name (n), res(r), idx_vec(oi), incl_idx(ii), binary(b) { }

const char *name;
DataFrame &res;
const IndexVecType &idx_vec;
const bool incl_idx;
const bool binary;

template<typename T>
void operator() (const T &vec);
};

// ----------------------------------------------------------------------------

// Local Variables:
// mode:C++
// tab-width:4
Expand Down
88 changes: 64 additions & 24 deletions include/DataFrame/Internals/DataFrame_get.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -362,35 +362,26 @@ DataFrame<I, H>::get_data_by_idx (Index2D<IndexType> range) const {
const auto thread_level =
(indices_.size() < ThreadPool::MUL_THR_THHOLD)
? 0L : get_thread_level();
auto lbd =
[b_dist, e_dist, &df, this]
(const auto &begin, const auto &end) -> void {
for (auto citer = begin; citer < end; ++citer) {
load_functor_<res_t, Ts ...> functor (
citer->first.c_str(), b_dist, e_dist, df);

if (thread_level > 2) {
auto lbd =
[b_dist, e_dist, &df, this]
(const auto &begin, const auto &end) -> void {
for (auto citer = begin; citer < end; ++citer) {
load_functor_<res_t , Ts ...> functor (
citer->first.c_str(), b_dist, e_dist, df);

this->data_[citer->second].change(functor);
}
};
this->data_[citer->second].change(functor);
}
};

auto futuers =
thr_pool_.parallel_loop(column_list_.begin(),
column_list_.end(),
std::move(lbd));
if (thread_level > 2) {
auto futuers = thr_pool_.parallel_loop(column_list_.begin(),
column_list_.end(),
std::move(lbd));

for (auto &fut : futuers) fut.get();
}
else {
for (const auto &[name, idx] : column_list_) [[likely]] {
load_functor_<res_t, Ts ...> functor (name.c_str(),
b_dist,
e_dist,
df);

data_[idx].change(functor);
}
lbd(column_list_.begin(), column_list_.end());
}
}

Expand Down Expand Up @@ -2549,6 +2540,55 @@ get_view(const StlVecType<const char *> &col_names) const {

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<hashable_equal ... Ts>
DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::
duplication_mask (bool include_index, bool binary) const {

using res_t = DataFrame<I, HeteroVector<std::size_t(H::align_value)>>;

res_t new_df;

new_df.load_index(indices_.begin(), indices_.end());

const SpinGuard guard(lock_);

for (const auto &[name, idx] : column_list_) [[likely]]
new_df.template create_column<int>(name.c_str(), false);

const auto thread_level =
(indices_.size() < ThreadPool::MUL_THR_THHOLD)
? 0L : get_thread_level();
auto lbd =
[&new_df, this, include_index, binary]
(const auto &begin, const auto &end) -> void {
for (auto citer = begin; citer < end; ++citer) {
dup_mask_functor_<Ts ...> functor(citer->first.c_str(),
new_df,
new_df.indices_,
include_index,
binary);

this->data_[citer->second].change(functor);
}
};

if (thread_level > 2) {
auto futuers = thr_pool_.parallel_loop(column_list_.begin(),
column_list_.end(),
std::move(lbd));

for (auto &fut : futuers) fut.get();
}
else {
lbd(column_list_.begin(), column_list_.end());
}

return (new_df);
}

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<typename T, typename ... Ts>
DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::
Expand Down Expand Up @@ -2735,7 +2775,7 @@ change_freq(size_type new_freq,
"convert_freq(): "
"Index type of DateTime must have a valid time unit");
#endif // HMDF_SANITY_EXCEPTIONS
new_idx =
new_idx =
gen_datetime_index(
indices_.front().string_format(DT_FORMAT::DT_TM2).c_str(),
indices_.back().string_format(DT_FORMAT::DT_TM2).c_str(),
Expand Down
63 changes: 63 additions & 0 deletions include/DataFrame/Internals/DataFrame_misc.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -1066,6 +1066,69 @@ operator() (const T &vec) {
false);
}


// ----------------------------------------------------------------------------

template<typename I, typename H>
template<typename ... Ts>
template<typename T>
void
DataFrame<I, H>::dup_mask_functor_<Ts ...>::
operator() (const T &vec) {

using VecType = typename std::remove_reference<T>::type;
using ValueType = typename VecType::value_type;
using NewVecType = ColumnVecType<int>;

using data_tuple = std::tuple<const ValueType &, const IndexType &>;
using map_t = DFUnorderedMap<data_tuple, int, TupleHash>;

const IndexType dummy_idx { };
const auto col_s = std::min(idx_vec.size(), vec.size());
map_t table;
size_type i { 0 };

table.reserve(col_s);
for (const auto &val : vec) {
const auto insert_res =
table.emplace(
std::forward_as_tuple(val, incl_idx ? idx_vec[i++] : dummy_idx),
0);

insert_res.first->second += 1;
}

NewVecType new_vec;

new_vec.reserve(col_s);
i = 0;
if (! binary) {
for (const auto &val : vec) {
const auto find_res =
table.find(std::make_tuple(
val, incl_idx ? idx_vec[i++] : dummy_idx));

if (find_res != table.end())
new_vec.push_back(find_res->second);
}
}
else {
for (const auto &val : vec) {
const auto find_res =
table.find(std::make_tuple(
val, incl_idx ? idx_vec[i++] : dummy_idx));

if (find_res != table.end())
new_vec.push_back(find_res->second == 1 ? 0 : 1);
}
}

res.template load_column<int>(name,
std::move(new_vec),
nan_policy::dont_pad_with_nans,
false);
}

} // namespace hmdf

// ----------------------------------------------------------------------------
Expand Down
95 changes: 95 additions & 0 deletions test/dataframe_tester_3.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4381,6 +4381,100 @@ static void test_change_freq_3() {

// ----------------------------------------------------------------------------

static void test_duplication_mask() {

std::cout << "\nTesting duplication_mask( ) ..." << std::endl;

MyDataFrame df;
StlVecType<unsigned long> idxvec =
{ 1UL, 2UL, 3UL, 10UL, 5UL, 7UL, 8UL, 12UL, 9UL, 12UL,
10UL, 13UL, 10UL, 15UL, 14UL };
StlVecType<double> dblvec =
{ 0.0, 15.0, 14.0, 2.0, 15.0, 12.0, 11.0, 8.0, 7.0, 11.0,
5.0, 11.0, 3.0, 9.0, 15.0 };
StlVecType<double> dblvec2 =
{ 100.0, 101.0, 102.0, 103.0, 101.0, 105.0, 106.55, 107.34, 1.8, 111.0,
112.0, 113.0, 114.0, 115.0, 116.0 };
StlVecType<int> intvec = { 1, 2, 3, 4, 2, 8, 6, 7, 1 };
StlVecType<std::string> strvec =
{ "zz", "hh", "cc", "ww", "bb", "ff", "gg", "hh", "ii", "jj",
"kk", "ll", "mm", "ww", "oo" };

df.load_data(std::move(idxvec),
std::make_pair("dbl_col", dblvec),
std::make_pair("dbl_col_2", dblvec2),
std::make_pair("str_col", strvec));
df.load_column("int_col",
std::move(intvec),
nan_policy::dont_pad_with_nans);

const auto df2 = df.duplication_mask<double, int, std::string>(false);

{
StlVecType<unsigned long> res_idx =
{ 1, 2, 3, 10, 5, 7, 8, 12, 9, 12, 10, 13, 10, 15, 14 };
StlVecType<int> res_dbl_col =
{ 1, 3, 1, 1, 3, 1, 3, 1, 1, 3, 1, 3, 1, 1, 3 };
StlVecType<int> res_dbl_col_2 =
{ 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
StlVecType<int> res_str_col =
{ 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1 };
StlVecType<int> res_int_col =
{ 2, 2, 1, 1, 2, 1, 1, 1, 2 };

assert(df2.get_index() == res_idx);
assert(df2.get_column<int>("dbl_col") == res_dbl_col);
assert(df2.get_column<int>("dbl_col_2") == res_dbl_col_2);
assert(df2.get_column<int>("str_col") == res_str_col);
assert(df2.get_column<int>("int_col") == res_int_col);
}

const auto df3 = df.duplication_mask<double, int, std::string>(true);

{
StlVecType<unsigned long> res_idx =
{ 1, 2, 3, 10, 5, 7, 8, 12, 9, 12, 10, 13, 10, 15, 14 };
StlVecType<int> res_dbl_col =
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
StlVecType<int> res_dbl_col_2 =
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
StlVecType<int> res_str_col =
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
StlVecType<int> res_int_col =
{ 1, 1, 1, 1, 1, 1, 1, 1, 1 };

assert(df3.get_index() == res_idx);
assert(df3.get_column<int>("dbl_col") == res_dbl_col);
assert(df3.get_column<int>("dbl_col_2") == res_dbl_col_2);
assert(df3.get_column<int>("str_col") == res_str_col);
assert(df3.get_column<int>("int_col") == res_int_col);
}

const auto df4 =
df.duplication_mask<double, int, std::string>(false, true);

{
StlVecType<unsigned long> res_idx =
{ 1, 2, 3, 10, 5, 7, 8, 12, 9, 12, 10, 13, 10, 15, 14 };
StlVecType<int> res_dbl_col =
{ 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1 };
StlVecType<int> res_dbl_col_2 =
{ 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
StlVecType<int> res_str_col =
{ 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0 };
StlVecType<int> res_int_col =
{ 1, 1, 0, 0, 1, 0, 0, 0, 1 };

assert(df4.get_index() == res_idx);
assert(df4.get_column<int>("dbl_col") == res_dbl_col);
assert(df4.get_column<int>("dbl_col_2") == res_dbl_col_2);
assert(df4.get_column<int>("str_col") == res_str_col);
assert(df4.get_column<int>("int_col") == res_int_col);
}
}

// ----------------------------------------------------------------------------

int main(int, char *[]) {

MyDataFrame::set_optimum_thread_level();
Expand Down Expand Up @@ -4468,6 +4562,7 @@ int main(int, char *[]) {
test_change_freq();
test_change_freq_2();
test_change_freq_3();
test_duplication_mask();

return (0);
}
Expand Down

0 comments on commit 306b192

Please sign in to comment.