Implemented duplication_mask()

hosseinmoein · Jul 11, 2024 · 306b192 · 306b192
1 parent 3506017
commit 306b192
Show file tree

Hide file tree

Showing 7 changed files with 446 additions and 25 deletions.
diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html
@@ -228,6 +228,10 @@ <H2><font color="blue">API Reference with code samples</font></H2>
       <td title="Drops missing values"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/drop_missing.html">drop_missing</a>()</td>
     </tr>
 
+    <tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
+      <td title="Returns a DataFrame with duplication masks for each column"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/duplication_mask.html">duplication_mask</a>()</td>
+    </tr>
+
     <tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
       <td title="Returns true/false if empty"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/empty.html">empty</a>()</td>
     </tr>

diff --git a/docs/HTML/duplication_mask.html b/docs/HTML/duplication_mask.html
diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h
@@ -2785,6 +2785,43 @@ class   DataFrame : public ThreadGranularity {
     [[nodiscard]] DataFrame<I, HeteroVector<std::size_t(H::align_value)>>
     get_data(const StlVecType<const char *> &col_names) const;
 
+    // This returns a new DataFrame with the same index column as self and an
+    // integer column with the same name for each column in self.
+    // The integer columns in returned DataFrame show a duplication mask for
+    // each column in self.
+    // For example, if self has a column like:
+    //     ----------------------------------------
+    //    |  aa | bb | cc | aa | dd | aa | bb | hh |
+    //     ----------------------------------------
+    // The returned DataFrame has a corresponding integer column with the
+    // same name
+    // If binary is false:
+    //     --------------------------------
+    //    |  3 | 2 | 1 | 3 | 1 | 3 | 2 | 1 |
+    //     --------------------------------
+    // If binary is true:
+    //     --------------------------------
+    //    |  1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 |
+    //     --------------------------------
+    //
+    // NOTE: All column types must be hash-able and have == operator
+    //       well defined
+    //
+    // Ts:
+    //   List all the types of all data columns. A type should be specified in
+    //   the list only once.
+    // include_index:
+    //   If true, it includes the index column to determine uniqueness
+    // binary:
+    //   If false, the returned integer columns contain the count of each item
+    //   in the original column. If true, the returned integer columns contain
+    //   0’s and 1’s depending on if original items in the column had
+    //   duplicates or not.
+    //
+    template<hashable_equal ... Ts>
+    [[nodiscard]] DataFrame<I, HeteroVector<std::size_t(H::align_value)>>
+    duplication_mask(bool include_index, bool binrary = false) const;
+
     // It behaves like get_data(), but it returns a View.
     // A view is a DataFrame that is a reference to the original DataFrame.
     // So if you modify anything in the view the original DataFrame will

diff --git a/include/DataFrame/Internals/DataFrame_functors.h b/include/DataFrame/Internals/DataFrame_functors.h
@@ -739,7 +739,7 @@ struct  change_freq_functor_ : DataVec::template visitor_base<Ts ...>  {
 
     inline change_freq_functor_(const char *n,
                                 DataFrame &r,
-								const IndexVecType &oi)
+                                const IndexVecType &oi)
         : name (n), res(r), old_idx(oi)  {   }
 
     const char          *name;
@@ -752,6 +752,28 @@ struct  change_freq_functor_ : DataVec::template visitor_base<Ts ...>  {
 
 // ----------------------------------------------------------------------------
 
+template<typename ... Ts>
+struct  dup_mask_functor_ : DataVec::template visitor_base<Ts ...>  {
+
+    inline dup_mask_functor_(const char *n,
+                             DataFrame &r,
+                             const IndexVecType &oi,
+                             bool ii,
+                             bool b)
+        : name (n), res(r), idx_vec(oi), incl_idx(ii), binary(b)  {   }
+
+    const char          *name;
+    DataFrame           &res;
+    const IndexVecType  &idx_vec;
+    const bool          incl_idx;
+    const bool          binary;
+
+    template<typename T>
+    void operator() (const T &vec);
+};
+
+// ----------------------------------------------------------------------------
+
 // Local Variables:
 // mode:C++
 // tab-width:4

diff --git a/include/DataFrame/Internals/DataFrame_get.tcc b/include/DataFrame/Internals/DataFrame_get.tcc
@@ -362,35 +362,26 @@ DataFrame<I, H>::get_data_by_idx (Index2D<IndexType> range) const  {
         const auto  thread_level =
             (indices_.size() < ThreadPool::MUL_THR_THHOLD)
                 ? 0L : get_thread_level();
+        auto        lbd =
+            [b_dist, e_dist, &df, this]
+            (const auto &begin, const auto &end) -> void  {
+                for (auto citer = begin; citer < end; ++citer)  {
+                    load_functor_<res_t, Ts ...>  functor (
+                         citer->first.c_str(), b_dist, e_dist, df);
 
-        if (thread_level > 2)  {
-            auto    lbd =
-                [b_dist, e_dist, &df, this]
-                (const auto &begin, const auto &end) -> void  {
-                    for (auto citer = begin; citer < end; ++citer)  {
-                        load_functor_<res_t  , Ts ...>  functor (
-                             citer->first.c_str(), b_dist, e_dist, df);
-
-                        this->data_[citer->second].change(functor);
-                    }
-                };
+                    this->data_[citer->second].change(functor);
+                }
+            };
 
-            auto    futuers =
-                thr_pool_.parallel_loop(column_list_.begin(),
-                                        column_list_.end(),
-                                        std::move(lbd));
+        if (thread_level > 2)  {
+            auto    futuers = thr_pool_.parallel_loop(column_list_.begin(),
+                                                      column_list_.end(),
+                                                      std::move(lbd));
 
             for (auto &fut : futuers)  fut.get();
         }
         else  {
-            for (const auto &[name, idx] : column_list_) [[likely]]  {
-                load_functor_<res_t, Ts ...>    functor (name.c_str(),
-                                                         b_dist,
-                                                         e_dist,
-                                                         df);
-
-                data_[idx].change(functor);
-            }
+            lbd(column_list_.begin(), column_list_.end());
         }
     }
 
@@ -2549,6 +2540,55 @@ get_view(const StlVecType<const char *> &col_names) const  {
 
 // ----------------------------------------------------------------------------
 
+template<typename I, typename H>
+template<hashable_equal ... Ts>
+DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::
+duplication_mask (bool include_index, bool binary) const  {
+
+    using res_t = DataFrame<I, HeteroVector<std::size_t(H::align_value)>>;
+
+    res_t   new_df;
+
+    new_df.load_index(indices_.begin(), indices_.end());
+
+    const SpinGuard guard(lock_);
+
+    for (const auto &[name, idx] : column_list_) [[likely]]
+        new_df.template create_column<int>(name.c_str(), false);
+
+    const auto  thread_level =
+        (indices_.size() < ThreadPool::MUL_THR_THHOLD)
+            ? 0L : get_thread_level();
+    auto        lbd =
+        [&new_df, this, include_index, binary]
+        (const auto &begin, const auto &end) -> void  {
+            for (auto citer = begin; citer < end; ++citer)  {
+                dup_mask_functor_<Ts ...>   functor(citer->first.c_str(),
+                                                    new_df,
+                                                    new_df.indices_,
+                                                    include_index,
+                                                    binary);
+
+                this->data_[citer->second].change(functor);
+            }
+        };
+
+    if (thread_level > 2)  {
+        auto    futuers = thr_pool_.parallel_loop(column_list_.begin(),
+                                                  column_list_.end(),
+                                                  std::move(lbd));
+
+        for (auto &fut : futuers)  fut.get();
+    }
+    else  {
+        lbd(column_list_.begin(), column_list_.end());
+    }
+
+    return (new_df);
+}
+
+// ----------------------------------------------------------------------------
+
 template<typename I, typename H>
 template<typename T, typename ... Ts>
 DataFrame<I, HeteroVector<std::size_t(H::align_value)>> DataFrame<I, H>::
@@ -2735,7 +2775,7 @@ change_freq(size_type new_freq,
                 "convert_freq(): "
                 "Index type of DateTime must have a valid time unit");
 #endif // HMDF_SANITY_EXCEPTIONS
-        new_idx = 
+        new_idx =
             gen_datetime_index(
                 indices_.front().string_format(DT_FORMAT::DT_TM2).c_str(),
                 indices_.back().string_format(DT_FORMAT::DT_TM2).c_str(),

diff --git a/include/DataFrame/Internals/DataFrame_misc.tcc b/include/DataFrame/Internals/DataFrame_misc.tcc
@@ -1066,6 +1066,69 @@ operator() (const T &vec)  {
                                         false);
 }
 
+
+// ----------------------------------------------------------------------------
+
+template<typename I, typename H>
+template<typename ... Ts>
+template<typename T>
+void
+DataFrame<I, H>::dup_mask_functor_<Ts ...>::
+operator() (const T &vec)  {
+
+    using VecType = typename std::remove_reference<T>::type;
+    using ValueType = typename VecType::value_type;
+    using NewVecType = ColumnVecType<int>;
+
+    using data_tuple = std::tuple<const ValueType &, const IndexType &>;
+    using map_t = DFUnorderedMap<data_tuple, int, TupleHash>;
+
+    const IndexType dummy_idx { };
+    const auto      col_s = std::min(idx_vec.size(), vec.size());
+    map_t           table;
+    size_type       i { 0 };
+
+    table.reserve(col_s);
+    for (const auto &val : vec)  {
+        const auto  insert_res =
+            table.emplace(
+                std::forward_as_tuple(val, incl_idx ? idx_vec[i++] : dummy_idx),
+                0);
+
+        insert_res.first->second += 1;
+    }
+
+    NewVecType  new_vec;
+
+    new_vec.reserve(col_s);
+    i = 0;
+    if (! binary)  {
+        for (const auto &val : vec)  {
+            const auto  find_res =
+                table.find(std::make_tuple(
+                    val, incl_idx ? idx_vec[i++] : dummy_idx));
+
+            if (find_res != table.end())
+                new_vec.push_back(find_res->second);
+        }
+    }
+    else  {
+        for (const auto &val : vec)  {
+            const auto  find_res =
+                table.find(std::make_tuple(
+                    val, incl_idx ? idx_vec[i++] : dummy_idx));
+
+            if (find_res != table.end())
+                new_vec.push_back(find_res->second == 1 ? 0 : 1);
+        }
+    }
+
+    res.template load_column<int>(name,
+                                  std::move(new_vec),
+                                  nan_policy::dont_pad_with_nans,
+                                  false);
+}
+
 } // namespace hmdf
 
 // ----------------------------------------------------------------------------

diff --git a/test/dataframe_tester_3.cc b/test/dataframe_tester_3.cc
@@ -4381,6 +4381,100 @@ static void test_change_freq_3()  {
 
 // ----------------------------------------------------------------------------
 
+static void test_duplication_mask()  {
+
+    std::cout << "\nTesting duplication_mask( ) ..." << std::endl;
+
+    MyDataFrame                df;
+    StlVecType<unsigned long>  idxvec =
+        { 1UL, 2UL, 3UL, 10UL, 5UL, 7UL, 8UL, 12UL, 9UL, 12UL,
+          10UL, 13UL, 10UL, 15UL, 14UL };
+    StlVecType<double>         dblvec =
+        { 0.0, 15.0, 14.0, 2.0, 15.0, 12.0, 11.0, 8.0, 7.0, 11.0,
+          5.0, 11.0, 3.0, 9.0, 15.0 };
+    StlVecType<double>         dblvec2 =
+        { 100.0, 101.0, 102.0, 103.0, 101.0, 105.0, 106.55, 107.34, 1.8, 111.0,
+          112.0, 113.0, 114.0, 115.0, 116.0 };
+    StlVecType<int>            intvec = { 1, 2, 3, 4, 2, 8, 6, 7, 1 };
+    StlVecType<std::string>    strvec =
+        { "zz", "hh", "cc", "ww", "bb", "ff", "gg", "hh", "ii", "jj",
+          "kk", "ll", "mm", "ww", "oo" };
+
+    df.load_data(std::move(idxvec),
+                 std::make_pair("dbl_col", dblvec),
+                 std::make_pair("dbl_col_2", dblvec2),
+                 std::make_pair("str_col", strvec));
+    df.load_column("int_col",
+                   std::move(intvec),
+                   nan_policy::dont_pad_with_nans);
+
+    const auto  df2 = df.duplication_mask<double, int, std::string>(false);
+
+    {
+        StlVecType<unsigned long>   res_idx =
+            { 1, 2, 3, 10, 5, 7, 8, 12, 9, 12, 10, 13, 10, 15, 14 };
+        StlVecType<int>             res_dbl_col =
+            { 1, 3, 1, 1, 3, 1, 3, 1, 1, 3, 1, 3, 1, 1, 3 };
+        StlVecType<int>             res_dbl_col_2 =
+            { 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+        StlVecType<int>             res_str_col =
+            { 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1 };
+        StlVecType<int>             res_int_col =
+            { 2, 2, 1, 1, 2, 1, 1, 1, 2 };
+
+        assert(df2.get_index() == res_idx);
+        assert(df2.get_column<int>("dbl_col") == res_dbl_col);
+        assert(df2.get_column<int>("dbl_col_2") == res_dbl_col_2);
+        assert(df2.get_column<int>("str_col") == res_str_col);
+        assert(df2.get_column<int>("int_col") == res_int_col);
+    }
+
+    const auto  df3 = df.duplication_mask<double, int, std::string>(true);
+
+    {
+        StlVecType<unsigned long>   res_idx =
+            { 1, 2, 3, 10, 5, 7, 8, 12, 9, 12, 10, 13, 10, 15, 14 };
+        StlVecType<int>             res_dbl_col =
+            { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+        StlVecType<int>             res_dbl_col_2 =
+            { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+        StlVecType<int>             res_str_col =
+            { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+        StlVecType<int>             res_int_col =
+            { 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+
+        assert(df3.get_index() == res_idx);
+        assert(df3.get_column<int>("dbl_col") == res_dbl_col);
+        assert(df3.get_column<int>("dbl_col_2") == res_dbl_col_2);
+        assert(df3.get_column<int>("str_col") == res_str_col);
+        assert(df3.get_column<int>("int_col") == res_int_col);
+    }
+
+    const auto  df4 =
+        df.duplication_mask<double, int, std::string>(false, true);
+
+    {
+        StlVecType<unsigned long>   res_idx =
+            { 1, 2, 3, 10, 5, 7, 8, 12, 9, 12, 10, 13, 10, 15, 14 };
+        StlVecType<int>             res_dbl_col =
+            { 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1 };
+        StlVecType<int>             res_dbl_col_2 =
+            { 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+        StlVecType<int>             res_str_col =
+            { 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0 };
+        StlVecType<int>             res_int_col =
+            { 1, 1, 0, 0, 1, 0, 0, 0, 1 };
+
+        assert(df4.get_index() == res_idx);
+        assert(df4.get_column<int>("dbl_col") == res_dbl_col);
+        assert(df4.get_column<int>("dbl_col_2") == res_dbl_col_2);
+        assert(df4.get_column<int>("str_col") == res_str_col);
+        assert(df4.get_column<int>("int_col") == res_int_col);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
 int main(int, char *[]) {
 
     MyDataFrame::set_optimum_thread_level();
@@ -4468,6 +4562,7 @@ int main(int, char *[]) {
     test_change_freq();
     test_change_freq_2();
     test_change_freq_3();
+    test_duplication_mask();
 
     return (0);
 }