From 35060178058f33b25286de37753b05bfc509e4ff Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Mon, 8 Jul 2024 14:24:58 -0400 Subject: [PATCH] remove_duplicates() can now operates on the index column by itself --- docs/HTML/remove_duplicates.html | 166 ++++++++++-------- include/DataFrame/Internals/DataFrame_set.tcc | 35 ++-- test/dataframe_tester_2.cc | 32 +++- 3 files changed, 133 insertions(+), 100 deletions(-) diff --git a/docs/HTML/remove_duplicates.html b/docs/HTML/remove_duplicates.html index e1ab7204..eb0182ab 100644 --- a/docs/HTML/remove_duplicates.html +++ b/docs/HTML/remove_duplicates.html @@ -76,13 +76,14 @@ It removes duplicate rows and returns a new DataFrame. Duplication is determined by the given column. remove_dup_spec determines which of the duplicated rows to keep.
+ name can potentially be DF_INDEX_COL_NAME which would refer to the index column.

NOTE: The given column type must be hash-able and must have equality (==) operator well defined.
NOTE: If this is called from a View, the duplicates are removed from the view but not from the original DataFrame
T: Type of the named column
Ts: List all the types of all data columns. A type should be specified in the list only once.
- name: Name of the data column
+ name: Name of the data column or DF_INDEX_COL_NAME
include_index: If true, it includes the index column to determine uniqueness
rds: Determined which of the duplicated columns to keep
@@ -238,82 +239,93 @@ -
static void test_remove_duplicates()  {
-
-    std::cout << "\nTesting remove_duplicates( ) ..." << std::endl;
-
-    MyDataFrame df;
-
-    std::vector<unsigned long>  idxvec =
-        { 1UL, 2UL, 3UL, 10UL, 5UL, 7UL, 8UL, 12UL, 9UL, 12UL, 10UL, 13UL, 10UL, 15UL, 14UL };
-    std::vector<double>         dblvec =
-        { 0.0, 15.0, 14.0, 2.0, 15.0, 12.0, 11.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 9.0, 10.0};
-    std::vector<double>         dblvec2 =
-        { 100.0, 101.0, 102.0, 103.0, 101.0, 105.0, 106.55, 107.34, 1.8, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0};
-    std::vector<int>            intvec = { 1, 2, 3, 4, 2, 8, 6, 7, 11, 14, 9 };
-    std::vector<std::string>    strvec =
-        { "zz", "bb", "cc", "ww", "bb", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
-
-    df.load_data(std::move(idxvec),
-                 std::make_pair("dbl_col", dblvec),
-                 std::make_pair("dbl_col_2", dblvec2),
-                 std::make_pair("str_col", strvec));
-    df.load_column("int_col", std::move(intvec), nan_policy::dont_pad_with_nans);
-
-    auto    result1 =
-        df.remove_duplicates<double, int, double, std::string, int>
-        ("dbl_col", "int_col", false, remove_dup_spec::keep_first);
-    auto    result2 =
-        df.remove_duplicates<double, double, int, std::string, double, std::string, int>
-        ("dbl_col", "dbl_col_2", "int_col", "str_col", false, remove_dup_spec::keep_first);
-
-    std::vector<double>         actual_d {
-        100, 101, 102, 103, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 };
-    std::vector<std::string>    actual_s {
-        "zz", "bb", "cc", "ww", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
-
-    assert(result2.get_index().size() == 14);
-    assert(result2.get_column<double>("dbl_col_2") == actual_d);
-    assert(result2.get_column<std::string>("str_col") == actual_s);
-
-    auto    result3 =
-        df.remove_duplicates<double, double, int, std::string, double, std::string, int>
-        ("dbl_col", "dbl_col_2", "int_col", "str_col", false, remove_dup_spec::keep_last);
-
-    actual_d = std::vector<double> {
-        100, 102, 103, 101, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 };
-    actual_s = std::vector<std::string> {
-        "zz", "cc", "ww", "bb", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
-    assert(result3.get_index().size() == 14);
-    assert(result3.get_column<double>("dbl_col_2") == actual_d);
-    assert(result3.get_column<std::string>("str_col") == actual_s);
-
-    auto    result4 =
-        df.remove_duplicates<double, double, int, std::string, double, std::string, int>
-        ("dbl_col", "dbl_col_2", "int_col", "str_col", false, remove_dup_spec::keep_none);
-
-    actual_d = std::vector<double> {
-        100, 102, 103, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 };
-    actual_s = std::vector<std::string> {
-        "zz", "cc", "ww", "ff", "gg", "hh",  "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
-    assert(result4.get_index().size() == 13);
-    assert(result4.get_column<double>("dbl_col_2") == actual_d);
-    assert(result4.get_column<std::string>("str_col") == actual_s);
-
-    auto    result5 =
-        df.remove_duplicates<double, double, int, std::string, double, std::string, int>
-        ("dbl_col", "dbl_col_2", "int_col", "str_col", true, remove_dup_spec::keep_none);
-
-    actual_d = std::vector<double> {
-        100, 101, 102, 103, 101, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 };
-    actual_s = std::vector<std::string> {
-        "zz", "bb", "cc", "ww", "bb", "ff", "gg", "hh",  "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
-    assert(result5.get_index().size() == 15);
-    assert(result5.get_column<double>("dbl_col_2") == actual_d);
-    assert(result5.get_column<std::string>("str_col") == actual_s);
-}
-
- +
static void test_remove_duplicates()  {
+
+    std::cout << "\nTesting remove_duplicates( ) ..." << std::endl;
+
+    MyDataFrame df;
+
+    StlVecType<unsigned long>  idxvec = { 1UL, 2UL, 3UL, 10UL, 5UL, 7UL, 8UL, 12UL, 9UL, 12UL, 10UL, 13UL, 10UL, 15UL, 14UL };
+    StlVecType<double>         dblvec = { 0.0, 15.0, 14.0, 2.0, 15.0, 12.0, 11.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 9.0, 10.0 };
+    StlVecType<double>         dblvec2 = { 100.0, 101.0, 102.0, 103.0, 101.0, 105.0, 106.55, 107.34, 1.8, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0 };
+    StlVecType<int>            intvec = { 1, 2, 3, 4, 2, 8, 6, 7, 11, 14, 9 };
+    StlVecType<std::string>    strvec = { "zz", "bb", "cc", "ww", "bb", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
+
+    df.load_data(std::move(idxvec),
+                 std::make_pair("dbl_col", dblvec),
+                 std::make_pair("dbl_col_2", dblvec2),
+                 std::make_pair("str_col", strvec));
+    df.load_column("int_col", std::move(intvec), nan_policy::dont_pad_with_nans);
+
+    auto    vw = df.get_view<double, int, std::string>({ "dbl_col", "dbl_col_2", "str_col", "int_col" });
+    auto    result1 = df.remove_duplicates<double, int, double, std::string, int>("dbl_col", "int_col", false, remove_dup_spec::keep_first);
+    auto    result_vw =
+        vw.remove_duplicates<double, double, int, std::string, double, std::string, int>
+            ("dbl_col", "dbl_col_2", "int_col", "str_col", false, remove_dup_spec::keep_first);
+    auto    result2 =
+        df.remove_duplicates<double, double, int, std::string, double, std::string, int>
+            ("dbl_col", "dbl_col_2", "int_col", "str_col", false, remove_dup_spec::keep_first);
+
+    StlVecType<double>         actual_d { 100, 101, 102, 103, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 };
+    StlVecType<std::string>    actual_s { "zz", "bb", "cc", "ww", "ff", "gg", "hh",  "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
+
+    assert(result2.get_index().size() == 14);
+    assert(result2.get_column<double>("dbl_col_2") == actual_d);
+    assert(result2.get_column<std::string>("str_col") == actual_s);
+
+    assert(result_vw.get_index().size() == 14);
+
+    auto    result3 =
+        df.remove_duplicates<double, double, int, std::string, double, std::string, int>
+            ("dbl_col", "dbl_col_2", "int_col", "str_col", false, remove_dup_spec::keep_last);
+
+    actual_d = StlVecType<double> { 100, 102, 103, 101, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 };
+    actual_s = StlVecType<std::string> { "zz", "cc", "ww", "bb", "ff", "gg", "hh",  "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
+    assert(result3.get_index().size() == 14);
+    assert(result3.get_column<double>("dbl_col_2") == actual_d);
+    assert(result3.get_column<std::string>("str_col") == actual_s);
+
+    auto    result4 =
+        df.remove_duplicates<double, double, int, std::string, double, std::string, int>
+            ("dbl_col", "dbl_col_2", "int_col", "str_col", false, remove_dup_spec::keep_none);
+
+    actual_d = StlVecType<double> { 100, 102, 103, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 };
+    actual_s = StlVecType<std::string> { "zz", "cc", "ww", "ff", "gg", "hh",  "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
+    assert(result4.get_index().size() == 13);
+    assert(result4.get_column<double>("dbl_col_2") == actual_d);
+    assert(result4.get_column<std::string>("str_col") == actual_s);
+
+    auto    result5 =
+        df.remove_duplicates<double, double, int, std::string, double, std::string, int>
+            ("dbl_col", "dbl_col_2", "int_col", "str_col", true, remove_dup_spec::keep_none);
+
+    actual_d = StlVecType<double> { 100, 101, 102, 103, 101, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 };
+    actual_s = StlVecType<std::string> { "zz", "bb", "cc", "ww", "bb", "ff", "gg", "hh",  "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
+    assert(result5.get_index().size() == 15);
+    assert(result5.get_column<double>("dbl_col_2") == actual_d);
+    assert(result5.get_column<std::string>("str_col") == actual_s);
+
+    auto    result6 =
+        df.remove_duplicates<double, double, std::string, int> ("dbl_col", false, remove_dup_spec::keep_first);
+
+    actual_d = StlVecType<double> { 100, 101, 102, 103, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 };
+    actual_s = StlVecType<std::string> { "zz", "bb", "cc", "ww", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
+    assert(result6.get_index().size() == 14);
+    assert(result6.get_column<double>("dbl_col_2") == actual_d);
+    assert(result6.get_column<std::string>("str_col") == actual_s);
+
+    auto        result7 =
+        df.remove_duplicates<unsigned long, double, std::string, int>
+            (DF_INDEX_COL_NAME, false, remove_dup_spec::keep_first);
+    const auto  actual_idx = StlVecType<unsigned long> { 1UL, 2UL, 3UL, 10UL, 5UL, 7UL, 8UL, 12UL, 9UL, 13UL, 15UL, 14UL };
+
+    actual_d = StlVecType<double> { 100, 101, 102, 103, 101, 105, 106.55, 107.34, 1.8, 113, 115, 116 };
+    actual_s = StlVecType<std::string> { "zz", "bb", "cc", "ww", "bb", "ff", "gg", "hh", "ii", "ll", "nn", "oo" };
+    assert(result7.get_index() == actual_idx);
+    assert(result7.get_column<double>("dbl_col_2") == actual_d);
+    assert(result7.get_column<std::string>("str_col") == actual_s);
+}
+

C++ DataFrame diff --git a/include/DataFrame/Internals/DataFrame_set.tcc b/include/DataFrame/Internals/DataFrame_set.tcc index ebbbddb8..9550716d 100644 --- a/include/DataFrame/Internals/DataFrame_set.tcc +++ b/include/DataFrame/Internals/DataFrame_set.tcc @@ -1053,7 +1053,7 @@ template void DataFrame::remove_data_by_sel (const char *name, F &sel_functor) { static_assert(std::is_base_of, H>::value || - std::is_base_of, H>::value, + std::is_base_of, H>::value, "Only a StdDataFrame or a PtrView can call " "remove_data_by_sel()"); @@ -1078,7 +1078,7 @@ void DataFrame:: remove_data_by_sel (const char *name1, const char *name2, F &sel_functor) { static_assert(std::is_base_of, H>::value || - std::is_base_of, H>::value, + std::is_base_of, H>::value, "Only a StdDataFrame or a PtrView can call " "remove_data_by_sel()"); @@ -1116,7 +1116,7 @@ remove_data_by_sel (const char *name1, F &sel_functor) { static_assert(std::is_base_of, H>::value || - std::is_base_of, H>::value, + std::is_base_of, H>::value, "Only a StdDataFrame or a PtrView can call " "remove_data_by_sel()"); @@ -1157,7 +1157,7 @@ remove_data_by_like (const char *name, char esc_char) { static_assert(std::is_base_of, H>::value || - std::is_base_of, H>::value, + std::is_base_of, H>::value, "Only a StdDataFrame or a PtrView can call " "remove_data_by_like()"); @@ -1201,7 +1201,7 @@ remove_data_by_like(const char *name1, char esc_char) { static_assert(std::is_base_of, H>::value || - std::is_base_of, H>::value, + std::is_base_of, H>::value, "Only a StdDataFrame or a PtrView can call " "remove_data_by_like()"); @@ -1255,22 +1255,29 @@ remove_duplicates (const char *name, using count_vec = StlVecType; using map_t = DFUnorderedMap; - const ColumnVecType &vec = get_column(name); - const auto &index = get_index(); - const size_type col_s = std::min(vec.size(), index.size()); - map_t row_table; - count_vec dummy_vec; - const IndexType dummy_idx { }; + const ColumnVecType *vec { nullptr }; + + if (! ::strcmp(name, DF_INDEX_COL_NAME)) { + vec = (const ColumnVecType *) &(get_index()); + include_index = false; + } + else + vec = (const ColumnVecType *) &(get_column(name, false)); + + const auto &index = get_index(); + const size_type col_s = std::min(vec->size(), index.size()); + map_t row_table; + count_vec dummy_vec; + const IndexType dummy_idx { }; for (size_type i = 0; i < col_s; ++i) [[likely]] { const auto insert_res = row_table.emplace( - std::forward_as_tuple(vec[i], + std::forward_as_tuple((*vec)[i], include_index ? index[i] : dummy_idx), dummy_vec); - if (insert_res.second) - insert_res.first->second.reserve(8); + if (insert_res.second) insert_res.first->second.reserve(8); insert_res.first->second.push_back(i); } diff --git a/test/dataframe_tester_2.cc b/test/dataframe_tester_2.cc index 0011449b..e3e179cd 100644 --- a/test/dataframe_tester_2.cc +++ b/test/dataframe_tester_2.cc @@ -962,8 +962,8 @@ static void test_remove_duplicates() { auto result3 = df.remove_duplicates - ("dbl_col", "dbl_col_2", "int_col", "str_col", - false, remove_dup_spec::keep_last); + ("dbl_col", "dbl_col_2", "int_col", "str_col", + false, remove_dup_spec::keep_last); actual_d = StlVecType { 100, 102, 103, 101, 105, 106.55, 107.34, 1.8, 111, 112, 113, @@ -978,8 +978,8 @@ static void test_remove_duplicates() { auto result4 = df.remove_duplicates - ("dbl_col", "dbl_col_2", "int_col", "str_col", - false, remove_dup_spec::keep_none); + ("dbl_col", "dbl_col_2", "int_col", "str_col", + false, remove_dup_spec::keep_none); actual_d = StlVecType { 100, 102, 103, 105, 106.55, 107.34, 1.8, 111, 112, 113, @@ -994,8 +994,8 @@ static void test_remove_duplicates() { auto result5 = df.remove_duplicates - ("dbl_col", "dbl_col_2", "int_col", "str_col", - true, remove_dup_spec::keep_none); + ("dbl_col", "dbl_col_2", "int_col", "str_col", + true, remove_dup_spec::keep_none); actual_d = StlVecType { 100, 101, 102, 103, 101, 105, 106.55, 107.34, 1.8, 111, 112, 113, @@ -1009,18 +1009,32 @@ static void test_remove_duplicates() { auto result6 = df.remove_duplicates - ("dbl_col", false, remove_dup_spec::keep_first); + ("dbl_col", false, remove_dup_spec::keep_first); actual_d = StlVecType { 100, 101, 102, 103, 105, 106.55, 107.34, 1.8, 111, 112, 113, 114, 115, 116 }; actual_s = StlVecType { "zz", "bb", "cc", "ww", "ff", "gg", "hh", "ii", "jj", "kk", "ll", - "mm", "nn", "oo" - }; + "mm", "nn", "oo" }; assert(result6.get_index().size() == 14); assert(result6.get_column("dbl_col_2") == actual_d); assert(result6.get_column("str_col") == actual_s); + + auto result7 = + df.remove_duplicates + (DF_INDEX_COL_NAME, false, remove_dup_spec::keep_first); + const auto actual_idx = StlVecType + { 1UL, 2UL, 3UL, 10UL, 5UL, 7UL, 8UL, 12UL, 9UL, 13UL, 15UL, 14UL }; + + actual_d = StlVecType + { 100, 101, 102, 103, 101, 105, 106.55, 107.34, 1.8, 113, 115, 116 }; + actual_s = StlVecType + { "zz", "bb", "cc", "ww", "bb", "ff", "gg", "hh", "ii", "ll", + "nn", "oo" }; + assert(result7.get_index() == actual_idx); + assert(result7.get_column("dbl_col_2") == actual_d); + assert(result7.get_column("str_col") == actual_s); } // -----------------------------------------------------------------------------