From 11f74de13bd63dee0eb4f239c2987704965b6191 Mon Sep 17 00:00:00 2001
From: Adam Laiacano <alaiacano@nvidia.com>
Date: Wed, 21 Jun 2023 10:33:48 -0400
Subject: [PATCH 1/6] rename temporary column to something less likely to be
 used

---
 nvtabular/ops/categorify.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py
index 556e2a005a..f82341735c 100644
--- a/nvtabular/ops/categorify.py
+++ b/nvtabular/ops/categorify.py
@@ -1615,6 +1615,8 @@ def _encode(
     selection_r = ColumnSelector(name if isinstance(name, list) else [storage_name])
     list_col = is_list_col(selection_l, df)
 
+    tmp_label_column = "__labels_tmp"
+
     # Find number of oov buckets
     if buckets and storage_name in buckets:
         num_oov_buckets = buckets[storage_name]
@@ -1642,9 +1644,9 @@ def _encode(
                         cats_only=True,
                         reader=read_pq_func,
                     )
-                    if len(value) and value["labels"].iloc[0] < OOV_OFFSET + num_oov_buckets:
+                    if len(value) and value[tmp_label_column].iloc[0] < OOV_OFFSET + num_oov_buckets:
                         # See: https://github.com/rapidsai/cudf/issues/12837
-                        value["labels"] += OOV_OFFSET + num_oov_buckets
+                        value[tmp_label_column] += OOV_OFFSET + num_oov_buckets
         else:
             value = read_pq_func(  # pylint: disable=unexpected-keyword-arg
                 path,
@@ -1652,7 +1654,7 @@ def _encode(
                 **({"split_row_groups": False} if split_out > 1 else {}),
             )
 
-            value.index = value.index.rename("labels")
+            value.index = value.index.rename(tmp_label_column)
             if split_out > 1:
                 value = value.reset_index(drop=False)
                 if type(df).__module__.split(".")[0] == "cudf":
@@ -1665,7 +1667,7 @@ def _encode(
                         part_size = file_frag.metadata.num_rows
                         ranges.append((size, size + part_size))
                         size += part_size
-                    value["labels"] = dd.from_map(lambda r: pd.RangeIndex(*r), ranges)
+                    value[tmp_label_column] = dd.from_map(lambda r: pd.RangeIndex(*r), ranges)
             else:
                 value.reset_index(drop=False, inplace=True)
 
@@ -1674,7 +1676,7 @@ def _encode(
         for c in selection_r.names:
             typ = df[selection_l.names[0]].dtype if len(selection_l.names) == 1 else df[c].dtype
             value[c] = nullable_series([None], df, typ)
-        value.index = value.index.rename("labels")
+        value.index = value.index.rename(tmp_label_column)
         value.reset_index(drop=False, inplace=True)
 
     use_collection = isinstance(value, DaskDataFrame)
@@ -1684,7 +1686,7 @@ def _encode(
         use_collection = False
 
     # Determine encoding offsets
-    null_encoding_offset = value["labels"].head(1).iloc[0] if single_table else NULL_OFFSET
+    null_encoding_offset = value[tmp_label_column].head(1).iloc[0] if single_table else NULL_OFFSET
     bucket_encoding_offset = null_encoding_offset + 1  # 2 (if not single_table)
     distinct_encoding_offset = bucket_encoding_offset + num_oov_buckets
 
@@ -1727,7 +1729,7 @@ def _encode(
                             left_on=selection_l.names,
                             right_on=selection_r.names,
                             how="left",
-                        ).dropna(subset=["labels"])
+                        ).dropna(subset=[tmp_label_column])
                         for part in value.partitions
                     ],
                     ignore_index=False,
@@ -1741,11 +1743,11 @@ def _encode(
             if len(merged_df) < len(codes):
                 # Missing nulls
                 labels = df._constructor_sliced(indistinct)
-                labels.iloc[merged_df["order"]] = merged_df["labels"]
+                labels.iloc[merged_df["order"]] = merged_df[tmp_label_column]
                 labels = labels.values
             else:
-                merged_df["labels"].fillna(df._constructor_sliced(indistinct), inplace=True)
-                labels = merged_df["labels"].values
+                merged_df[tmp_label_column].fillna(df._constructor_sliced(indistinct), inplace=True)
+                labels = merged_df[tmp_label_column].values
         else:
             # no hashing
             if use_collection:
@@ -1757,7 +1759,7 @@ def _encode(
                             left_on=selection_l.names,
                             right_on=selection_r.names,
                             how="left",
-                        ).dropna(subset=["labels"])
+                        ).dropna(subset=[tmp_label_column)
                         for part in value.partitions
                     ],
                     ignore_index=True,
@@ -1768,16 +1770,16 @@ def _encode(
                         np.full(
                             len(codes),
                             indistinct,
-                            like=merged_df["labels"].values,
+                            like=merged_df[tmp_column_name].values,
                         ),
                     )
-                    labels.iloc[merged_df["order"]] = merged_df["labels"]
+                    labels.iloc[merged_df["order"]] = merged_df[tmp_column_name]
                 else:
-                    labels = merged_df.sort_values("order")["labels"].reset_index(drop=True)
+                    labels = merged_df.sort_values("order")[tmp_column_name].reset_index(drop=True)
             else:
                 labels = codes.merge(
                     value, left_on=selection_l.names, right_on=selection_r.names, how="left"
-                ).sort_values("order")["labels"]
+                ).sort_values("order")[tmp_column_name]
             labels.fillna(indistinct, inplace=True)
             labels = labels.values
     else:

From 84ae407cea97ac1acc901aa929a2834c368de1b2 Mon Sep 17 00:00:00 2001
From: Adam Laiacano <alaiacano@nvidia.com>
Date: Wed, 21 Jun 2023 10:39:33 -0400
Subject: [PATCH 2/6] fix syntax

---
 nvtabular/ops/categorify.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py
index f82341735c..b3d0933137 100644
--- a/nvtabular/ops/categorify.py
+++ b/nvtabular/ops/categorify.py
@@ -1759,7 +1759,7 @@ def _encode(
                             left_on=selection_l.names,
                             right_on=selection_r.names,
                             how="left",
-                        ).dropna(subset=[tmp_label_column)
+                        ).dropna(subset=[tmp_label_column])
                         for part in value.partitions
                     ],
                     ignore_index=True,

From 5d8d4606a523095e49ed4e8b04d205620f530fd6 Mon Sep 17 00:00:00 2001
From: Adam Laiacano <alaiacano@nvidia.com>
Date: Wed, 21 Jun 2023 10:46:19 -0400
Subject: [PATCH 3/6] catch renaming error

---
 nvtabular/ops/categorify.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/nvtabular/ops/categorify.py b/nvtabular/ops/categorify.py
index b3d0933137..ba63b72778 100644
--- a/nvtabular/ops/categorify.py
+++ b/nvtabular/ops/categorify.py
@@ -1644,7 +1644,10 @@ def _encode(
                         cats_only=True,
                         reader=read_pq_func,
                     )
-                    if len(value) and value[tmp_label_column].iloc[0] < OOV_OFFSET + num_oov_buckets:
+                    if (
+                        len(value)
+                        and value[tmp_label_column].iloc[0] < OOV_OFFSET + num_oov_buckets
+                    ):
                         # See: https://github.com/rapidsai/cudf/issues/12837
                         value[tmp_label_column] += OOV_OFFSET + num_oov_buckets
         else:
@@ -1770,16 +1773,16 @@ def _encode(
                         np.full(
                             len(codes),
                             indistinct,
-                            like=merged_df[tmp_column_name].values,
+                            like=merged_df[tmp_label_column].values,
                         ),
                     )
-                    labels.iloc[merged_df["order"]] = merged_df[tmp_column_name]
+                    labels.iloc[merged_df["order"]] = merged_df[tmp_label_column]
                 else:
-                    labels = merged_df.sort_values("order")[tmp_column_name].reset_index(drop=True)
+                    labels = merged_df.sort_values("order")[tmp_label_column].reset_index(drop=True)
             else:
                 labels = codes.merge(
                     value, left_on=selection_l.names, right_on=selection_r.names, how="left"
-                ).sort_values("order")[tmp_column_name]
+                ).sort_values("order")[tmp_label_column]
             labels.fillna(indistinct, inplace=True)
             labels = labels.values
     else:

From 485081ebb2cc9b19566897c772f2276803cd5ace Mon Sep 17 00:00:00 2001
From: Adam Laiacano <alaiacano@nvidia.com>
Date: Wed, 21 Jun 2023 11:53:57 -0400
Subject: [PATCH 4/6] update col name in transform path

---
 cpp/nvtabular/inference/categorify.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/nvtabular/inference/categorify.cc b/cpp/nvtabular/inference/categorify.cc
index e9b50c0cdd..88b93da323 100644
--- a/cpp/nvtabular/inference/categorify.cc
+++ b/cpp/nvtabular/inference/categorify.cc
@@ -38,7 +38,7 @@ namespace nvtabular
         py::object pandas = py::module_::import("pandas");
         py::object df = pandas.attr("read_parquet")(filename);
         py::object isnull = pandas.attr("isnull");
-        py::array values = df[column_name.c_str()].attr("values");
+        py::array values = df[column_name.c_str()].attr("__values_tmp");
         auto dtype = values.dtype();
 
         if ((dtype.kind() == 'O') || (dtype.kind() == 'U'))

From 5d1f5386119371f02387ab93455c41ea328d5dea Mon Sep 17 00:00:00 2001
From: Adam Laiacano <alaiacano@nvidia.com>
Date: Wed, 21 Jun 2023 12:01:26 -0400
Subject: [PATCH 5/6] fix name

---
 cpp/nvtabular/inference/categorify.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/nvtabular/inference/categorify.cc b/cpp/nvtabular/inference/categorify.cc
index 88b93da323..4fada831a6 100644
--- a/cpp/nvtabular/inference/categorify.cc
+++ b/cpp/nvtabular/inference/categorify.cc
@@ -38,7 +38,7 @@ namespace nvtabular
         py::object pandas = py::module_::import("pandas");
         py::object df = pandas.attr("read_parquet")(filename);
         py::object isnull = pandas.attr("isnull");
-        py::array values = df[column_name.c_str()].attr("__values_tmp");
+        py::array values = df[column_name.c_str()].attr("__labels_tmp");
         auto dtype = values.dtype();
 
         if ((dtype.kind() == 'O') || (dtype.kind() == 'U'))

From b5a45f6490993024134e471a892059e7fd50352c Mon Sep 17 00:00:00 2001
From: Adam Laiacano <alaiacano@nvidia.com>
Date: Wed, 21 Jun 2023 12:01:51 -0400
Subject: [PATCH 6/6] undo

---
 cpp/nvtabular/inference/categorify.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/nvtabular/inference/categorify.cc b/cpp/nvtabular/inference/categorify.cc
index 4fada831a6..e9b50c0cdd 100644
--- a/cpp/nvtabular/inference/categorify.cc
+++ b/cpp/nvtabular/inference/categorify.cc
@@ -38,7 +38,7 @@ namespace nvtabular
         py::object pandas = py::module_::import("pandas");
         py::object df = pandas.attr("read_parquet")(filename);
         py::object isnull = pandas.attr("isnull");
-        py::array values = df[column_name.c_str()].attr("__labels_tmp");
+        py::array values = df[column_name.c_str()].attr("values");
         auto dtype = values.dtype();
 
         if ((dtype.kind() == 'O') || (dtype.kind() == 'U'))