dask-contrib · charlesbluca · Mar 1, 2022 · Feb 23, 2022 · Feb 23, 2022 · Feb 28, 2022
@@ -964,8 +964,7 @@ def _register_callable(
         schema = self.schema[schema_name]
 
         if not aggregation:
-            f = UDF(f, row_udf, return_type)
-
+            f = UDF(f, row_udf, parameters, return_type)
         lower_name = name.lower()
         if lower_name in schema.functions:
             if replace:

@@ -183,7 +183,7 @@ def assign(self) -> dd.DataFrame:
 
 
 class UDF:
-    def __init__(self, func, row_udf: bool, return_type=None):
+    def __init__(self, func, row_udf: bool, params, return_type=None):
         """
         Helper class that handles different types of UDFs and manages
         how they should be mapped to dask operations. Two versions of
@@ -196,6 +196,8 @@ def __init__(self, func, row_udf: bool, return_type=None):
         self.row_udf = row_udf
         self.func = func
 
+        self.names = [param[0] for param in params]
+
         if return_type is None:
             # These UDFs go through apply and without providing
             # a return type, dask will attempt to guess it, and
@@ -212,9 +214,11 @@ def __call__(self, *args, **kwargs):
                     column_args.append(operand)
                 else:
                     scalar_args.append(operand)
+
             df = column_args[0].to_frame()
-            for col in column_args[1:]:
-                df[col.name] = col
+            for name, col in zip(self.names, column_args):
+                df[name] = col
+
             result = df.apply(
                 self.func, axis=1, args=tuple(scalar_args), meta=self.meta
             ).astype(self.meta[1])

@@ -37,6 +37,19 @@ def df_simple():
     return pd.DataFrame({"a": [1, 2, 3], "b": [1.1, 2.2, 3.3]})
 
 
+@pytest.fixture()
+def df_wide():
+    return pd.DataFrame(
+        {
+            "a": [0, 1, 2],
+            "b": [3, 4, 5],
+            "c": [6, 7, 8],
+            "d": [9, 10, 11],
+            "e": [12, 13, 14],
+        }
+    )
+
+
 @pytest.fixture()
 def df():
     np.random.seed(42)
@@ -126,6 +139,7 @@ def gpu_datetime_table(datetime_table):
 @pytest.fixture()
 def c(
     df_simple,
+    df_wide,
     df,
     user_table_1,
     user_table_2,
@@ -142,6 +156,7 @@ def c(
 ):
     dfs = {
         "df_simple": df_simple,
+        "df_wide": df_wide,
         "df": df,
         "user_table_1": user_table_1,
         "user_table_2": user_table_2,

@@ -1,9 +1,10 @@
+import itertools
 import operator
 
 import dask.dataframe as dd
 import numpy as np
 import pytest
-from pandas.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal, assert_series_equal
 
 
 def test_custom_function(c, df):
@@ -40,6 +41,27 @@ def f(row):
     assert_frame_equal(return_df.reset_index(drop=True), df[["a"]] ** 2)
 
 
+@pytest.mark.parametrize("colnames", list(itertools.combinations(["a", "b", "c"], 2)))
+def test_custom_function_any_colnames(colnames, df_wide, c):
+    # a third column is needed
+
+    def f(row):
+        return row["x"] + row["y"]
+
+    colname_x, colname_y = colnames
+    c.register_function(
+        f, "f", [("x", np.int64), ("y", np.int64)], np.int64, row_udf=True
+    )
+
+    return_df = c.sql(f"SELECT F({colname_x},{colname_y}) FROM df_wide")
+
+    return_df = return_df.compute()
+    expect = df_wide[colname_x] + df_wide[colname_y]
+    got = return_df[return_df.columns[0]]
+
+    assert_series_equal(expect, got, check_names=False)
+
+
 @pytest.mark.parametrize(
     "retty",
     [None, np.float64, np.float32, np.int64, np.int32, np.int16, np.int8, np.bool_],

@@ -35,6 +35,7 @@ def test_tables(c):
             "Table": [
                 "df",
                 "df_simple",
+                "df_wide",
                 "user_table_1",
                 "user_table_2",
                 "long_table",
@@ -47,6 +48,7 @@ def test_tables(c):
             else [
                 "df",
                 "df_simple",
+                "df_wide",
                 "user_table_1",
                 "user_table_2",
                 "long_table",