dask-contrib · charlesbluca · Jul 26, 2022 · Jul 22, 2022 · Jul 22, 2022 · Jul 22, 2022
@@ -898,6 +898,7 @@ def _register_callable(
         row_udf: bool = False,
     ):
         """Helper function to do the function or aggregation registration"""
+
         schema_name = schema_name or self.schema_name
         schema = self.schema[schema_name]
 

@@ -4,6 +4,8 @@
 import dask.dataframe as dd
 import pandas as pd
 
+from dask_sql.mappings import python_to_sql_type
+
 ColumnType = Union[str, int]
 
 FunctionDescription = namedtuple(
@@ -198,11 +200,10 @@ def __init__(self, func, row_udf: bool, params, return_type=None):
 
         self.names = [param[0] for param in params]
 
-        if return_type is None:
-            # These UDFs go through apply and without providing
-            # a return type, dask will attempt to guess it, and
-            # dask might be wrong.
-            raise ValueError("Return type must be provided")
+        # validate UDF metadata
+        for dt in (*(param[1] for param in params), return_type):
+            _ = python_to_sql_type(dt)
 sql_return_type = python_to_sql_type(function_description.return_type) 
 sql_param_type = python_to_sql_type(param_type) 
 sql_return_type = python_to_sql_type(function_description.return_type) 
 sql_param_type = python_to_sql_type(param_type) 
+
         self.meta = (None, return_type)
 
     def __call__(self, *args, **kwargs):
@@ -218,7 +219,6 @@ def __call__(self, *args, **kwargs):
             df = column_args[0].to_frame(self.names[0])
             for name, col in zip(self.names[1:], column_args[1:]):
                 df[name] = col
-
             result = df.apply(
                 self.func, axis=1, args=tuple(scalar_args), meta=self.meta
             ).astype(self.meta[1])

@@ -88,6 +88,11 @@
 def python_to_sql_type(python_type):
     """Mapping between python and SQL types."""
 
+    if python_type in (int, float):
+        python_type = np.dtype(python_type)
+    elif python_type is str:
+        python_type = np.dtype("object")
+
     if isinstance(python_type, np.dtype):
         python_type = python_type.type
 

@@ -52,17 +52,12 @@ def f(row):
 
 @pytest.mark.parametrize(
     "retty",
-    [None, np.float64, np.float32, np.int64, np.int32, np.int16, np.int8, np.bool_],
+    [np.float64, np.float32, np.int64, np.int32, np.int16, np.int8, np.bool_],
 )
 def test_custom_function_row_return_types(c, df, retty):
     def f(row):
         return row["x"] ** 2
 
-    if retty is None:
-        with pytest.raises(ValueError):
-            c.register_function(f, "f", [("x", np.float64)], retty, row_udf=True)
-        return
-
     c.register_function(f, "f", [("x", np.float64)], retty, row_udf=True)
 
     return_df = c.sql("SELECT F(a) AS a FROM df")
@@ -199,3 +194,17 @@ def f(x):
         c.register_aggregation(fagg, "fagg", [("x", np.float64)], np.float64)
 
     c.register_aggregation(fagg, "fagg", [("x", np.float64)], np.float64, replace=True)
+
+
+@pytest.mark.parametrize("dtype", [np.timedelta64, None, "a string"])
+def test_unsupported_dtype(c, dtype):
+    def f(x):
+        return x**2
+
+    # test that an invalid return type raises
+    with pytest.raises(NotImplementedError):
+        c.register_function(f, "f", [("x", np.int64)], dtype)
+
+    # test that an invalid param type raises
+    with pytest.raises(NotImplementedError):
+        c.register_function(f, "f", [("x", dtype)], np.int64)