From c61dbae07fcc0c1631a629470e91cbc3adc9b326 Mon Sep 17 00:00:00 2001 From: Yijun Xie Date: Tue, 1 Nov 2022 01:42:42 +0000 Subject: [PATCH] SNOW-583979: Improve UDF and table merge docstring (#595) --- src/snowflake/snowpark/stored_procedure.py | 5 ++++- src/snowflake/snowpark/table.py | 8 ++++---- src/snowflake/snowpark/udf.py | 15 +++++++++++++-- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/snowflake/snowpark/stored_procedure.py b/src/snowflake/snowpark/stored_procedure.py index dfe4f75ae70..fd7fba4c424 100644 --- a/src/snowflake/snowpark/stored_procedure.py +++ b/src/snowflake/snowpark/stored_procedure.py @@ -108,7 +108,10 @@ class StoredProcedureRegistration: function on the Snowflake server during stored procedure creation. During the serialization, the global variables used in the Python function will be serialized into the bytecode, but only the name of the module object or any objects from a module that are used in the - Python function will be serialized. During the deserialization, Python will look up the + Python function will be serialized. If the size of the serialized bytecode is over 8K bytes, it will be uploaded to a stage location as a Python file. + If it's under 8K, it will be added to the `Stored Procedure in-line code `__. + + During the deserialization, Python will look up the corresponding modules and objects by names. Details could be found in :class:`snowflake.snowpark.udf.UDFRegistration`. diff --git a/src/snowflake/snowpark/table.py b/src/snowflake/snowpark/table.py index c9630c46c94..4899392b165 100644 --- a/src/snowflake/snowpark/table.py +++ b/src/snowflake/snowpark/table.py @@ -82,15 +82,15 @@ def update(self, assignments: Dict[str, ColumnOrLiteral]) -> "WhenMatchedClause" >>> # if its key is equal to the key of any row in target. >>> # For all such rows, update its value to the value of the >>> # corresponding row in source. - >>> from snowflake.snowpark.functions import when_matched + >>> from snowflake.snowpark.functions import when_matched, lit >>> target_df = session.create_dataframe([(10, "old"), (10, "too_old"), (11, "old")], schema=["key", "value"]) >>> target_df.write.save_as_table("my_table", mode="overwrite", table_type="temporary") >>> target = session.table("my_table") >>> source = session.create_dataframe([(10, "new")], schema=["key", "value"]) - >>> target.merge(source, target["key"] == source["key"], [when_matched().update({"value": source["value"]})]) - MergeResult(rows_inserted=0, rows_updated=2, rows_deleted=0) + >>> target.merge(source, (target["key"] == source["key"]) & (target["value"] == lit("too_old")), [when_matched().update({"value": source["value"]})]) + MergeResult(rows_inserted=0, rows_updated=1, rows_deleted=0) >>> target.collect() # the value in the table is updated - [Row(KEY=10, VALUE='new'), Row(KEY=10, VALUE='new'), Row(KEY=11, VALUE='old')] + [Row(KEY=10, VALUE='old'), Row(KEY=10, VALUE='new'), Row(KEY=11, VALUE='old')] Note: An exception will be raised if this method or :meth:`WhenMatchedClause.delete` diff --git a/src/snowflake/snowpark/udf.py b/src/snowflake/snowpark/udf.py index 0dc28b696e4..b3b4281009f 100644 --- a/src/snowflake/snowpark/udf.py +++ b/src/snowflake/snowpark/udf.py @@ -117,6 +117,16 @@ class UDFRegistration: permanently. The methods that register a UDF return a :class:`UserDefinedFunction` object, which you can also use in :class:`~snowflake.snowpark.Column` expressions. + Note: + Before creating a UDF, think about whether you want to create a vectorized UDF (also referred to as `Python UDF Batch API`) or a regular UDF. + The advantages of a vectorized UDF are: + + - The potential for better performance if your Python code operates efficiently on batches of rows. + - Less transformation logic is required if you are calling into libraries that operate on Pandas DataFrames or Pandas arrays. + + Refer to `Python UDF Batch API `__ for more details. + The following text explains how to create a regular UDF and a vectorized UDF by using the Snowpark Python APIs. + There are two ways to register a UDF with Snowpark: - Use :func:`~snowflake.snowpark.functions.udf` or :meth:`register`. By pointing to a @@ -125,8 +135,9 @@ class UDFRegistration: function on the Snowflake server during UDF creation. During the serialization, the global variables used in the Python function will be serialized into the bytecode, but only the name of the module object or any objects from a module that are used in the - Python function will be serialized. During the deserialization, Python will look up the - corresponding modules and objects by names. For example:: + Python function will be serialized. If the size of the serialized bytecode is over 8K bytes, it will be uploaded to a stage location as a Python file. + If it's under 8K, it will be added to the `UDF in-line code `__. + During the deserialization, Python will look up the corresponding modules and objects by names. For example:: >>> import numpy >>> from resources.test_udf_dir.test_udf_file import mod5