From b58989f9cc037cca6c5562027c7efe34f83d4664 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 16 May 2022 19:45:57 -0400 Subject: [PATCH] Add documentation for FugueSQL integrations (#523) * Add documentation for FugueSQL integrations * Minor nitpick around autodoc obj -> class --- dask_sql/integrations/fugue.py | 36 +++++++++++++++------------- docs/source/api.rst | 5 +++- docs/source/fugue.rst | 44 ++++++++++++++++++++++++++++++++++ docs/source/index.rst | 1 + 4 files changed, 69 insertions(+), 17 deletions(-) create mode 100644 docs/source/fugue.rst diff --git a/dask_sql/integrations/fugue.py b/dask_sql/integrations/fugue.py index c9e0a076f..ce685a1ee 100644 --- a/dask_sql/integrations/fugue.py +++ b/dask_sql/integrations/fugue.py @@ -73,15 +73,15 @@ def fsql_dask( register: bool = False, fugue_conf: Any = None, ) -> Dict[str, dd.DataFrame]: - """Fugue SQL utility function that can consume Context directly. Fugue SQL is a language + """FugueSQL utility function that can consume Context directly. FugueSQL is a language extending standard SQL. It makes SQL eligible to describe end to end workflows. It also enables you to invoke python extensions in the SQL like language. For more, please read - `Fugue SQl Tutorial `_ + `FugueSQL Tutorial `_ Args: - sql: (:obj:`str`): Fugue SQL statement + sql (:obj:`str`): Fugue SQL statement ctx (:class:`dask_sql.Context`): The context to operate on, defaults to None register (:obj:`bool`): Whether to register named steps back to the context (if provided), defaults to False @@ -89,26 +89,30 @@ def fsql_dask( Example: .. code-block:: python - # schema: * - def median(df:pd.DataFrame) -> pd.DataFrame: + + # define a custom prepartition function for FugueSQL + def median(df: pd.DataFrame) -> pd.DataFrame: df["y"] = df["y"].median() return df.head(1) - # Create a context with tables df1, df2 + # create a context with some tables c = Context() ... - result = fsql_dask(''' - j = SELECT df1.*, df2.x - FROM df1 INNER JOIN df2 ON df1.key = df2.key - PERSIST # using persist because j will be used twice - TAKE 5 ROWS PREPARTITION BY x PRESORT key - PRINT - TRANSFORM j PREPARTITION BY x USING median - PRINT - ''', c, register=True) + + # run a FugueSQL query using the context as input + query = ''' + j = SELECT df1.*, df2.x + FROM df1 INNER JOIN df2 ON df1.key = df2.key + PERSIST + TAKE 5 ROWS PREPARTITION BY x PRESORT key + PRINT + TRANSFORM j PREPARTITION BY x USING median + PRINT + ''' + result = fsql_dask(query, c, register=True) + assert "j" in result assert "j" in c.tables - """ _global, _local = get_caller_global_local_vars() diff --git a/docs/source/api.rst b/docs/source/api.rst index 29c4f5632..cb5407419 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -11,4 +11,7 @@ API Documentation .. autofunction:: dask_sql.cmd_loop -.. autofunction:: dask_sql.integrations.fugue.fsql +.. autoclass:: dask_sql.integrations.fugue.DaskSQLExecutionEngine + :members: + +.. autofunction:: dask_sql.integrations.fugue.fsql_dask diff --git a/docs/source/fugue.rst b/docs/source/fugue.rst new file mode 100644 index 000000000..264d19fcd --- /dev/null +++ b/docs/source/fugue.rst @@ -0,0 +1,44 @@ +FugueSQL Integrations +===================== + +`FugueSQL `_ is a related project that aims to provide a unified SQL interface for a variety of different computing frameworks, including Dask. +While it offers a SQL engine with a larger set of supported commands, this comes at the cost of slower performance when using Dask in comparison to dask-sql. +In order to offer a "best of both worlds" solution, dask-sql includes several options to integrate with FugueSQL, using its faster implementation of SQL commands when possible and falling back on FugueSQL when necessary. + +dask-sql as a FugueSQL engine +----------------------------- + +FugueSQL users unfamiliar with dask-sql can take advantage of its functionality with minimal code changes by passing :class:`dask_sql.integrations.fugue.DaskSQLExecutionEngine` into the ``FugueSQLWorkflow`` being used to execute commands. +For more information and sample usage, see `Fugue — dask-sql as a FugueSQL engine `_. + +Using FugueSQL on an existing ``Context`` +----------------------------------------- + +dask-sql users attempting to expand their SQL querying options for an existing ``Context`` can use :func:`dask_sql.integrations.fugue.fsql_dask`, which executes the provided query using FugueSQL, using the tables within the provided context as input. +The results of this query can then optionally be registered to the context: + +.. code-block:: python + + # define a custom prepartition function for FugueSQL + def median(df: pd.DataFrame) -> pd.DataFrame: + df["y"] = df["y"].median() + return df.head(1) + + # create a context with some tables + c = Context() + ... + + # run a FugueSQL query using the context as input + query = """ + j = SELECT df1.*, df2.x + FROM df1 INNER JOIN df2 ON df1.key = df2.key + PERSIST + TAKE 5 ROWS PREPARTITION BY x PRESORT key + PRINT + TRANSFORM j PREPARTITION BY x USING median + PRINT + """ + result = fsql_dask(query, c, register=True) # results aren't registered by default + + assert "j" in result # returns a dict of resulting tables + assert "j" in c.tables # results are also registered to the context diff --git a/docs/source/index.rst b/docs/source/index.rst index 8ebb80150..8a9accc99 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -98,6 +98,7 @@ For this example, we use some data loaded from disk and query it with a SQL comm api server cmd + fugue how_does_it_work configuration