From 7bf73dec648841a8a569be856fcf54b9ce357ca4 Mon Sep 17 00:00:00 2001 From: Stefaan Lippens Date: Wed, 16 Oct 2024 12:37:21 +0200 Subject: [PATCH] Issue #604/#644 move ProcessBasedJobCreator example to more extensive doc page --- CHANGELOG.md | 4 +- docs/cookbook/job_manager.rst | 102 +++++++++++++++++++++++++++++++++ docs/rst-cheatsheet.rst | 15 ++++- openeo/extra/job_management.py | 53 +++++------------ 4 files changed, 131 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 873bd90fb..805bfb369 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,8 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `MultiBackendJobManager`: Added `initialize_from_df(df)` (to `CsvJobDatabase` and `ParquetJobDatabase`) to initialize (and persist) the job database from a given DataFrame. Also added `create_job_db()` factory to easily create a job database from a given dataframe and its type guessed from filename extension. ([#635](https://github.com/Open-EO/openeo-python-client/issues/635)) -- `MultiBackendJobManager.run_jobs()` now returns a dictionary with counters/stats about various events during the job run ([#645](https://github.com/Open-EO/openeo-python-client/issues/645)) -- Added `ProcessBasedJobCreator` to be used as `start_job` callable with `MultiBackendJobManager` to create multiple jobs from a single parameterized process (e.g. a UDP or remote process definition) ([#604](https://github.com/Open-EO/openeo-python-client/issues/604)) +- `MultiBackendJobManager.run_jobs()` now returns a dictionary with counters/stats about various events during the full run of the job manager ([#645](https://github.com/Open-EO/openeo-python-client/issues/645)) +- Added (experimental) `ProcessBasedJobCreator` to be used as `start_job` callable with `MultiBackendJobManager` to create multiple jobs from a single parameterized process (e.g. a UDP or remote process definition) ([#604](https://github.com/Open-EO/openeo-python-client/issues/604)) ### Changed diff --git a/docs/cookbook/job_manager.rst b/docs/cookbook/job_manager.rst index c505e44e5..b5219dc72 100644 --- a/docs/cookbook/job_manager.rst +++ b/docs/cookbook/job_manager.rst @@ -2,6 +2,9 @@ Multi Backend Job Manager ==================================== +API +=== + .. warning:: This is a new experimental API, subject to change. @@ -15,6 +18,105 @@ Multi Backend Job Manager .. autoclass:: openeo.extra.job_management.ParquetJobDatabase + .. autoclass:: openeo.extra.job_management.ProcessBasedJobCreator :members: :special-members: __call__ + + +.. _job-management-with-process-based-job-creator: + +Job creation based on parameterized processes +=============================================== + +The openEO API supports parameterized processes out of the box, +which allows to work with flexible, reusable openEO building blocks +in the form of :ref:`user-defined processes ` +or `remote openEO process definitions `_. +This can also be leveraged for job creation in the context of the +:py:class:`~openeo.extra.job_management.MultiBackendJobManager`: +define a "template" job as a parameterized process +and let the job manager fill in the parameters +from a given data frame. + +The :py:class:`~openeo.extra.job_management.ProcessBasedJobCreator` helper class +allows to do exactly that. +Given a reference to a parameterized process, +such as a user-defined process or remote process definition, +it can be used directly as ``start_job`` callable to +:py:meth:`~openeo.extra.job_management.MultiBackendJobManager.run_jobs` +which will fill in the process parameters from the dataframe. + +Basic :py:class:`~openeo.extra.job_management.ProcessBasedJobCreator` example +----------------------------------------------------------------------------- + +Basic usage example with a remote process definition: + +.. code-block:: python + :linenos: + :caption: Basic :py:class:`~openeo.extra.job_management.ProcessBasedJobCreator` example snippet + :emphasize-lines: 10-15, 28 + + from openeo.extra.job_management import ( + MultiBackendJobManager, + create_job_db, + ProcessBasedJobCreator, + ) + + # Job creator, based on a parameterized openEO process + # (specified by the remote process definition at given URL) + # which has parameters "start_date" and "bands" for example. + job_starter = ProcessBasedJobCreator( + namespace="https://example.com/my_process.json", + parameter_defaults={ + "bands": ["B02", "B03"], + }, + ) + + # Initialize job database from a dataframe, + # with desired parameter values to fill in. + df = pd.DataFrame({ + "start_date": ["2021-01-01", "2021-02-01", "2021-03-01"], + }) + job_db = create_job_db("jobs.csv").initialize_from_df(df) + + # Create and run job manager, + # which will start a job for each of the `start_date` values in the dataframe + # and use the default band list ["B02", "B03"] for the "bands" parameter. + job_manager = MultiBackendJobManager(...) + job_manager.run_jobs(job_db=job_db, start_job=job_starter) + +In this example, a :py:class:`ProcessBasedJobCreator` is instantiated +based on a remote process definition, +which has parameters ``start_date`` and ``bands``. +When passed to :py:meth:`~openeo.extra.job_management.MultiBackendJobManager.run_jobs`, +a job for each row in the dataframe will be created, +with parameter values based on matching columns in the dataframe: + +- the ``start_date`` parameter will be filled in + with the values from the "start_date" column of the dataframe, +- the ``bands`` parameter has no corresponding column in the dataframe, + and will get its value from the default specified in the ``parameter_defaults`` argument. + + +:py:class:`~openeo.extra.job_management.ProcessBasedJobCreator` with geometry handling +--------------------------------------------------------------------------------------------- + +Apart from the intuitive name-based parameter-column linking, +:py:class:`~openeo.extra.job_management.ProcessBasedJobCreator` +also automatically links: + +- a process parameters that accepts inline GeoJSON geometries/features + (which practically means it has a schema like ``{"type": "object", "subtype": "geojson"}``, + as produced by :py:meth:`Parameter.geojson `). +- with the geometry column in a `GeoPandas `_ dataframe. + +even if the name of the parameter does not exactly match +the name of the GeoPandas geometry column (``geometry`` by default). +This automatic liking is only done if there is only one +GeoJSON parameter and one geometry column in the dataframe. + + +.. admonition:: to do + + Add example with geometry handling. diff --git a/docs/rst-cheatsheet.rst b/docs/rst-cheatsheet.rst index d1bd37360..c02e4a15d 100644 --- a/docs/rst-cheatsheet.rst +++ b/docs/rst-cheatsheet.rst @@ -50,6 +50,15 @@ More explicit code block with language hint (and no need for double colon) >>> 3 + 5 8 +Code block with additional features (line numbers, caption, highlighted lines, +for more see https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#directive-code-block) + +.. code-block:: python + :linenos: + :caption: how to say hello + :emphasize-lines: 1 + + print("hello world") References: @@ -60,4 +69,8 @@ References: - refer to the reference with:: - :ref:`target` + :ref:`target` or :ref:`custom text ` + +- inline URL references:: + + `Python `_ diff --git a/openeo/extra/job_management.py b/openeo/extra/job_management.py index 09dc5bdef..d7f370291 100644 --- a/openeo/extra/job_management.py +++ b/openeo/extra/job_management.py @@ -948,47 +948,13 @@ class ProcessBasedJobCreator: for each row of the dataframe managed by the :py:class:`MultiBackendJobManager` by filling in the process parameters with corresponding row values. - Usage example with a remote process definition: + .. seealso:: + See :ref:`job-management-with-process-based-job-creator` + for more information and examples. - .. code-block:: python - - from openeo.extra.job_management import ( - MultiBackendJobManager, - create_job_db, - ProcessBasedJobCreator, - ) - - # Job creator, based on a parameterized openEO process - # (specified by the remote process definition at given URL) - # which has, say, parameters "start_date" and "bands" for example. - job_starter = ProcessBasedJobCreator( - namespace="https://example.com/my_process.json", - parameter_defaults={ - # Default value for the "bands" parameter - # (to be used when not available in the dataframe) - "bands": ["B02", "B03"], - }, - ) - - # Initialize job database from a dataframe, - # with desired parameter values to fill in. - df = pd.DataFrame({ - "start_date": ["2021-01-01", "2021-02-01", "2021-03-01"], - ... - }) - job_db = create_job_db("jobs.csv").initialize_from_df(df) - - # Create and run job manager - job_manager = MultiBackendJobManager(...) - job_manager.run_jobs(job_db=job_db, start_job=job_starter) - - The factory will take care of filling in the process parameters - based on matching column names in the dataframe from the job database - (like "start_date" in the example above). - - This intuitive name-based matching should cover most use cases, - but for some more advanced use cases, there are additional options - to provide overrides and fallbacks: + Process parameters are linked to dataframe columns by name. + While this intuitive name-based matching should cover most use cases, + there are additional options for overrides or fallbacks: - When provided, ``parameter_column_map`` will be consulted for resolving a process parameter name (key in the dictionary) @@ -1010,6 +976,7 @@ class ProcessBasedJobCreator: - Finally if no (default) value can be determined and the parameter is not flagged as optional, an error will be raised. + :param process_id: (optional) openEO process identifier. Can be omitted when working with a remote process definition that is fully defined with a URL in the ``namespace`` parameter. @@ -1024,6 +991,12 @@ class ProcessBasedJobCreator: to dataframe column names as value. .. versionadded:: 0.33.0 + + .. warning:: + This is an experimental API subject to change, + and we greatly welcome + `feedback and suggestions for improvement `_. + """ def __init__( self,