diff --git a/docs/conf.py b/docs/conf.py index a087c28a4a5..b9f793df2b6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -10,7 +10,7 @@ import modin project = u"Modin" -copyright = u"2018, Modin" +copyright = u"2018-2020, Modin" author = u"Modin contributors" # The short X.Y version @@ -28,7 +28,17 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = [] +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + 'sphinx.ext.todo', + 'sphinx.ext.mathjax', + 'sphinx.ext.githubpages', + 'sphinx.ext.graphviz', + 'sphinxcontrib.plantuml', + "sphinx_issues", +] + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -67,7 +77,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = "nature" +html_theme = "sphinx_rtd_theme" html_logo = "img/MODIN_ver2.png" @@ -75,7 +85,7 @@ # further. For a list of options available for each theme, see the # documentation. # -html_theme_options = {"sidebarwidth": 270} +html_theme_options = {"sidebarwidth": 270, 'collapse_navigation': False} # Custom sidebar templates, must be a dictionary that maps document names # to template names. @@ -88,3 +98,5 @@ html_sidebars = { "**": ["globaltoc.html", "relations.html", "sourcelink.html", "searchbox.html"] } + +issues_github_path = "modin-project/modin" \ No newline at end of file diff --git a/docs/developer/architecture.rst b/docs/developer/architecture.rst new file mode 100644 index 00000000000..934e9781053 --- /dev/null +++ b/docs/developer/architecture.rst @@ -0,0 +1,260 @@ +System Architecture +=================== + +In this section, we will lay out the overall system architecture for +Modin, as well as go into detail about the component design, implementation and +other important details. This document also contains important reference +information for those interested in contributing new functionality, bugfixes +and enhancements. + +High-Level Architectural View +----------------------------- +The diagram below outlines the general layered view to the components of Modin +with a short description of each major section of the documentation following. + + +.. image:: /img/modin_architecture.png + :align: center + +Modin is logically separated into different layers that represent the hierarchy of a +typical Database Management System. Abstracting out each component allows us to +individually optimize and swap out components without affecting the rest of the system. +We can implement, for example, new compute kernels that are optimized for a certain type +of data and can simply plug it in to the existing infrastructure by implementing a small +interface. It can still be distributed by our choice of compute engine with the +logic internally. + +System View +--------------------------- +If we look to the overall class structure of the Modin system from very top, it will +look to something like this: + +.. image:: /img/10000_meter.png + :align: center + +The user - Data Scientist interacts with the Modin system by sending interactive or +batch commands through API and Modin executes them using various backend execution +engines: Ray, Dask and MPI are currently supported. + +Subsystem/Container View +------------------------ +If we click down to the next level of details we will see that inside Modin the layered +architecture is implemented using several interacting components: + +.. image:: /img/component_view.png + :align: center + +For the simplicity the other backend systems - Dask and MPI are omitted and only Ray backend is shown. + +* Dataframe subsystem is the backbone of the dataframe holding and query compilation. It is responsible for + dispatching the ingress/egress to the appropriate module, getting the Pandas API and calling the query + compiler to convert calls to the internal intermediate Dataframe Algebra. +* Data Ingress/Egress Module is working in conjunction with Dataframe and Partitions subsystem to read data + split into partitions and send data into the appropriate node for storing. +* Query Planner is subsystem that translates the Pandas API to intermediate Dataframe Algebra representation + DAG and performs an initial set of optimizations. +* Query Executor is responsible for getting the Dataframe Algebra DAG, performing further optimizations based + on a selected backend execution subsystem and mapping or compiling the Dataframe Algebra DAG to and actual + execution sequence. +* Backends module is responsible for mapping the abstract operation to an actual executor call, e.g. Pandas, + PyArrow, custom backend. +* Orchestration subsystem is responsible for spawning and controlling the actual execution environment for the + selected backend. It spawns the actual nodes, fires up the execution environment, e.g. Ray, monitors the state + of executors and provides telemetry + +Component View +-------------- +Coming soon... + +Module/Class View +----------------- +Coming soon... + +DataFrame Partitioning +---------------------- + +The Modin DataFrame architecture follows in the footsteps of modern architectures for +database and high performance matrix systems. We chose a partitioning schema that +partitions along both columns and rows because it gives Modin flexibility and +scalability in both the number of columns and the number of rows supported. The +following figure illustrates this concept. + +.. image:: /img/block_partitions_diagram.png + :align: center + +Currently, each partition's memory format is a `pandas DataFrame`_. In the future, we will +support additional in-memory formats for the backend, namely `Arrow tables`_. + +Index +""""" + +We currently use the ``pandas.Index`` object for both indexing columns and rows. In the +future, we will implement a distributed, pandas-compatible Index object in order remove +this scaling limitation from the system. It does not start to become a problem until you +are operating on more than 10's of billions of columns or rows, so most workloads will +not be affected by this scalability limit. **Important note**: If you are using the +default index (``pandas.RangeIndex``) there is a fixed memory overhead (~200 bytes) and +there will be no scalability issues with the index. + + +API +""" + +The API is the outer-most layer that faces users. The majority of our current effort is +spent implementing the components of the pandas API. We have implemented a toy example +for a sqlite API as a proof of concept, but this isn't ready for usage/testing. There +are also plans to expose the Modin DataFrame API as a reduced API set that encompasses +the entire pandas/dataframe API. + +Query Compiler +"""""""""""""" + +The Query Compiler receives queries from the pandas API layer. The API layer's +responsibility is to ensure clean input to the Query Compiler. The Query Compiler must +have knowledge of the compute kernels/in-memory format of the data in order to +efficiently compile the queries. + +The Query Compiler is responsible for sending the compiled query to the Modin DataFrame. +In this design, the Query Compiler does not have information about where or when the +query will be executed, and gives the control of the partition layout to the Modin +DataFrame. + +In the interest of reducing the pandas API, the Query Compiler layer closely follows the +pandas API, but cuts out a large majority of the repetition. + +Modin DataFrame +""""""""""""""" + +At this layer, operations can be performed lazily. Currently, Modin executes most +operations eagerly in an attempt to behave as pandas does. Some operations, e.g. +``transpose`` are expensive and create full copies of the data in-memory. In these +cases, we can wait until another operation triggers computation. In the future, we plan +to add additional query planning and laziness to Modin to ensure that queries are +performed efficiently. + +The structure of the Modin DataFrame is extensible, such that any operation that could +be better optimized for a given backend can be overridden and optimized in that way. + +This layer has a significantly reduced API from the QueryCompiler and the user-facing +API. Each of these APIs represents a single way of performing a given operation or +behavior. Some of these are expanded for convenience/understanding. The API abstractions +are as follows: + +Modin DataFrame API +''''''''''''''''''' + +* ``mask``: Indexing/masking/selecting on the data (by label or by integer index). +* ``copy``: Create a copy of the data. +* ``mapreduce``: Reduce the dimension of the data. +* ``foldreduce``: Reduce the dimension of the data, but entire column/row information is needed. +* ``map``: Perform a map. +* ``fold``: Perform a fold. +* ``apply_``: Apply a function that may or may not change the shape of the data. + + * ``full_axis``: Apply a function requires knowledge of the entire axis. + * ``full_axis_select_indices``: Apply a function performed on a subset of the data that requires knowledge of the entire axis. + * ``select_indices``: Apply a function to a subset of the data. This is mainly used for indexing. + +* ``binary_op``: Perform a function between two dataframes. +* ``concat``: Append one or more dataframes to either axis of this dataframe. +* ``transpose``: Swap the axes (columns become rows, rows become columns). +* ``groupby``: + + * ``groupby_reduce``: Perform a reduction on each group. + * ``groupby_apply``: Apply a function to each group. + +* take functions + * ``head``: Take the first ``n`` rows. + * ``tail``: Take the last ``n`` rows. + * ``front``: Take the first ``n`` columns. + * ``back``: Take the last ``n`` columns. + +* import/export functions + * ``from_pandas``: Convert a pandas dataframe to a Modin dataframe. + * ``to_pandas``: Convert a Modin dataframe to a pandas dataframe. + * ``to_numpy``: Convert a Modin dataframe to a numpy array. + +More documentation can be found internally in the code_. This API is not complete, but +represents an overwhelming majority of operations and behaviors. + +This API can be implemented by other distributed/parallel DataFrame libraries and +plugged in to Modin as well. Create an issue_ or discuss on our Discourse_ for more +information! + +The Modin DataFrame is responsible for the data layout and shuffling, partitioning, +and serializing the tasks that get sent to each partition. Other implementations of the +Modin DataFrame interface will have to handle these as well. + +Execution Engine/Framework +"""""""""""""""""""""""""" + +This layer is what Modin uses to perform computation on a partition of the data. The +Modin DataFrame is designed to work with `task parallel`_ frameworks, but with some +effort, a data parallel framework is possible. + +Internal abstractions +""""""""""""""""""""" + +These abstractions are not included in the above architecture, but are important to the +internals of Modin. + +Partition Manager +''''''''''''''''' + +The Partition Manager can change the size and shape of the partitions based on the type +of operation. For example, certain operations are complex and require access to an +entire column or row. The Partition Manager can convert the block partitions to row +partitions or column partitions. This gives Modin the flexibility to perform operations +that are difficult in row-only or column-only partitioning schemas. + +Another important component of the Partition Manager is the serialization and shipment +of compiled queries to the Partitions. It maintains metadata for the length and width of +each partition, so when operations only need to operate on or extract a subset of the +data, it can ship those queries directly to the correct partition. This is particularly +important for some operations in pandas which can accept different arguments and +operations for different columns, e.g. ``fillna`` with a dictionary. + +This abstraction separates the actual data movement and function application from the +DataFrame layer to keep the DataFrame API small and separately optimize the data +movement and metadata management. + +Partition +''''''''' + +Partitions are responsible for managing a subset of the DataFrame. As is mentioned +above, the DataFrame is partitioned both row and column-wise. This gives Modin +scalability in both directions and flexibility in data layout. There are a number of +optimizations in Modin that are implemented in the partitions. Partitions are specific +to the execution framework and in-memory format of the data. This allows Modin to +exploit potential optimizations across both of these. These optimizations are explained +further on the pages specific to the execution framework. + +Supported Execution Frameworks and Memory Formats +""""""""""""""""""""""""""""""""""""""""""""""""" + +This is the list of execution frameworks and memory formats supported in Modin. If you +would like to contribute a new execution framework or memory format, please see the +documentation page on Contributing_. + +- `Pandas on Ray`_ + - Uses the Ray_ execution framework. + - The compute kernel/in-memory format is a pandas DataFrame. +- `Pandas on Dask`_ + - Uses the `Dask Futures`_ execution framework. + - The compute kernel/in-memory format is a pandas DataFrame. +- `Pyarrow on Ray`_ (experimental) + - Uses the Ray_ execution framework. + - The compute kernel/in-memory format is a pyarrow Table. + +.. _pandas Dataframe: https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.html +.. _Arrow tables: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html +.. _Ray: https://github.com/ray-project/ray +.. _code: https://github.com/modin-project/modin/blob/master/modin/engines/base/frame/data.py +.. _Contributing: contributing.html +.. _Pandas on Ray: UsingPandasonRay/optimizations.html +.. _Pandas on Dask: UsingPandasonDask/optimizations.html +.. _Dask Futures: https://docs.dask.org/en/latest/futures.html +.. _issue: https://github.com/modin-project/modin/issues +.. _Discourse: https://discuss.modin.org +.. _task parallel: https://en.wikipedia.org/wiki/Task_parallelism +.. _Pyarrow on Ray: UsingPyarrowonRay/index.html diff --git a/docs/developer/contributing.rst b/docs/developer/contributing.rst new file mode 100644 index 00000000000..257e6ffb9d5 --- /dev/null +++ b/docs/developer/contributing.rst @@ -0,0 +1,164 @@ +Contributing +============ + +Getting Started +--------------- + +If you're interested in getting involved in the development of Modin, but aren't sure +where start, take a look at the issues tagged `Good first issue`_ or Documentation_. +These are issues that would be good for getting familiar with the codebase and better +understanding some of the more complex components of the architecture. There is +documentation here about the architecture_ that you will want to review in order to get +started. + +Also, feel free to join the discussions on the `developer mailing list`_. + +Certificate of Origin +--------------------- + +To keep a clear track of who did what, we use a `sign-off` procedure (same requirements +for using the signed-off-by process as the Linux kernel has +https://www.kernel.org/doc/html/v4.17/process/submitting-patches.html) on patches or pull +requests that are being sent. The sign-off is a simple line at the end of the explanation +for the patch, which certifies that you wrote it or otherwise have the right to pass it +on as an open-source patch. The rules are pretty simple: if you can certify the below: + +CERTIFICATE OF ORIGIN V 1.1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +"By making a contribution to this project, I certify that: + +1.) The contribution was created in whole or in part by me and I have the right to +submit it under the open source license indicated in the file; or +2.) The contribution is based upon previous work that, to the best of my knowledge, is +covered under an appropriate open source license and I have the right under that license +to submit that work with modifications, whether created in whole or in part by me, under +the same open source license (unless I am permitted to submit under a different +license), as indicated in the file; or +3.) The contribution was provided directly to me by some other person who certified (a), +(b) or (c) and I have not modified it. +4.) I understand and agree that this project and the contribution are public and that a +record of the contribution (including all personal information I submit with it, +including my sign-off) is maintained indefinitely and may be redistributed consistent +with this project or the open source license(s) involved." + + +.. code-block:: bash + + This is my commit message + + Signed-off-by: Awesome Developer + + +. +Code without a proper signoff cannot be merged into the +master branch. Note: You must use your real name (sorry, no pseudonyms or anonymous +contributions.) + +The text can either be manually added to your commit body, or you can add either ``-s`` +or ``--signoff`` to your usual ``git commit`` commands: + + + +.. code-block:: bash + + git commit --signoff + git commit -s + +This will use your default git configuration which is found in .git/config. To change +this, you can use the following commands: + +.. code-block:: bash + + git config --global user.name "Awesome Developer" + git config --global user.email "awesome.developer.@example.org" + +If you have authored a commit that is missing the signed-off-by line, you can amend your +commits and push them to GitHub. + +.. code-block:: bash + + git commit --amend --signoff + +If you've pushed your changes to GitHub already you'll need to force push your branch +after this with ``git push -f``. + +Development Dependencies +------------------------ + +We recommend doing development in a virtualenv or conda environment, though this decision +is ultimately yours. You will want to run the following in order to install all of the required +dependencies for running the tests and formatting the code: + +.. code-block:: bash + + pip install -r requirements.txt + +For developments under Windows, dependencies can be found in 'env_windows.yml' file. + +Code Formatting and Lint +------------------------ + +We use black_ for code formatting. Before you submit a pull request, please make sure +that you run the following from the project root: + +.. code-block:: bash + + black modin/ + +We also use flake8_ to check linting errors. Running the following from the project root +will ensure that it passes the lint checks on Travis: + +.. code-block:: bash + + flake8 . + +We test that this has been run on our `Travis CI`_ test suite. If you do this and find +that the tests are still failing, try updating your version of black and flake8. + +Adding a test +------------- + +If you find yourself fixing a bug or adding a new feature, don't forget to add a test to +the test suite to verify its correctness! More on testing and the layout of the tests +can be found in our testing_ documentation. We ask that you follow the existing +structure of the tests for ease of maintenance. + +Running the tests +----------------- + +To run the entire test suite, run the following from the project root: + +.. code-block:: bash + + pytest modin/pandas/test + +The test suite is very large, and may take a long time if you run every test. If you've +only modified a small amount of code, it may be sufficient to run a single test or some +subset of the test suite. In order to run a specific test run: + +.. code-block:: bash + + pytest modin/pandas/test::test_new_functionality + +The entire test suite is automatically run for each pull request. + +Contributing a new execution framework or in-memory format +---------------------------------------------------------- + +If you are interested in contributing support for a new execution framework or in-memory +format, please make sure you understand the architecture_ of Modin. + +The best place to start the discussion for adding a new execution framework or in-memory +format is the `developer mailing list`_. + +More docs on this coming soon... + +.. _Good first issue: https://github.com/modin-project/modin/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue+%3Abeginner%3A%22 +.. _Documentation: https://github.com/modin-project/modin/issues?q=is%3Aissue+is%3Aopen+label%3A%22documentation+%3Abookmark_tabs%3A%22 +.. _architecture: architecture.html +.. _internal methods: +.. _black: https://github.com/ambv/black +.. _flake8: http://flake8.pycqa.org/en/latest/ +.. _Travis CI: https://travis-ci.org/ +.. _testing: +.. _developer mailing list: https://groups.google.com/forum/#!forum/modin-dev diff --git a/docs/img/10000_meter.png b/docs/img/10000_meter.png new file mode 100644 index 00000000000..213e0c9cc17 Binary files /dev/null and b/docs/img/10000_meter.png differ diff --git a/docs/img/component_view.png b/docs/img/component_view.png new file mode 100644 index 00000000000..16181443de6 Binary files /dev/null and b/docs/img/component_view.png differ diff --git a/docs/img/query_planner.png b/docs/img/query_planner.png new file mode 100644 index 00000000000..1158320ac53 Binary files /dev/null and b/docs/img/query_planner.png differ diff --git a/docs/index.rst b/docs/index.rst index 78c63a3fff0..2f8e80822a5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -137,6 +137,12 @@ nature, you get a fast DataFrame at 1MB and 1TB+. supported_apis/utilities_supported supported_apis/io_supported +.. toctree:: + :caption: Developer Documentation + + developer/contributing + developer/architecture + .. toctree:: :caption: Engines, Backends, and APIs diff --git a/docs/supported_apis/index.rst b/docs/supported_apis/index.rst index 4e08ce6781c..055119a148c 100644 --- a/docs/supported_apis/index.rst +++ b/docs/supported_apis/index.rst @@ -20,7 +20,7 @@ The remaining unimplemented methods default to pandas. This allows users to cont using Modin even though their workloads contain functions not yet implemented in Modin. Here is a diagram of how we convert to pandas and perform the operation: -.. image:: img/convert_to_pandas.png +.. image:: /img/convert_to_pandas.png :align: center We first convert to a pandas DataFrame, then perform the operation. There is a