Merge branch 'master' into Mypy-Algebra

modin-project · Oct 31, 2022 · 797a888 · 797a888
2 parents 561fecc + 6f0ff79
commit 797a888
Show file tree

Hide file tree

Showing 92 changed files with 8,548 additions and 716 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -8,11 +8,11 @@ if you have questions about contributing.
 
 <!-- Please give a short brief about these changes. -->
 
-- [ ] commit message follows format outlined [here](https://modin.readthedocs.io/en/latest/development/contributing.html#commit-message-formatting)
+- [x] first commit message and PR title follow format outlined [here](https://modin.readthedocs.io/en/latest/development/contributing.html#commit-message-formatting)
+  > **_NOTE:_**  If you edit the PR title to match this format, you need to add another commit (even if it's empty) or amend your last commit for the CI job that checks the PR title to pick up the new PR title.
 - [ ] passes `flake8 modin/ asv_bench/benchmarks scripts/doc_checker.py`
 - [ ] passes `black --check modin/ asv_bench/benchmarks scripts/doc_checker.py`
 - [ ] signed commit with `git commit -s` <!-- you can amend your commit with a signature via `git commit -amend -s` -->
 - [ ] Resolves #? <!-- issue must be created for each patch -->
 - [ ] tests added and passing
 - [ ] module layout described at `docs/development/architecture.rst` is up-to-date <!-- if you have added, renamed or removed files or directories please update the documentation accordingly -->
-- [ ] added (Issue Number: PR title (PR Number)) and github username to release notes for next major release <!-- e.g. DOCS-#4077: Add release notes template to docs folder (#4078) -->
diff --git a/.github/workflows/ci-required.yml b/.github/workflows/ci-required.yml
@@ -1,21 +1,11 @@
 name: ci-required
 on: pull_request
 jobs:
-  lint-commit:
-    name: lint (commit)
+  check-pr-title:
     runs-on: ubuntu-latest
-    env:
-      GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
     steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-      - uses: actions/setup-node@v1
-        with:
-          node-version: "12.x"
-      - run: npm install --save-dev @commitlint/{config-conventional,cli} commitlint-plugin-jira-rules commitlint-config-jira
-      - name: Add dependencies for commitlint action
-        run: echo "NODE_PATH=$GITHUB_WORKSPACE/node_modules" >> $GITHUB_ENV
-      - run: git remote add upstream https://github.com/modin-project/modin.git
-      - run: git fetch upstream
-      - run: npx commitlint --from upstream/master --to $(git log upstream/master..HEAD --pretty=format:"%h" | tail -1) --verbose
+    - uses: Slashgear/[email protected]
+      with:
+        # NOTE: If you change the allowed prefixes here, update
+        # the documentation about them in /docs/development/contributing.rst
+        regexp: '^(?:FEAT|DOCS|FIX|REFACTOR|TEST|PERF)-#\d+:'
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -429,6 +429,15 @@ jobs:
       - run: pytest modin/pandas/test/test_io.py::TestCsv --verbose
       - run: pytest modin/test/interchange/dataframe_protocol/test_general.py
       - run: pytest modin/test/interchange/dataframe_protocol/hdk
+      - run: python examples/docker/modin-hdk/census-hdk.py examples/data/census_1k.csv -no-ml
+      - run: python examples/docker/modin-hdk/nyc-taxi-hdk.py examples/data/nyc-taxi_1k.csv
+      - run: |
+          python examples/docker/modin-hdk/plasticc-hdk.py \
+          examples/data/plasticc_training_set_1k.csv \
+          examples/data/plasticc_test_set_1k.csv \
+          examples/data/plasticc_training_set_metadata_1k.csv \
+          examples/data/plasticc_test_set_metadata_1k.csv \
+          -no-ml
       - uses: codecov/codecov-action@v2
 
   test-asv-benchmarks:

diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py
@@ -20,6 +20,7 @@
 # measurements
 
 import numpy as np
+import pandas._testing as tm
 
 from .utils import (
     generate_dataframe,
@@ -122,12 +123,56 @@ def time_join(self, shapes, how, sort):
         execute(self.df1.join(self.df2, how=how, lsuffix="left_", sort=sort))
 
 
+class TimeJoinStringIndex:
+    param_names = ["shapes", "sort"]
+    params = [
+        get_benchmark_shapes("TimeJoinStringIndex"),
+        [True, False],
+    ]
+
+    def setup(self, shapes, sort):
+        assert shapes[0] % 100 == 0, "implementation restriction"
+        level1 = tm.makeStringIndex(10).values
+        level2 = tm.makeStringIndex(shapes[0] // 100).values
+        codes1 = np.arange(10).repeat(shapes[0] // 100)
+        codes2 = np.tile(np.arange(shapes[0] // 100), 10)
+        index2 = IMPL.MultiIndex(levels=[level1, level2], codes=[codes1, codes2])
+        self.df_multi = IMPL.DataFrame(
+            np.random.randn(len(index2), 4), index=index2, columns=["A", "B", "C", "D"]
+        )
+
+        self.key1 = np.tile(level1.take(codes1), 10)
+        self.key2 = np.tile(level2.take(codes2), 10)
+        self.df = generate_dataframe("int", *shapes, RAND_LOW, RAND_HIGH)
+        # just to keep source shape
+        self.df = self.df.drop(columns=self.df.columns[-2:])
+        self.df["key1"] = self.key1
+        self.df["key2"] = self.key2
+        execute(self.df)
+
+        self.df_key1 = IMPL.DataFrame(
+            np.random.randn(len(level1), 4), index=level1, columns=["A", "B", "C", "D"]
+        )
+        self.df_key2 = IMPL.DataFrame(
+            np.random.randn(len(level2), 4), index=level2, columns=["A", "B", "C", "D"]
+        )
+
+    def time_join_dataframe_index_multi(self, shapes, sort):
+        execute(self.df.join(self.df_multi, on=["key1", "key2"], sort=sort))
+
+    def time_join_dataframe_index_single_key_bigger(self, shapes, sort):
+        execute(self.df.join(self.df_key2, on="key2", sort=sort))
+
+    def time_join_dataframe_index_single_key_small(self, shapes, sort):
+        execute(self.df.join(self.df_key1, on="key1", sort=sort))
+
+
 class TimeMerge:
     param_names = ["shapes", "how", "sort"]
     params = [
         get_benchmark_shapes("TimeMerge"),
         ["left", "inner"],
-        [False],
+        [True, False],
     ]
 
     def setup(self, shapes, how, sort):
@@ -142,6 +187,54 @@ def time_merge(self, shapes, how, sort):
             )
         )
 
+    def time_merge_default(self, shapes, how, sort):
+        execute(IMPL.merge(self.df1, self.df2, how=how, sort=sort))
+
+    def time_merge_dataframe_empty_right(self, shapes, how, sort):
+        # Getting an empty dataframe using `iloc` should be very fast,
+        # so the impact on the time of the merge operation should be negligible.
+        execute(IMPL.merge(self.df1, self.df2.iloc[:0], how=how, sort=sort))
+
+    def time_merge_dataframe_empty_left(self, shapes, how, sort):
+        # Getting an empty dataframe using `iloc` should be very fast,
+        # so the impact on the time of the merge operation should be negligible.
+        execute(IMPL.merge(self.df1.iloc[:0], self.df2, how=how, sort=sort))
+
+
+class TimeMergeCategoricals:
+    param_names = ["shapes", "data_type"]
+    params = [
+        get_benchmark_shapes("MergeCategoricals"),
+        ["object", "category"],
+    ]
+
+    def setup(self, shapes, data_type):
+        assert len(shapes) == 2
+        assert shapes[1] == 2
+        size = (shapes[0],)
+        self.left = IMPL.DataFrame(
+            {
+                "X": np.random.choice(range(0, 10), size=size),
+                "Y": np.random.choice(["one", "two", "three"], size=size),
+            }
+        )
+
+        self.right = IMPL.DataFrame(
+            {
+                "X": np.random.choice(range(0, 10), size=size),
+                "Z": np.random.choice(["jjj", "kkk", "sss"], size=size),
+            }
+        )
+
+        if data_type == "category":
+            self.left = self.left.assign(Y=self.left["Y"].astype("category"))
+            execute(self.left)
+            self.right = self.right.assign(Z=self.right["Z"].astype("category"))
+            execute(self.right)
+
+    def time_merge_categoricals(self, shapes, data_type):
+        execute(IMPL.merge(self.left, self.right, on="X"))
+
 
 class TimeConcat:
     param_names = ["shapes", "how", "axis", "ignore_index"]
@@ -198,6 +291,26 @@ def time_binary_op(self, shapes, binary_op, axis):
         execute(self.op(self.df2, axis=axis))
 
 
+class TimeBinaryOpSeries:
+    param_names = ["shapes", "binary_op"]
+    params = [
+        get_benchmark_shapes("TimeBinaryOpSeries"),
+        ["mul"],
+    ]
+
+    def setup(self, shapes, binary_op):
+        df1 = generate_dataframe("int", *shapes[0], RAND_LOW, RAND_HIGH)
+        df2 = generate_dataframe("int", *shapes[1], RAND_LOW, RAND_HIGH)
+        self.series1 = df1[df1.columns[0]]
+        self.series2 = df2[df2.columns[0]]
+        self.op = getattr(self.series1, binary_op)
+        execute(self.series1)
+        execute(self.series2)
+
+    def time_binary_op_series(self, shapes, binary_op):
+        execute(self.op(self.series2))
+
+
 class BaseTimeSetItem:
     param_names = ["shape", "item_length", "loc", "is_equal_indices"]
 
@@ -699,3 +812,6 @@ def time_columns(self, shape):
 
     def time_index(self, shape):
         return self.df.index
+
+
+from .utils import setup  # noqa: E402, F401
diff --git a/asv_bench/benchmarks/utils/__init__.py b/asv_bench/benchmarks/utils/__init__.py
@@ -32,6 +32,7 @@
     random_booleans,
     translator_groupby_ngroups,
     trigger_import,
+    setup,
 )
 
 __all__ = [
@@ -54,4 +55,5 @@
     "random_booleans",
     "translator_groupby_ngroups",
     "trigger_import",
+    "setup",
 ]
diff --git a/asv_bench/benchmarks/utils/common.py b/asv_bench/benchmarks/utils/common.py
@@ -585,3 +585,10 @@ def prepare_io_data_parquet(test_filename: str, data_type: str, shapes: list):
         df.to_parquet(test_filenames[shape_id], index=False)
 
     return test_filenames
+
+
+def setup(*args, **kwargs):  # noqa: GL08
+    # This function just needs to be imported into each benchmark file to
+    # set up the random seed before each function. ASV run it automatically.
+    # https://asv.readthedocs.io/en/latest/writing_benchmarks.html
+    np.random.seed(42)
diff --git a/asv_bench/benchmarks/utils/data_shapes.py b/asv_bench/benchmarks/utils/data_shapes.py
@@ -43,6 +43,13 @@
     "big": [[100_000, 1]],
     "small": [[10_000, 1]],
 }
+BINARY_OP_SERIES_DATA_SIZE = {
+    "big": [
+        [[500_000, 1], [1_000_000, 1]],
+        [[500_000, 1], [500_000, 1]],
+    ],
+    "small": [[[5_000, 1], [10_000, 1]]],
+}
 
 
 HDK_BINARY_OP_DATA_SIZE = {
@@ -118,6 +125,13 @@
             "TimeFillnaSeries",
         ],
     ),
+    (
+        BINARY_OP_SERIES_DATA_SIZE[ASV_DATASET_SIZE],
+        [
+            # Pandas storage format benchmarks
+            "TimeBinaryOpSeries",
+        ],
+    ),
 ]
 
 _DEFAULT_HDK_CONFIG_T = [
@@ -149,6 +163,12 @@
     ),
 ]
 DEFAULT_CONFIG = {}
+DEFAULT_CONFIG["MergeCategoricals"] = (
+    [[10_000, 2]] if ASV_DATASET_SIZE == "big" else [[1_000, 2]]
+)
+DEFAULT_CONFIG["TimeJoinStringIndex"] = (
+    [[100_000, 64]] if ASV_DATASET_SIZE == "big" else [[1_000, 4]]
+)
 for config in (_DEFAULT_CONFIG_T, _DEFAULT_HDK_CONFIG_T):
     for _shape, _names in config:
         DEFAULT_CONFIG.update({_name: _shape for _name in _names})

diff --git a/c323f7fe385011ed849300155de07645.db b/c323f7fe385011ed849300155de07645.db
diff --git a/codecov.yml b/codecov.yml
@@ -0,0 +1 @@
+comment: false
diff --git a/commitlint.config.js b/commitlint.config.js
diff --git a/docs/development/contributing.rst b/docs/development/contributing.rst
@@ -86,8 +86,8 @@ after this with ``git push -f``.
 
 Commit Message formatting
 -------------------------
-To ensure that all commit messages in the master branch follow a specific format, we
-enforce that all commit messages must follow the following format:
+We request that your first commit follow a particular format, and we
+**require** that your PR title follow the format. The format is:
 
 .. code-block:: bash
 
@@ -109,6 +109,11 @@ because it links commits to their issues.
 
 The commit message should follow a colon (:) and be descriptive and succinct.
 
+A Modin CI job on GitHub will enforce that your pull request title follows the
+format we suggest. Note that if you update the PR title, you have to push
+another commit (even if it's empty) or amend your last commit for the job to
+pick up the new PR title. Re-running the job in Github Actions won't work.
+
 General Rules for committers
 ----------------------------
 
@@ -192,9 +197,8 @@ To build the documentation, please follow the steps below from the project root:
 
 .. code-block:: bash
 
-    cd docs
-    pip install -r requirements-doc.txt
-    sphinx-build -b html . build
+    pip install -r docs/requirements-doc.txt
+    sphinx-build -b html docs docs/build
 
 To visualize the documentation locally, run the following from `build` folder: