Merge branch 'dev' into solve-ignore-empty

pyjanitor-devs · Nov 28, 2022 · a29a463 · a29a463
2 parents 08fe78c + 68b8bb0
commit a29a463
Show file tree

Hide file tree

Showing 30 changed files with 441 additions and 127 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.23.1
+current_version = 0.24.0
 commit = True
 tag = True
 

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -33,7 +33,9 @@
 		"ms-python.python",
 		"ms-python.vscode-pylance",
 		"ms-vsliveshare.vsliveshare-pack",
-		"arcticicestudio.nord-visual-studio-code"
+		"arcticicestudio.nord-visual-studio-code",
+		"ms-vsliveshare.vsliveshare",
+		"ms-vsliveshare.vsliveshare-audio"
 	],
 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
 	"forwardPorts": [

diff --git a/.github/workflows/auto-update.yml b/.github/workflows/auto-update.yml
@@ -0,0 +1,16 @@
+# This workflow automatically updates PR branches with latest changes on target branch.
+# See: https://github.com/marketplace/actions/auto-update
+name: autoupdate
+on:
+  # This will trigger on all pushes to all branches.
+  push:
+    branches: [dev]
+jobs:
+  autoupdate:
+    name: autoupdate
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: docker://chinthakagodawita/autoupdate-action:v1
+        env:
+          GITHUB_TOKEN: "${{ secrets.GHPAGES_TOKEN }}"
+          PR_READY_STATE: "ready_for_review"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -37,15 +37,17 @@ jobs:
           auto-update-conda: true
           miniforge-variant: Mambaforge
           channels: conda-forge
-          activate-environment: pyjanitor-dev
           environment-file: environment-dev.yml
           use-mamba: true
 
+      - name: Install pyjanitor
+        run: python -m pip install -e .
+
+      - name: Run docstrings tests
+        run: pytest -v -r a -n auto --color=yes --durations=0 --cov=janitor --cov-append --cov-report term-missing --cov-report xml --doctest-only janitor
+
       - name: Run unit tests
-        run: |
-          conda activate pyjanitor-dev
-          python -m pip install -e .
-          pytest -m "${{ matrix.test-subset }}"
+        run: pytest -v -r a -n auto --color=yes --durations=0 --cov=janitor --cov-append --cov-report term-missing --cov-report xml tests -m "${{ matrix.test-subset }}"
 
       # https://github.com/codecov/codecov-action
       - name: Upload code coverage

diff --git a/AUTHORS.md b/AUTHORS.md
@@ -108,6 +108,8 @@ Contributors
 - [@gahjelle](https://github.com/gahjelle) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%3Agahjelle)
 - [@ethompsy](https://github.com/ethompsy) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%3Aethompsy)
 - [@apatao](https://github.com/apatao) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%3Aapatao)
-- [@OdinTech3](https://github.com/OdinTech3) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1094)
+- [@OdinTech3](https://github.com/OdinTech3) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%OdinTech3)
+- [@asmirnov69](https://github.com/asmirnov69) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%asmirnov69)
+- [@xujiboy](https://github.com/xujiboy) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%xujiboy)
+- [@joranbeasley](https://github.com/joranbeasley) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%joranbeasley)
 - [@Fu-Jie](https://github.com/Fu-Jie) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pulls?q=is%3Aclosed+mentions%3AFu-Jie)
-- [@asmirnov69](https://github.com/asmirnov69) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues/1059)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 ## [Unreleased]
 
+-   [INF] Replace `pytest.ini` file with `pyproject.toml` file. PR #1204 @Zeroto521
+-   [INF] Extract docstrings tests from all tests. PR #1205 @Zeroto521
+-   [BUG] address the `TypeError` when importing v0.24.0 (issue #1201 @xujiboy and @joranbeasley)
+
+## [v0.24.0] - 2022-11-12
+
 -   [ENH] Add lazy imports to speed up the time taken to load pyjanitor (part 2)
 -   [DOC] Updated developer guide docs.
 -   [ENH] Allow column selection/renaming within conditional_join. Issue #1102. Also allow first or last match. Issue #1020 @samukweku.
@@ -29,10 +35,12 @@
 -   [ENH] Fix error when `sort_by_appearance=True` is combined with `dropna=True`. Issue #1168 @samukweku
 -   [ENH] Add explicit default parameter to `case_when` function. Issue #1159 @samukweku
 -   [BUG] pandas 1.5.x `_MergeOperation` doesn't have `copy` keyword anymore. Issue #1174 @Zeroto521
--   [ENH] `select_rows` function added for flexible row selection. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku
+-   [ENH] `select_rows` function added for flexible row selection. Generic `select` function added as well. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku
 -   [TST] Compat with macos and window, to fix `FailedHealthCheck` Issue #1181 @Zeroto521
 -   [INF] Merge two docs CIs (`docs-preview.yml` and `docs.yml`) to one. And add `documentation` pytest mark. PR #1183 @Zeroto521
 -   [INF] Merge `codecov.yml` (only works for the dev branch pushing event) into `tests.yml` (only works for PR event). PR #1185 @Zeroto521
+-   [TST] Fix failure for test/timeseries/test_fill_missing_timestamp. Issue #1184 @samukweku
+-   [BUG] Import `DataDescription` to fix: `AttributeError: 'DataFrame' object has no attribute 'data_description'`. PR #1191 @Zeroto521
 
 ## [v0.23.1] - 2022-05-03
 
@@ -320,7 +328,9 @@ We thank all contributors
 who have helped make `pyjanitor`
 the package that it is today.
 
-[Unreleased]: https://github.com/pyjanitor-devs/pyjanitor/compare/v0.23.1...HEAD
+[Unreleased]: https://github.com/pyjanitor-devs/pyjanitor/compare/v0.24.0...HEAD
+
+[v0.24.0]: https://github.com/pyjanitor-devs/pyjanitor/compare/v0.23.1...v0.24.0
 
 [v0.23.1]: https://github.com/pyjanitor-devs/pyjanitor/compare/v0.22.0...v0.23.1
 

diff --git a/environment-dev.yml b/environment-dev.yml
@@ -44,6 +44,7 @@ dependencies:
   - pytest
   - pytest-cov
   - pytest-xdist
+  - pytest-doctestplus
   - python-language-server
   - rdkit=2021.09.3
   - recommonmark

diff --git a/janitor/__init__.py b/janitor/__init__.py
@@ -29,4 +29,4 @@ def get_features_targets(*args, **kwargs):
     return _get_features_targets(*args, **kwargs)
 
 
-__version__ = "0.23.1"
+__version__ = "0.24.0"
diff --git a/janitor/accessors/__init__.py b/janitor/accessors/__init__.py
@@ -1,17 +1,3 @@
-"""Miscellaneous mathematical operators.
+"""Miscellaneous mathematical operators."""
 
-Lazy loading used here to speed up imports.
-"""
-
-import warnings
-from typing import Tuple
-
-
-import lazy_loader as lazy
-
-scipy_special = lazy.load("scipy.special")
-ss = lazy.load("scipy.stats")
-pf = lazy.load("pandas_flavor")
-pd = lazy.load("pandas")
-np = lazy.load("numpy")
-pdtypes = lazy.load("pandas.api.types")
+from janitor.accessors.data_description import DataDescription  # noqa: F401
diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py
@@ -75,4 +75,4 @@
 from .transform_columns import transform_column, transform_columns
 from .truncate_datetime import truncate_datetime_dataframe
 from .update_where import update_where
-from .utils import patterns, unionize_dataframe_categories
+from .utils import patterns, unionize_dataframe_categories, DropLabel
diff --git a/janitor/functions/_numba.py b/janitor/functions/_numba.py
@@ -163,7 +163,7 @@ def _numba_pair_le_lt(df: pd.DataFrame, right: pd.DataFrame, pair: list):
     # 6 has no match in pair2 of value_2A/2B, so we discard
     # our final matching indices for the left and right pairs
     #########################################################
-    # left_index      right_indes
+    # left_index      right_index
     #     0              7
     #     4              5
     #     5              1
@@ -261,6 +261,9 @@ def _realign(indices, regions):
         # this function ensures the regions are properly aligned
         arr1, arr2 = indices
         region1, region2 = regions
+        # arr2 is used as the reference point
+        # because we are certain that at the very least
+        # it has the same items as arr1, but not more
         indexer = pd.Index(arr2).get_indexer(arr1)
         mask = indexer == -1
         if mask.any():
@@ -724,7 +727,7 @@ def _get_regions(
     #  are present ---> l1 < r1 & l2 > r2
     #  For two non equi conditions, the matches are where
     #  the regions from group A (l1 < r1)
-    #  are also lower than the regions from group B (l2 > r2)
+    #  are also lower than the regions from group B (l2 < r2)
     #  This implementation is based on the algorithm outlined here:
     #  https://www.scitepress.org/papers/2018/68268/68268.pdf
     indices = _search_indices(left_c, right_c, strict, op_code)

diff --git a/janitor/functions/case_when.py b/janitor/functions/case_when.py
@@ -90,6 +90,11 @@ def case_when(
     else:
         default
     ```
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `default` parameter.
+
 
     :param df: A pandas DataFrame.
     :param args: Variable argument of conditions and expected values.

diff --git a/janitor/functions/conditional_join.py b/janitor/functions/conditional_join.py
@@ -1,7 +1,7 @@
+from __future__ import annotations
 import operator
 from enum import Enum
 from typing import Union, Any, Optional, Hashable, Literal
-
 import numpy as np
 import pandas as pd
 import pandas_flavor as pf
@@ -115,6 +115,12 @@ def conditional_join(
         3        4         3         5
         4        4         3         6
 
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `df_columns`, `right_columns`, `keep` and `use_numba` parameters.
+
+
 
     :param df: A pandas DataFrame.
     :param right: Named Series or DataFrame to join to.
@@ -145,7 +151,7 @@ def conditional_join(
     :param use_numba: Use numba, if installed, to accelerate the computation.
         Applicable only to strictly non-equi joins. Default is `False`.
     :returns: A pandas DataFrame of the two merged Pandas objects.
-    """
+    """  # noqa: E501
 
     return _conditional_join_compute(
         df,
@@ -348,9 +354,10 @@ def _conditional_join_type_check(
             f"'{right_column.name}' has {right_column.dtype} type."
         )
 
-    if (op in less_than_join_types.union(greater_than_join_types)) & (
-        (is_string_dtype(left_column) | is_categorical_dtype(left_column))
-    ):
+    number_or_date = is_numeric_dtype(left_column) or is_datetime64_dtype(
+        left_column
+    )
+    if (op != _JoinOperator.STRICTLY_EQUAL.value) & (not number_or_date):
         raise ValueError(
             "non-equi joins are supported "
             "only for datetime and numeric dtypes. "
@@ -484,12 +491,12 @@ def _less_than_indices(
     if left.min() > right.max():
         return None
 
-    any_nulls = pd.isna(left)
+    any_nulls = left.isna()
     if any_nulls.all():
         return None
     if any_nulls.any():
         left = left[~any_nulls]
-    any_nulls = pd.isna(right)
+    any_nulls = right.isna()
     if any_nulls.all():
         return None
     if any_nulls.any():
@@ -591,12 +598,12 @@ def _greater_than_indices(
     if left.max() < right.min():
         return None
 
-    any_nulls = pd.isna(left)
+    any_nulls = left.isna()
     if any_nulls.all():
         return None
     if any_nulls.any():
         left = left[~any_nulls]
-    any_nulls = pd.isna(right)
+    any_nulls = right.isna()
     if any_nulls.all():
         return None
     if any_nulls.any():
@@ -1123,10 +1130,10 @@ def _range_indices(
     # get rid of any nulls
     # this is helpful as we can convert extension arrays to numpy arrays safely
     # and simplify the search logic below
-    any_nulls = pd.isna(df[left_on])
+    any_nulls = df[left_on].isna()
     if any_nulls.any():
         left_c = left_c[~any_nulls]
-    any_nulls = pd.isna(right[right_on])
+    any_nulls = right[right_on].isna()
     if any_nulls.any():
         right_c = right_c[~any_nulls]
 
@@ -1154,16 +1161,26 @@ def _range_indices(
     right_c = right_c._values
     left_c, right_c = _convert_to_numpy_array(left_c, right_c)
     op = operator_map[op]
-    pos = np.empty(left_c.size, dtype=np.intp)
-
-    # better served in a compiled environment
-    # where we can break early
-    # parallelise the operation, as well as
-    # avoid the restrictive fixed size approach of numpy
-    # which isnt particularly helpful in a for loop
-    for ind in range(left_c.size):
-        out = op(left_c[ind], right_c)
-        pos[ind] = np.argmax(out)
+    pos = np.copy(search_indices)
+    counter = np.arange(left_c.size)
+
+    # better than np.outer memory wise?
+    # using this for loop instead of np.outer
+    # allows us to break early and reduce the
+    # number of cartesian checks
+    # since as we iterate, we reduce the size of left_c
+    # speed wise, np.outer will be faster
+    # alternatively, the user can just use the numba option
+    # for more performance
+    for ind in range(right_c.size):
+        if not counter.size:
+            break
+        keep_rows = op(left_c, right_c[ind])
+        if not keep_rows.any():
+            continue
+        pos[counter[keep_rows]] = ind
+        counter = counter[~keep_rows]
+        left_c = left_c[~keep_rows]
 
     # no point searching within (a, b)
     # if a == b
@@ -1255,10 +1272,10 @@ def _create_frame(
     """
     Create final dataframe
     """
-    if df_columns:
+    if df_columns is not None:
         df = _cond_join_select_columns(df_columns, df)
 
-    if right_columns:
+    if right_columns is not None:
         right = _cond_join_select_columns(right_columns, right)
 
     if set(df.columns).intersection(right.columns):

diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py
@@ -220,6 +220,13 @@ def pivot_longer(
         7   Austin    Texas  Watermelon      99   None     NaN
         8   Hoover  Alabama  Watermelon      43   None     NaN
 
+
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `dropna` parameter.
+
+
     :param df: A pandas DataFrame.
     :param index: Name(s) of columns to use as identifier variables.
         Should be either a single column name, or a list/tuple of
@@ -1259,6 +1266,13 @@ def pivot_wider(
         0  5.5       20       25       30       37
         1  6.1       22       18       19       29
 
+
+    !!! abstract "Version Changed"
+
+        - 0.24.0
+            - Added `reset_index`, `names_expand` and `index_expand` parameters.
+
+
     :param df: A pandas DataFrame.
     :param index: Name(s) of columns to use as identifier variables.
         It should be either a single column name, or a list of column names.
@@ -1293,7 +1307,7 @@ def pivot_wider(
         Applies only if `index` is a categorical column. Default is `False`.
     :returns: A pandas DataFrame that has been unpivoted from long to wide
         form.
-    """
+    """  # noqa: E501
 
     df = df.copy()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -29,4 +29,4 @@ def get_features_targets(args, *kwargs):
		return _get_features_targets(args, *kwargs)


		__version__ = "0.23.1"
		__version__ = "0.24.0"