diff --git a/.github/test_conda_env.yml b/.github/test_conda_env.yml index 651ab90..d3455b3 100644 --- a/.github/test_conda_env.yml +++ b/.github/test_conda_env.yml @@ -1,4 +1,4 @@ -name: test +name: dbscan1d channels: - conda-forge - defaults diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..01807ae --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,30 @@ +# Lint the code using the defined pre-commits +name: LintCode +on: [push] + +jobs: + lint_code: + runs-on: ubuntu-latest + + # only run if CI isn't turned off + if: github.event_name == 'push' || !contains(github.event.pull_request.labels.*.name, 'no_ci') + + steps: + - uses: actions/checkout@v4 + + - name: "get tags" + run: | + git fetch --tags --force # Retrieve annotated tags. + + - name: Install uv + uses: astral-sh/setup-uv@v3 + + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: install linting packages + run: uv tool install pre-commit + + - name: run all precommits + run: uv run pre-commit run --all diff --git a/.github/workflows/on_master_commits.yml b/.github/workflows/on_master_commits.yml deleted file mode 100644 index daab2bc..0000000 --- a/.github/workflows/on_master_commits.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: calculate coverage -on: - push: - branches: - - master - -jobs: - # Calculates new coverage for the base branch - calc_coverage: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v1 - - - name: Setup conda - uses: conda-incubator/setup-miniconda@v2 - with: - miniconda-version: 'latest' - python-version: "3.10" - activate-environment: test - environment-file: .github/test_conda_env.yml - condarc-file: .github/test_condarc.yml - - - name: install - shell: bash -l {0} - run: | - pip install -e . - - - name: run test suite - shell: bash -l {0} - run: | - pytest -s --cov dbscan1d --cov-report=xml - - - name: upload coverage - uses: codecov/codecov-action@v1 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: ./coverage.xml - flags: unittests - name: codecov-umbrella - fail_ci_if_error: true diff --git a/.github/workflows/release_published.yml b/.github/workflows/release_published.yml index c9d4bf6..1f760f9 100644 --- a/.github/workflows/release_published.yml +++ b/.github/workflows/release_published.yml @@ -9,27 +9,25 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v4 - - name: Setup conda - uses: conda-incubator/setup-miniconda@v2 - with: - miniconda-version: 'latest' - python-version: "3.11" - activate-environment: test - environment-file: .github/test_conda_env.yml - condarc-file: .github/test_condarc.yml - - - name: install - shell: bash -l {0} + - name: "get tags" run: | - pip install -e .[dev] + git fetch --tags --force # Retrieve annotated tags. + + - name: Install uv + uses: astral-sh/setup-uv@v3 + + - name: Set up Python + run: uv python install 3.12 + + - name: Install the project + run: uv sync --all-extras --dev - name: create dists shell: bash -l {0} run: | - python -m pip install build - python -m build + uv build - name: publish package uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/runtests.yml b/.github/workflows/runtests.yml index 749288a..a8a3d51 100644 --- a/.github/workflows/runtests.yml +++ b/.github/workflows/runtests.yml @@ -1,58 +1,79 @@ -name: validate -on: [push] +# Run full test suite using conda env and all optional deps. +name: TestCode +on: + push: + branches: + - master + pull_request: + branches: + - master + paths: + - 'pyproject.toml' + - '**.py' + - '.github/workflows/*.yml' -jobs: - # Simply applies flake8 to code using pre-commit - lint_code: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v1 - - - name: Setup conda - uses: s-weigand/setup-conda@v1 - with: - python-version: "3.10" - - name: install linting packages - run: pip install pre-commit - - - name: run all precommits - run: pre-commit run --all +# Cancel previous runs when this one starts. +concurrency: + group: TestCode-${{ github.event.pull_request.number || github.run_id }} + cancel-in-progress: true +jobs: # Runs the tests on combinations of the supported python/os matrix. test_code: + + timeout-minutes: 25 runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ['3.10', '3.11', "3.12"] + + # only run if CI isn't turned off + if: github.event_name == 'push' || !contains(github.event.pull_request.labels.*.name, 'no_ci') + + env: + # set conda environment file with dependencies + env_file: "test_conda_env.yml" steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v4 - - name: Setup conda - uses: conda-incubator/setup-miniconda@v2 - with: - miniconda-version: 'latest' - python-version: ${{ matrix.python-version }} - activate-environment: test - environment-file: .github/test_conda_env.yml - condarc-file: .github/test_condarc.yml - - - name: install - shell: bash -l {0} + - name: "get tags" run: | - pip install -e . + git fetch --tags --force # Retrieve annotated tags. + + - name: Install uv + uses: astral-sh/setup-uv@v3 + + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} + - name: Install the project + run: uv sync --all-extras --dev + + # Print out the package info for current environment - name: print package info - shell: bash -l {0} + shell: bash -el {0} run: | - conda info -a - conda list + uv pip list - # Runs test suite and calculates coverage - - name: run test suite - shell: bash -l {0} - run: | - pytest tests + - name: Run tests + # For example, using `pytest` + run: uv run pytest -s --cov dbscan1d --cov-append --cov-report=xml + + # Upload coverage files + - uses: codecov/codecov-action@v4 + with: + fail_ci_if_error: false + files: ./coverage.xml + flags: unittests + name: PR_tests + token: ${{ secrets.CODECOV_TOKEN }} + + +# This is a very useful step for debugging, it allows you to ssh into the CI +# machine (https://github.com/marketplace/actions/debugging-with-tmate). +# +#- name: Setup tmate session +# uses: mxschmitt/action-tmate@v3 diff --git a/.gitignore b/.gitignore index 2a4c87a..3385cf2 100644 --- a/.gitignore +++ b/.gitignore @@ -88,3 +88,6 @@ docs/quickref/stubs # mypy .mypy_cache + +# uv +uv.lock diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 01a7efb..a12d5c6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,38 +1,27 @@ +exclude: scripts/ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.3.0 hooks: - id: check-yaml - id: end-of-file-fixer - - id: trailing-whitespace + - id: check-merge-conflict - id: mixed-line-ending args: ['--fix=lf'] -- repo: https://github.com/psf/black - rev: 22.6.0 + + # Ruff is a replacement for flake8 and many other linters (much faster too) +- repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.6.1 hooks: - - id: black -- repo: https://github.com/PyCQA/flake8 - rev: 3.8.3 + - id: ruff + args: ["--fix"] + # Run the formatter. + - id: ruff-format + + # ensures __future__ import annotations at top of files which require it + # for the typing features they are using. +- repo: https://github.com/frostming/fix-future-annotations + rev: 0.5.0 hooks: - - id: flake8 - additional_dependencies: - - flake8-black - - flake8-breakpoint - - flake8-docstrings -- repo: https://github.com/pycqa/isort - rev: 5.12.0 - hooks: - - id: isort - name: isort (python) - args: ["--profile", "black"] - - id: isort - name: isort (cython) - types: [cython] - - id: isort - name: isort (pyi) - types: [pyi] -- repo: https://github.com/kynan/nbstripout - rev: 0.3.9 - hooks: - - id: nbstripout - files: ".ipynb" + - id: fix-future-annotations diff --git a/README.md b/README.md index 0cab4cd..f6067bc 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ It only requires numpy. ## Quickstart dbscan1d is designed to be interchangable with sklearn's implementation in almost -all cases. The exception is that the `weights` parameter is not yet supported. +all cases. ```python from sklearn.datasets import make_blobs diff --git a/pyproject.toml b/pyproject.toml index b8af7b2..2b0f785 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ authors = [ description = "An efficient implementation of the DBSCAN algorithm for 1D arrays." readme = "README.md" license = { file="LICENSE" } -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Development Status :: 4 - Beta", "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)", @@ -46,14 +46,14 @@ keywords = ["geophysics", "distributed-acoustic-sensing"] # --- Dependencies dependencies = [ - "numpy >= 1.13.0", + "numpy >= 1.25.0", ] [project.optional-dependencies] test = [ "pytest", - "pre-commit", + "pytest-cov", "scikit-learn", ] dev = ["dbscan1d[test]"] @@ -64,3 +64,76 @@ dev = ["dbscan1d[test]"] "Bug Tracker" = "https://github.com/d-chambers/dbscan1d/issues" "Documentation" = "https://github.com/d-chambers/dbscan1d" "Homepage" = "https://github.com/d-chambers/dbscan1d" + +# --- formatting + +[tool.ruff] + +line-length = 88 + +# enable certain types of linting +lint.select = [ + "E", + "F", + "UP", + "RUF", + "I001", + "D", + "FA", + "T", + "N", + "NPY", + "NPY201", +] + +# Exclude a variety of commonly ignored directories. +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv", + "__init__.py" +] + +# lowest python version supported +target-version = "py310" + +lint.fixable = ["ALL"] + +# List of codes to ignore +lint.ignore = ["D105", "D107", "D401", "D205", "D200", "D400", "N803", "N806"] + +[tool.ruff.lint.mccabe] +# Unlike Flake8, default to a complexity level of 10. +max-complexity = 10 + +# config for docstring parsing +[tool.ruff.lint.pydocstyle] +convention = "numpy" + +[tool.pytest.ini_options] +filterwarnings = [ + # Ignore hdf5 warnings from pytables, See pytables #1035 + 'ignore::Warning:tables:' +] + +[tool.ruff.format] +# Use `\n` line endings for all files +line-ending = "lf" diff --git a/scripts/profile_dbscan1d.ipynb b/scripts/profile_dbscan1d.ipynb index ba04bd2..97d614d 100644 --- a/scripts/profile_dbscan1d.ipynb +++ b/scripts/profile_dbscan1d.ipynb @@ -18,13 +18,20 @@ "\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", - "\n", - "from dbscan1d import DBSCAN1D\n", "from sklearn.cluster import DBSCAN\n", "from sklearn.datasets import make_blobs\n", "\n", + "from dbscan1d import DBSCAN1D\n", "\n", - "n_points = [10, 100, 1_000, 10_000, 20_000, 30_000, 40_000,]\n", + "n_points = [\n", + " 10,\n", + " 100,\n", + " 1_000,\n", + " 10_000,\n", + " 20_000,\n", + " 30_000,\n", + " 40_000,\n", + "]\n", "centers = 2" ] }, @@ -46,8 +53,8 @@ "outputs": [], "source": [ "# Profile\n", - "db1 = DBSCAN1D(.5, 4)\n", - "db2 = DBSCAN(.5, 4)" + "db1 = DBSCAN1D(0.5, 4)\n", + "db2 = DBSCAN(0.5, 4)" ] }, { @@ -57,16 +64,16 @@ "outputs": [], "source": [ "# profile each stream type with each function\n", - "df = pd.DataFrame(columns=['dbscan', 'dbscan1d'], index=n_points)\n", + "df = pd.DataFrame(columns=[\"dbscan\", \"dbscan1d\"], index=n_points)\n", "for n_point in n_points:\n", - " print(f'on {n_point}')\n", + " print(f\"on {n_point}\")\n", " X = create_blobs(n_point, centers)\n", - " print('starting dbscan1d')\n", + " print(\"starting dbscan1d\")\n", " ti1 = %timeit -o db1.fit_predict(X)\n", - " df.loc[n_point, 'dbscan1d'] = ti1.best\n", - " print('starting dbscan')\n", + " df.loc[n_point, \"dbscan1d\"] = ti1.best\n", + " print(\"starting dbscan\")\n", " ti2 = %timeit -o db2.fit_predict(X)\n", - " df.loc[n_point, 'dbscan'] = ti2.best\n", + " df.loc[n_point, \"dbscan\"] = ti2.best\n", " print()\n", " print()" ] @@ -89,20 +96,20 @@ }, "outputs": [], "source": [ - "out_path = Path(__file__).parent / 'profile_results.png'\n", + "out_path = Path(__file__).parent / \"profile_results.png\"\n", "\n", "x = df.index.values\n", - "plt.loglog(x, df['dbscan'].values, label='dbscan', color='r')\n", - "plt.loglog(x, df['dbscan1d'].values, label='dbscan1d', color='b')\n", + "plt.loglog(x, df[\"dbscan\"].values, label=\"dbscan\", color=\"r\")\n", + "plt.loglog(x, df[\"dbscan1d\"].values, label=\"dbscan1d\", color=\"b\")\n", "\n", - "plt.xlabel('number of points')\n", - "plt.ylabel('run time (s)')\n", + "plt.xlabel(\"number of points\")\n", + "plt.ylabel(\"run time (s)\")\n", "\n", "plt.legend()\n", "\n", "plt.savefig(out_path)\n", "\n", - "plt.show()\n" + "plt.show()" ] } ], diff --git a/src/dbscan1d/core.py b/src/dbscan1d/core.py index c75a2df..b3d86ab 100644 --- a/src/dbscan1d/core.py +++ b/src/dbscan1d/core.py @@ -3,7 +3,8 @@ It should be *much* more efficient for large datasets. """ -from typing import Optional + +from __future__ import annotations import numpy as np @@ -17,9 +18,9 @@ class DBSCAN1D: """ # params that change upon fit/training - core_sample_indices_: Optional[np.ndarray] = None - components_: Optional[np.ndarray] = None - labels_: Optional[np.ndarray] = None + core_sample_indices_: np.ndarray | None = None + components_: np.ndarray | None = None + labels_: np.ndarray | None = None def __init__(self, eps: float = 0.5, min_samples: int = 5, metric="euclidean"): self.eps = eps diff --git a/src/dbscan1d/version.py b/src/dbscan1d/version.py index 9826595..b1023cd 100644 --- a/src/dbscan1d/version.py +++ b/src/dbscan1d/version.py @@ -1,8 +1,9 @@ """Module for reporting the version of dbscan1d.""" + from importlib.metadata import PackageNotFoundError, version try: __version__ = version("dbscan1d") # package is not installed -except PackageNotFoundError: # NOQA - __version__ = "0.0.0" # NOQA +except PackageNotFoundError: + __version__ = "0.0.0" diff --git a/tests/test_dbscan1d.py b/tests/test_dbscan1d.py index 98e63da..f15474a 100644 --- a/tests/test_dbscan1d.py +++ b/tests/test_dbscan1d.py @@ -3,6 +3,7 @@ Requires sklearn. """ + import copy from itertools import product from pathlib import Path @@ -122,7 +123,8 @@ def generate_test_data(num_points, centers=None): num_points, n_features=1, centers=centers, random_state=13 ) X = blobs.flatten() - np.random.shuffle(X) + rng = np.random.default_rng() + rng.shuffle(X) return X, blob_labels @@ -139,18 +141,18 @@ class TestSKleanEquivalent: # define a small range of dbscan input params over which tests will # be parametrized - eps_values = [0.0001, 0.1, 0.5, 1, 2] - min_samples_values = [1, 2, 5, 15] - db_params = list(product(eps_values, min_samples_values)) + eps_values = (0.0001, 0.1, 0.5, 1, 2) + min_samples_values = (1, 2, 5, 15) + db_params = tuple(product(eps_values, min_samples_values)) - centers = [ + centers = ( np.array([0, 5, 10]), np.arange(10), np.array([1, 2, 3, 4, 5, 10]), np.array([1, 1.1, 1.2, 1.3, 1.4, 1.5]), 2, 7, - ] + ) @pytest.fixture(scope="class", params=centers) def blobs(self, request): @@ -233,16 +235,16 @@ def test_issue_7(self, issue_7_array): # also test indices of core points assert np.all(dbs_1.core_sample_indices_ == dbs_2.core_sample_indices_) - -def test_sample_weights() -> None: - x = np.asarray([0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 1.1, 1.2]) - labels = DBSCAN1D(eps=0.5, min_samples=3).fit_predict(x) - assert np.all(labels[:9] == 0) - assert np.all(labels[-2:] == -1) - - weight = np.ones_like(x) - weight[-1] = 1.2 - weight[-2] = 1.8 - labels = DBSCAN1D(eps=0.5, min_samples=3).fit_predict(x, sample_weight=weight) - assert np.all(labels[:9] == 0) - assert np.all(labels[-2:] == 1) + def test_sample_weights(self) -> None: + """Test case for sample weights .""" + x = np.asarray([0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 1.1, 1.2]) + labels = DBSCAN1D(eps=0.5, min_samples=3).fit_predict(x) + assert np.all(labels[:9] == 0) + assert np.all(labels[-2:] == -1) + + weight = np.ones_like(x) + weight[-1] = 1.2 + weight[-2] = 1.8 + labels = DBSCAN1D(eps=0.5, min_samples=3).fit_predict(x, sample_weight=weight) + assert np.all(labels[:9] == 0) + assert np.all(labels[-2:] == 1)