Skip to content

Commit

Permalink
[pandas-vet] series constant series (astral-sh#5802)
Browse files Browse the repository at this point in the history
## Summary

Implementation for astral-sh#5588

Q1: are there any additional semantic helpers that could be used to
guard this rule? Which existing rules should be similar in that respect?
Can we at least check if `pandas` is imported (any pointers welcome)?
Currently, the rule flags:
```python
data = {"a": "b"}
data.nunique() == 1
```

Q2: Any pointers on naming of the rule and selection of the code? It was
proposed, but not replied to/implemented in the upstream. `pandas` did
accept a PR to update their cookbook to reflect this rule though.

## Test Plan

TODO:
- [X] Checking for ecosystem CI results
- [x] Test on selected [real-world
cases](https://github.com/search?q=%22nunique%28%29+%3D%3D+1%22+language%3APython+&type=code)
  - [x] https://github.com/sdv-dev/SDMetrics
  - [x] https://github.com/google-research/robustness_metrics
  - [x] https://github.com/soft-matter/trackpy
  - [x] https://github.com/microsoft/FLAML/
- [ ] Add guarded test cases
  • Loading branch information
sbrugman authored and evanrittenhouse committed Jul 19, 2023
1 parent 14d242f commit 8f8d4b5
Show file tree
Hide file tree
Showing 8 changed files with 287 additions and 0 deletions.
27 changes: 27 additions & 0 deletions crates/ruff/resources/test/fixtures/pandas_vet/PD101.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pandas as pd


data = pd.Series(range(1000))

# PD101
data.nunique() <= 1
data.nunique(dropna=True) <= 1
data.nunique(dropna=False) <= 1
data.nunique() == 1
data.nunique(dropna=True) == 1
data.nunique(dropna=False) == 1
data.nunique() != 1
data.nunique(dropna=True) != 1
data.nunique(dropna=False) != 1
data.nunique() > 1
data.dropna().nunique() == 1
data[data.notnull()].nunique() == 1

# No violation of this rule
data.nunique() == 0 # empty
data.nunique() >= 1 # not-empty
data.nunique() < 1 # empty
data.nunique() == 2 # not constant
data.unique() == 1 # not `nunique`

{"hello": "world"}.nunique() == 1 # no pd.Series
9 changes: 9 additions & 0 deletions crates/ruff/src/checkers/ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3285,6 +3285,15 @@ where
if self.enabled(Rule::YodaConditions) {
flake8_simplify::rules::yoda_conditions(self, expr, left, ops, comparators);
}
if self.enabled(Rule::PandasNuniqueConstantSeriesCheck) {
pandas_vet::rules::nunique_constant_series_check(
self,
expr,
left,
ops,
comparators,
);
}
}
Expr::Constant(ast::ExprConstant {
value: Constant::Int(_) | Constant::Float(_) | Constant::Complex { .. },
Expand Down
1 change: 1 addition & 0 deletions crates/ruff/src/codes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,7 @@ pub fn code_to_rule(linter: Linter, code: &str) -> Option<(RuleGroup, Rule)> {
(PandasVet, "012") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasUseOfDotReadTable),
(PandasVet, "013") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasUseOfDotStack),
(PandasVet, "015") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasUseOfPdMerge),
(PandasVet, "101") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasNuniqueConstantSeriesCheck),
(PandasVet, "901") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasDfVariableName),

// flake8-errmsg
Expand Down
1 change: 1 addition & 0 deletions crates/ruff/src/rules/pandas_vet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ mod tests {
Path::new("pandas_use_of_dot_read_table.py")
)]
#[test_case(Rule::PandasUseOfInplaceArgument, Path::new("PD002.py"))]
#[test_case(Rule::PandasNuniqueConstantSeriesCheck, Path::new("PD101.py"))]
fn paths(rule_code: Rule, path: &Path) -> Result<()> {
let snapshot = format!("{}_{}", rule_code.noqa_code(), path.to_string_lossy());
let diagnostics = test_path(
Expand Down
2 changes: 2 additions & 0 deletions crates/ruff/src/rules/pandas_vet/rules/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ pub(crate) use assignment_to_df::*;
pub(crate) use attr::*;
pub(crate) use call::*;
pub(crate) use inplace_argument::*;
pub(crate) use nunique_constant_series_check::*;
pub(crate) use pd_merge::*;
pub(crate) use read_table::*;
pub(crate) use subscript::*;
Expand All @@ -10,6 +11,7 @@ pub(crate) mod assignment_to_df;
pub(crate) mod attr;
pub(crate) mod call;
pub(crate) mod inplace_argument;
pub(crate) mod nunique_constant_series_check;
pub(crate) mod pd_merge;
pub(crate) mod read_table;
pub(crate) mod subscript;
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
use num_traits::One;
use rustpython_parser::ast::{self, CmpOp, Constant, Expr, Ranged};

use ruff_diagnostics::Diagnostic;
use ruff_diagnostics::Violation;
use ruff_macros::{derive_message_formats, violation};

use crate::checkers::ast::Checker;
use crate::rules::pandas_vet::helpers::{test_expression, Resolution};

/// ## What it does
/// Check for uses of `.nunique()` to check if a Pandas Series is constant
/// (i.e., contains only one unique value).
///
/// ## Why is this bad?
/// `.nunique()` is computationally inefficient for checking if a Series is
/// constant.
///
/// Consider, for example, a Series of length `n` that consists of increasing
/// integer values (e.g., 1, 2, 3, 4). The `.nunique()` method will iterate
/// over the entire Series to count the number of unique values. But in this
/// case, we can detect that the Series is non-constant after visiting the
/// first two values, which are non-equal.
///
/// In general, `.nunique()` requires iterating over the entire Series, while a
/// more efficient approach allows short-circuiting the operation as soon as a
/// non-equal value is found.
///
/// Instead of calling `.nunique()`, convert the Series to a NumPy array, and
/// check if all values in the array are equal to the first observed value.
///
/// ## Example
/// ```python
/// import pandas as pd
///
/// data = pd.Series(range(1000))
/// if data.nunique() <= 1:
/// print("Series is constant")
/// ```
///
/// Use instead:
/// ```python
/// import pandas as pd
///
/// data = pd.Series(range(1000))
/// array = data.to_numpy()
/// if array.shape[0] == 0 or (array[0] == array).all():
/// print("Series is constant")
/// ```
///
/// ## References
/// - [Pandas Cookbook: "Constant Series"](https://pandas.pydata.org/docs/user_guide/cookbook.html#constant-series)
/// - [Pandas documentation: `nunique`](https://pandas.pydata.org/docs/reference/api/pandas.Series.nunique.html)
#[violation]
pub struct PandasNuniqueConstantSeriesCheck;

impl Violation for PandasNuniqueConstantSeriesCheck {
#[derive_message_formats]
fn message(&self) -> String {
format!("Using `series.nunique()` for checking that a series is constant is inefficient")
}
}

/// PD101
pub(crate) fn nunique_constant_series_check(
checker: &mut Checker,
expr: &Expr,
left: &Expr,
ops: &[CmpOp],
comparators: &[Expr],
) {
let ([op], [right]) = (ops, comparators) else {
return;
};

// Operators may be ==, !=, <=, >.
if !matches!(op, CmpOp::Eq | CmpOp::NotEq | CmpOp::LtE | CmpOp::Gt,) {
return;
}

// Right should be the integer 1.
if !is_constant_one(right) {
return;
}

// Check if call is `.nuniuqe()`.
let Expr::Call(ast::ExprCall { func, .. }) = left else {
return;
};

let Expr::Attribute(ast::ExprAttribute { value, attr, .. }) = func.as_ref() else {
return;
};

if attr.as_str() != "nunique" {
return;
}

// Avoid flagging on non-Series (e.g., `{"a": 1}.at[0]`).
if !matches!(
test_expression(value, checker.semantic()),
Resolution::RelevantLocal
) {
return;
}

checker.diagnostics.push(Diagnostic::new(
PandasNuniqueConstantSeriesCheck,
expr.range(),
));
}

/// Return `true` if an [`Expr`] is a constant `1`.
fn is_constant_one(expr: &Expr) -> bool {
match expr {
Expr::Constant(constant) => match &constant.value {
Constant::Int(int) => int.is_one(),
_ => false,
},
_ => false,
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
---
source: crates/ruff/src/rules/pandas_vet/mod.rs
---
PD101.py:7:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient
|
6 | # PD101
7 | data.nunique() <= 1
| ^^^^^^^^^^^^^^^^^^^ PD101
8 | data.nunique(dropna=True) <= 1
9 | data.nunique(dropna=False) <= 1
|

PD101.py:8:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient
|
6 | # PD101
7 | data.nunique() <= 1
8 | data.nunique(dropna=True) <= 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101
9 | data.nunique(dropna=False) <= 1
10 | data.nunique() == 1
|

PD101.py:9:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient
|
7 | data.nunique() <= 1
8 | data.nunique(dropna=True) <= 1
9 | data.nunique(dropna=False) <= 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101
10 | data.nunique() == 1
11 | data.nunique(dropna=True) == 1
|

PD101.py:10:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient
|
8 | data.nunique(dropna=True) <= 1
9 | data.nunique(dropna=False) <= 1
10 | data.nunique() == 1
| ^^^^^^^^^^^^^^^^^^^ PD101
11 | data.nunique(dropna=True) == 1
12 | data.nunique(dropna=False) == 1
|

PD101.py:11:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient
|
9 | data.nunique(dropna=False) <= 1
10 | data.nunique() == 1
11 | data.nunique(dropna=True) == 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101
12 | data.nunique(dropna=False) == 1
13 | data.nunique() != 1
|

PD101.py:12:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient
|
10 | data.nunique() == 1
11 | data.nunique(dropna=True) == 1
12 | data.nunique(dropna=False) == 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101
13 | data.nunique() != 1
14 | data.nunique(dropna=True) != 1
|

PD101.py:13:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient
|
11 | data.nunique(dropna=True) == 1
12 | data.nunique(dropna=False) == 1
13 | data.nunique() != 1
| ^^^^^^^^^^^^^^^^^^^ PD101
14 | data.nunique(dropna=True) != 1
15 | data.nunique(dropna=False) != 1
|

PD101.py:14:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient
|
12 | data.nunique(dropna=False) == 1
13 | data.nunique() != 1
14 | data.nunique(dropna=True) != 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101
15 | data.nunique(dropna=False) != 1
16 | data.nunique() > 1
|

PD101.py:15:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient
|
13 | data.nunique() != 1
14 | data.nunique(dropna=True) != 1
15 | data.nunique(dropna=False) != 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101
16 | data.nunique() > 1
17 | data.dropna().nunique() == 1
|

PD101.py:16:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient
|
14 | data.nunique(dropna=True) != 1
15 | data.nunique(dropna=False) != 1
16 | data.nunique() > 1
| ^^^^^^^^^^^^^^^^^^ PD101
17 | data.dropna().nunique() == 1
18 | data[data.notnull()].nunique() == 1
|

PD101.py:17:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient
|
15 | data.nunique(dropna=False) != 1
16 | data.nunique() > 1
17 | data.dropna().nunique() == 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101
18 | data[data.notnull()].nunique() == 1
|

PD101.py:18:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient
|
16 | data.nunique() > 1
17 | data.dropna().nunique() == 1
18 | data[data.notnull()].nunique() == 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101
19 |
20 | # No violation of this rule
|


3 changes: 3 additions & 0 deletions ruff.schema.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 8f8d4b5

Please sign in to comment.