diff --git a/crates/ruff/resources/test/fixtures/pandas_vet/PD101.py b/crates/ruff/resources/test/fixtures/pandas_vet/PD101.py new file mode 100644 index 0000000000000..157c5e47a956a --- /dev/null +++ b/crates/ruff/resources/test/fixtures/pandas_vet/PD101.py @@ -0,0 +1,27 @@ +import pandas as pd + + +data = pd.Series(range(1000)) + +# PD101 +data.nunique() <= 1 +data.nunique(dropna=True) <= 1 +data.nunique(dropna=False) <= 1 +data.nunique() == 1 +data.nunique(dropna=True) == 1 +data.nunique(dropna=False) == 1 +data.nunique() != 1 +data.nunique(dropna=True) != 1 +data.nunique(dropna=False) != 1 +data.nunique() > 1 +data.dropna().nunique() == 1 +data[data.notnull()].nunique() == 1 + +# No violation of this rule +data.nunique() == 0 # empty +data.nunique() >= 1 # not-empty +data.nunique() < 1 # empty +data.nunique() == 2 # not constant +data.unique() == 1 # not `nunique` + +{"hello": "world"}.nunique() == 1 # no pd.Series diff --git a/crates/ruff/src/checkers/ast/mod.rs b/crates/ruff/src/checkers/ast/mod.rs index 3c6e1b2b9e0d0..a168fcd2483bf 100644 --- a/crates/ruff/src/checkers/ast/mod.rs +++ b/crates/ruff/src/checkers/ast/mod.rs @@ -3285,6 +3285,15 @@ where if self.enabled(Rule::YodaConditions) { flake8_simplify::rules::yoda_conditions(self, expr, left, ops, comparators); } + if self.enabled(Rule::PandasNuniqueConstantSeriesCheck) { + pandas_vet::rules::nunique_constant_series_check( + self, + expr, + left, + ops, + comparators, + ); + } } Expr::Constant(ast::ExprConstant { value: Constant::Int(_) | Constant::Float(_) | Constant::Complex { .. }, diff --git a/crates/ruff/src/codes.rs b/crates/ruff/src/codes.rs index 3a2d13a41e1d6..b77dccb198269 100644 --- a/crates/ruff/src/codes.rs +++ b/crates/ruff/src/codes.rs @@ -604,6 +604,7 @@ pub fn code_to_rule(linter: Linter, code: &str) -> Option<(RuleGroup, Rule)> { (PandasVet, "012") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasUseOfDotReadTable), (PandasVet, "013") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasUseOfDotStack), (PandasVet, "015") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasUseOfPdMerge), + (PandasVet, "101") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasNuniqueConstantSeriesCheck), (PandasVet, "901") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasDfVariableName), // flake8-errmsg diff --git a/crates/ruff/src/rules/pandas_vet/mod.rs b/crates/ruff/src/rules/pandas_vet/mod.rs index 6319a0f5cbd05..b57f0836dfe6f 100644 --- a/crates/ruff/src/rules/pandas_vet/mod.rs +++ b/crates/ruff/src/rules/pandas_vet/mod.rs @@ -344,6 +344,7 @@ mod tests { Path::new("pandas_use_of_dot_read_table.py") )] #[test_case(Rule::PandasUseOfInplaceArgument, Path::new("PD002.py"))] + #[test_case(Rule::PandasNuniqueConstantSeriesCheck, Path::new("PD101.py"))] fn paths(rule_code: Rule, path: &Path) -> Result<()> { let snapshot = format!("{}_{}", rule_code.noqa_code(), path.to_string_lossy()); let diagnostics = test_path( diff --git a/crates/ruff/src/rules/pandas_vet/rules/mod.rs b/crates/ruff/src/rules/pandas_vet/rules/mod.rs index 0a52e1878324a..7a137babb3bb5 100644 --- a/crates/ruff/src/rules/pandas_vet/rules/mod.rs +++ b/crates/ruff/src/rules/pandas_vet/rules/mod.rs @@ -2,6 +2,7 @@ pub(crate) use assignment_to_df::*; pub(crate) use attr::*; pub(crate) use call::*; pub(crate) use inplace_argument::*; +pub(crate) use nunique_constant_series_check::*; pub(crate) use pd_merge::*; pub(crate) use read_table::*; pub(crate) use subscript::*; @@ -10,6 +11,7 @@ pub(crate) mod assignment_to_df; pub(crate) mod attr; pub(crate) mod call; pub(crate) mod inplace_argument; +pub(crate) mod nunique_constant_series_check; pub(crate) mod pd_merge; pub(crate) mod read_table; pub(crate) mod subscript; diff --git a/crates/ruff/src/rules/pandas_vet/rules/nunique_constant_series_check.rs b/crates/ruff/src/rules/pandas_vet/rules/nunique_constant_series_check.rs new file mode 100644 index 0000000000000..9edffcf25ee8c --- /dev/null +++ b/crates/ruff/src/rules/pandas_vet/rules/nunique_constant_series_check.rs @@ -0,0 +1,122 @@ +use num_traits::One; +use rustpython_parser::ast::{self, CmpOp, Constant, Expr, Ranged}; + +use ruff_diagnostics::Diagnostic; +use ruff_diagnostics::Violation; +use ruff_macros::{derive_message_formats, violation}; + +use crate::checkers::ast::Checker; +use crate::rules::pandas_vet::helpers::{test_expression, Resolution}; + +/// ## What it does +/// Check for uses of `.nunique()` to check if a Pandas Series is constant +/// (i.e., contains only one unique value). +/// +/// ## Why is this bad? +/// `.nunique()` is computationally inefficient for checking if a Series is +/// constant. +/// +/// Consider, for example, a Series of length `n` that consists of increasing +/// integer values (e.g., 1, 2, 3, 4). The `.nunique()` method will iterate +/// over the entire Series to count the number of unique values. But in this +/// case, we can detect that the Series is non-constant after visiting the +/// first two values, which are non-equal. +/// +/// In general, `.nunique()` requires iterating over the entire Series, while a +/// more efficient approach allows short-circuiting the operation as soon as a +/// non-equal value is found. +/// +/// Instead of calling `.nunique()`, convert the Series to a NumPy array, and +/// check if all values in the array are equal to the first observed value. +/// +/// ## Example +/// ```python +/// import pandas as pd +/// +/// data = pd.Series(range(1000)) +/// if data.nunique() <= 1: +/// print("Series is constant") +/// ``` +/// +/// Use instead: +/// ```python +/// import pandas as pd +/// +/// data = pd.Series(range(1000)) +/// array = data.to_numpy() +/// if array.shape[0] == 0 or (array[0] == array).all(): +/// print("Series is constant") +/// ``` +/// +/// ## References +/// - [Pandas Cookbook: "Constant Series"](https://pandas.pydata.org/docs/user_guide/cookbook.html#constant-series) +/// - [Pandas documentation: `nunique`](https://pandas.pydata.org/docs/reference/api/pandas.Series.nunique.html) +#[violation] +pub struct PandasNuniqueConstantSeriesCheck; + +impl Violation for PandasNuniqueConstantSeriesCheck { + #[derive_message_formats] + fn message(&self) -> String { + format!("Using `series.nunique()` for checking that a series is constant is inefficient") + } +} + +/// PD101 +pub(crate) fn nunique_constant_series_check( + checker: &mut Checker, + expr: &Expr, + left: &Expr, + ops: &[CmpOp], + comparators: &[Expr], +) { + let ([op], [right]) = (ops, comparators) else { + return; + }; + + // Operators may be ==, !=, <=, >. + if !matches!(op, CmpOp::Eq | CmpOp::NotEq | CmpOp::LtE | CmpOp::Gt,) { + return; + } + + // Right should be the integer 1. + if !is_constant_one(right) { + return; + } + + // Check if call is `.nuniuqe()`. + let Expr::Call(ast::ExprCall { func, .. }) = left else { + return; + }; + + let Expr::Attribute(ast::ExprAttribute { value, attr, .. }) = func.as_ref() else { + return; + }; + + if attr.as_str() != "nunique" { + return; + } + + // Avoid flagging on non-Series (e.g., `{"a": 1}.at[0]`). + if !matches!( + test_expression(value, checker.semantic()), + Resolution::RelevantLocal + ) { + return; + } + + checker.diagnostics.push(Diagnostic::new( + PandasNuniqueConstantSeriesCheck, + expr.range(), + )); +} + +/// Return `true` if an [`Expr`] is a constant `1`. +fn is_constant_one(expr: &Expr) -> bool { + match expr { + Expr::Constant(constant) => match &constant.value { + Constant::Int(int) => int.is_one(), + _ => false, + }, + _ => false, + } +} diff --git a/crates/ruff/src/rules/pandas_vet/snapshots/ruff__rules__pandas_vet__tests__PD101_PD101.py.snap b/crates/ruff/src/rules/pandas_vet/snapshots/ruff__rules__pandas_vet__tests__PD101_PD101.py.snap new file mode 100644 index 0000000000000..ddfdd060ba947 --- /dev/null +++ b/crates/ruff/src/rules/pandas_vet/snapshots/ruff__rules__pandas_vet__tests__PD101_PD101.py.snap @@ -0,0 +1,122 @@ +--- +source: crates/ruff/src/rules/pandas_vet/mod.rs +--- +PD101.py:7:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient + | +6 | # PD101 +7 | data.nunique() <= 1 + | ^^^^^^^^^^^^^^^^^^^ PD101 +8 | data.nunique(dropna=True) <= 1 +9 | data.nunique(dropna=False) <= 1 + | + +PD101.py:8:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient + | + 6 | # PD101 + 7 | data.nunique() <= 1 + 8 | data.nunique(dropna=True) <= 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101 + 9 | data.nunique(dropna=False) <= 1 +10 | data.nunique() == 1 + | + +PD101.py:9:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient + | + 7 | data.nunique() <= 1 + 8 | data.nunique(dropna=True) <= 1 + 9 | data.nunique(dropna=False) <= 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101 +10 | data.nunique() == 1 +11 | data.nunique(dropna=True) == 1 + | + +PD101.py:10:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient + | + 8 | data.nunique(dropna=True) <= 1 + 9 | data.nunique(dropna=False) <= 1 +10 | data.nunique() == 1 + | ^^^^^^^^^^^^^^^^^^^ PD101 +11 | data.nunique(dropna=True) == 1 +12 | data.nunique(dropna=False) == 1 + | + +PD101.py:11:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient + | + 9 | data.nunique(dropna=False) <= 1 +10 | data.nunique() == 1 +11 | data.nunique(dropna=True) == 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101 +12 | data.nunique(dropna=False) == 1 +13 | data.nunique() != 1 + | + +PD101.py:12:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient + | +10 | data.nunique() == 1 +11 | data.nunique(dropna=True) == 1 +12 | data.nunique(dropna=False) == 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101 +13 | data.nunique() != 1 +14 | data.nunique(dropna=True) != 1 + | + +PD101.py:13:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient + | +11 | data.nunique(dropna=True) == 1 +12 | data.nunique(dropna=False) == 1 +13 | data.nunique() != 1 + | ^^^^^^^^^^^^^^^^^^^ PD101 +14 | data.nunique(dropna=True) != 1 +15 | data.nunique(dropna=False) != 1 + | + +PD101.py:14:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient + | +12 | data.nunique(dropna=False) == 1 +13 | data.nunique() != 1 +14 | data.nunique(dropna=True) != 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101 +15 | data.nunique(dropna=False) != 1 +16 | data.nunique() > 1 + | + +PD101.py:15:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient + | +13 | data.nunique() != 1 +14 | data.nunique(dropna=True) != 1 +15 | data.nunique(dropna=False) != 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101 +16 | data.nunique() > 1 +17 | data.dropna().nunique() == 1 + | + +PD101.py:16:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient + | +14 | data.nunique(dropna=True) != 1 +15 | data.nunique(dropna=False) != 1 +16 | data.nunique() > 1 + | ^^^^^^^^^^^^^^^^^^ PD101 +17 | data.dropna().nunique() == 1 +18 | data[data.notnull()].nunique() == 1 + | + +PD101.py:17:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient + | +15 | data.nunique(dropna=False) != 1 +16 | data.nunique() > 1 +17 | data.dropna().nunique() == 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101 +18 | data[data.notnull()].nunique() == 1 + | + +PD101.py:18:1: PD101 Using `series.nunique()` for checking that a series is constant is inefficient + | +16 | data.nunique() > 1 +17 | data.dropna().nunique() == 1 +18 | data[data.notnull()].nunique() == 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD101 +19 | +20 | # No violation of this rule + | + + diff --git a/ruff.schema.json b/ruff.schema.json index 353136f6944ff..8732c73890891 100644 --- a/ruff.schema.json +++ b/ruff.schema.json @@ -2090,6 +2090,9 @@ "PD012", "PD013", "PD015", + "PD1", + "PD10", + "PD101", "PD9", "PD90", "PD901",