-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
wip: initial version of pandas-vet constant series check
- Loading branch information
Showing
8 changed files
with
273 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import pandas as pd | ||
|
||
|
||
data = pd.Series(range(1000)) | ||
|
||
# PD801 | ||
data.nunique() <= 1 | ||
data.nunique(dropna=True) <= 1 | ||
data.nunique(dropna=False) <= 1 | ||
data.nunique() == 1 | ||
data.nunique(dropna=True) == 1 | ||
data.nunique(dropna=False) == 1 | ||
data.nunique() != 1 | ||
data.nunique(dropna=True) != 1 | ||
data.nunique(dropna=False) != 1 | ||
data.nunique() > 1 | ||
data.dropna().nunique() == 1 | ||
data[data.notnull()].nunique() == 1 | ||
|
||
# No violation of this rule | ||
data.nunique() == 0 # empty | ||
data.nunique() >= 1 # not-empty | ||
data.nunique() < 1 # empty | ||
data.nunique() == 2 # not constant | ||
data.unique() == 1 # not `nunique` | ||
|
||
{"hello": "world"}.nunique() == 1 # no pd.Series |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
110 changes: 110 additions & 0 deletions
110
crates/ruff/src/rules/pandas_vet/rules/pandas_nunique_constant_series_check.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
use num_traits::One; | ||
use ruff_diagnostics::Violation; | ||
use ruff_macros::{derive_message_formats, violation}; | ||
use rustpython_parser::ast::{self, CmpOp, Constant, Expr, Ranged}; | ||
|
||
use crate::checkers::ast::Checker; | ||
use crate::rules::pandas_vet::helpers::{test_expression, Resolution}; | ||
use ruff_diagnostics::Diagnostic; | ||
|
||
/// ## What it does | ||
/// Check for the use of `.nunique()` for determining if a Pandas Series is constant. | ||
/// | ||
/// ## Why is this bad? | ||
/// Let's take the example of a series of increasing integers (1, 2, 3, 4) of length `n`. | ||
/// While walking through the series, we already know at observing the second value that | ||
/// the series is not unique. However, using `.nunique()`, we will count till the end of | ||
/// the series before returning the result. This is computationally inefficient. | ||
/// | ||
/// ## Example | ||
/// ```python | ||
/// import pandas as pd | ||
/// | ||
/// data = pd.Series(range(1000)) | ||
/// if data.nunique() <= 1: | ||
/// print("Series is constant") | ||
/// ``` | ||
/// | ||
/// Use instead: | ||
/// ```python | ||
/// import pandas as pd | ||
/// | ||
/// data = pd.Series(range(1000)) | ||
/// v = s.to_numpy() | ||
/// if v.shape[0] == 0 or (s[0] == s).all(): | ||
/// print("Series is constant") | ||
/// ``` | ||
/// | ||
/// The [Pandas Cookbook](https://pandas.pydata.org/docs/user_guide/cookbook.html#constant-series) provides additional examples in case that the Series contain missing values. | ||
/// | ||
/// ## References | ||
/// - [Pandas documentation: `nunique`](https://pandas.pydata.org/docs/reference/api/pandas.Series.nunique.html) | ||
#[violation] | ||
pub struct PandasNuniqueConstantSeriesCheck; | ||
|
||
impl Violation for PandasNuniqueConstantSeriesCheck { | ||
#[derive_message_formats] | ||
fn message(&self) -> String { | ||
format!("Using `series.nunique()` for checking that a series is constant is inefficient") | ||
} | ||
} | ||
|
||
/// Return `true` if an [`Expr`] is a constant `1`. | ||
fn is_constant_one(expr: &Expr) -> bool { | ||
match expr { | ||
Expr::Constant(constant) => match &constant.value { | ||
Constant::Int(int) => int.is_one(), | ||
_ => false, | ||
}, | ||
_ => false, | ||
} | ||
} | ||
|
||
/// PD801 | ||
pub(crate) fn pandas_nunique_constant_series_check( | ||
checker: &mut Checker, | ||
expr: &Expr, | ||
left: &Expr, | ||
ops: &[CmpOp], | ||
comparators: &[Expr], | ||
) { | ||
let ([op], [right]) = (ops, comparators) else { | ||
return; | ||
}; | ||
|
||
// Operators may be ==, !=, <=, > | ||
if !matches!(op, CmpOp::Eq | CmpOp::NotEq | CmpOp::LtE | CmpOp::Gt,) { | ||
return; | ||
} | ||
|
||
// Right should be the integer 1 | ||
if !is_constant_one(right) { | ||
return; | ||
} | ||
|
||
// Check if call is .nuniuqe() | ||
let Expr::Call(ast::ExprCall {func, .. }) = left else { | ||
return; | ||
}; | ||
|
||
let Expr::Attribute(ast::ExprAttribute { value, attr, .. }) = func.as_ref() else { | ||
return; | ||
}; | ||
|
||
if attr.as_str() != "nunique" { | ||
return; | ||
} | ||
|
||
// Avoid flagging on non-Series (e.g., `{"a": 1}.at[0]`). | ||
if !matches!( | ||
test_expression(value, checker.semantic()), | ||
Resolution::RelevantLocal | ||
) { | ||
return; | ||
} | ||
|
||
checker.diagnostics.push(Diagnostic::new( | ||
PandasNuniqueConstantSeriesCheck, | ||
expr.range(), | ||
)); | ||
} |
120 changes: 120 additions & 0 deletions
120
...s/ruff/src/rules/pandas_vet/snapshots/ruff__rules__pandas_vet__tests__PD801_PD801.py.snap
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
--- | ||
source: crates/ruff/src/rules/pandas_vet/mod.rs | ||
--- | ||
PD801.py:7:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient | ||
| | ||
6 | # PD801 | ||
7 | data.nunique() <= 1 | ||
| ^^^^^^^^^^^^^^^^^^^ PD801 | ||
8 | data.nunique(dropna=True) <= 1 | ||
9 | data.nunique(dropna=False) <= 1 | ||
| | ||
|
||
PD801.py:8:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient | ||
| | ||
6 | # PD801 | ||
7 | data.nunique() <= 1 | ||
8 | data.nunique(dropna=True) <= 1 | ||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 | ||
9 | data.nunique(dropna=False) <= 1 | ||
10 | data.nunique() == 1 | ||
| | ||
|
||
PD801.py:9:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient | ||
| | ||
7 | data.nunique() <= 1 | ||
8 | data.nunique(dropna=True) <= 1 | ||
9 | data.nunique(dropna=False) <= 1 | ||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 | ||
10 | data.nunique() == 1 | ||
11 | data.nunique(dropna=True) == 1 | ||
| | ||
|
||
PD801.py:10:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient | ||
| | ||
8 | data.nunique(dropna=True) <= 1 | ||
9 | data.nunique(dropna=False) <= 1 | ||
10 | data.nunique() == 1 | ||
| ^^^^^^^^^^^^^^^^^^^ PD801 | ||
11 | data.nunique(dropna=True) == 1 | ||
12 | data.nunique(dropna=False) == 1 | ||
| | ||
|
||
PD801.py:11:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient | ||
| | ||
9 | data.nunique(dropna=False) <= 1 | ||
10 | data.nunique() == 1 | ||
11 | data.nunique(dropna=True) == 1 | ||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 | ||
12 | data.nunique(dropna=False) == 1 | ||
13 | data.nunique() != 1 | ||
| | ||
|
||
PD801.py:12:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient | ||
| | ||
10 | data.nunique() == 1 | ||
11 | data.nunique(dropna=True) == 1 | ||
12 | data.nunique(dropna=False) == 1 | ||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 | ||
13 | data.nunique() != 1 | ||
14 | data.nunique(dropna=True) != 1 | ||
| | ||
|
||
PD801.py:13:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient | ||
| | ||
11 | data.nunique(dropna=True) == 1 | ||
12 | data.nunique(dropna=False) == 1 | ||
13 | data.nunique() != 1 | ||
| ^^^^^^^^^^^^^^^^^^^ PD801 | ||
14 | data.nunique(dropna=True) != 1 | ||
15 | data.nunique(dropna=False) != 1 | ||
| | ||
|
||
PD801.py:14:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient | ||
| | ||
12 | data.nunique(dropna=False) == 1 | ||
13 | data.nunique() != 1 | ||
14 | data.nunique(dropna=True) != 1 | ||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 | ||
15 | data.nunique(dropna=False) != 1 | ||
16 | data.nunique() > 1 | ||
| | ||
|
||
PD801.py:15:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient | ||
| | ||
13 | data.nunique() != 1 | ||
14 | data.nunique(dropna=True) != 1 | ||
15 | data.nunique(dropna=False) != 1 | ||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 | ||
16 | data.nunique() > 1 | ||
17 | data.dropna().nunique() == 1 | ||
| | ||
|
||
PD801.py:16:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient | ||
| | ||
14 | data.nunique(dropna=True) != 1 | ||
15 | data.nunique(dropna=False) != 1 | ||
16 | data.nunique() > 1 | ||
| ^^^^^^^^^^^^^^^^^^ PD801 | ||
17 | data.dropna().nunique() == 1 | ||
18 | data[data.notnull()].nunique() == 1 | ||
| | ||
|
||
PD801.py:17:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient | ||
| | ||
15 | data.nunique(dropna=False) != 1 | ||
16 | data.nunique() > 1 | ||
17 | data.dropna().nunique() == 1 | ||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 | ||
18 | data[data.notnull()].nunique() == 1 | ||
| | ||
|
||
PD801.py:18:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient | ||
| | ||
16 | data.nunique() > 1 | ||
17 | data.dropna().nunique() == 1 | ||
18 | data[data.notnull()].nunique() == 1 | ||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 | ||
19 | | ||
20 | # No violation of this rule | ||
| |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.