Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

466 add feature importance cli #468

Merged
merged 11 commits into from
Dec 4, 2024
34 changes: 34 additions & 0 deletions eis_toolkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -871,6 +871,40 @@ def local_morans_i_cli(
typer.echo(f"Local Moran's I completed, output vector saved to {output_vector}.")


# FEATURE IMPORTANCE
@app.command()
def feature_importance_cli(
model_file: INPUT_FILE_OPTION,
input_rasters: INPUT_FILES_ARGUMENT,
target_labels: INPUT_FILE_OPTION,
n_repeats: int = 10,
random_state: Optional[int] = None,
):
"""Evaluate the feature importance of a sklearn classifier or regressor."""
from eis_toolkit.exploratory_analyses.feature_importance import evaluate_feature_importance
from eis_toolkit.prediction.machine_learning_general import load_model, prepare_data_for_ml

typer.echo("Progress: 10%")

model = load_model(model_file)
typer.echo("Progress: 20%")

X, y, _, _ = prepare_data_for_ml(input_rasters, target_labels)
typer.echo("Progress: 30%")

feature_names = [raster.name for raster in input_rasters]
typer.echo("Progress: 40%")

feature_importance, _ = evaluate_feature_importance(model, X, y, feature_names, n_repeats, random_state)
typer.echo("Progress: 80%")

results = dict(zip(feature_importance["Feature"], feature_importance["Importance"]))
json_str = json.dumps(results)
typer.echo("Progress: 100%")

typer.echo(f"Results: {json_str}")


# --- RASTER PROCESSING ---


Expand Down
42 changes: 26 additions & 16 deletions eis_toolkit/exploratory_analyses/feature_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,32 @@
from beartype.typing import Optional, Sequence
from sklearn.inspection import permutation_importance

from eis_toolkit.exceptions import InvalidDatasetException, InvalidParameterValueException
from eis_toolkit.exceptions import (
InvalidDatasetException,
InvalidParameterValueException,
NonMatchingParameterLengthsException,
)


@beartype
def evaluate_feature_importance(
model: sklearn.base.BaseEstimator,
x_test: np.ndarray,
y_test: np.ndarray,
X: np.ndarray,
y: np.ndarray,
feature_names: Sequence[str],
n_repeats: int = 50,
n_repeats: int = 10,
random_state: Optional[int] = None,
) -> tuple[pd.DataFrame, dict]:
"""
Evaluate the feature importance of a sklearn classifier or regressor.
Evaluate the feature importance of a Sklearn classifier or regressor.

Args:
model: A trained and fitted Sklearn model.
x_test: Testing feature data (X data need to be normalized / standardized).
y_test: Testing label data.
feature_names: Names of the feature columns.
n_repeats: Number of iteration used when calculate feature importance. Defaults to 50.
random_state: random state for repeatability of results. Optional parameter.
X: Feature data.
y: Target labels.
feature_names: Names of features in X.
n_repeats: Number of iteration used when calculating feature importance. Defaults to 10.
random_state: Seed for random number generation. Defaults to None.

Returns:
A dataframe containing features and their importance.
Expand All @@ -37,18 +41,24 @@ def evaluate_feature_importance(
InvalidParameterValueException: Value for 'n_repeats' is not at least one.
"""

if x_test.size == 0:
raise InvalidDatasetException("Array 'x_test' is empty.")
if X.size == 0:
raise InvalidDatasetException("Feature matrix X is empty.")

if y_test.size == 0:
raise InvalidDatasetException("Array 'y_test' is empty.")
if y.size == 0:
raise InvalidDatasetException("Target labels y is empty.")

if n_repeats < 1:
raise InvalidParameterValueException("Value for 'n_repeats' is less than one.")

result = permutation_importance(model, x_test, y_test.ravel(), n_repeats=n_repeats, random_state=random_state)
if len(X) != len(y):
raise NonMatchingParameterLengthsException("Feature matrix X and target labels y must have the same length.")

feature_importance = pd.DataFrame({"Feature": feature_names, "Importance": result.importances_mean * 100})
if len(feature_names) != X.shape[1]:
raise InvalidParameterValueException("Number of feature names must match the number of input features.")

result = permutation_importance(model, X, y.ravel(), n_repeats=n_repeats, random_state=random_state)

feature_importance = pd.DataFrame({"Feature": feature_names, "Importance": result.importances_mean})

feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

Expand Down
42 changes: 29 additions & 13 deletions tests/exploratory_analyses/feature_importance_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

from eis_toolkit.exceptions import InvalidDatasetException, InvalidParameterValueException
from eis_toolkit.exceptions import (
InvalidDatasetException,
InvalidParameterValueException,
NonMatchingParameterLengthsException,
)
from eis_toolkit.exploratory_analyses.feature_importance import evaluate_feature_importance

feature_names = [
Expand Down Expand Up @@ -42,43 +46,55 @@ def test_empty_data():
empty_data = np.array([])
empty_labels = np.array([])
with pytest.raises(InvalidDatasetException):
_, _ = evaluate_feature_importance(
model=classifier, x_test=empty_data, y_test=labels, feature_names=feature_names
)
_, _ = evaluate_feature_importance(model=classifier, X=empty_data, y=labels, feature_names=feature_names)

with pytest.raises(InvalidDatasetException):
_, _ = evaluate_feature_importance(
model=classifier, x_test=data, y_test=empty_labels, feature_names=feature_names
)
_, _ = evaluate_feature_importance(model=classifier, X=data, y=empty_labels, feature_names=feature_names)


def test_invalid_n_repeats():
"""Test that invalid value for 'n_repeats' raises exception."""
with pytest.raises(InvalidParameterValueException):
_, _ = evaluate_feature_importance(
model=classifier, x_test=data, y_test=labels, feature_names=feature_names, n_repeats=0
)
_, _ = evaluate_feature_importance(model=classifier, X=data, y=labels, feature_names=feature_names, n_repeats=0)


def test_model_output():
"""Test that function output is as expected."""
classifier.fit(data, labels.ravel())
feature_importance, importance_results = evaluate_feature_importance(
model=classifier, x_test=data, y_test=labels, feature_names=feature_names, random_state=0
model=classifier, X=data, y=labels, feature_names=feature_names, n_repeats=50, random_state=0
)

np.testing.assert_almost_equal(
feature_importance.loc[feature_importance["Feature"] == "EM_ratio", "Importance"].values[0],
desired=12.923077,
desired=0.129231,
decimal=6,
)
np.testing.assert_almost_equal(
feature_importance.loc[feature_importance["Feature"] == "EM_Qd", "Importance"].values[0],
desired=4.461538,
desired=0.044615,
decimal=6,
)
np.testing.assert_equal(len(feature_importance), desired=len(feature_names))
np.testing.assert_equal(
tuple(importance_results.keys()),
desired=("importances_mean", "importances_std", "importances"),
)


def test_invalid_input_lengths():
"""Test that non matcing X and y lengths raises an exception."""
labels = np.random.randint(2, size=12)
with pytest.raises(NonMatchingParameterLengthsException):
_, _ = evaluate_feature_importance(model=classifier, X=data, y=labels, feature_names=feature_names)


def test_invalid_number_of_feature_names():
"""Test that invalid number of feature names raises an exception."""
with pytest.raises(InvalidParameterValueException):
_, _ = evaluate_feature_importance(
model=classifier,
X=data,
y=labels,
feature_names=["a", "b", "c"],
)
Loading