diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index d29d0eac..16929af4 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -871,6 +871,40 @@ def local_morans_i_cli( typer.echo(f"Local Moran's I completed, output vector saved to {output_vector}.") +# FEATURE IMPORTANCE +@app.command() +def feature_importance_cli( + model_file: INPUT_FILE_OPTION, + input_rasters: INPUT_FILES_ARGUMENT, + target_labels: INPUT_FILE_OPTION, + n_repeats: int = 10, + random_state: Optional[int] = None, +): + """Evaluate the feature importance of a sklearn classifier or regressor.""" + from eis_toolkit.exploratory_analyses.feature_importance import evaluate_feature_importance + from eis_toolkit.prediction.machine_learning_general import load_model, prepare_data_for_ml + + typer.echo("Progress: 10%") + + model = load_model(model_file) + typer.echo("Progress: 20%") + + X, y, _, _ = prepare_data_for_ml(input_rasters, target_labels) + typer.echo("Progress: 30%") + + feature_names = [raster.name for raster in input_rasters] + typer.echo("Progress: 40%") + + feature_importance, _ = evaluate_feature_importance(model, X, y, feature_names, n_repeats, random_state) + typer.echo("Progress: 80%") + + results = dict(zip(feature_importance["Feature"], feature_importance["Importance"])) + json_str = json.dumps(results) + typer.echo("Progress: 100%") + + typer.echo(f"Results: {json_str}") + + # --- RASTER PROCESSING --- diff --git a/eis_toolkit/exploratory_analyses/feature_importance.py b/eis_toolkit/exploratory_analyses/feature_importance.py index e0f7baae..18197ffd 100644 --- a/eis_toolkit/exploratory_analyses/feature_importance.py +++ b/eis_toolkit/exploratory_analyses/feature_importance.py @@ -5,28 +5,32 @@ from beartype.typing import Optional, Sequence from sklearn.inspection import permutation_importance -from eis_toolkit.exceptions import InvalidDatasetException, InvalidParameterValueException +from eis_toolkit.exceptions import ( + InvalidDatasetException, + InvalidParameterValueException, + NonMatchingParameterLengthsException, +) @beartype def evaluate_feature_importance( model: sklearn.base.BaseEstimator, - x_test: np.ndarray, - y_test: np.ndarray, + X: np.ndarray, + y: np.ndarray, feature_names: Sequence[str], - n_repeats: int = 50, + n_repeats: int = 10, random_state: Optional[int] = None, ) -> tuple[pd.DataFrame, dict]: """ - Evaluate the feature importance of a sklearn classifier or regressor. + Evaluate the feature importance of a Sklearn classifier or regressor. Args: model: A trained and fitted Sklearn model. - x_test: Testing feature data (X data need to be normalized / standardized). - y_test: Testing label data. - feature_names: Names of the feature columns. - n_repeats: Number of iteration used when calculate feature importance. Defaults to 50. - random_state: random state for repeatability of results. Optional parameter. + X: Feature data. + y: Target labels. + feature_names: Names of features in X. + n_repeats: Number of iteration used when calculating feature importance. Defaults to 10. + random_state: Seed for random number generation. Defaults to None. Returns: A dataframe containing features and their importance. @@ -37,18 +41,24 @@ def evaluate_feature_importance( InvalidParameterValueException: Value for 'n_repeats' is not at least one. """ - if x_test.size == 0: - raise InvalidDatasetException("Array 'x_test' is empty.") + if X.size == 0: + raise InvalidDatasetException("Feature matrix X is empty.") - if y_test.size == 0: - raise InvalidDatasetException("Array 'y_test' is empty.") + if y.size == 0: + raise InvalidDatasetException("Target labels y is empty.") if n_repeats < 1: raise InvalidParameterValueException("Value for 'n_repeats' is less than one.") - result = permutation_importance(model, x_test, y_test.ravel(), n_repeats=n_repeats, random_state=random_state) + if len(X) != len(y): + raise NonMatchingParameterLengthsException("Feature matrix X and target labels y must have the same length.") - feature_importance = pd.DataFrame({"Feature": feature_names, "Importance": result.importances_mean * 100}) + if len(feature_names) != X.shape[1]: + raise InvalidParameterValueException("Number of feature names must match the number of input features.") + + result = permutation_importance(model, X, y.ravel(), n_repeats=n_repeats, random_state=random_state) + + feature_importance = pd.DataFrame({"Feature": feature_names, "Importance": result.importances_mean}) feature_importance = feature_importance.sort_values(by="Importance", ascending=False) diff --git a/tests/exploratory_analyses/feature_importance_test.py b/tests/exploratory_analyses/feature_importance_test.py index a5d81470..a90d8197 100644 --- a/tests/exploratory_analyses/feature_importance_test.py +++ b/tests/exploratory_analyses/feature_importance_test.py @@ -4,7 +4,11 @@ from sklearn.neural_network import MLPClassifier from sklearn.preprocessing import StandardScaler -from eis_toolkit.exceptions import InvalidDatasetException, InvalidParameterValueException +from eis_toolkit.exceptions import ( + InvalidDatasetException, + InvalidParameterValueException, + NonMatchingParameterLengthsException, +) from eis_toolkit.exploratory_analyses.feature_importance import evaluate_feature_importance feature_names = [ @@ -42,39 +46,33 @@ def test_empty_data(): empty_data = np.array([]) empty_labels = np.array([]) with pytest.raises(InvalidDatasetException): - _, _ = evaluate_feature_importance( - model=classifier, x_test=empty_data, y_test=labels, feature_names=feature_names - ) + _, _ = evaluate_feature_importance(model=classifier, X=empty_data, y=labels, feature_names=feature_names) with pytest.raises(InvalidDatasetException): - _, _ = evaluate_feature_importance( - model=classifier, x_test=data, y_test=empty_labels, feature_names=feature_names - ) + _, _ = evaluate_feature_importance(model=classifier, X=data, y=empty_labels, feature_names=feature_names) def test_invalid_n_repeats(): """Test that invalid value for 'n_repeats' raises exception.""" with pytest.raises(InvalidParameterValueException): - _, _ = evaluate_feature_importance( - model=classifier, x_test=data, y_test=labels, feature_names=feature_names, n_repeats=0 - ) + _, _ = evaluate_feature_importance(model=classifier, X=data, y=labels, feature_names=feature_names, n_repeats=0) def test_model_output(): """Test that function output is as expected.""" classifier.fit(data, labels.ravel()) feature_importance, importance_results = evaluate_feature_importance( - model=classifier, x_test=data, y_test=labels, feature_names=feature_names, random_state=0 + model=classifier, X=data, y=labels, feature_names=feature_names, n_repeats=50, random_state=0 ) np.testing.assert_almost_equal( feature_importance.loc[feature_importance["Feature"] == "EM_ratio", "Importance"].values[0], - desired=12.923077, + desired=0.129231, decimal=6, ) np.testing.assert_almost_equal( feature_importance.loc[feature_importance["Feature"] == "EM_Qd", "Importance"].values[0], - desired=4.461538, + desired=0.044615, decimal=6, ) np.testing.assert_equal(len(feature_importance), desired=len(feature_names)) @@ -82,3 +80,21 @@ def test_model_output(): tuple(importance_results.keys()), desired=("importances_mean", "importances_std", "importances"), ) + + +def test_invalid_input_lengths(): + """Test that non matcing X and y lengths raises an exception.""" + labels = np.random.randint(2, size=12) + with pytest.raises(NonMatchingParameterLengthsException): + _, _ = evaluate_feature_importance(model=classifier, X=data, y=labels, feature_names=feature_names) + + +def test_invalid_number_of_feature_names(): + """Test that invalid number of feature names raises an exception.""" + with pytest.raises(InvalidParameterValueException): + _, _ = evaluate_feature_importance( + model=classifier, + X=data, + y=labels, + feature_names=["a", "b", "c"], + )