Merge branch 'main' into 756-add-notebook-for-eulaw

dianna-ai · Oct 2, 2024 · 2381edd · 2381edd
2 parents a165038 + 0beee8b
commit 2381edd
Show file tree

Hide file tree

Showing 24 changed files with 2,112 additions and 1,271 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -55,8 +55,8 @@ jobs:
       - name: Run unit tests
         run: python -m pytest -v
 
-      - name: Verify that we can build the package
-        run: python setup.py sdist bdist_wheel
+      #- name: Verify that we can build the package
+      #  run: python setup.py sdist bdist_wheel
 
   test_downloader:
     name: Test file downloader
@@ -73,7 +73,8 @@ jobs:
 
   test_dashboard:
     name: Test dashboard
-    if: github.event.pull_request.draft == false
+    if: always()
+      #github.event.pull_request.draft == false
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3

diff --git a/dianna/cli.py b/dianna/cli.py
@@ -21,6 +21,7 @@ def dashboard():
         *('--theme.primaryColor', '7030a0'),
         *('--theme.secondaryBackgroundColor', 'e4f3f9'),
         *('--browser.gatherUsageStats', 'false'),
+        *('--client.showSidebarNavigation', 'false'),
         *args,
     ]
 

diff --git a/dianna/dashboard/Home.py b/dianna/dashboard/Home.py
@@ -46,14 +46,6 @@
     with and for (academic) researchers and research software engineers working on machine
     learning projects.
 
-    ### Pages
-
-    - <a href="/Images" target="_parent">Image data</a>
-    - <a href="/Tabular" target="_parent">Tabular data</a>
-    - <a href="/Text" target="_parent">Text data</a>
-    - <a href="/Time_series" target="_parent">Time series data</a>
-
-
     ### More information
 
     - [Source code](https://github.com/dianna-ai/dianna)

diff --git a/dianna/dashboard/_model_utils.py b/dianna/dashboard/_model_utils.py
@@ -2,6 +2,7 @@
 import numpy as np
 import onnx
 import pandas as pd
+from sklearn.model_selection import train_test_split
 
 
 def load_data(file):
@@ -42,3 +43,41 @@ def load_labels(file):
 
 def load_training_data(file):
     return np.float32(np.load(file, allow_pickle=False))
+
+
+def load_sunshine(file):
+    """Tabular sunshine example.
+
+    Load the csv file in a pandas dataframe and split the data in a train and test set.
+    """
+    data = load_data(file)
+
+    # Drop unused columns
+    X_data = data.drop(columns=['DATE', 'MONTH', 'Index'])[:-1]
+    y_data = data.loc[1:]["BASEL_sunshine"]
+
+    # Split the data
+    X_train, X_holdout, _, y_holdout = train_test_split(X_data, y_data, test_size=0.3, random_state=0)
+    _, X_test, _, _ = train_test_split(X_holdout, y_holdout, test_size=0.5, random_state=0)
+    X_test = X_test.reset_index(drop=True)
+    X_test.insert(0, 'Index', X_test.index)
+
+    return X_train.to_numpy(dtype=np.float32), X_test
+
+def load_penguins(penguins):
+    """Prep the data for the penguin model example as per ntoebook."""
+    # Remove categorial columns and NaN values
+    penguins_filtered = penguins.drop(columns=['island', 'sex']).dropna()
+
+
+    # Extract inputs and target
+    input_features = penguins_filtered.drop(columns=['species'])
+    target = pd.get_dummies(penguins_filtered['species'])
+
+    X_train, X_test, _, _ = train_test_split(input_features, target, test_size=0.2,
+                                                    random_state=0, shuffle=True, stratify=target)
+
+    X_test = X_test.reset_index(drop=True)
+    X_test.insert(0, 'Index', X_test.index)
+
+    return X_train.to_numpy(dtype=np.float32), X_test
diff --git a/dianna/dashboard/_models_tabular.py b/dianna/dashboard/_models_tabular.py
@@ -1,33 +1,55 @@
-import tempfile
 import numpy as np
+import onnxruntime as ort
 import streamlit as st
 from dianna import explain_tabular
-from dianna.utils.onnx_runner import SimpleModelRunner
 
 
 @st.cache_data
 def predict(*, model, tabular_input):
-    model_runner = SimpleModelRunner(model)
-    predictions = model_runner(tabular_input.reshape(1,-1).astype(np.float32))
-    return predictions
+    # Make sure that tabular input is provided as float32
+    sess = ort.InferenceSession(model)
+    input_name = sess.get_inputs()[0].name
+    output_name = sess.get_outputs()[0].name
+
+    onnx_input = {input_name: tabular_input.astype(np.float32)}
+    pred_onnx = sess.run([output_name], onnx_input)[0]
+
+    return pred_onnx
 
 
 @st.cache_data
-def _run_rise_tabular(_model, table, training_data, **kwargs):
+def _run_rise_tabular(_model, table, training_data,_feature_names, **kwargs):
+    # convert streamlit kwarg requirement back to dianna kwarg requirement
+    if "_preprocess_function" in kwargs:
+        kwargs["preprocess_function"] = kwargs["_preprocess_function"]
+        del kwargs["_preprocess_function"]
+
+    def run_model(tabular_input):
+        return predict(model=_model, tabular_input=tabular_input)
+
     relevances = explain_tabular(
-        _model,
+        run_model,
         table,
         method='RISE',
         training_data=training_data,
+        feature_names=_feature_names,
         **kwargs,
     )
     return relevances
 
 
 @st.cache_data
 def _run_lime_tabular(_model, table, training_data, _feature_names, **kwargs):
+    # convert streamlit kwarg requirement back to dianna kwarg requirement
+    if "_preprocess_function" in kwargs:
+        kwargs["preprocess_function"] = kwargs["_preprocess_function"]
+        del kwargs["_preprocess_function"]
+
+    def run_model(tabular_input):
+        return predict(model=_model, tabular_input=tabular_input)
+
     relevances = explain_tabular(
-        _model,
+        run_model,
         table,
         method='LIME',
         training_data=training_data,
@@ -37,17 +59,22 @@ def _run_lime_tabular(_model, table, training_data, _feature_names, **kwargs):
     return relevances
 
 @st.cache_data
-def _run_kernelshap_tabular(model, table, training_data, **kwargs):
+def _run_kernelshap_tabular(model, table, training_data, _feature_names, **kwargs):
     # Kernelshap interface is different. Write model to temporary file.
-    with tempfile.NamedTemporaryFile() as f:
-        f.write(model)
-        f.flush()
-        relevances = explain_tabular(f.name,
+    if "_preprocess_function" in kwargs:
+        kwargs["preprocess_function"] = kwargs["_preprocess_function"]
+        del kwargs["_preprocess_function"]
+
+    def run_model(tabular_input):
+        return predict(model=model, tabular_input=tabular_input)
+
+    relevances = explain_tabular(run_model,
                 table,
                 method='KernelSHAP',
                 training_data=training_data,
+                feature_names=_feature_names,
                 **kwargs)
-    return relevances[0]
+    return np.array(relevances)
 
 
 explain_tabular_dispatcher = {

diff --git a/dianna/dashboard/_movie_model.py b/dianna/dashboard/_movie_model.py
@@ -1,22 +1,22 @@
-import os
 import numpy as np
+import pandas as pd
 from _shared import data_directory
 from scipy.special import expit as sigmoid
-from torchtext.vocab import Vectors
 from dianna import utils
 from dianna.utils.tokenizers import SpacyTokenizer
 
 
 class MovieReviewsModelRunner:
     """Creates runner for movie review model."""
 
-    def __init__(self, model, word_vectors=None, max_filter_size=5):
+    def __init__(self, model, word_vector_path=None, max_filter_size=5):
         """Initializes the class."""
-        if word_vectors is None:
-            word_vectors = data_directory / 'movie_reviews_word_vectors.txt'
+        if word_vector_path is None:
+            word_vector_path = data_directory / 'movie_reviews_word_vectors.txt'
 
         self.run_model = utils.get_function(model)
-        self.vocab = Vectors(word_vectors, cache=os.path.dirname(word_vectors))
+        self.keys = list(
+            pd.read_csv(word_vector_path, header=None, delimiter=' ')[0])
         self.max_filter_size = max_filter_size
         self.tokenizer = SpacyTokenizer()
 
@@ -35,8 +35,8 @@ def __call__(self, sentences):
 
             # numericalize the tokens
             tokens_numerical = [
-                self.vocab.stoi[token]
-                if token in self.vocab.stoi else self.vocab.stoi['<unk>']
+                self.keys.index(token)
+                if token in self.keys else self.keys.index('<unk>')
                 for token in tokens
             ]
 

diff --git a/dianna/dashboard/_shared.py b/dianna/dashboard/_shared.py
@@ -74,13 +74,25 @@ def _methods_checkboxes(*, choices: Sequence, key):
 
 def _get_params(method: str, key):
     if method == 'RISE':
+        n_masks = 1000
+        fr = 8
+        pkeep = 0.1
+        if 'FRB' in key:
+            n_masks = 5000
+            fr = 16
+        elif 'Tabular' in key:
+            pkeep = 0.5
+        elif 'Weather' in key:
+            n_masks = 10000
+        elif 'Digits' in key:
+            n_masks = 5000
         return {
             'n_masks':
-            st.number_input('Number of masks', value=1000, key=f'{key}_{method}_nmasks'),
+            st.number_input('Number of masks', value=n_masks, key=f'{key}_{method}_nmasks'),
             'feature_res':
-            st.number_input('Feature resolution', value=6, key=f'{key}_{method}_fr'),
+            st.number_input('Feature resolution', value=fr, key=f'{key}_{method}_fr'),
             'p_keep':
-            st.number_input('Probability to be kept unmasked', value=0.1, key=f'{key}_{method}_pkeep'),
+            st.number_input('Probability to be kept unmasked', value=pkeep, key=f'{key}_{method}_pkeep'),
         }
 
     elif method == 'KernelSHAP':
@@ -97,9 +109,14 @@ def _get_params(method: str, key):
             }
 
     elif method == 'LIME':
-        return {
-            'random_state': st.number_input('Random state', value=2, key=f'{key}_{method}_rs'),
+        if 'Tabular' in key:
+            return {
+            'random_state': st.number_input('Random state', value=0, key=f'{key}_{method}_rs'),
         }
+        else:
+            return {
+                'random_state': st.number_input('Random state', value=2, key=f'{key}_{method}_rs'),
+            }
 
     else:
         raise ValueError(f'No such method: {method}')

diff --git a/dianna/dashboard/dashboard-screenshot.png b/dianna/dashboard/dashboard-screenshot.png
diff --git a/dianna/dashboard/pages/Images.py b/dianna/dashboard/pages/Images.py
@@ -41,6 +41,8 @@
         image_model_file = download('mnist_model_tf.onnx', 'model')
         image_label_file = download('labels_mnist.txt', 'label')
 
+        imagekey = 'Digits_Image_cb'
+
         st.markdown(
             """
             This example demonstrates the use of DIANNA on a pretrained binary
@@ -71,6 +73,8 @@
     image_label_file = st.sidebar.file_uploader('Select labels',
                                                 type='txt')
 
+    imagekey = 'Image_cb'
+
 if input_type is None:
     st.info('Select which input type to use in the left panel to continue')
     st.stop()
@@ -93,7 +97,7 @@
 
 with st.container(border=True):
     prediction_placeholder = st.empty()
-    methods, method_params = _methods_checkboxes(choices=choices, key='Image_cb')
+    methods, method_params = _methods_checkboxes(choices=choices, key=imagekey)
 
     with st.spinner('Predicting class'):
         predictions = predict(model=model, image=image)