jvalegre · ddgunizar · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024 · Nov 5, 2024
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -45,14 +45,14 @@ jobs:
           python -m pip install --upgrade pip          
           pip install rdkit==2024.3.1
           conda install -y -c conda-forge openbabel
-          conda install -y -c conda-forge xtb=6.6.1
+          conda install -y -c conda-forge xtb=6.7.1
           conda install -y -c conda-forge glib
           pip install weasyprint
           conda install -y -c conda-forge gtk3
           conda install -y -c conda-forge pango
           pip install aqme==1.7.0
           pip install .
-          pip install scikit-learn-intelex==2024.4.0
+          pip install scikit-learn-intelex==2025.0.1
           # uninstall robert to avoid codecov issues (only running from the robert folder)
           pip uninstall -y robert
           # install and run pytest

diff --git a/Examples/CSV_workflow/ROBERT_full_workflow.ipynb b/Examples/CSV_workflow/ROBERT_full_workflow.ipynb
@@ -14,7 +14,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "command_line = \"python -m robert --ignore \\\"['Name']\\\" --y Target_values --csv_name Robert_example.csv --csv_test Robert_example_test.csv --names Name\"\n",
+    "command_line = \"python -m robert --y Target_values --csv_name Robert_example.csv --csv_test Robert_example_test.csv --names Name\"\n",
     "\n",
     "def run_command(cmd):\n",
     "    from subprocess import Popen, PIPE\n",

diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ Don't miss out the latest hands-on tutorials from our [YouTube channel](https://
 2. Activate conda environment: `conda activate robert`  
 3. Install ROBERT using pip: `pip install robert` 
 4. Install libraries for the PDF report `conda install -y -c conda-forge glib gtk3 pango mscorefonts`
-5. (Only for compatible devices) Install Intelex accelerator: `pip install scikit-learn-intelex`  
+5. (Only for compatible devices) Install Intelex accelerator: `pip install scikit-learn-intelex==2025.0.1`  
 * Inexperienced users should visit the *Users with no Python experience* section in [Read the Docs](https://robert.readthedocs.io).
 ## Update the program
 1. Update to the latest version: `pip install robert --upgrade`  
@@ -53,7 +53,7 @@ We really THANK all the testers for their feedback and for participating in the
 
 ## How to cite ROBERT
 If you use any of the ROBERT modules, please include this citation:  
-* Dalmau, D.; Alegre Requena, J. V. ROBERT: Bridging the Gap between Machine Learning and Chemistry. *Wiley Interdiscip. Rev. Comput. Mol. Sci.* **2024**, *accepted*. DOI: 10.1002/WCMS.1733.  
+* Dalmau, D.; Alegre Requena, J. V. ROBERT: Bridging the Gap between Machine Learning and Chemistry. *Wiley Interdiscip. Rev. Comput. Mol. Sci.* **2024**, *14*, e1733.
 
 If you use the AQME module, please include this citation:  
 * Alegre-Requena et al., AQME: Automated Quantum Mechanical Environments for Researchers and Educators. *Wiley Interdiscip. Rev. Comput. Mol. Sci.* **2023**, *13*, e1663.

diff --git a/docs/Misc/versions.rst b/docs/Misc/versions.rst
@@ -4,6 +4,30 @@
 Versions
 ========
 
+Version 1.3.1 [`url <https://github.com/jvalegre/robert/releases/tag/1.3.1>`__]
+   -  Fixed a bug in one-hot encoding in the one-hot test
+   -  Adding the possibility to disable the automatic standarization of descriptors (--std False)
+   -  Changing CV_test (now it standardizes the descriptors in each fold)
+   -  Fixing a bug with the sklearn-intelex accelerator
+   -  Fixing a threading bug with matplotlib in SHAP
+   -  Sorting the training points when using all the split methods to match GENERATE models with PREDICT/VERIFY
+
+Version 1.3.0 [`url <https://github.com/jvalegre/robert/releases/tag/1.3.0>`__]
+   -  Fixing a bug in the KNN imputer (it was incorrectly placing values in the target variable)
+   -  Adding a new way of splitting data (stratified) to ensure that the validation points are taken throughout the range of the target values
+   -  Fixing bug to work with spaces in descriptor names
+   -  Changing the way of selecting the best model (now using a combined error metric, not only the validation error)
+   -  Fixing bug in GENERATE when plotting the models' heatmap in case the model had infinite values
+   -  Auto_test is now done by default if the database has more than 100 datapoints
+   -  90% training size disables for datasets with less than 100 datapoints and 80% for less than 50 datapoints
+   -  Changing models paramaters to avoid overgitting in small datasets
+   -  Fixing bug (ROBERT was not reading some CSV files correctly when saved as UTF-8)
+   -  Fixed bug in the report module when the Target_values had spaces
+   -  MVL is replaced with AdaB when ROBERT assigns automated classification problems
+   -  Adding automatic checks to ensure compatible classification problems
+   -  ROBERT score is printed in the section title in the report to save space
+   -  Kmeans clustering is applied individually to the different target values in classification problems to allow for a more compensated training selection
+
 Version 1.2.1 [`url <https://github.com/jvalegre/robert/releases/tag/1.2.1>`__]
    -  NN solver are now set to 'lbfgs' by default in the MLPRegressor to work with small datasets
    -  Thres_x is now set to 0.7 by default in the CURATE module

diff --git a/docs/README.rst b/docs/README.rst
@@ -20,6 +20,9 @@
 .. |ReadtheDocs| image:: https://img.shields.io/readthedocs/robert?label=Read%20the%20Docs&logo=readthedocs
    :target: https://robert.readthedocs.io
    :alt: Documentation Status
+
+.. |PyPI| image:: https://img.shields.io/pypi/v/robert
+   :target: https://pypi.org/project/robert/
 
 |CircleCI|
 |Codecov|
@@ -111,7 +114,7 @@ it also works for newer Python versions (i.e., 3.11 and 3.12):
 .. code-block:: shell 
 
    pip install robert
-   pip install scikit-learn-intelex
+   pip install scikit-learn-intelex==2025.0.1
 
 **3.** Install GLib, GTK3, pango and mscorefonts to avoid errors when creating the PDF report:  
 
@@ -129,7 +132,7 @@ it also works for newer Python versions (i.e., 3.11 and 3.12):
 
    conda install -c conda-forge robert
    pip install robert --upgrade
-   pip install scikit-learn-intelex
+   pip install scikit-learn-intelex==2025.0.1
    conda install -y -c conda-forge glib gtk3 pango mscorefonts
 
 .. installation-end 
@@ -167,7 +170,7 @@ you can install `Miniconda with Python 3 <https://docs.conda.io/projects/minicon
 **5.** Install ROBERT as defined in the "Installation" section (:code:`pip install robert`).
 
 
-**6.** Install the intelex code accelerator (only if your system is compatible with intelex) (:code:`pip install scikit-learn-intelex`).
+**6.** Install the intelex code accelerator (only if your system is compatible with intelex) (:code:`pip install scikit-learn-intelex==2025.0.1`).
 
 
 **7.** Install GLib, GTK3, pango and mscorefonts to avoid errors when creating the PDF report (:code:`conda install -y -c conda-forge glib gtk3 pango mscorefonts`).
@@ -303,7 +306,7 @@ Reference
 
 If you use any of the ROBERT modules, please include this citation:  
 
-* Dalmau, D.; Alegre Requena, J. V. ROBERT: Bridging the Gap between Machine Learning and Chemistry. *Wiley Interdiscip. Rev. Comput. Mol. Sci.* **2024**, *accepted*. DOI: 10.1002/WCMS.1733. 
+* Dalmau, D.; Alegre Requena, J. V. ROBERT: Bridging the Gap between Machine Learning and Chemistry. *Wiley Interdiscip. Rev. Comput. Mol. Sci.* **2024**, *14*, e1733. 
 
 If you use the AQME module, please include this citation:  
 

diff --git a/robert/aqme.py b/robert/aqme.py
@@ -99,6 +99,12 @@ def run_csearch_qdescp(self,csv_target,aqme_test=False):
 
         # Load database
         csv_df = load_database(self,csv_target,job_type)
+         # avoid running calcs with special signs (i.e. *)
+        for name_csv_indiv in csv_df['code_name']:
+            if '*' in f'{name_csv_indiv}':
+                self.args.log.write(f"\nx  WARNING! The names provided in the CSV contain * (i.e. {name_csv_indiv}). Please, remove all the * characters.")
+                self.args.log.finalize()
+                sys.exit()
 
         # find if there is more than one SMILES column in the CSV file
         for column in csv_df.columns:

diff --git a/robert/argument_parser.py b/robert/argument_parser.py
@@ -28,6 +28,7 @@
     "ignore" : [],
     "categorical" : "onehot",
     "corr_filter" : True,
+    "std" : True,
     "desc_thres" : 25,
     "thres_y" : 0.001,
     "thres_x" : 0.7,
@@ -37,7 +38,7 @@
     "auto_kn" : True,
     "auto_type": True,
     "filter_train" : True,
-    "split" : "RND",
+    "split" : "stratified",
     "model" : ['RF','GB','NN','MVL'],
     "eval_model" : 'MVL',
     "custom_params" : None,

diff --git a/robert/curate.py b/robert/curate.py
@@ -53,11 +53,8 @@
 import time
 import os
 import pandas as pd
-from scipy import stats
-from robert.utils import load_variables, finish_print, load_database, pearson_map, check_clas_problem
-from sklearn.feature_selection import RFECV
-from sklearn.model_selection import KFold
-from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
+from robert.utils import (load_variables, finish_print, load_database, pearson_map,
+                          check_clas_problem, categorical_transform, correlation_filter)
 
 
 class curate:
@@ -80,20 +77,19 @@ def __init__(self, **kwargs):
         # load database, discard user-defined descriptors and perform data checks
         csv_df = load_database(self,self.args.csv_name,"curate")
 
-        # changes type to classification if there are only two different y values
-        if self.args.type.lower() == 'reg' and self.args.auto_type:
-            self = check_clas_problem(self,csv_df)
+        # adjust options of classification problems and detects whether the right type of problem was used
+        self = check_clas_problem(self,csv_df)
 
         if not self.args.evaluate:
             # transform categorical descriptors
-            csv_df = self.categorical_transform(csv_df,'curate')
+            csv_df = categorical_transform(self,csv_df,'curate')
 
             # apply duplicate filters (i.e., duplication of datapoints or descriptors)
             csv_df = self.dup_filter(csv_df)
 
             # apply the correlation filters and returns the database without correlated descriptors
             if self.args.corr_filter:
-                csv_df = self.correlation_filter(csv_df)
+                csv_df = correlation_filter(self,csv_df)
 
         # create Pearson heatmap
         _ = pearson_map(self,csv_df,'curate')
@@ -105,53 +101,6 @@ def __init__(self, **kwargs):
         _ = finish_print(self,start_time,'CURATE')
 
 
-    def categorical_transform(self,csv_df,module):
-        # converts all columns with strings into categorical values (one hot encoding
-        # by default, can be set to numerical 1,2,3... with categorical = True).
-        # Troubleshooting! For one-hot encoding, don't use variable names that are
-        # also column headers! i.e. DESCRIPTOR "C_atom" contain C2 as a value,
-        # but C2 is already a header of a different column in the database. Same applies
-        # for multiple columns containing the same variable names.
-
-        if module.lower() == 'curate':
-            txt_categor = f'\no  Analyzing categorical variables'
-
-        descriptors_to_drop, categorical_vars, new_categor_desc = [],[],[]
-        for column in csv_df.columns:
-            if column not in self.args.ignore and column != self.args.y:
-                if(csv_df[column].dtype == 'object'):
-                    descriptors_to_drop.append(column)
-                    categorical_vars.append(column)
-                    if self.args.categorical.lower() == 'numbers':
-                        csv_df[column] = csv_df[column].astype('category')
-                        csv_df[column] = csv_df[column].cat.codes
-                    else:
-                        _ = csv_df[column].unique() # is this necessary?
-                        categor_descs = pd.get_dummies(csv_df[column])
-                        csv_df = csv_df.drop(column, axis=1)
-                        csv_df = pd.concat([csv_df, categor_descs], axis=1)
-                        for desc in categor_descs:
-                            new_categor_desc.append(desc)
-
-        if module.lower() == 'curate':
-            if len(categorical_vars) == 0:
-                txt_categor += f'\n   - No categorical variables were found'
-            else:
-                if self.args.categorical.lower() == 'numbers':
-                    txt_categor += f'\n   A total of {len(categorical_vars)} categorical variables were converted using the {self.args.categorical} mode in the categorical option:\n'
-                    txt_categor += '\n'.join(f'   - {var}' for var in categorical_vars)
-                else:
-                    txt_categor += f'\n   A total of {len(categorical_vars)} categorical variables were converted using the {self.args.categorical} mode in the categorical option'
-                    txt_categor += f'\n   Initial descriptors:\n'
-                    txt_categor += '\n'.join(f'   - {var}' for var in categorical_vars)
-                    txt_categor += f'\n   Generated descriptors:\n'
-                    txt_categor += '\n'.join(f'   - {var}' for var in new_categor_desc)
-
-            self.args.log.write(f'{txt_categor}')
-
-        return csv_df
-
-
     def dup_filter(self,csv_df_dup):
         '''
         Removes duplicated datapoints and descriptors
@@ -179,114 +128,6 @@ def dup_filter(self,csv_df_dup):
         return csv_df_dup
 
 
-    def correlation_filter(self, csv_df):
-        """
-        Discards a) correlated variables and b) variables that do not correlate with the y values, based
-        on R**2 values c) reduces the number of descriptors to one third of the datapoints using RFECV.
-        """
-
-        txt_corr = ''
-
-        # loosen correlation filters if there are too few descriptors
-        n_descps = len(csv_df.columns)-len(self.args.ignore)-1 # all columns - ignored - y
-        if self.args.desc_thres and n_descps*self.args.desc_thres < len(csv_df[self.args.y]):
-            self.args.thres_x = 0.95
-            self.args.thres_y = 0.0001
-            txt_corr += f'\nx  WARNING! The number of descriptors ({n_descps}) is {self.args.desc_thres} times lower than the number of datapoints ({len(csv_df[self.args.y])}), the correlation filters are loosen to thres_x = 0.95 and thres_y = 0.0001! Default thresholds (0.9 and 0.001) can be used with "--desc_thres False"'
-
-        txt_corr += f'\no  Correlation filter activated with these thresholds: thres_x = {self.args.thres_x}, thres_y = {self.args.thres_y}'
-
-        descriptors_drop = []
-        txt_corr += f'\n   Excluded descriptors:'
-        for i,column in enumerate(csv_df.columns):
-            if column not in descriptors_drop and column not in self.args.ignore and column != self.args.y:
-                # finds the descriptors with low correlation to the response values
-                try:
-                    res_y = stats.linregress(csv_df[column],csv_df[self.args.y])
-                    rsquared_y = res_y.rvalue**2
-                    if rsquared_y < self.args.thres_y:
-                        descriptors_drop.append(column)
-                        txt_corr += f'\n   - {column}: R**2 = {round(rsquared_y,2)} with the {self.args.y} values'
-                except ValueError: # this avoids X descriptors where the majority of the values are the same
-                    descriptors_drop.append(column)
-                    txt_corr += f'\n   - {column}: error in R**2 with the {self.args.y} values (are all the values the same?)'
-
-                # finds correlated descriptors
-                if column != csv_df.columns[-1] and column not in descriptors_drop:
-                    for j,column2 in enumerate(csv_df.columns):
-                        if j > i and column2 not in self.args.ignore and column not in descriptors_drop and column2 not in descriptors_drop and column2 != self.args.y:
-                            res_x = stats.linregress(csv_df[column],csv_df[column2])
-                            rsquared_x = res_x.rvalue**2
-                            if rsquared_x > self.args.thres_x:
-                                # discard the column with less correlation with the y values
-                                res_xy = stats.linregress(csv_df[column2],csv_df[self.args.y])
-                                rsquared_y2 = res_xy.rvalue**2
-                                if rsquared_y >= rsquared_y2:
-                                    descriptors_drop.append(column2)
-                                    txt_corr += f'\n   - {column2}: R**2 = {round(rsquared_x,2)} with {column}'
-                                else:
-                                    descriptors_drop.append(column)
-                                    txt_corr += f'\n   - {column}: R**2 = {round(rsquared_x,2)} with {column2}'
-
-        # drop descriptors that did not pass the filters
-        csv_df_filtered = csv_df.drop(descriptors_drop, axis=1)
-
-        # Check if descriptors are more than one third of datapoints
-        n_descps = len(csv_df_filtered.columns)-len(self.args.ignore)-1 # all columns - ignored - y
-        if len(csv_df[self.args.y]) > 50 and self.args.auto_test ==True:
-           datapoints = len(csv_df[self.args.y])*0.9
-        else:
-            datapoints = len(csv_df[self.args.y])
-        if n_descps > datapoints / 3:
-            num_descriptors = int(datapoints / 3)
-            # Avoid situations where the number of descriptors is equal to the number of datapoints/3
-            if len(csv_df[self.args.y]) / 3 == num_descriptors:
-                num_descriptors -= 1
-            # Use RFECV with a simple RandomForestRegressor to select the most important descriptors
-            if self.args.type.lower() == 'reg':
-                estimator = RandomForestRegressor(random_state=0, n_estimators=30, max_depth=10,  n_jobs=None)
-            elif self.args.type.lower() == 'clas':
-                estimator = RandomForestClassifier(random_state=0, n_estimators=30, max_depth=10,  n_jobs=None)
-            if self.args.kfold == 'auto':
-                # LOOCV for relatively small datasets (less than 50 datapoints)
-                if len(csv_df[self.args.y]) < 50:
-                    n_splits = len(csv_df[self.args.y])
-                    cv_type = 'LOOCV'
-                # k-fold CV with the same training/validation proportion used for fitting the model, using 5 splits
-                else:
-                    n_splits = 5
-                    cv_type = '5-fold CV'
-            else:
-                n_splits = self.args.kfold
-                cv_type = f'{n_splits}-fold CV'
-            txt_corr += f'\n\no  Descriptors reduced to one third of datapoints using RFECV with {cv_type}: {num_descriptors} descriptors remaining'
-
-            selector = RFECV(estimator, min_features_to_select=num_descriptors, cv=KFold(n_splits=n_splits, shuffle=True, random_state=0), n_jobs=None)
-            X = csv_df_filtered.drop([self.args.y] + self.args.ignore, axis=1)
-            y = csv_df_filtered[self.args.y]
-            # Convert column names to strings to avoid any issues
-            X.columns = X.columns.astype(str)
-            selector.fit(X, y)
-            # Sort the descriptors by their importance scores
-            descriptors_importances = list(zip(X.columns, selector.estimator_.feature_importances_))
-            sorted_descriptors = sorted(descriptors_importances, key=lambda x: x[1], reverse=True)
-            selected_descriptors = [descriptor for descriptor, _ in sorted_descriptors[:num_descriptors]]
-            # Find the descriptors to drop
-            descriptors_drop = [descriptor for descriptor in csv_df_filtered.columns if descriptor not in selected_descriptors and descriptor not in self.args.ignore and descriptor != self.args.y]
-            csv_df_filtered = csv_df_filtered.drop(descriptors_drop, axis=1)
-
-        if len(descriptors_drop) == 0:
-            txt_corr += f'\n   -  No descriptors were removed'
-
-        self.args.log.write(txt_corr)
-
-        txt_csv = f'\no  {len(csv_df_filtered.columns)} columns remaining after applying duplicate, correlation filters and RFECV:\n'
-        txt_csv += '\n'.join(f'   - {var}' for var in csv_df_filtered.columns)
-        self.args.log.write(txt_csv)
-
-        return csv_df_filtered
-
-
     def save_curate(self,csv_df):
         '''
         Saves the curated database and options used in CURATE