Merge pull request #149 from EhsanGharibNezhad/DataProcessor

Data processor
EhsanGharibNezhad · Nov 13, 2023 · 01ab8f3 · 01ab8f3
2 parents f83fbfd + 92d2757
commit 01ab8f3
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 297 deletions.
diff --git a/TelescopeML/DataMaster.py b/TelescopeML/DataMaster.py
@@ -44,57 +44,44 @@
 
 class DataProcessor:
     """
-    Perform various tasks to process the datasets including:
-    - Prepare input and output variables
-    - Split train, validation, test sets
-    - Scale/Normalize the data
+    Perform various tasks to process the datasets, including:
+
+    - Prepare inputs and outputs
+    - Split the dataset into training, validation, and test sets
+    - Scale/normalize the data
     - Visualize the data
-    - feature engineering.
+    - Conduct feature engineering
 
     Parameters
     ----------
-    # trained_model : object
-    #     Trained ML model (optional).
-    # trained_model_history : dict
-    #     History dict from the trained model (optional).
-    feature_values : array
+    feature_values : np.ndarray
         Flux arrays (input data).
-    feature_names : list
+    feature_names : List[str]
         Name of wavelength in micron.
-    target_values : array
+    target_values : np.ndarray
         Target variable array (e.g., Temperature, Gravity, Carbon_to_Oxygen, Metallicity).
     target_name : str
         Name of the target variable.
     is_hyperparam_tuned : str
         Indicates whether hyperparameters are tuned or not ('yes' or 'no').
-    param_grid : dict
+    param_grid : dict, optional
         ML hyperparameters to be tuned (used if is_hyperparam_tuned = 'yes').
-    spectral_resolution : int
+    spectral_resolution : int, optional
         Resolution of the synthetic spectra used to generate the dataset.
     feature_improvement_method : str
-        Indicates the method used for feature improvement ('no', 'pca', 'RFE').
+        Method used for feature improvement ('no', 'pca', 'RFE').
     augmentation_method : str
-        Indicates if augmented dataset is used ('no' or method name).
-    ml_model : object
+        Indicates if an augmented dataset is used ('no' or method name).
+    ml_model : BaseEstimator, optional
         ML model object from sklearn package.
-    ml_model_str : str
+    ml_model_str : str, optional
         Name of the ML model.
-
-    Notes
-    ------
-    This class can be instintiated and utilized with or without the trained ML model. In case of having the trained model,
-    you should have the following parameters:
-        - trained_model
-        - trained_model_history
-
-
+    ml_method : str, optional
+        Machine learning method ('regression' or 'classification').
 
     """
-
     def __init__(
             self,
-            # trained_model: Union[None, BaseEstimator] = None,
-            # trained_model_history: Union[None, Dict] = None,
             feature_values: Union[np.ndarray] = None,
             feature_names: Union[List[str]] = None,
             target_values: Union[np.ndarray] = None,
@@ -109,8 +96,6 @@ def __init__(
             ml_method: str = 'regression',
     ):
 
-        # self.trained_model = trained_model
-        # self.trained_model_history = trained_model_history
         self.feature_values = feature_values
         self.feature_names = feature_names
         self.target_values = target_values
@@ -446,14 +431,10 @@ def standardize_X_column_wise(self,
         X_test = self.X_test if X_test is None else X_test
 
         scaler_X = StandardScaler()
-        # if X_train == None:
+
         self.X_train_standardized_columnwise = scaler_X.fit_transform(X_train)
         self.X_val_standardized_columnwise = scaler_X.transform(X_val)
         self.X_test_standardized_columnwise = scaler_X.transform(X_test)
-        # elif X_train:
-        # X_train_standardized_columnwise = scaler_X.fit_transform(X_train)
-        # X_val_standardized_columnwise = scaler_X.transform(X_val)
-        # X_test_standardized_columnwise = scaler_X.transform(X_test)
 
         self.standardize_X_ColumnWise = scaler_X
 

diff --git a/TelescopeML/DeepTrainer.py b/TelescopeML/DeepTrainer.py
@@ -11,8 +11,6 @@
 
 from typing import Union
 
-# from typing import List, Union, Dict
-# from sklearn.base import BaseEstimator
 
 # ******* Data Visualization Libraries ****************************
 

diff --git a/TelescopeML/IO_utils.py b/TelescopeML/IO_utils.py
@@ -6,17 +6,7 @@
 
 __reference_data__ = os.getenv("TelescopeML_reference_data")
 # print(__reference_data__)
-#
-# if __reference_data__ is None:
-#     raise Exception('\n'
-#                        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n"
-#                        "TelescopeML Error Message: \n\n"
-#                        "You need to define the path to your reference data.\n"
-#                        "Check out this tutorial: https://ehsangharibnezhad.github.io/TelescopeML/installation.html\n"
-#                        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-#                     )
-# else:
-#     pass
+
 
 
 class LoadSave:
@@ -53,11 +43,7 @@ def create_generic_path(self, output_indicator):
         """
         file_name = f'{output_indicator}__{self.ml_model_str}__' \
                     f'{self.ml_method}'
-        # file_name = f'{indicator}__{self.ml_model_str}' \
-        #             f'__Is_feature_improved_{self.is_feature_improved}' \
-        #             f'__Is_augmented_{self.is_augmented}__' \
-        #             f'Is_tuned__{self.is_tuned}__' \
-        #             f'{self.ml_method}'
+
         generic_path = os.path.join(self.base_path, file_name)
         return generic_path
 
@@ -130,134 +116,3 @@ def load_or_dump_trained_model_CNN(
 
         return loaded_model, history
 
-# class LoadSave:
-#     """
-#     Load and Save trained operators, models, and datasets
-#     """
-#
-#     def __init__(self,
-#                  ml_model_str,
-#                  ml_method,
-#                  is_feature_improved,
-#                  is_augmented,
-#                  is_tuned,
-#                  ):
-#         self.ml_model_str = ml_model_str
-#         self.ml_method = ml_method
-#         self.is_feature_improved = is_feature_improved
-#         self.is_augmented = is_augmented
-#         self.is_tuned = is_tuned
-#         self.reference_data = __reference_data__
-#
-#     def create_generic_path(self, indicator):
-#         """
-#         Create the generic path for saving or loading the trained model
-#
-#         Inputs:
-#         -------
-#         - indicator (str): Indicator for the model type
-#
-#         Returns:
-#         --------
-#         - generic_path (str): The generic path for saving or loading the model
-#         """
-#         file_name = f'{indicator}__{self.ml_model_str}' \
-#                     f'__Is_feature_improved_{self.is_feature_improved}' \
-#                     f'__Is_augmented_{self.is_augmented}__' \
-#                     f'Is_tuned__{self.is_tuned}__' \
-#                     f'{self.ml_method}'
-#         generic_path = os.path.join(self.reference_data, file_name)
-#         return generic_path
-#
-#     def load_or_dump_trained_object(self,
-#                                     trained_object,
-#                                     indicator,
-#                                     load_or_dump='dump'):
-#         """
-#         Load or save the trained object
-#
-#         Inputs:
-#         -------
-#         - trained_object : The object to be saved or loaded
-#         - indicator (str): Indicator for the type of trained object
-#         - load_or_dump (str): 'dump' or 'load'
-#         """
-#         generic_path = self.create_generic_path(indicator)
-#
-#         if load_or_dump == 'dump':
-#             with open(generic_path, 'wb') as file:
-#                 pk.dump(trained_object, file)
-#         elif load_or_dump == 'load':
-#             with open(generic_path, 'rb') as file:
-#                 return pk.load(file)
-#
-#
-# def load_or_dump_trained_model_CNN(
-#                                    trained_model = None,
-#                                    indicator='TrainedCNN',
-#                                    load_or_dump='dump'):
-#     """
-#     Load or save the trained CNN model
-#
-#     Inputs:
-#     -------
-#     - trained_model : The trained CNN model
-#     - indicator (str): Indicator for the type of trained model
-#     - load_or_dump (str): 'dump' or 'load'
-#     """
-#     # json_path = self.create_generic_path(f'{indicator}_json')
-#     # weights_path = self.create_generic_path(f'{indicator}_weights')
-#
-#     if load_or_dump == 'dump':
-#         trained_model.trained_model.save(
-#             os.path.join(__reference_data__,
-#                          'trained_ML_models/trained_CNN_architecture_'+indicator+'.h5'
-#                          ))
-#
-#         trained_model.trained_model.save_weights(
-#             os.path.join(__reference_data__,
-#                          'trained_ML_models/trained_CNN_weights_'+indicator+'.h5',
-#                          ))
-#
-#
-#         with open(
-#                 os.path.join(__reference_data__,
-#                              'trained_ML_models/trained_CNN_history_'+indicator+'.h5',
-#                              ),
-#             'wb') as file:
-#
-#         # with open(
-#         #         f'../outputs/trained_models/trained_CNN_history_{indicator}.pkl',
-#         #         'wb') as file:
-#             pk.dump(trained_model.history.history, file)
-#
-#     elif load_or_dump == 'load':
-#
-#         loaded_model = load_model(
-#             os.path.join(__reference_data__,
-#                          'trained_ML_models/trained_CNN_architecture_'+indicator+'.h5' ))
-#
-#         loaded_model.load_weights(
-#             os.path.join(__reference_data__,
-#                          'trained_ML_models/trained_CNN_weights_'+indicator+'.h5' ))
-#
-#         # loaded_model = load_model(
-#         #     f'../outputs/trained_models/trained_CNN_architecture_{indicator}.h5')
-#         # loaded_model.load_weights(
-#         #     f'../outputs/trained_models/trained_CNN_weights_{indicator}.h5')
-#
-#         # Loading the saved history object
-#
-#         with open(
-#                 os.path.join(__reference_data__,
-#                              'trained_ML_models/trained_CNN_weights_'+indicator+'.pkl'
-#                              ),
-#                 'rb') as file:
-#                 history = pk.load(file)
-#
-#         # with open(
-#         #         f'../outputs/trained_models/trained_CNN_history_{indicator}.pkl',
-#         #         'rb') as file:
-#         #         history = pk.load(file)
-#
-#         return loaded_model, history
diff --git a/TelescopeML/Predictor.py b/TelescopeML/Predictor.py
@@ -45,23 +45,7 @@
 
 
 __reference_data__ = os.getenv("TelescopeML_reference_data")
-print(__reference_data__)
-
-# if __reference_data__ is None:
-#     raise Exception('\n'
-#                        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n"
-#                        "TelescopeML Error Message: \n\n"
-#                        "You need to define the path to your reference data.\n"
-#                        "Check out this tutorial: https://ehsangharibnezhad.github.io/TelescopeML/installation.html\n"
-#                        "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-#                     )
-# else:
-#     pass
-
-
-# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
-# Data Visualizing libararies
-# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+# print(__reference_data__)
 
 
 # ===============================================================================
@@ -230,10 +214,6 @@ def ProcessObservationalDataset(self,
                  Fnu_errors = Fnu_obs_err,
                  bd_literature_dic = bd_literature_dic)
 
-        # self.obs_data_df['Fnu_obs'] = self.Fnu_obs
-        # self.obs_data_df['Fnu_obs_err'] = self.Fnu_obs_err
-        # self.obs_data_df['Fnu_obs_absolute'] = self.Fnu_obs_absolute
-        # self.obs_data_df['Fnu_obs_absolute_err'] = self.Fnu_obs_absolute_err
 
         return Fnu_obs , Fnu_obs_err, Fnu_obs_absolute, Fnu_obs_absolute_err
 
@@ -463,14 +443,6 @@ def Process_Observational_Dataset(self,
             print_results_fun(targets=self.targets_single_spectrum_dic,
                               print_title='Predicted Targets from the Signle Observational Spectrum:')
 
-        # if __plot_predicted_vs_observed__:
-        #     plot_predicted_vs_observed(
-        #         training_datasets=self.training_dataset_df,
-        #         wl=self.wl,
-        #         predicted_targets_dic=self.targets_single_spectrum_dic,
-        #         object_name=self.object_name,
-        #         Fnu_obs_absolute_intd_df=self.Fnu_obs_absolute_intd_df,
-        #     )
 
     def predict_from_random_spectra(
             self,
@@ -555,21 +527,17 @@ def predict_from_random_spectra(
             df_MinMax_obs = pd.DataFrame(
                 (Fnu_obs_absolute_intd_df_min, Fnu_obs_absolute_intd_df_max)
             ).T
-            # print('Bug check1 -- df_MinMax_obs:', df_MinMax_obs)
+
             XminXmax_Stand = self.trained_X_ColWise_MinMax[0].transform(df_MinMax_obs.values)
             # XminXmax_Stand = self.trained_data_processor.normalize_X_ColumnWise.transform(df_MinMax_obs.values)
 
-            # print('Bug check2 -- XminXmax_Stand:', XminXmax_Stand)
-
 
             bd_mean = Fnu_obs_absolute_intd_df.mean(axis=1)[0]
             bd_std = Fnu_obs_absolute_intd_df.std(axis=1)[0]
 
-            # print('Bug check3 -- bd_mean, bd_std:', bd_mean, bd_std)
 
             # X_Scaled = (Fnu_obs_absolute_intd_df.div((self.bd_literature_dic['bd_radius_Rjup'])**2).values[0] - bd_mean) / bd_std
             X_Scaled = (Fnu_obs_absolute_intd_df.values[0] - bd_mean) / bd_std
-            # print('Bug check4 -- X_Scaled:', X_Scaled)
 
             y_pred_train = np.array(
                 self.trained_ML_model.predict(
@@ -595,14 +563,9 @@ def predict_from_random_spectra(
             # self.filtered_df4 = filtered_df4
             # print(filtered_df4.iloc[0,0:-5].values)
 
-            # if __print_results__: FINDME
-
             spectra_list_pre.append(filtered_df4.iloc[:, 0:-5].div((self.bd_literature_dic['bd_radius_Rjup'])**2).values.flatten())
             # spectra_list_pre.append(filtered_df4.iloc[:, 0:-5].values.flatten())
-            # print('Bug check5 -- spectra_list_pre:', spectra_list_pre)
 
-        # print('*'*10+'  Filtered and Interpolated training data based on the ML predicted parameters  '+'*'*10)
-        # print(spectra_list_pre)
 
         self.spectra_list_obs = spectra_list_obs
         self.spectra_list_pre = spectra_list_pre
@@ -672,15 +635,7 @@ def predict_from_random_spectra(
             boxplot_hist(self.df_random_pred['c_o'], x_label=r'C/O', xy_loc=[0.05, 0.98])
             boxplot_hist(self.df_random_pred['met'], x_label=r'[M/H]', xy_loc=[0.05, 0.98])
 
-        # if __plot_predicted_vs_observed__:
-        #     plot_predicted_vs_observed(
-        #         training_datasets=self.training_dataset_df,
-        #         wl=self.wl_synthetic,
-        #         predicted_targets_dic=self.dic_random_pred_mean,
-        #         object_name=self.object_name,
-        #         Fnu_obs_absolute_intd_df=self.Fnu_obs_absolute_intd_df,
-        #         __print_results__=False,
-        #     )
+
 
         if __plot_pred_vs_obs_errorbar__:
             plot_pred_vs_obs_errorbar(