Skip to content

Commit

Permalink
Merge pull request #149 from EhsanGharibNezhad/DataProcessor
Browse files Browse the repository at this point in the history
Data processor
  • Loading branch information
EhsanGharibNezhad authored Nov 13, 2023
2 parents f83fbfd + 92d2757 commit 01ab8f3
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 297 deletions.
55 changes: 18 additions & 37 deletions TelescopeML/DataMaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,57 +44,44 @@

class DataProcessor:
"""
Perform various tasks to process the datasets including:
- Prepare input and output variables
- Split train, validation, test sets
- Scale/Normalize the data
Perform various tasks to process the datasets, including:
- Prepare inputs and outputs
- Split the dataset into training, validation, and test sets
- Scale/normalize the data
- Visualize the data
- feature engineering.
- Conduct feature engineering
Parameters
----------
# trained_model : object
# Trained ML model (optional).
# trained_model_history : dict
# History dict from the trained model (optional).
feature_values : array
feature_values : np.ndarray
Flux arrays (input data).
feature_names : list
feature_names : List[str]
Name of wavelength in micron.
target_values : array
target_values : np.ndarray
Target variable array (e.g., Temperature, Gravity, Carbon_to_Oxygen, Metallicity).
target_name : str
Name of the target variable.
is_hyperparam_tuned : str
Indicates whether hyperparameters are tuned or not ('yes' or 'no').
param_grid : dict
param_grid : dict, optional
ML hyperparameters to be tuned (used if is_hyperparam_tuned = 'yes').
spectral_resolution : int
spectral_resolution : int, optional
Resolution of the synthetic spectra used to generate the dataset.
feature_improvement_method : str
Indicates the method used for feature improvement ('no', 'pca', 'RFE').
Method used for feature improvement ('no', 'pca', 'RFE').
augmentation_method : str
Indicates if augmented dataset is used ('no' or method name).
ml_model : object
Indicates if an augmented dataset is used ('no' or method name).
ml_model : BaseEstimator, optional
ML model object from sklearn package.
ml_model_str : str
ml_model_str : str, optional
Name of the ML model.
Notes
------
This class can be instintiated and utilized with or without the trained ML model. In case of having the trained model,
you should have the following parameters:
- trained_model
- trained_model_history
ml_method : str, optional
Machine learning method ('regression' or 'classification').
"""

def __init__(
self,
# trained_model: Union[None, BaseEstimator] = None,
# trained_model_history: Union[None, Dict] = None,
feature_values: Union[np.ndarray] = None,
feature_names: Union[List[str]] = None,
target_values: Union[np.ndarray] = None,
Expand All @@ -109,8 +96,6 @@ def __init__(
ml_method: str = 'regression',
):

# self.trained_model = trained_model
# self.trained_model_history = trained_model_history
self.feature_values = feature_values
self.feature_names = feature_names
self.target_values = target_values
Expand Down Expand Up @@ -446,14 +431,10 @@ def standardize_X_column_wise(self,
X_test = self.X_test if X_test is None else X_test

scaler_X = StandardScaler()
# if X_train == None:

self.X_train_standardized_columnwise = scaler_X.fit_transform(X_train)
self.X_val_standardized_columnwise = scaler_X.transform(X_val)
self.X_test_standardized_columnwise = scaler_X.transform(X_test)
# elif X_train:
# X_train_standardized_columnwise = scaler_X.fit_transform(X_train)
# X_val_standardized_columnwise = scaler_X.transform(X_val)
# X_test_standardized_columnwise = scaler_X.transform(X_test)

self.standardize_X_ColumnWise = scaler_X

Expand Down
2 changes: 0 additions & 2 deletions TelescopeML/DeepTrainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@

from typing import Union

# from typing import List, Union, Dict
# from sklearn.base import BaseEstimator

# ******* Data Visualization Libraries ****************************

Expand Down
149 changes: 2 additions & 147 deletions TelescopeML/IO_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,7 @@

__reference_data__ = os.getenv("TelescopeML_reference_data")
# print(__reference_data__)
#
# if __reference_data__ is None:
# raise Exception('\n'
# "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n"
# "TelescopeML Error Message: \n\n"
# "You need to define the path to your reference data.\n"
# "Check out this tutorial: https://ehsangharibnezhad.github.io/TelescopeML/installation.html\n"
# "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
# )
# else:
# pass



class LoadSave:
Expand Down Expand Up @@ -53,11 +43,7 @@ def create_generic_path(self, output_indicator):
"""
file_name = f'{output_indicator}__{self.ml_model_str}__' \
f'{self.ml_method}'
# file_name = f'{indicator}__{self.ml_model_str}' \
# f'__Is_feature_improved_{self.is_feature_improved}' \
# f'__Is_augmented_{self.is_augmented}__' \
# f'Is_tuned__{self.is_tuned}__' \
# f'{self.ml_method}'

generic_path = os.path.join(self.base_path, file_name)
return generic_path

Expand Down Expand Up @@ -130,134 +116,3 @@ def load_or_dump_trained_model_CNN(

return loaded_model, history

# class LoadSave:
# """
# Load and Save trained operators, models, and datasets
# """
#
# def __init__(self,
# ml_model_str,
# ml_method,
# is_feature_improved,
# is_augmented,
# is_tuned,
# ):
# self.ml_model_str = ml_model_str
# self.ml_method = ml_method
# self.is_feature_improved = is_feature_improved
# self.is_augmented = is_augmented
# self.is_tuned = is_tuned
# self.reference_data = __reference_data__
#
# def create_generic_path(self, indicator):
# """
# Create the generic path for saving or loading the trained model
#
# Inputs:
# -------
# - indicator (str): Indicator for the model type
#
# Returns:
# --------
# - generic_path (str): The generic path for saving or loading the model
# """
# file_name = f'{indicator}__{self.ml_model_str}' \
# f'__Is_feature_improved_{self.is_feature_improved}' \
# f'__Is_augmented_{self.is_augmented}__' \
# f'Is_tuned__{self.is_tuned}__' \
# f'{self.ml_method}'
# generic_path = os.path.join(self.reference_data, file_name)
# return generic_path
#
# def load_or_dump_trained_object(self,
# trained_object,
# indicator,
# load_or_dump='dump'):
# """
# Load or save the trained object
#
# Inputs:
# -------
# - trained_object : The object to be saved or loaded
# - indicator (str): Indicator for the type of trained object
# - load_or_dump (str): 'dump' or 'load'
# """
# generic_path = self.create_generic_path(indicator)
#
# if load_or_dump == 'dump':
# with open(generic_path, 'wb') as file:
# pk.dump(trained_object, file)
# elif load_or_dump == 'load':
# with open(generic_path, 'rb') as file:
# return pk.load(file)
#
#
# def load_or_dump_trained_model_CNN(
# trained_model = None,
# indicator='TrainedCNN',
# load_or_dump='dump'):
# """
# Load or save the trained CNN model
#
# Inputs:
# -------
# - trained_model : The trained CNN model
# - indicator (str): Indicator for the type of trained model
# - load_or_dump (str): 'dump' or 'load'
# """
# # json_path = self.create_generic_path(f'{indicator}_json')
# # weights_path = self.create_generic_path(f'{indicator}_weights')
#
# if load_or_dump == 'dump':
# trained_model.trained_model.save(
# os.path.join(__reference_data__,
# 'trained_ML_models/trained_CNN_architecture_'+indicator+'.h5'
# ))
#
# trained_model.trained_model.save_weights(
# os.path.join(__reference_data__,
# 'trained_ML_models/trained_CNN_weights_'+indicator+'.h5',
# ))
#
#
# with open(
# os.path.join(__reference_data__,
# 'trained_ML_models/trained_CNN_history_'+indicator+'.h5',
# ),
# 'wb') as file:
#
# # with open(
# # f'../outputs/trained_models/trained_CNN_history_{indicator}.pkl',
# # 'wb') as file:
# pk.dump(trained_model.history.history, file)
#
# elif load_or_dump == 'load':
#
# loaded_model = load_model(
# os.path.join(__reference_data__,
# 'trained_ML_models/trained_CNN_architecture_'+indicator+'.h5' ))
#
# loaded_model.load_weights(
# os.path.join(__reference_data__,
# 'trained_ML_models/trained_CNN_weights_'+indicator+'.h5' ))
#
# # loaded_model = load_model(
# # f'../outputs/trained_models/trained_CNN_architecture_{indicator}.h5')
# # loaded_model.load_weights(
# # f'../outputs/trained_models/trained_CNN_weights_{indicator}.h5')
#
# # Loading the saved history object
#
# with open(
# os.path.join(__reference_data__,
# 'trained_ML_models/trained_CNN_weights_'+indicator+'.pkl'
# ),
# 'rb') as file:
# history = pk.load(file)
#
# # with open(
# # f'../outputs/trained_models/trained_CNN_history_{indicator}.pkl',
# # 'rb') as file:
# # history = pk.load(file)
#
# return loaded_model, history
51 changes: 3 additions & 48 deletions TelescopeML/Predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,23 +45,7 @@


__reference_data__ = os.getenv("TelescopeML_reference_data")
print(__reference_data__)

# if __reference_data__ is None:
# raise Exception('\n'
# "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n"
# "TelescopeML Error Message: \n\n"
# "You need to define the path to your reference data.\n"
# "Check out this tutorial: https://ehsangharibnezhad.github.io/TelescopeML/installation.html\n"
# "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
# )
# else:
# pass


# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
# Data Visualizing libararies
# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
# print(__reference_data__)


# ===============================================================================
Expand Down Expand Up @@ -230,10 +214,6 @@ def ProcessObservationalDataset(self,
Fnu_errors = Fnu_obs_err,
bd_literature_dic = bd_literature_dic)

# self.obs_data_df['Fnu_obs'] = self.Fnu_obs
# self.obs_data_df['Fnu_obs_err'] = self.Fnu_obs_err
# self.obs_data_df['Fnu_obs_absolute'] = self.Fnu_obs_absolute
# self.obs_data_df['Fnu_obs_absolute_err'] = self.Fnu_obs_absolute_err

return Fnu_obs , Fnu_obs_err, Fnu_obs_absolute, Fnu_obs_absolute_err

Expand Down Expand Up @@ -463,14 +443,6 @@ def Process_Observational_Dataset(self,
print_results_fun(targets=self.targets_single_spectrum_dic,
print_title='Predicted Targets from the Signle Observational Spectrum:')

# if __plot_predicted_vs_observed__:
# plot_predicted_vs_observed(
# training_datasets=self.training_dataset_df,
# wl=self.wl,
# predicted_targets_dic=self.targets_single_spectrum_dic,
# object_name=self.object_name,
# Fnu_obs_absolute_intd_df=self.Fnu_obs_absolute_intd_df,
# )

def predict_from_random_spectra(
self,
Expand Down Expand Up @@ -555,21 +527,17 @@ def predict_from_random_spectra(
df_MinMax_obs = pd.DataFrame(
(Fnu_obs_absolute_intd_df_min, Fnu_obs_absolute_intd_df_max)
).T
# print('Bug check1 -- df_MinMax_obs:', df_MinMax_obs)

XminXmax_Stand = self.trained_X_ColWise_MinMax[0].transform(df_MinMax_obs.values)
# XminXmax_Stand = self.trained_data_processor.normalize_X_ColumnWise.transform(df_MinMax_obs.values)

# print('Bug check2 -- XminXmax_Stand:', XminXmax_Stand)


bd_mean = Fnu_obs_absolute_intd_df.mean(axis=1)[0]
bd_std = Fnu_obs_absolute_intd_df.std(axis=1)[0]

# print('Bug check3 -- bd_mean, bd_std:', bd_mean, bd_std)

# X_Scaled = (Fnu_obs_absolute_intd_df.div((self.bd_literature_dic['bd_radius_Rjup'])**2).values[0] - bd_mean) / bd_std
X_Scaled = (Fnu_obs_absolute_intd_df.values[0] - bd_mean) / bd_std
# print('Bug check4 -- X_Scaled:', X_Scaled)

y_pred_train = np.array(
self.trained_ML_model.predict(
Expand All @@ -595,14 +563,9 @@ def predict_from_random_spectra(
# self.filtered_df4 = filtered_df4
# print(filtered_df4.iloc[0,0:-5].values)

# if __print_results__: FINDME

spectra_list_pre.append(filtered_df4.iloc[:, 0:-5].div((self.bd_literature_dic['bd_radius_Rjup'])**2).values.flatten())
# spectra_list_pre.append(filtered_df4.iloc[:, 0:-5].values.flatten())
# print('Bug check5 -- spectra_list_pre:', spectra_list_pre)

# print('*'*10+' Filtered and Interpolated training data based on the ML predicted parameters '+'*'*10)
# print(spectra_list_pre)

self.spectra_list_obs = spectra_list_obs
self.spectra_list_pre = spectra_list_pre
Expand Down Expand Up @@ -672,15 +635,7 @@ def predict_from_random_spectra(
boxplot_hist(self.df_random_pred['c_o'], x_label=r'C/O', xy_loc=[0.05, 0.98])
boxplot_hist(self.df_random_pred['met'], x_label=r'[M/H]', xy_loc=[0.05, 0.98])

# if __plot_predicted_vs_observed__:
# plot_predicted_vs_observed(
# training_datasets=self.training_dataset_df,
# wl=self.wl_synthetic,
# predicted_targets_dic=self.dic_random_pred_mean,
# object_name=self.object_name,
# Fnu_obs_absolute_intd_df=self.Fnu_obs_absolute_intd_df,
# __print_results__=False,
# )


if __plot_pred_vs_obs_errorbar__:
plot_pred_vs_obs_errorbar(
Expand Down
Loading

0 comments on commit 01ab8f3

Please sign in to comment.