diff --git a/dora_exp_pipeline/example_config/dora_astronomy_des.yml b/dora_exp_pipeline/example_config/dora_astronomy_des.yml index 6ac50f5..89da630 100644 --- a/dora_exp_pipeline/example_config/dora_astronomy_des.yml +++ b/dora_exp_pipeline/example_config/dora_astronomy_des.yml @@ -1,12 +1,12 @@ # Data loading module parameters data_loader: { - name: 'Catalog', + name: 'FeatureVector', params: {} } data_to_fit: '/home/urebbapr/research/dora/sample_data/astronomy_des/Y3_mastercat_12_3_19_SOMv0.21_indexselect_p0.0001_lups_colors.h5' data_to_score: '/home/urebbapr/research/dora/sample_data/astronomy_des/Y3_mastercat_12_3_19_SOMv0.21_indexselect_p0.0001_lups_colors.h5' zscore_normalization: False -out_dir: '/home/urebbapr/research/dora/local/runs/dataloader_test' +out_dir: 'exp/astrophysics_des/results_Y3_mastercat_12_3_19_SOMv0.21_indexselect_p0.0001_lups_colors' # Feature extraction module features: { @@ -16,27 +16,37 @@ features: { } # Outlier detection module -top_n: 10 + +# Outlier detection module +top_n: None outlier_detection: { iforest: { - n_trees: 100 + n_trees: 100, + fit_single_trees: False, }, pca: { k: 3 }, +# demud: { +# k: 3 +# }, + pae: { + latent_dim: 3 + }, rx: {}, negative_sampling: { - percent_increase: 20 + percent_increase: 20 }, random: {} } # Results organization module results: { - kmeans: { - n_clusters: 5 - } - som: { - n_clusters: 5 - } +# comparison_plot: { +# 'validation_dir': '/proj/des/products/find_discarded_objectids_in_dec2019_data.labels.trunc1000.csv' +# }, + save_scores: {}, + # histogram: { + # 'bins': 25 + # } } diff --git a/dora_exp_pipeline/iforest_outlier_detection.py b/dora_exp_pipeline/iforest_outlier_detection.py index 4c6e41f..863aa32 100644 --- a/dora_exp_pipeline/iforest_outlier_detection.py +++ b/dora_exp_pipeline/iforest_outlier_detection.py @@ -9,11 +9,16 @@ def __init__(self): super(IForestOutlierDetection, self).__init__('iforest') def _rank_internal(self, data_to_fit, data_to_score, data_to_score_ids, - top_n, seed, n_trees): + top_n, seed, n_trees, fit_single_trees): if data_to_fit is None: data_to_fit = deepcopy(data_to_score) - scores = train_and_run_ISO(data_to_fit, data_to_score, n_trees, seed) + if not fit_single_trees: + scores = train_and_run_ISO(data_to_fit, data_to_score, + n_trees, seed) + else: + scores = single_tree_ISO(data_to_fit, data_to_score, + n_trees, seed) selection_indices = np.argsort(scores) results = dict() @@ -28,6 +33,18 @@ def _rank_internal(self, data_to_fit, data_to_score, data_to_score_ids, return results +def single_tree_ISO(train, test, n_trees, seed): + + random_state = np.random.RandomState(seed) + scores = np.empty((test.shape[0], n_trees)) + + for i in range(n_trees): + scores[:, i] = train_and_run_ISO(train, test, 1, + random_state.randint(0, 1000000)) + + return np.mean(scores, axis=1) + + def train_and_run_ISO(train, test, n_trees, seed): random_state = np.random.RandomState(seed) diff --git a/exp/astrophysics_des/dora_astronomy_des_trunc1000.yml b/exp/astrophysics_des/dora_astronomy_des_trunc1000.yml index 4719a79..844d0a2 100644 --- a/exp/astrophysics_des/dora_astronomy_des_trunc1000.yml +++ b/exp/astrophysics_des/dora_astronomy_des_trunc1000.yml @@ -19,7 +19,8 @@ features: { top_n: None outlier_detection: { iforest: { - n_trees: 100 + n_trees: 100, + fit_single_trees: False, }, pca: { k: 3 diff --git a/exp/astrophysics_des/dora_astronomy_des_trunc100000.yml b/exp/astrophysics_des/dora_astronomy_des_trunc100000.yml index 0e67524..cb0ded7 100644 --- a/exp/astrophysics_des/dora_astronomy_des_trunc100000.yml +++ b/exp/astrophysics_des/dora_astronomy_des_trunc100000.yml @@ -19,7 +19,8 @@ features: { top_n: None outlier_detection: { iforest: { - n_trees: 100 + n_trees: 100, + fit_single_trees: False, }, pca: { k: 3 diff --git a/exp/astrophysics_des/dora_astronomy_des_trunc1000000.yml b/exp/astrophysics_des/dora_astronomy_des_trunc1000000.yml index da0af7c..f567d0f 100644 --- a/exp/astrophysics_des/dora_astronomy_des_trunc1000000.yml +++ b/exp/astrophysics_des/dora_astronomy_des_trunc1000000.yml @@ -19,7 +19,8 @@ features: { top_n: None outlier_detection: { iforest: { - n_trees: 100 + n_trees: 100, + fit_single_trees: False, }, pca: { k: 3