Skip to content

Commit

Permalink
Merge pull request #73 from nasaharvest/issue7-averaged-isolation-trees
Browse files Browse the repository at this point in the history
Averaged isolation trees - Closes Issue 7 - take two
  • Loading branch information
urebbapr authored Oct 15, 2021
2 parents 90759bd + f88f334 commit 420b59f
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 16 deletions.
32 changes: 21 additions & 11 deletions dora_exp_pipeline/example_config/dora_astronomy_des.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Data loading module parameters
data_loader: {
name: 'Catalog',
name: 'FeatureVector',
params: {}
}
data_to_fit: '/home/urebbapr/research/dora/sample_data/astronomy_des/Y3_mastercat_12_3_19_SOMv0.21_indexselect_p0.0001_lups_colors.h5'
data_to_score: '/home/urebbapr/research/dora/sample_data/astronomy_des/Y3_mastercat_12_3_19_SOMv0.21_indexselect_p0.0001_lups_colors.h5'
zscore_normalization: False
out_dir: '/home/urebbapr/research/dora/local/runs/dataloader_test'
out_dir: 'exp/astrophysics_des/results_Y3_mastercat_12_3_19_SOMv0.21_indexselect_p0.0001_lups_colors'

# Feature extraction module
features: {
Expand All @@ -16,27 +16,37 @@ features: {
}

# Outlier detection module
top_n: 10

# Outlier detection module
top_n: None
outlier_detection: {
iforest: {
n_trees: 100
n_trees: 100,
fit_single_trees: False,
},
pca: {
k: 3
},
# demud: {
# k: 3
# },
pae: {
latent_dim: 3
},
rx: {},
negative_sampling: {
percent_increase: 20
percent_increase: 20
},
random: {}
}

# Results organization module
results: {
kmeans: {
n_clusters: 5
}
som: {
n_clusters: 5
}
# comparison_plot: {
# 'validation_dir': '/proj/des/products/find_discarded_objectids_in_dec2019_data.labels.trunc1000.csv'
# },
save_scores: {},
# histogram: {
# 'bins': 25
# }
}
21 changes: 19 additions & 2 deletions dora_exp_pipeline/iforest_outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,16 @@ def __init__(self):
super(IForestOutlierDetection, self).__init__('iforest')

def _rank_internal(self, data_to_fit, data_to_score, data_to_score_ids,
top_n, seed, n_trees):
top_n, seed, n_trees, fit_single_trees):
if data_to_fit is None:
data_to_fit = deepcopy(data_to_score)

scores = train_and_run_ISO(data_to_fit, data_to_score, n_trees, seed)
if not fit_single_trees:
scores = train_and_run_ISO(data_to_fit, data_to_score,
n_trees, seed)
else:
scores = single_tree_ISO(data_to_fit, data_to_score,
n_trees, seed)
selection_indices = np.argsort(scores)

results = dict()
Expand All @@ -28,6 +33,18 @@ def _rank_internal(self, data_to_fit, data_to_score, data_to_score_ids,
return results


def single_tree_ISO(train, test, n_trees, seed):

random_state = np.random.RandomState(seed)
scores = np.empty((test.shape[0], n_trees))

for i in range(n_trees):
scores[:, i] = train_and_run_ISO(train, test, 1,
random_state.randint(0, 1000000))

return np.mean(scores, axis=1)


def train_and_run_ISO(train, test, n_trees, seed):
random_state = np.random.RandomState(seed)

Expand Down
3 changes: 2 additions & 1 deletion exp/astrophysics_des/dora_astronomy_des_trunc1000.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ features: {
top_n: None
outlier_detection: {
iforest: {
n_trees: 100
n_trees: 100,
fit_single_trees: False,
},
pca: {
k: 3
Expand Down
3 changes: 2 additions & 1 deletion exp/astrophysics_des/dora_astronomy_des_trunc100000.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ features: {
top_n: None
outlier_detection: {
iforest: {
n_trees: 100
n_trees: 100,
fit_single_trees: False,
},
pca: {
k: 3
Expand Down
3 changes: 2 additions & 1 deletion exp/astrophysics_des/dora_astronomy_des_trunc1000000.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ features: {
top_n: None
outlier_detection: {
iforest: {
n_trees: 100
n_trees: 100,
fit_single_trees: False,
},
pca: {
k: 3
Expand Down

0 comments on commit 420b59f

Please sign in to comment.