diff --git a/docs/conf.py b/docs/conf.py index 96dadbbf..a58b2fd6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -17,9 +17,9 @@ # -- Project information ----------------------------------------------------- -project = 'Pippin' -copyright = '2022, Samuel Hinton, Patrick Armstrong, Dillon Brout, et. al.' -author = 'Samuel Hinton, Patrick Armstrong, Dillon Brout, et. al.' +project = "Pippin" +copyright = "2022, Samuel Hinton, Patrick Armstrong, Dillon Brout, et. al." +author = "Samuel Hinton, Patrick Armstrong, Dillon Brout, et. al." # -- General configuration --------------------------------------------------- @@ -30,10 +30,10 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx_rtd_theme', - 'sphinx_rtd_dark_mode', - 'myst_parser', - 'sphinxcontrib.youtube', + "sphinx_rtd_theme", + "sphinx_rtd_dark_mode", + "myst_parser", + "sphinxcontrib.youtube", ] myst_enable_extensions = [ @@ -46,12 +46,12 @@ } # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # -- Options for HTML output ------------------------------------------------- @@ -59,9 +59,9 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] diff --git a/pippin/aggregator.py b/pippin/aggregator.py index 274d2148..44887d37 100644 --- a/pippin/aggregator.py +++ b/pippin/aggregator.py @@ -19,7 +19,7 @@ class Aggregator(Task): - """ Merge fitres files and aggregator output + """Merge fitres files and aggregator output CONFIGURATION: ============== @@ -54,8 +54,12 @@ def __init__(self, name, output_dir, config, dependencies, options, recal_aggtas super().__init__(name, output_dir, config=config, dependencies=dependencies) self.passed = False self.classifiers = [d for d in dependencies if isinstance(d, Classifier)] - self.lcfit_deps = [d for c in self.classifiers for d in c.get_fit_dependency(output=False)] - self.lcfit_names = list(set([l.output["name"] for l in self.lcfit_deps if l is not None])) + self.lcfit_deps = [ + d for c in self.classifiers for d in c.get_fit_dependency(output=False) + ] + self.lcfit_names = list( + set([l.output["name"] for l in self.lcfit_deps if l is not None]) + ) self.output["lcfit_names"] = self.lcfit_names if not self.lcfit_names: self.logger.debug("No jobs depend on the LCFIT, so adding a dummy one") @@ -66,9 +70,21 @@ def __init__(self, name, output_dir, config, dependencies, options, recal_aggtas self.recal_aggtask = recal_aggtask self.num_versions = len(self.sim_task.output["sim_folders"]) - self.output_dfs = [os.path.join(self.output_dir, f"merged_{i}.csv") for i in range(self.num_versions)] - self.output_dfs_key = [[os.path.join(self.output_dir, f"merged_{l}_{i}.key") for l in self.lcfit_names] for i in range(self.num_versions)] - self.output_cals = [os.path.join(self.output_dir, f"calibration_{i}.csv") for i in range(self.num_versions)] + self.output_dfs = [ + os.path.join(self.output_dir, f"merged_{i}.csv") + for i in range(self.num_versions) + ] + self.output_dfs_key = [ + [ + os.path.join(self.output_dir, f"merged_{l}_{i}.key") + for l in self.lcfit_names + ] + for i in range(self.num_versions) + ] + self.output_cals = [ + os.path.join(self.output_dir, f"calibration_{i}.csv") + for i in range(self.num_versions) + ] self.id = "CID" self.type_name = "SNTYPE" @@ -81,16 +97,22 @@ def __init__(self, name, output_dir, config, dependencies, options, recal_aggtas self.output["calibration_files"] = self.output_cals self.output["empty_agg"] = False if isinstance(self.plot, bool): - self.python_file = os.path.dirname(inspect.stack()[0][1]) + "/external/aggregator_plot.py" + self.python_file = ( + os.path.dirname(inspect.stack()[0][1]) + "/external/aggregator_plot.py" + ) else: self.python_file = self.plot self.python_file = get_output_loc(self.python_file) if not os.path.exists(self.python_file): - Task.fail_config(f"Attempting to find python file {self.python_file} but it's not there!") + Task.fail_config( + f"Attempting to find python file {self.python_file} but it's not there!" + ) merge_classifiers = self.config.get("MERGE_CLASSIFIERS") - self.classifier_merge = {c.output['name']: c.get_prob_column_name() for c in self.classifiers} + self.classifier_merge = { + c.output["name"]: c.get_prob_column_name() for c in self.classifiers + } if merge_classifiers is not None: self.classifier_merge = dict() for c in self.classifiers: @@ -102,33 +124,42 @@ def __init__(self, name, output_dir, config, dependencies, options, recal_aggtas if match: continue else: - if m in c.output['name']: + if m in c.output["name"]: match = True if match: if prob_col_name[:5] != "PROB_": prob_col_name = "PROB_" + prob_col_name prob_col.append(prob_col_name) if len(prob_col) == 1: - self.classifier_merge[c.output['name']] = prob_col[0] + self.classifier_merge[c.output["name"]] = prob_col[0] else: if len(prob_col) == 0: - self.classifier_merge[c.output['name']] = c.get_prob_column_name() + self.classifier_merge[ + c.output["name"] + ] = c.get_prob_column_name() else: - Task.fail_config(f"Classifier task {c.output['name']} matched multiple MERGE_CLASSIFIERS keys: {prob_col}. Please provide more specific keys") + Task.fail_config( + f"Classifier task {c.output['name']} matched multiple MERGE_CLASSIFIERS keys: {prob_col}. Please provide more specific keys" + ) self.logger.debug(f"Classifier merge = {self.classifier_merge}") self.output["classifier_merge"] = self.classifier_merge - def _check_completion(self, squeue): if not self.passed: - self.logger.debug("Task not reporting passed, might be external. Checking done file.") + self.logger.debug( + "Task not reporting passed, might be external. Checking done file." + ) if os.path.exists(self.done_file): self.logger.debug("Done file exists, loading contents") with open(self.done_file) as f: self.passed = "SUCCESS" in f.read() - self.logger.debug(f"After reading done file, passed set to {self.passed}") + self.logger.debug( + f"After reading done file, passed set to {self.passed}" + ) else: - self.logger.warning(f"Task has not set passed and has no done file at {self.done_file}, returning failure") + self.logger.warning( + f"Task has not set passed and has no done file at {self.done_file}, returning failure" + ) return Task.FINISHED_SUCCESS if self.passed else Task.FINISHED_FAILURE def get_underlying_sim_task(self): @@ -142,7 +173,9 @@ def get_underlying_sim_task(self): for task in check + self.dependencies: if isinstance(task, SNANASimulation) or isinstance(task, DataPrep): return task - self.logger.error(f"Unable to find a simulation or data dependency for aggregator {self.name}") + self.logger.error( + f"Unable to find a simulation or data dependency for aggregator {self.name}" + ) return None def load_prediction_file(self, filename): @@ -152,16 +185,19 @@ def load_prediction_file(self, filename): df = pd.read_csv(filename, comment="#", delim_whitespace=True) if "VARNAMES:" in df.columns: df = df.drop(columns="VARNAMES:") - remove_columns = [c for i, c in enumerate(df.columns) if i != 0 and "PROB_" not in c] + remove_columns = [ + c for i, c in enumerate(df.columns) if i != 0 and "PROB_" not in c + ] df = df.drop(columns=remove_columns) return df def save_calibration_curve(self, df, output_name): - self.logger.debug("Creating calibration curves") # First let us define some prob bins - bins = np.linspace(-1, 2, 61) # Yes, outside normal range, so if we smooth it we dont screw things up with edge effects + bins = np.linspace( + -1, 2, 61 + ) # Yes, outside normal range, so if we smooth it we dont screw things up with edge effects bc = 0.5 * (bins[:-1] + bins[1:]) mask = (bc >= 0) & (bc <= 1) bc3 = bc[mask] # Dont bother saving out the negative probs @@ -189,15 +225,21 @@ def save_calibration_curve(self, df, output_name): combined_mask = truth_mask & data_mask if combined_mask.sum() < 100: if combined_mask.sum() == 0: - self.logger.warning("There are no events which have both a prob and a known Ia/CC flag") + self.logger.warning( + "There are no events which have both a prob and a known Ia/CC flag" + ) else: - self.logger.warning("There are too few events with both a prob and known Ia/CC flag") + self.logger.warning( + "There are too few events with both a prob and known Ia/CC flag" + ) continue data2 = data[combined_mask] truth2 = truth[combined_mask].astype(float) - actual_prob, _, _ = binned_statistic(data2, truth2, bins=bins, statistic="mean") + actual_prob, _, _ = binned_statistic( + data2, truth2, bins=bins, statistic="mean" + ) m = np.isfinite(actual_prob) # All the -1 to 0 and 1 to 2 probs will be NaN # Sets a 1:1 line outside of 0 to 1 @@ -225,27 +267,39 @@ def recalibrate(self, df): self.logger.debug(f"Recalibrating column {c}") data = df[c] if c not in curves: - self.logger.warning(f"Classifier {c} cannot be recalibrated. If this is because its FITPROB or another fake classifier, all good.") + self.logger.warning( + f"Classifier {c} cannot be recalibrated. If this is because its FITPROB or another fake classifier, all good." + ) recalibrated = data else: recalibrated = interp1d(curves["bins"], curves[c])(data) df[c.replace("PROB_", "CPROB_")] = recalibrated - self.logger.debug("Returning recalibrated curves. They start with CPROB_, instead of PROB_") + self.logger.debug( + "Returning recalibrated curves. They start with CPROB_, instead of PROB_" + ) return df def load_calibration_curve(self): path = self.recal_aggtask.output["calibration_files"] if len(path) > 1: - self.logger.warning(f"Warning, found multiple calibration files, only using first one: {path}") - assert len(path) != 0, f"No calibration files found for agg task {self.recal_aggtask}" + self.logger.warning( + f"Warning, found multiple calibration files, only using first one: {path}" + ) + assert ( + len(path) != 0 + ), f"No calibration files found for agg task {self.recal_aggtask}" path = path[0] df = pd.read_csv(path) self.logger.debug(f"Reading calibration curves from {path}") return df - def _run(self,): - new_hash = self.get_hash_from_string(self.name + str(self.include_type) + str(self.plot)) + def _run( + self, + ): + new_hash = self.get_hash_from_string( + self.name + str(self.include_type) + str(self.plot) + ) if self._check_regenerate(new_hash): shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) @@ -255,7 +309,9 @@ def _run(self,): relevant_classifiers = [c for c in self.classifiers if c.index == index] self.logger.debug(f"relevant_classifiers: {relevant_classifiers}") - prediction_files = [d.output["predictions_filename"] for d in relevant_classifiers] + prediction_files = [ + d.output["predictions_filename"] for d in relevant_classifiers + ] lcfits = [d.get_fit_dependency() for d in relevant_classifiers] self.logger.debug(f"lcfits: {lcfits}") @@ -266,60 +322,102 @@ def _run(self,): need_to_rename = len(colnames) != len(set(colnames)) rename_ind = [] if need_to_rename: - self.logger.info("Detected duplicate probability column names, will need to rename them") - for (i, n) in enumerate(colnames): - if len([j for j in range(len(colnames)) if colnames[j] == n]) > 1: + self.logger.info( + "Detected duplicate probability column names, will need to rename them" + ) + for i, n in enumerate(colnames): + if ( + len([j for j in range(len(colnames)) if colnames[j] == n]) + > 1 + ): rename_ind.append(i) - for i, (f, d, l) in enumerate(zip(prediction_files, relevant_classifiers, lcfits)): + for i, (f, d, l) in enumerate( + zip(prediction_files, relevant_classifiers, lcfits) + ): self.logger.debug(f"l: {l}") dataframe = self.load_prediction_file(f) - dataframe = dataframe.rename(columns={d.get_prob_column_name(): self.classifier_merge[d.name]}) - dataframe = dataframe.rename(columns={dataframe.columns[0]: self.id}) + dataframe = dataframe.rename( + columns={ + d.get_prob_column_name(): self.classifier_merge[d.name] + } + ) + dataframe = dataframe.rename( + columns={dataframe.columns[0]: self.id} + ) dataframe[self.id] = dataframe[self.id].apply(str) dataframe[self.id] = dataframe[self.id].str.strip() - if need_to_rename and (l is not None or l != []) and i in rename_ind: - lcname = ensure_list(l)[0]["name"] - self.logger.debug(f"Renaming column {self.classifier_merge[d.name]} to include LCFIT name {lcname}") - dataframe = dataframe.rename(columns={self.classifier_merge[d.name]: self.classifier_merge[d.name] + "_RENAMED_" + lcname}) + if ( + need_to_rename + and (l is not None or l != []) + and i in rename_ind + ): + lcname = ensure_list(l)[0]["name"] + self.logger.debug( + f"Renaming column {self.classifier_merge[d.name]} to include LCFIT name {lcname}" + ) + dataframe = dataframe.rename( + columns={ + self.classifier_merge[d.name]: self.classifier_merge[ + d.name + ] + + "_RENAMED_" + + lcname + } + ) self.logger.debug(f"Merging on column {self.id} for file {f}") if df is None: df = dataframe else: df = pd.merge(df, dataframe, on=self.id, how="outer") - self.logger.info(f"Finding original types, size of prediction df is {df.shape if df is not None else 'None'}") + self.logger.info( + f"Finding original types, size of prediction df is {df.shape if df is not None else 'None'}" + ) s = self.get_underlying_sim_task() type_df = None phot_dir = s.output["photometry_dirs"][index] - headers = [os.path.join(phot_dir, a) for a in os.listdir(phot_dir) if "HEAD" in a] + headers = [ + os.path.join(phot_dir, a) + for a in os.listdir(phot_dir) + if "HEAD" in a + ] if len(headers) == 0: - self.logger.warning(f"No HEAD fits files found in {phot_dir}, manually running grep command!") + self.logger.warning( + f"No HEAD fits files found in {phot_dir}, manually running grep command!" + ) cmd = "grep --exclude-dir=* TYPE *.dat | awk -F ':' '{print $1 $3}'" self.logger.debug(f"Running command {cmd} in dir {phot_dir}") - process = subprocess.run(cmd, capture_output=True, cwd=phot_dir, shell=True) + process = subprocess.run( + cmd, capture_output=True, cwd=phot_dir, shell=True + ) output = process.stdout.decode("ascii").split("\n") output = [x for x in output if x] cmd = "zgrep TYPE *.dat.gz | awk -F ':' '{print $1 $3}'" self.logger.debug(f"Running command {cmd} in dir {phot_dir}") - process = subprocess.run(cmd, capture_output=True, cwd=phot_dir, shell=True) + process = subprocess.run( + cmd, capture_output=True, cwd=phot_dir, shell=True + ) output2 = process.stdout.decode("ascii").split("\n") output += [x for x in output2 if x] cmd = "zgrep TYPE *.txt | awk -F ':' '{print $1 $3}'" self.logger.debug(f"Running command {cmd} in dir {phot_dir}") - process = subprocess.run(cmd, capture_output=True, cwd=phot_dir, shell=True) + process = subprocess.run( + cmd, capture_output=True, cwd=phot_dir, shell=True + ) output3 = process.stdout.decode("ascii").split("\n") output += [x for x in output3 if x] - if len(output) == 0: snid = [] else: - if "_" in output[0]: #check if photometry is in filename - snid = [x.split()[0].split("_")[1].split(".")[0] for x in output] + if "_" in output[0]: # check if photometry is in filename + snid = [ + x.split()[0].split("_")[1].split(".")[0] for x in output + ] snid = [x[1:] if x.startswith("0") else x for x in snid] else: snid = [x.split()[0].split(".")[0] for x in output] @@ -336,43 +434,65 @@ def _run(self,): snid = np.array(data.field("SNID")) sntype = np.array(data.field("SNTYPE")).astype(np.int64) # self.logger.debug(f"Photometry has fields {hdul[1].columns.names}") - dataframe = pd.DataFrame({self.id: snid, self.type_name: sntype}) - dataframe[self.id] = dataframe[self.id].astype(str).str.strip() + dataframe = pd.DataFrame( + {self.id: snid, self.type_name: sntype} + ) + dataframe[self.id] = ( + dataframe[self.id].astype(str).str.strip() + ) if type_df is None: type_df = dataframe else: type_df = pd.concat([type_df, dataframe]) type_df.drop_duplicates(subset=self.id, inplace=True) - self.logger.debug(f"Photometric types are {type_df['SNTYPE'].unique()}") + self.logger.debug( + f"Photometric types are {type_df['SNTYPE'].unique()}" + ) if type_df is not None: if df is None: self.logger.debug("No original df found, only saving types") df = type_df else: - self.logger.debug(f"Merging types of shape {type_df.shape} into df {df.shape}") + self.logger.debug( + f"Merging types of shape {type_df.shape} into df {df.shape}" + ) df = pd.merge(df, type_df, on=self.id, how="left") - self.logger.debug(f"Final dataframe from file ingestion has shape {df.shape}") + self.logger.debug( + f"Final dataframe from file ingestion has shape {df.shape}" + ) types = self.get_underlying_sim_task().output["types_dict"] has_nonia = len(types.get("NONIA", [])) > 0 has_ia = len(types.get("IA", [])) > 0 self.logger.debug(f"Input types are {types}") - ia = df["SNTYPE"].apply(lambda y: 1.0 if y in types["IA"] else (0.0 if y in types["NONIA"] else np.nan)) + ia = df["SNTYPE"].apply( + lambda y: 1.0 + if y in types["IA"] + else (0.0 if y in types["NONIA"] else np.nan) + ) df["IA"] = ia num_ia = (ia == 1.0).sum() num_cc = (ia == 0.0).sum() num_nan = ia.isnull().sum() - self.logger.info(f"Truth type has {num_ia} Ias, {num_cc} CCs and {num_nan} unknowns") + self.logger.info( + f"Truth type has {num_ia} Ias, {num_cc} CCs and {num_nan} unknowns" + ) - sorted_columns = [self.id, "SNTYPE", "IA"] + sorted([c for c in df.columns if c.startswith("PROB_")]) + sorted_columns = [self.id, "SNTYPE", "IA"] + sorted( + [c for c in df.columns if c.startswith("PROB_")] + ) df = df.reindex(sorted_columns, axis=1) - self.logger.info(f"Merged into dataframe of {df.shape[0]} rows, with columns {list(df.columns)}") + self.logger.info( + f"Merged into dataframe of {df.shape[0]} rows, with columns {list(df.columns)}" + ) if df.shape[0] == 0: - self.logger.warning("Oh no, dataframe doesnt have any rows. What is going on? What strange data format is this?") + self.logger.warning( + "Oh no, dataframe doesnt have any rows. What is going on? What strange data format is this?" + ) self.output["empty_agg"] = True if has_nonia and has_ia: @@ -384,14 +504,18 @@ def _run(self,): for l in self.lcfit_names: self.save_key_format(df, index, l) - self.logger.debug(f"Saving merged dataframe to {self.output_dfs[index]}") + self.logger.debug( + f"Saving merged dataframe to {self.output_dfs[index]}" + ) self.save_new_hash(new_hash) if self.plot: if index == 0 or self.plot_all: return_good = self._plot(index) if not return_good: - self.logger.error("Plotting did not work correctly! Attempting to continue anyway.") + self.logger.error( + "Plotting did not work correctly! Attempting to continue anyway." + ) else: self.logger.debug("Plot not set, skipping plotting section") @@ -415,7 +539,9 @@ def _run(self,): return True def save_key_format(self, df, index, lcfitname): - lc_index = 0 if len(self.lcfit_names) == 1 else self.lcfit_names.index(lcfitname) + lc_index = ( + 0 if len(self.lcfit_names) == 1 else self.lcfit_names.index(lcfitname) + ) if "IA" in df.columns: df = df.drop(columns=[self.type_name, "IA"]) cols_to_rename = [c for c in df.columns if "_RENAMED_" in c] @@ -424,17 +550,36 @@ def save_key_format(self, df, index, lcfitname): if lcfit == lcfitname: df = df.rename(columns={c: name}) else: - self.logger.warning(f"Aggregation {self.name} for LCFIT {lcfitname} is dropping column {c} as it doesnt match.") + self.logger.warning( + f"Aggregation {self.name} for LCFIT {lcfitname} is dropping column {c} as it doesnt match." + ) df = df.drop(columns=[c]) df2 = df.fillna(0.0) df2.insert(0, "VARNAMES:", ["SN:"] * df2.shape[0]) - df2.to_csv(self.output_dfs_key[index][lc_index], index=False, float_format="%0.4f", sep=" ") + df2.to_csv( + self.output_dfs_key[index][lc_index], + index=False, + float_format="%0.4f", + sep=" ", + ) def _plot(self, index): - cmd = ["python", self.python_file, self.output_dfs[index], self.output_dir, f"{index}"] + cmd = [ + "python", + self.python_file, + self.output_dfs[index], + self.output_dir, + f"{index}", + ] self.logger.debug(f"Invoking command {' '.join(cmd)}") try: - subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=self.output_dir, check=True) + subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd=self.output_dir, + check=True, + ) self.logger.info(f"Finished invoking {self.python_file}") except subprocess.CalledProcessError: return False @@ -463,12 +608,18 @@ def _get_aggregator_dir(base_output_dir, stage_number, agg_name): recal_simtask = None recal_aggtask = None if recalibration: - recal_sim = [i for i, s in enumerate(sim_tasks) if s.name == recalibration] + recal_sim = [ + i for i, s in enumerate(sim_tasks) if s.name == recalibration + ] if len(recal_sim) == 0: - Task.fail_config(f"Recalibration sim {recalibration} not in the list of available sims: {[s.name for s in sim_tasks]}") + Task.fail_config( + f"Recalibration sim {recalibration} not in the list of available sims: {[s.name for s in sim_tasks]}" + ) elif len(recal_sim) > 1: - Task.fail_config(f"Recalibration aggregation {recalibration} not in the list of available aggs: {[s.name for s in sim_tasks]}") + Task.fail_config( + f"Recalibration aggregation {recalibration} not in the list of available aggs: {[s.name for s in sim_tasks]}" + ) # Move the recal sim task to the front of the queue so it executes first recal_sim_index = recal_sim[0] @@ -476,27 +627,45 @@ def _get_aggregator_dir(base_output_dir, stage_number, agg_name): sim_tasks.insert(0, sim_tasks.pop(recal_sim_index)) for sim_task in sim_tasks: - if mask_sim not in sim_task.name or mask not in sim_task.name and recal_simtask != sim_task: + if ( + mask_sim not in sim_task.name + or mask not in sim_task.name + and recal_simtask != sim_task + ): continue agg_name2 = f"{agg_name}_{sim_task.name}" deps = [ c for c in classifier_tasks - if mask in c.name and mask_clas in c.name and c.mode == Classifier.PREDICT and sim_task in c.get_simulation_dependency() + if mask in c.name + and mask_clas in c.name + and c.mode == Classifier.PREDICT + and sim_task in c.get_simulation_dependency() ] if len(deps) == 0: deps = [sim_task] if recalibration and sim_task != recal_simtask: if recal_aggtask is None: - Task.fail_config(f"The aggregator task for {recalibration} has not been made yet. Sam probably screwed up dependency order.") + Task.fail_config( + f"The aggregator task for {recalibration} has not been made yet. Sam probably screwed up dependency order." + ) else: deps.append(recal_aggtask) - a = Aggregator(agg_name2, _get_aggregator_dir(base_output_dir, stage_number, agg_name2), config, deps, options, recal_aggtask) + a = Aggregator( + agg_name2, + _get_aggregator_dir(base_output_dir, stage_number, agg_name2), + config, + deps, + options, + recal_aggtask, + ) if sim_task == recal_simtask: recal_aggtask = a - Task.logger.info(f"Creating aggregation task {agg_name2} for {sim_task.name} with {a.num_jobs} jobs") + Task.logger.info( + f"Creating aggregation task {agg_name2} for {sim_task.name} with {a.num_jobs} jobs" + ) tasks.append(a) return tasks diff --git a/pippin/analyse.py b/pippin/analyse.py index 2aa21b69..980b0f76 100644 --- a/pippin/analyse.py +++ b/pippin/analyse.py @@ -7,7 +7,14 @@ import numpy as np from pippin.biascor import BiasCor -from pippin.config import mkdirs, get_config, ensure_list, get_data_loc, generic_open, merge_dict +from pippin.config import ( + mkdirs, + get_config, + ensure_list, + get_data_loc, + generic_open, + merge_dict, +) from pippin.cosmofitters.cosmofit import CosmoFit from pippin.cosmofitters.cosmomc import CosmoMC from pippin.cosmofitters.wfit import WFit @@ -15,8 +22,10 @@ from pippin.task import Task -class AnalyseChains(Task): # TODO: Define the location of the output so we can run the lc fitting on it. - """ Smack the data into something that looks like the simulated data +class AnalyseChains( + Task +): # TODO: Define the location of the output so we can run the lc fitting on it. + """Smack the data into something that looks like the simulated data CONFIGURATION ============= @@ -48,12 +57,18 @@ def __init__(self, name, output_dir, config, options, dependencies=None): self.logfile = os.path.join(self.output_dir, "output.log") - self.job_name = os.path.basename(Path(output_dir).parents[1]) + "_ANALYSE_" + os.path.basename(output_dir) + self.job_name = ( + os.path.basename(Path(output_dir).parents[1]) + + "_ANALYSE_" + + os.path.basename(output_dir) + ) self.path_to_codes = [] self.done_files = [] - self.plot_code_dir = os.path.join(os.path.dirname(inspect.stack()[0][1]), "external") + self.plot_code_dir = os.path.join( + os.path.dirname(inspect.stack()[0][1]), "external" + ) self.covopts = options.get("COVOPTS") self.singular_blind = options.get("SINGULAR_BLIND", False) @@ -101,21 +116,36 @@ def __init__(self, name, output_dir, config, options, dependencies=None): for c in self.cosmomc_deps: for covopt in c.output["covopts"]: self.cosmomc_input_files.append(c.output["base_dict"][covopt]) - self.cosmomc_output_files.append(c.output["label"] + "_" + covopt + ".csv.gz") + self.cosmomc_output_files.append( + c.output["label"] + "_" + covopt + ".csv.gz" + ) self.cosmomc_covopts.append(covopt) self.names.append(c.output["label"].replace("_", " ") + " " + covopt) for p in c.output["cosmology_params"]: if p not in self.params: self.params.append(p) - self.logger.debug(f"Analyse task will create CosmoMC plots with {len(self.cosmomc_input_files)} covopts/plots") + self.logger.debug( + f"Analyse task will create CosmoMC plots with {len(self.cosmomc_input_files)} covopts/plots" + ) self.wsummary_files = [b.output["w_summary"] for b in self.biascor_deps] # Get the fitres and m0diff files we'd want to parse for Hubble diagram plotting - self.biascor_fitres_input_files = [os.path.join(m, "FITOPT000_MUOPT000.FITRES.gz") for b in self.biascor_deps for m in b.output["m0dif_dirs"]] - self.biascor_prob_col_names = [b.output["prob_column_name"] for b in self.biascor_deps for m in b.output["m0dif_dirs"]] + self.biascor_fitres_input_files = [ + os.path.join(m, "FITOPT000_MUOPT000.FITRES.gz") + for b in self.biascor_deps + for m in b.output["m0dif_dirs"] + ] + self.biascor_prob_col_names = [ + b.output["prob_column_name"] + for b in self.biascor_deps + for m in b.output["m0dif_dirs"] + ] self.biascor_fitres_output_files = [ - b.name + "__" + os.path.basename(m).replace("OUTPUT_BBCFIT", "1") + "__FITOPT0_MUOPT0.fitres.gz" + b.name + + "__" + + os.path.basename(m).replace("OUTPUT_BBCFIT", "1") + + "__FITOPT0_MUOPT0.fitres.gz" for b in self.biascor_deps for m in b.output["m0dif_dirs"] ] @@ -127,8 +157,9 @@ def __init__(self, name, output_dir, config, options, dependencies=None): self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) - self.batch_replace = self.options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) - + self.batch_replace = self.options.get( + "BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {}) + ) self.slurm = """{sbatch_header} {task_setup} @@ -153,11 +184,17 @@ def get_slurm_raw(self): def add_plot_script_to_run(self, script_name): script_path = get_data_loc(script_name, extra=self.plot_code_dir) if script_path is None: - self.fail_config(f"Cannot resolve script {script_name} relative to {self.plot_code_dir}. Please use a variable or abs path.") + self.fail_config( + f"Cannot resolve script {script_name} relative to {self.plot_code_dir}. Please use a variable or abs path." + ) else: self.logger.debug(f"Adding script path {script_path} to plotting code.") self.path_to_codes.append(script_path) - self.done_files.append(os.path.join(self.output_dir, os.path.basename(script_name).split(".")[0] + ".done")) + self.done_files.append( + os.path.join( + self.output_dir, os.path.basename(script_name).split(".")[0] + ".done" + ) + ) def _check_completion(self, squeue): num_success = 0 @@ -166,7 +203,9 @@ def _check_completion(self, squeue): self.logger.debug(f"Done file found at {f}") with open(f) as ff: if "FAILURE" in ff.read(): - self.logger.error(f"Done file reported failure. Check output log {self.logfile}") + self.logger.error( + f"Done file reported failure. Check output log {self.logfile}" + ) return Task.FINISHED_FAILURE else: num_success += 1 @@ -182,7 +221,6 @@ def _check_completion(self, squeue): return self.check_for_job(squeue, self.job_name) def _run(self): - # Get the m0diff files for everything for b in self.biascor_deps: for m in b.output["m0dif_dirs"]: @@ -190,27 +228,59 @@ def _run(self): sim_number = 1 if os.path.basename(m).isdigit(): sim_number = int(os.path.basename(m)) - files = [f for f in sorted(os.listdir(m)) if f.endswith(".M0DIF") or f.endswith(".M0DIF.gz")] + files = [ + f + for f in sorted(os.listdir(m)) + if f.endswith(".M0DIF") or f.endswith(".M0DIF.gz") + ] for f in files: muopt_num = int(f.split("MUOPT")[-1].split(".")[0]) fitopt_num = int(f.split("FITOPT")[-1].split("_")[0]) if muopt_num == 0: muopt = "DEFAULT" else: - muopt = b.output["muopts"][muopt_num - 1] # Because 0 is default + muopt = b.output["muopts"][ + muopt_num - 1 + ] # Because 0 is default if fitopt_num == 0: fitopt = "DEFAULT" else: fitopt = b.output["fitopt_index"][fitopt_num] - self.biascor_m0diffs.append((b.name, sim_number, muopt, muopt_num, fitopt, fitopt_num, os.path.join(m, f))) - - data_fitres_files = [os.path.join(l.output["fitres_dirs"][0], l.output["fitopt_map"]["DEFAULT"]) for l in self.lcfit_deps if l.output["is_data"]] + self.biascor_m0diffs.append( + ( + b.name, + sim_number, + muopt, + muopt_num, + fitopt, + fitopt_num, + os.path.join(m, f), + ) + ) + + data_fitres_files = [ + os.path.join(l.output["fitres_dirs"][0], l.output["fitopt_map"]["DEFAULT"]) + for l in self.lcfit_deps + if l.output["is_data"] + ] data_fitres_output = [d.split("/")[-4] + ".csv.gz" for d in data_fitres_files] - sim_fitres_files = [os.path.join(l.output["fitres_dirs"][0], l.output["fitopt_map"]["DEFAULT"]) for l in self.lcfit_deps if not l.output["is_data"]] + sim_fitres_files = [ + os.path.join(l.output["fitres_dirs"][0], l.output["fitopt_map"]["DEFAULT"]) + for l in self.lcfit_deps + if not l.output["is_data"] + ] sim_fitres_output = [d.split("/")[-4] + ".csv.gz" for d in sim_fitres_files] - types = list(set([a for l in self.lcfit_deps for a in l.sim_task.output["types_dict"]["IA"]])) + types = list( + set( + [ + a + for l in self.lcfit_deps + for a in l.sim_task.output["types_dict"]["IA"] + ] + ) + ) input_yml_file = "input.yml" output_dict = { "COSMOMC": { @@ -244,35 +314,33 @@ def _run(self): "IA_TYPES": types, }, } - + if self.batch_file is None: if self.gpu: self.sbatch_header = self.sbatch_gpu_header else: self.sbatch_header = self.sbatch_cpu_header else: - with open(self.batch_file, 'r') as f: + with open(self.batch_file, "r") as f: self.sbatch_header = f.read() self.sbatch_header = self.clean_header(self.sbatch_header) header_dict = { - "REPLACE_NAME": self.job_name, - "REPLACE_WALLTIME": "1:00:00", - "REPLACE_LOGFILE": self.logfile, - "REPLACE_MEM": "20GB", - "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=1"] - } + "REPLACE_NAME": self.job_name, + "REPLACE_WALLTIME": "1:00:00", + "REPLACE_LOGFILE": self.logfile, + "REPLACE_MEM": "20GB", + "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=1"], + } header_dict = merge_dict(header_dict, self.batch_replace) self.update_header(header_dict) - setup_dict = { - "output_dir": self.output_dir - } + setup_dict = {"output_dir": self.output_dir} format_dict = { - "sbatch_header": self.sbatch_header, - "task_setup": self.update_setup(setup_dict, self.task_setup['analyse']), - "input_yml": input_yml_file - } + "sbatch_header": self.sbatch_header, + "task_setup": self.update_setup(setup_dict, self.task_setup["analyse"]), + "input_yml": input_yml_file, + } final_slurm = self.get_slurm_raw().format(**format_dict) new_hash = self.get_hash_from_string(final_slurm + json.dumps(output_dict)) @@ -299,7 +367,9 @@ def _run(self): return True @staticmethod - def get_tasks(configs, prior_tasks, base_output_dir, stage_number, prefix, global_config): + def get_tasks( + configs, prior_tasks, base_output_dir, stage_number, prefix, global_config + ): def _get_analyse_dir(base_output_dir, stage_number, name): return f"{base_output_dir}/{stage_number}_ANALYSE/{name}" @@ -314,23 +384,43 @@ def _get_analyse_dir(base_output_dir, stage_number, name): mask_cosmofit = config.get("MASK_COSMOFIT") mask_biascor = config.get("MASK_BIASCOR") if config.get("HISTOGRAM") is not None: - Task.fail_config("Sorry to do this, but please change HISTOGRAM into MASK_LCFIT to bring it into line with others.") + Task.fail_config( + "Sorry to do this, but please change HISTOGRAM into MASK_LCFIT to bring it into line with others." + ) mask_lcfit = config.get("MASK_LCFIT") # TODO: Add aggregation to compile all the plots here - deps_cosmofit = Task.match_tasks_of_type(mask_cosmofit, prior_tasks, CosmoFit, match_none=False, allowed_failure=True) + deps_cosmofit = Task.match_tasks_of_type( + mask_cosmofit, + prior_tasks, + CosmoFit, + match_none=False, + allowed_failure=True, + ) Task.logger.debug(f"deps_cosmofit: {deps_cosmofit}") - deps_biascor = Task.match_tasks_of_type(mask_biascor, prior_tasks, BiasCor, match_none=False) + deps_biascor = Task.match_tasks_of_type( + mask_biascor, prior_tasks, BiasCor, match_none=False + ) Task.logger.debug(f"deps_biascor: {deps_biascor}") - deps_lcfit = Task.match_tasks_of_type(mask_lcfit, prior_tasks, SNANALightCurveFit, match_none=False) + deps_lcfit = Task.match_tasks_of_type( + mask_lcfit, prior_tasks, SNANALightCurveFit, match_none=False + ) Task.logger.debug(f"deps_lcfit: {deps_lcfit}") deps = deps_cosmofit + deps_biascor + deps_lcfit if len(deps) == 0: Task.fail_config(f"Analyse task {cname} has no dependencies!") - a = AnalyseChains(cname, _get_analyse_dir(base_output_dir, stage_number, cname), config, options, deps) - Task.logger.info(f"Creating Analyse task {cname} for {[c.name for c in deps]} with {a.num_jobs} jobs") + a = AnalyseChains( + cname, + _get_analyse_dir(base_output_dir, stage_number, cname), + config, + options, + deps, + ) + Task.logger.info( + f"Creating Analyse task {cname} for {[c.name for c in deps]} with {a.num_jobs} jobs" + ) tasks.append(a) return tasks diff --git a/pippin/base.py b/pippin/base.py index 1a8c3a11..d1ea0707 100644 --- a/pippin/base.py +++ b/pippin/base.py @@ -5,7 +5,9 @@ class ConfigBasedExecutable(Task, ABC): - def __init__(self, name, output_dir, config, base_file, default_assignment, dependencies=None): + def __init__( + self, name, output_dir, config, base_file, default_assignment, dependencies=None + ): super().__init__(name, output_dir, config=config, dependencies=dependencies) self.default_assignment = default_assignment self.base_file = base_file @@ -29,7 +31,9 @@ def process_yaml(self, index): self.base = self.base[index:] def delete_property(self, name, section_start=None, section_end=None): - self.set_property(name, None, section_start=section_start, section_end=section_end) + self.set_property( + name, None, section_start=section_start, section_end=section_end + ) def get_output_string(self): return yaml.safe_dump(self.yaml, width=2048) + "\n".join(self.base) + "\n" @@ -40,7 +44,7 @@ def write_output_file(self, path): self.logger.info(f"Input file written to {path}") def get_property(self, name, assignment=None): - """ Get a property from the base file + """Get a property from the base file Parameters ---------- @@ -60,9 +64,17 @@ def get_property(self, name, assignment=None): return line.split(assignment, maxsplit=1)[1] return None - def set_property(self, name, value, section_start=None, section_end=None, assignment=None, only_add=False): - """ Ensures the property name value pair is set in the base file. - + def set_property( + self, + name, + value, + section_start=None, + section_end=None, + assignment=None, + only_add=False, + ): + """Ensures the property name value pair is set in the base file. + Set value to None to remove a property Parameters @@ -97,7 +109,11 @@ def set_property(self, name, value, section_start=None, section_end=None, assign else: continue - if not only_add and modified_line and modified_line.split()[0] == name.upper(): + if ( + not only_add + and modified_line + and modified_line.split()[0] == name.upper() + ): # Replace existing option or remove it if value is None: self.base[i] = "" @@ -109,7 +125,11 @@ def set_property(self, name, value, section_start=None, section_end=None, assign added = True break - if value is not None and reached_section and (section_end is not None and line.strip().startswith(section_end)): + if ( + value is not None + and reached_section + and (section_end is not None and line.strip().startswith(section_end)) + ): # Option doesn't exist, lets add it self.base.insert(i, desired_line) added = True @@ -121,4 +141,11 @@ def set_property(self, name, value, section_start=None, section_end=None, assign if others is not None: for o in others: - self.set_property(name, o, section_start=section_start, section_end=section_end, assignment=assignment, only_add=True) + self.set_property( + name, + o, + section_start=section_start, + section_end=section_end, + assignment=assignment, + only_add=True, + ) diff --git a/pippin/biascor.py b/pippin/biascor.py index b21bfb05..4fc830b4 100644 --- a/pippin/biascor.py +++ b/pippin/biascor.py @@ -7,7 +7,15 @@ from pippin.base import ConfigBasedExecutable from pippin.classifiers.classifier import Classifier -from pippin.config import chown_dir, mkdirs, get_config, ensure_list, get_data_loc, read_yaml, compress_dir +from pippin.config import ( + chown_dir, + mkdirs, + get_config, + ensure_list, + get_data_loc, + read_yaml, + compress_dir, +) from pippin.merge import Merger from pippin.task import Task @@ -22,7 +30,9 @@ def __init__(self, name, output_dir, config, dependencies, options, global_confi self.logging_file = os.path.join(self.output_dir, "output.log") self.global_config = get_config() - self.batch_replace = options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) + self.batch_replace = options.get( + "BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {}) + ) batch_mem = self.batch_replace.get("REPLACE_MEM", None) if batch_mem is not None: self.yaml["CONFIG"]["BATCH_MEM"] = batch_mem @@ -39,8 +49,9 @@ def __init__(self, name, output_dir, config, dependencies, options, global_confi self.merged_iasim_fitopts = [0 for _ in self.merged_iasim] else: self.merged_iasim_fitopts = ensure_list(self.merged_iasim_fitopts) - assert len(self.merged_iasim_fitopts) == len(self.merged_iasim), \ - f"SIMFILE_BIASCOR_FITOPTS must be the same length as SIMFILE_BIASCOR ({len(self.merged_iasim)}), but is length {len(self.merged_iasim_fitopts)}" + assert len(self.merged_iasim_fitopts) == len( + self.merged_iasim + ), f"SIMFILE_BIASCOR_FITOPTS must be the same length as SIMFILE_BIASCOR ({len(self.merged_iasim)}), but is length {len(self.merged_iasim_fitopts)}" self.logger.debug(f"SIMFILE_BIASCOR_FITOPTS: {self.merged_iasim_fitopts}") self.merged_ccsim = config.get("SIMFILE_CCPRIOR") self.merged_ccsim_fitopts = config.get("SIMFILE_CCPRIOR_FITOPTS") @@ -49,8 +60,9 @@ def __init__(self, name, output_dir, config, dependencies, options, global_confi self.merged_ccsim_fitopts = [0 for _ in self.merged_ccsim] else: self.merged_ccsim_fitopts = ensure_list(self.merged_ccsim_fitopts) - assert len(self.merged_ccsim_fitopts) == len(self.merged_ccsim), \ - f"SIMFILE_CCPRIOR_FITOPTS must be the same length as SIMFILE_CCPRIOR ({len(self.merged_ccsim)}), but is length {len(self.merged_ccsim_fitopts)}" + assert len(self.merged_ccsim_fitopts) == len( + self.merged_ccsim + ), f"SIMFILE_CCPRIOR_FITOPTS must be the same length as SIMFILE_CCPRIOR ({len(self.merged_ccsim)}), but is length {len(self.merged_ccsim_fitopts)}" self.logger.debug(f"SIMFILE_CCPRIOR_FITOPTS: {self.merged_ccsim_fitopts}") self.classifier = config.get("CLASSIFIER") @@ -69,10 +81,16 @@ def __init__(self, name, output_dir, config, dependencies, options, global_confi self.output["blind"] = self.blind self.genversions = [m.output["genversion"] for m in self.merged_data] self.num_verions = [len(m.output["fitres_dirs"]) for m in self.merged_data] - self.output["fitopt_files"] = [m.output.get("fitopt_file") for m in self.merged_data] - self.genversion = "_".join(self.sim_names) + ("" if self.classifier is None else "_" + self.classifier.name) + self.output["fitopt_files"] = [ + m.output.get("fitopt_file") for m in self.merged_data + ] + self.genversion = "_".join(self.sim_names) + ( + "" if self.classifier is None else "_" + self.classifier.name + ) - self.config_filename = f"{self.name}.input" # Make sure this syncs with the tmp file name + self.config_filename = ( + f"{self.name}.input" # Make sure this syncs with the tmp file name + ) self.config_path = os.path.join(self.output_dir, self.config_filename) self.kill_file = self.config_path.replace(".input", "_KILL.LOG") self.job_name = os.path.basename(self.config_path) @@ -92,13 +110,19 @@ def __init__(self, name, output_dir, config, dependencies, options, global_confi if self.use_recalibrated: new_name = self.probability_column_name.replace("PROB_", "CPROB_") - self.logger.debug(f"Updating prob column name from {self.probability_column_name} to {new_name}. I hope it exists!") + self.logger.debug( + f"Updating prob column name from {self.probability_column_name} to {new_name}. I hope it exists!" + ) self.probability_column_name = new_name self.output["fit_output_dir"] = self.fit_output_dir - self.output["NSPLITRAN"] = "NSPLITRAN" in [x.upper() for x in self.options.keys()] + self.output["NSPLITRAN"] = "NSPLITRAN" in [ + x.upper() for x in self.options.keys() + ] if self.output["NSPLITRAN"]: - self.output["NSPLITRAN_VAL"] = {x.upper(): y for x, y in self.options.items()}["NSPLITRAN"] + self.output["NSPLITRAN_VAL"] = { + x.upper(): y for x, y in self.options.items() + }["NSPLITRAN"] self.w_summary = os.path.join(self.fit_output_dir, "BBC_SUMMARY_wfit.FITRES") self.output["w_summary"] = self.w_summary @@ -113,15 +137,16 @@ def __init__(self, name, output_dir, config, dependencies, options, global_confi self.output["muopts"] = self.muopt_order self.output["hubble_plot"] = self.output_plots - self.devel = self.options.get('devel', 0) + self.devel = self.options.get("devel", 0) self.logger.debug(f"Devel option: {self.devel}") - self.do_iterate = False # Temp flag to stop iterating as BBC will reiterate natively + self.do_iterate = ( + False # Temp flag to stop iterating as BBC will reiterate natively + ) self.logger.debug(f"Do iterate: {self.do_iterate}") self.logger.debug(f"SNANA_DIR: {os.environ['SNANA_DIR']}") def set_m0dif_dirs(self): - versions = None # Check if the SUBMIT.INFO exists submit_info = os.path.join(self.fit_output_dir, "SUBMIT.INFO") @@ -134,16 +159,26 @@ def set_m0dif_dirs(self): else: num_dirs = self.num_verions[0] if self.output["NSPLITRAN"]: - self.output["subdirs"] = [f"OUTPUT_BBCFIT-{i + 1:04d}" for i in range(self.output["NSPLITRAN_VAL"])] + self.output["subdirs"] = [ + f"OUTPUT_BBCFIT-{i + 1:04d}" + for i in range(self.output["NSPLITRAN_VAL"]) + ] else: if num_dirs == 1: self.output["subdirs"] = ["OUTPUT_BBCFIT"] else: - self.output["subdirs"] = [f"OUTPUT_BBCFIT-{i + 1:04d}" for i in range(num_dirs)] + self.output["subdirs"] = [ + f"OUTPUT_BBCFIT-{i + 1:04d}" for i in range(num_dirs) + ] - self.output["m0dif_dirs"] = [os.path.join(self.fit_output_dir, s) for s in self.output["subdirs"]] + self.output["m0dif_dirs"] = [ + os.path.join(self.fit_output_dir, s) for s in self.output["subdirs"] + ] self.output_plots = [ - os.path.join(m, f"{self.name}_{(str(int(os.path.basename(m))) + '_') if os.path.basename(m).isdigit() else ''}hubble.png") + os.path.join( + m, + f"{self.name}_{(str(int(os.path.basename(m))) + '_') if os.path.basename(m).isdigit() else ''}hubble.png", + ) for m in self.output["m0dif_dirs"] ] @@ -158,35 +193,67 @@ def get_blind(self, config, options): def kill_and_fail(self): with open(self.kill_file, "w") as f: self.logger.info(f"Killing remaining jobs for {self.name}") - command = ["submit_batch_jobs.sh", "--kill", os.path.basename(self.config_path)] - subprocess.run([' '.join(command)], stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir, shell=True) + command = [ + "submit_batch_jobs.sh", + "--kill", + os.path.basename(self.config_path), + ] + subprocess.run( + [" ".join(command)], + stdout=f, + stderr=subprocess.STDOUT, + cwd=self.output_dir, + shell=True, + ) return Task.FINISHED_FAILURE def check_issues(self, kill=True): log_files = [self.logging_file] - dirs = [self.output_dir, self.fit_output_dir, os.path.join(self.fit_output_dir, "SCRIPTS_BBCFIT")] + self.output["m0dif_dirs"] + dirs = [ + self.output_dir, + self.fit_output_dir, + os.path.join(self.fit_output_dir, "SCRIPTS_BBCFIT"), + ] + self.output["m0dif_dirs"] for dir in dirs: if os.path.exists(dir): - log_files += [os.path.join(dir, f) for f in os.listdir(dir) if f.upper().endswith(".LOG")] - self.scan_files_for_error(log_files, "FATAL ERROR ABORT", "QOSMaxSubmitJobPerUserLimit", "DUE TO TIME LIMIT") + log_files += [ + os.path.join(dir, f) + for f in os.listdir(dir) + if f.upper().endswith(".LOG") + ] + self.scan_files_for_error( + log_files, + "FATAL ERROR ABORT", + "QOSMaxSubmitJobPerUserLimit", + "DUE TO TIME LIMIT", + ) if kill: return self.kill_and_fail() else: return Task.FINISHED_FAILURE def submit_reject_phase(self): - """ Merges the reject lists for each version, saves it to the output dir, modifies the input file, and resubmits if needed + """Merges the reject lists for each version, saves it to the output dir, modifies the input file, and resubmits if needed Returns: true if the job is resubmited, false otherwise """ - self.logger.info("Checking for rejected SNID after round 1 of BiasCor has finished") + self.logger.info( + "Checking for rejected SNID after round 1 of BiasCor has finished" + ) rejects = None for folder in self.output["m0dif_dirs"]: - - num_fitres_files = len([f for f in os.listdir(folder) if f.startswith("FITOPT") and ".FITRES" in f]) + num_fitres_files = len( + [ + f + for f in os.listdir(folder) + if f.startswith("FITOPT") and ".FITRES" in f + ] + ) if num_fitres_files < 2: - self.logger.debug(f"M0DIF dir {folder} has only {num_fitres_files} FITRES file, so rejecting wont do anything. Not taking it into account.") + self.logger.debug( + f"M0DIF dir {folder} has only {num_fitres_files} FITRES file, so rejecting wont do anything. Not taking it into account." + ) continue path = os.path.join(folder, "BBC_REJECT_SUMMARY.LIST") df = pd.read_csv(path, delim_whitespace=True, comment="#") @@ -196,7 +263,9 @@ def submit_reject_phase(self): else: rejects = rejects.append(df) if rejects is None or not rejects.shape[0]: - self.logger.info("No rejected SNIDs found, not rerunning, task finishing successfully") + self.logger.info( + "No rejected SNIDs found, not rerunning, task finishing successfully" + ) return Task.FINISHED_SUCCESS else: self.logger.info(f"Found {rejects.shape[0]} rejected SNIDs, will resubmit") @@ -219,7 +288,13 @@ def submit_reject_phase(self): command = ["submit_batch_jobs.sh", os.path.basename(self.config_filename)] self.logger.debug(f"Running command: {' '.join(command)}") with open(self.logging_file, "w") as f: - subprocess.run([' '.join(command)], stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir, shell=True) + subprocess.run( + [" ".join(command)], + stdout=f, + stderr=subprocess.STDOUT, + cwd=self.output_dir, + shell=True, + ) self.logger.notice(f"RESUBMITTED: BiasCor {self.name} task") return 1 @@ -230,20 +305,28 @@ def move_to_next_phase(self): pass return self.submit_reject_phase() else: - self.logger.info(f"On run iteration {self.run_iteration}, finishing successfully") + self.logger.info( + f"On run iteration {self.run_iteration}, finishing successfully" + ) return Task.FINISHED_SUCCESS def _check_completion(self, squeue): if os.path.exists(self.done_file): - self.logger.debug(f"Done file found for {self.name}, biascor task finishing") + self.logger.debug( + f"Done file found for {self.name}, biascor task finishing" + ) with open(self.done_file) as f: content = f.read().upper() if "FAIL" in content or "STOP" in content: - self.logger.error(f"Done file reporting failure! Check log in {self.logging_file} and other logs") + self.logger.error( + f"Done file reporting failure! Check log in {self.logging_file} and other logs" + ) return self.check_issues() else: - self.logger.debug(f"Found {self.w_summary}, task finished successfully") + self.logger.debug( + f"Found {self.w_summary}, task finished successfully" + ) return self.move_to_next_phase() elif not os.path.exists(self.merge_log): self.logger.error("MERGE.LOG was not created, job died on submission") @@ -254,12 +337,34 @@ def _check_completion(self, squeue): def get_simfile_biascor(self, ia_sims, fitopt_num=None): if (ia_sims is not None) and (fitopt_num is None): fitopt_num = [0 for _ in ia_sims] - return None if ia_sims is None else ",".join([os.path.join(ia_sims[i].output["fitres_dirs"][0], f"FITOPT{n:03}.FITRES.gz") for (i, n) in enumerate(fitopt_num)]) + return ( + None + if ia_sims is None + else ",".join( + [ + os.path.join( + ia_sims[i].output["fitres_dirs"][0], f"FITOPT{n:03}.FITRES.gz" + ) + for (i, n) in enumerate(fitopt_num) + ] + ) + ) def get_simfile_ccprior(self, cc_sims, fitopt_num=None): if (cc_sims is not None) and (fitopt_num is None): fitopt_num = [0 for _ in cc_sims] - return None if cc_sims is None else ",".join([os.path.join(cc_sims[i].output["fitres_dirs"][0], f"FITOPT{n:03}.FITRES.gz") for (i, n) in enumerate(fitopt_num)]) + return ( + None + if cc_sims is None + else ",".join( + [ + os.path.join( + cc_sims[i].output["fitres_dirs"][0], f"FITOPT{n:03}.FITRES.gz" + ) + for (i, n) in enumerate(fitopt_num) + ] + ) + ) def get_fitopt_map(self, datas): fitopts = {} @@ -281,7 +386,10 @@ def get_fitopt_map(self, datas): # Now for each of the labels we've found in all files, construct the output dict # Which is just FITOPT004: {DES_NAME: FITOPT004, LOWZ_NAME: FITOPT029}... etc index = 0 - result = {"SURVEY_LIST": " ".join([d.output["SURVEY"] for d in datas]), "FITOPT000": " ".join(["FITOPT000" for d in datas])} + result = { + "SURVEY_LIST": " ".join([d.output["SURVEY"] for d in datas]), + "FITOPT000": " ".join(["FITOPT000" for d in datas]), + } index_map = ["DEFAULT"] for label, mapping in fitopts.items(): index += 1 @@ -291,30 +399,40 @@ def get_fitopt_map(self, datas): return result, index_map def write_input(self): - if self.merged_iasim is not None: - for (i, m) in enumerate(self.merged_iasim): + for i, m in enumerate(self.merged_iasim): if len(m.output["fitres_dirs"]) > 1: - self.logger.warning(f"Your IA sim {m} has multiple versions! Using 0 index from options {m.output['fitres_dirs']}") + self.logger.warning( + f"Your IA sim {m} has multiple versions! Using 0 index from options {m.output['fitres_dirs']}" + ) self.logger.debug(f"Fitres directory: {m.output['fitres_dirs'][0]}") n = self.merged_iasim_fitopts[i] - assert os.path.exists(os.path.join(m.output["fitres_dirs"][0], f"FITOPT{n:03}.FITRES.gz")), f"FITOPT{n:03} does not exist for your IA sim {m}" + assert os.path.exists( + os.path.join(m.output["fitres_dirs"][0], f"FITOPT{n:03}.FITRES.gz") + ), f"FITOPT{n:03} does not exist for your IA sim {m}" if self.merged_ccsim is not None: - for (i, m) in enumerate(self.merged_ccsim): + for i, m in enumerate(self.merged_ccsim): if len(m.output["fitres_dirs"]) > 1: - self.logger.warning(f"Your CC sim {m} has multiple versions! Using 0 index from options {m.output['fitres_dirs']}") + self.logger.warning( + f"Your CC sim {m} has multiple versions! Using 0 index from options {m.output['fitres_dirs']}" + ) self.logger.debug(f"Fitres directory: {m.output['fitres_dirs'][0]}") n = self.merged_ccsim_fitopts[i] - assert os.path.exists(os.path.join(m.output["fitres_dirs"][0], f"FITOPT{n:03}.FITRES.gz")), f"FITOPT{n:03} does not exist for your CC sim {m}" - - - self.bias_cor_fits = self.get_simfile_biascor(self.merged_iasim, self.merged_iasim_fitopts) - self.cc_prior_fits = self.get_simfile_ccprior(self.merged_ccsim, self.merged_ccsim_fitopts) + assert os.path.exists( + os.path.join(m.output["fitres_dirs"][0], f"FITOPT{n:03}.FITRES.gz") + ), f"FITOPT{n:03} does not exist for your CC sim {m}" + + self.bias_cor_fits = self.get_simfile_biascor( + self.merged_iasim, self.merged_iasim_fitopts + ) + self.cc_prior_fits = self.get_simfile_ccprior( + self.merged_ccsim, self.merged_ccsim_fitopts + ) self.data = [m.output["lc_output_dir"] for m in self.merged_data] self.data_fitres = [m.output["fitres_file"] for m in self.merged_data] - #print('MERGED DATA') - #print(self.yaml) - #print('------------') + # print('MERGED DATA') + # print(self.yaml) + # print('------------') self.yaml["FITOPT_MAP"], fitopt_index = self.get_fitopt_map(self.merged_data) self.output["fitopt_index"] = fitopt_index @@ -375,24 +493,30 @@ def write_input(self): muopt_scales[label] = value.get("SCALE", 1.0) mu_str = f"/{label}/ " if value.get("SIMFILE_BIASCOR"): - value_simfile_biascor = value.get('SIMFILE_BIASCOR') - value_simfile_biascor_fitopts = value.get('SIMFILE_BIASCOR_FITOPTS') + value_simfile_biascor = value.get("SIMFILE_BIASCOR") + value_simfile_biascor_fitopts = value.get("SIMFILE_BIASCOR_FITOPTS") if value_simfile_biascor_fitopts is None: value_simfile_biascor_fitopts = [0 for _ in value_simfile_biascor] else: - value_simfile_biascor_fitopts = ensure_list(value_simfile_biascor_fitopts) - assert len(value_simfile_biascor_fitopts) == len(value_simfile_biascor), \ - f"SIMFILE_BIASCOR_FITOPTS must be the same length as SIMFILE_BIASCOR ({len(value_simfile_biascor)}), but is length {len(value_simfile_biascor_fitopt)}" + value_simfile_biascor_fitopts = ensure_list( + value_simfile_biascor_fitopts + ) + assert len(value_simfile_biascor_fitopts) == len( + value_simfile_biascor + ), f"SIMFILE_BIASCOR_FITOPTS must be the same length as SIMFILE_BIASCOR ({len(value_simfile_biascor)}), but is length {len(value_simfile_biascor_fitopt)}" mu_str += f"simfile_biascor={self.get_simfile_biascor(value_simfile_biascor, value_simfile_biascor_fitopts)} " if value.get("SIMFILE_CCPRIOR"): - value_simfile_ccprior = value.get('SIMFILE_CCPRIOR') - value_simfile_ccprior_fitopts = value.get('SIMFILE_CCPRIOR_FITOPTS') + value_simfile_ccprior = value.get("SIMFILE_CCPRIOR") + value_simfile_ccprior_fitopts = value.get("SIMFILE_CCPRIOR_FITOPTS") if value_simfile_ccprior_fitopts is None: value_simfile_ccprior_fitopts = [0 for _ in value_simfile_ccprior] else: - value_simfile_ccprior_fitopts = ensure_list(value_simfile_ccprior_fitopts) - assert len(value_simfile_ccprior_fitopts) == len(value_simfile_ccprior), \ - f"SIMFILE_CCPRIOR_FITOPTS must be the same length as SIMFILE_CCPRIOR ({len(value_simfile_ccprior)}), but is length {len(value_simfile_ccprior_fitopt)}" + value_simfile_ccprior_fitopts = ensure_list( + value_simfile_ccprior_fitopts + ) + assert len(value_simfile_ccprior_fitopts) == len( + value_simfile_ccprior + ), f"SIMFILE_CCPRIOR_FITOPTS must be the same length as SIMFILE_CCPRIOR ({len(value_simfile_ccprior)}), but is length {len(value_simfile_ccprior_fitopt)}" mu_str += f"simfile_ccprior={self.get_simfile_ccprior(value_simfile_ccprior, value_simfile_ccprior_fitopts)} " if value.get("CLASSIFIER"): cname = self.prob_cols[value.get("CLASSIFIER").name] @@ -403,7 +527,9 @@ def write_input(self): if value.get("FITOPT") is not None: mu_str += f"FITOPT={value.get('FITOPT')} " for opt, opt_value in value.get("OPTS", {}).items(): - self.logger.info(f"In MUOPT {label}, found OPTS flag for myopt with opt {opt} and value {opt_value}") + self.logger.info( + f"In MUOPT {label}, found OPTS flag for myopt with opt {opt} and value {opt_value}" + ) if "CUTWIN_" in opt: opt2 = opt.replace("CUTWIN_", "") if opt2 == "PROB_IA": @@ -449,7 +575,13 @@ def _run(self): self.logger.debug(f"Will output log at {self.logging_file}") self.logger.debug(f"Running command: {' '.join(command)}") with open(self.logging_file, "w") as f: - subprocess.run([' '.join(command)], stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir, shell=True) + subprocess.run( + [" ".join(command)], + stdout=f, + stderr=subprocess.STDOUT, + cwd=self.output_dir, + shell=True, + ) chown_dir(self.output_dir) self.set_m0dif_dirs() else: @@ -460,7 +592,11 @@ def _run(self): @staticmethod def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): merge_tasks = Task.get_task_of_type(prior_tasks, Merger) - prob_cols = {k: v for d in [t.output["classifier_merge"] for t in merge_tasks] for k, v in d.items()} + prob_cols = { + k: v + for d in [t.output["classifier_merge"] for t in merge_tasks] + for k, v in d.items() + } classifier_tasks = Task.get_task_of_type(prior_tasks, Classifier) tasks = [] @@ -484,8 +620,12 @@ def resolve_classifiers(names): task = [c for c in classifier_tasks if c.name in names] if len(task) == 0: if len(names) > 1: - Task.fail_config(f"CLASSIFIERS {names} do not match any classifiers. If these are prob column names, you must specify only one!") - Task.logger.info(f"CLASSIFIERS {names} matched no classifiers. Checking prob column names instead.") + Task.fail_config( + f"CLASSIFIERS {names} do not match any classifiers. If these are prob column names, you must specify only one!" + ) + Task.logger.info( + f"CLASSIFIERS {names} matched no classifiers. Checking prob column names instead." + ) task = [c for c in classifier_tasks if prob_cols[c.name] in names] if len(task) == 0: choices = [prob_cols[c.name] for c in task] @@ -498,7 +638,9 @@ def resolve_classifiers(names): if len(choices) == 1: task = [task[0]] else: - Task.fail_config(f"Found multiple classifiers. Please instead specify a column name. Your choices: {choices}") + Task.fail_config( + f"Found multiple classifiers. Please instead specify a column name. Your choices: {choices}" + ) return task[0] # We only care about the prob column name def resolve_merged_fitres_files(name, classifier_name): @@ -511,12 +653,17 @@ def resolve_merged_fitres_files(name, classifier_name): message = f"Resolved multiple merge tasks {task} for name {name}" Task.fail_config(message) else: - if classifier_name is not None and classifier_name not in task[0].output["classifier_names"]: - if prob_cols[classifier_name] not in [prob_cols[n] for n in task[0].output['classifier_names']]: + if ( + classifier_name is not None + and classifier_name not in task[0].output["classifier_names"] + ): + if prob_cols[classifier_name] not in [ + prob_cols[n] for n in task[0].output["classifier_names"] + ]: Task.logger.warning( f"When constructing Biascor {gname}, merge input {name} does not have classifier {classifier_name}. " f"If this is a spec confirmed sample, or an EXTERNAL task, all good, else check this." - ) + ) return task[0] # Ensure classifiers point to the same prob column @@ -529,17 +676,22 @@ def validate_classifiers(classifier_names): if name in prob_cols.values(): prob_col.append(name) else: - Task.fail_config(f"Classifier {name} has no prob column name in {prob_cols}. This should never happen!") + Task.fail_config( + f"Classifier {name} has no prob column name in {prob_cols}. This should never happen!" + ) else: prob_col.append(col) if len(set(prob_col)) > 1: - Task.fail_config(f"Classifiers {classifier_names} map to different probability columns: {prob_cols}, you may need to map them to the same name via MERGE_CLASSIFIERS in the AGGREGATION stage.") + Task.fail_config( + f"Classifiers {classifier_names} map to different probability columns: {prob_cols}, you may need to map them to the same name via MERGE_CLASSIFIERS in the AGGREGATION stage." + ) else: - Task.logger.debug(f"Classifiers {classifier_names} map to {prob_col[0]}") - + Task.logger.debug( + f"Classifiers {classifier_names} map to {prob_col[0]}" + ) def resolve_conf(subdict, default=None): - """ Resolve the sub-dictionary and keep track of all the dependencies """ + """Resolve the sub-dictionary and keep track of all the dependencies""" deps = [] # If this is a muopt, allow access to the base config's resolution @@ -551,29 +703,36 @@ def resolve_conf(subdict, default=None): if classifier_names is not None: classifier_names = ensure_list(classifier_names) validate_classifiers(classifier_names) - #Task.logger.debug(f"XXX names: {classifier_names}") + # Task.logger.debug(f"XXX names: {classifier_names}") # Only if all classifiers point to the same prob_column should you continue classifier_task = None if classifier_names is not None: classifier_task = resolve_classifiers(classifier_names) - #Task.logger.debug(f"XXX tasks: {classifier_task}") - classifier_dep = classifier_task or default.get("CLASSIFIER") # For resolving merge tasks + # Task.logger.debug(f"XXX tasks: {classifier_task}") + classifier_dep = classifier_task or default.get( + "CLASSIFIER" + ) # For resolving merge tasks if classifier_dep is not None: classifier_dep = classifier_dep.name - #Task.logger.debug(f"XXX deps: {classifier_dep}") + # Task.logger.debug(f"XXX deps: {classifier_dep}") if "CLASSIFIER" in subdict: subdict["CLASSIFIER"] = classifier_task if classifier_task is not None: deps.append(classifier_task) - #Task.logger.debug(f"XXX global deps: {deps}") + # Task.logger.debug(f"XXX global deps: {deps}") # Get the Ia sims simfile_ia = subdict.get("SIMFILE_BIASCOR") if default is None and simfile_ia is None: - Task.fail_config(f"You must specify SIMFILE_BIASCOR for the default biascor. Supply a simulation name that has a merged output") + Task.fail_config( + f"You must specify SIMFILE_BIASCOR for the default biascor. Supply a simulation name that has a merged output" + ) if simfile_ia is not None: simfile_ia = ensure_list(simfile_ia) - simfile_ia_tasks = [resolve_merged_fitres_files(s, classifier_dep) for s in simfile_ia] + simfile_ia_tasks = [ + resolve_merged_fitres_files(s, classifier_dep) + for s in simfile_ia + ] deps += simfile_ia_tasks subdict["SIMFILE_BIASCOR"] = simfile_ia_tasks @@ -584,7 +743,10 @@ def resolve_conf(subdict, default=None): Task.logger.warning(message) if simfile_cc is not None: simfile_cc = ensure_list(simfile_cc) - simfile_cc_tasks = [resolve_merged_fitres_files(s, classifier_dep) for s in simfile_cc] + simfile_cc_tasks = [ + resolve_merged_fitres_files(s, classifier_dep) + for s in simfile_cc + ] deps += simfile_cc_tasks subdict["SIMFILE_CCPRIOR"] = simfile_cc_tasks @@ -594,11 +756,15 @@ def resolve_conf(subdict, default=None): # Resolve the data section data_names = config.get("DATA") if data_names is None: - Task.fail_config("For BIASCOR tasks you need to specify an input DATA which is a mask for a merged task") + Task.fail_config( + "For BIASCOR tasks you need to specify an input DATA which is a mask for a merged task" + ) data_names = ensure_list(data_names) class_task = config.get("CLASSIFIER") class_name = class_task.name if class_task is not None else None - data_tasks = [resolve_merged_fitres_files(s, class_name) for s in data_names] + data_tasks = [ + resolve_merged_fitres_files(s, class_name) for s in data_names + ] deps += data_tasks config["DATA"] = data_tasks @@ -609,7 +775,14 @@ def resolve_conf(subdict, default=None): for label, mu_conf in muopts.items(): deps += resolve_conf(mu_conf, default=config) - task = BiasCor(name, _get_biascor_output_dir(base_output_dir, stage_number, name), config, deps, options, global_config) + task = BiasCor( + name, + _get_biascor_output_dir(base_output_dir, stage_number, name), + config, + deps, + options, + global_config, + ) Task.logger.info(f"Creating aggregation task {name} with {task.num_jobs}") tasks.append(task) diff --git a/pippin/classifiers/classifier.py b/pippin/classifiers/classifier.py index d06206c3..d46fa570 100644 --- a/pippin/classifiers/classifier.py +++ b/pippin/classifiers/classifier.py @@ -7,8 +7,9 @@ from pippin.snana_sim import SNANASimulation from pippin.snana_fit import SNANALightCurveFit + class Classifier(Task): - """ Classification task + """Classification task CONFIGURATION: ============== @@ -41,7 +42,17 @@ class Classifier(Task): TRAIN = 0 PREDICT = 1 - def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): + def __init__( + self, + name, + output_dir, + config, + dependencies, + mode, + options, + index=0, + model_name=None, + ): super().__init__(name, output_dir, config=config, dependencies=dependencies) self.options = options self.index = index @@ -52,7 +63,7 @@ def __init__(self, name, output_dir, config, dependencies, mode, options, index= @abstractmethod def predict(self): - """ Predict probabilities for given dependencies + """Predict probabilities for given dependencies :return: true or false for success in launching the job """ @@ -60,7 +71,7 @@ def predict(self): @abstractmethod def train(self): - """ Train a model to file for given dependencies + """Train a model to file for given dependencies :return: true or false for success in launching the job """ @@ -68,17 +79,16 @@ def train(self): @staticmethod def get_requirements(config): - """ Return what data is actively used by the classifier + """Return what data is actively used by the classifier :param config: the input dictionary `OPTS` from the config file :return: a two tuple - (needs simulation photometry, needs a fitres file) """ return True, True - @staticmethod def get_optional_requirements(config): - """ Return what data *may* be used by the classier. + """Return what data *may* be used by the classier. Default behaviour: if OPTIONAL_MASK != "": True, True @@ -94,7 +104,7 @@ def get_optional_requirements(config): opt_sim = ("OPTIONAL_MASK" in config) or ("OPTIONAL_MASK_SIM" in config) opt_fit = ("OPTIONAL_MASK" in config) or ("OPTIONAL_MASK_FIT" in config) - return opt_sim, opt_fit + return opt_sim, opt_fit def get_fit_dependency(self, output=True): fit_deps = [] @@ -118,13 +128,17 @@ def validate_model(self): if self.mode == Classifier.PREDICT: model = self.options.get("MODEL") if model is None: - Task.fail_config(f"Classifier {self.name} is in predict mode but does not have a model specified") + Task.fail_config( + f"Classifier {self.name} is in predict mode but does not have a model specified" + ) model_classifier = self.get_model_classifier() if model_classifier is not None and model_classifier.name == model: return True path = get_data_loc(model) if not os.path.exists(path): - Task.fail_config(f"Classifier {self.name} does not have a classifier dependency and model is not a serialised file path") + Task.fail_config( + f"Classifier {self.name} does not have a classifier dependency and model is not a serialised file path" + ) return True def get_model_classifier(self): @@ -164,8 +178,18 @@ def get_prob_column_name(self): def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): from pippin.classifiers.factory import ClassifierFactory - def _get_clas_output_dir(base_output_dir, stage_number, sim_name, fit_name, clas_name, index=None, extra=None): - sim_name = "" if sim_name is None or fit_name is not None else "_" + sim_name + def _get_clas_output_dir( + base_output_dir, + stage_number, + sim_name, + fit_name, + clas_name, + index=None, + extra=None, + ): + sim_name = ( + "" if sim_name is None or fit_name is not None else "_" + sim_name + ) fit_name = "" if fit_name is None else "_" + fit_name extra_name = "" if extra is None else "_" + extra index = "" if index is None else f"_{index}" @@ -174,9 +198,16 @@ def _get_clas_output_dir(base_output_dir, stage_number, sim_name, fit_name, clas def get_num_ranseed(sim_tasks, lcfit_tasks): num = 0 if len(sim_tasks) > 0: - return min([len(sim_task.output["sim_folders"]) for sim_task in sim_tasks]) + return min( + [len(sim_task.output["sim_folders"]) for sim_task in sim_tasks] + ) if len(lcfit_tasks) > 0: - return min([len(lcfit_task.output["fitres_dirs"]) for lcfit_task in lcfit_tasks]) + return min( + [ + len(lcfit_task.output["fitres_dirs"]) + for lcfit_task in lcfit_tasks + ] + ) raise ValueError("Classifier dependency has no sim_task or lcfit_task?") tasks = [] @@ -188,11 +219,18 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): cls = ClassifierFactory.get(name) options = config.get("OPTS", {}) if options == None: - Task.fail_config(f"Classifier {clas_name} has no OPTS specified -- either remove the OPTS keyword or specify some options under it") + Task.fail_config( + f"Classifier {clas_name} has no OPTS specified -- either remove the OPTS keyword or specify some options under it" + ) if "MODE" not in config: - Task.fail_config(f"Classifier task {clas_name} needs to specify MODE as train or predict") + Task.fail_config( + f"Classifier task {clas_name} needs to specify MODE as train or predict" + ) mode = config["MODE"].lower() - assert mode in ["train", "predict"], "MODE should be either train or predict" + assert mode in [ + "train", + "predict", + ], "MODE should be either train or predict" if mode == "train": mode = Classifier.TRAIN else: @@ -200,11 +238,17 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): # Prevent mode = predict and SIM_FRACTION < 1 if mode == Classifier.PREDICT and options.get("SIM_FRACTION", 1) > 1: - Task.fail_config("SIM_FRACTION must be 1 (all sims included) for predict mode") + Task.fail_config( + "SIM_FRACTION must be 1 (all sims included) for predict mode" + ) # Validate that train is not used on certain classifiers if mode == Classifier.TRAIN: - assert name not in ["PerfectClassifier", "UnityClassifier", "FitProbClassifier"], f"Can not use train mode with {name}" + assert name not in [ + "PerfectClassifier", + "UnityClassifier", + "FitProbClassifier", + ], f"Can not use train mode with {name}" needs_sim, needs_lc = cls.get_requirements(options) @@ -213,10 +257,10 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): opt_deps = [] if opt_sim or opt_lc: # Get all optional masks - mask = options.get("OPTIONAL_MASK", "") + mask = options.get("OPTIONAL_MASK", "") mask_sim = options.get("OPTIONAL_MASK_SIM", "") mask_fit = options.get("OPTIONAL_MASK_FIT", "") - + # If no optional masks are set, use base masks if not any([mask, mask_sim, mask_fit]): mask = config.get("MASK", "") @@ -227,7 +271,9 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): optional_sim_tasks = [] if opt_sim: if not any([mask, mask_sim]): - Task.logger.debug(f"No optional sim masks set, all sim tasks included as dependendencies") + Task.logger.debug( + f"No optional sim masks set, all sim tasks included as dependendencies" + ) optional_sim_tasks = sim_tasks else: for s in sim_tasks: @@ -236,14 +282,20 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): elif mask and mask in s.name: optional_sim_tasks.append(s) if len(optional_sim_tasks) == 0: - Task.logger.warn(f"Optional SIM dependency but no matching sim tasks for MASK: {mask} or MASK_SIM: {mask_sim}") + Task.logger.warn( + f"Optional SIM dependency but no matching sim tasks for MASK: {mask} or MASK_SIM: {mask_sim}" + ) else: - Task.logger.debug(f"Found {len(optional_sim_tasks)} optional SIM dependencies") + Task.logger.debug( + f"Found {len(optional_sim_tasks)} optional SIM dependencies" + ) # Get optional lcfit tasks optional_lcfit_tasks = [] if opt_lc: if not any([mask, mask_fit]): - Task.logger.debug(f"No optional lcfit masks set, all lcfit tasks included as dependendencies") + Task.logger.debug( + f"No optional lcfit masks set, all lcfit tasks included as dependendencies" + ) optional_lcfit_tasks = lcfit_tasks else: for l in lcfit_tasks: @@ -252,9 +304,13 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): elif mask and mask in l.name: optional_lcfit_tasks.append(l) if len(optional_lcfit_tasks) == 0: - Task.logger.warn(f"Optional LCFIT dependency but no matching lcfit tasks for MASK: {mask} or MASK_FIT: {mask_fit}") + Task.logger.warn( + f"Optional LCFIT dependency but no matching lcfit tasks for MASK: {mask} or MASK_FIT: {mask_fit}" + ) else: - Task.logger.debug(f"Found {len(optional_lcfit_tasks)} optional LCFIT dependencies") + Task.logger.debug( + f"Found {len(optional_lcfit_tasks)} optional LCFIT dependencies" + ) opt_deps = optional_sim_tasks + optional_lcfit_tasks runs = [] @@ -280,7 +336,9 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): elif needs_sim: runs = [[(s, None)] for s in sim_tasks] else: - Task.logger.warn(f"Classifier {name} does not need sims or fits. Wat.") + Task.logger.warn( + f"Classifier {name} does not need sims or fits. Wat." + ) num_gen = 0 mask = config.get("MASK", "") @@ -295,7 +353,9 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): matched_combined = len(run) > 1 else: if len(run) > 1: - Task.logger.warn(f"Classifier {name} has multiple tasks -- this should only occur when COMBINE_MASK is specified. Using first task.") + Task.logger.warn( + f"Classifier {name} has multiple tasks -- this should only occur when COMBINE_MASK is specified. Using first task." + ) s, l = run[0] sim_name = s.name if s is not None else None @@ -310,8 +370,16 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): matched_fit = matched_fit and mask_sim in sim_name if not matched_fit or not matched_sim or not matched_combined: continue - sim_deps = [sim_fit_tuple[0] for sim_fit_tuple in run if sim_fit_tuple[0] is not None] - fit_deps = [sim_fit_tuple[1] for sim_fit_tuple in run if sim_fit_tuple[1] is not None] + sim_deps = [ + sim_fit_tuple[0] + for sim_fit_tuple in run + if sim_fit_tuple[0] is not None + ] + fit_deps = [ + sim_fit_tuple[1] + for sim_fit_tuple in run + if sim_fit_tuple[1] is not None + ] model = options.get("MODEL") @@ -332,8 +400,16 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): deps = sim_deps + fit_deps + opt_deps - sim_name = "_".join([s.name for s in sim_deps if s is not None]) if len(sim_deps) > 0 else None - fit_name = "_".join([l.name for l in fit_deps if l is not None]) if len(fit_deps) > 0 else None + sim_name = ( + "_".join([s.name for s in sim_deps if s is not None]) + if len(sim_deps) > 0 + else None + ) + fit_name = ( + "_".join([l.name for l in fit_deps if l is not None]) + if len(fit_deps) > 0 + else None + ) if model is not None: if "/" in model or "." in model: @@ -345,8 +421,25 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): indexes = get_num_ranseed(sim_deps, fit_deps) for i in range(indexes): num = i + 1 if indexes > 1 else None - clas_output_dir = _get_clas_output_dir(base_output_dir, stage_number, sim_name, fit_name, clas_name, index=num, extra=extra) - cc = cls(clas_name, clas_output_dir, config, deps, mode, options, index=i, model_name=extra) + clas_output_dir = _get_clas_output_dir( + base_output_dir, + stage_number, + sim_name, + fit_name, + clas_name, + index=num, + extra=extra, + ) + cc = cls( + clas_name, + clas_output_dir, + config, + deps, + mode, + options, + index=i, + model_name=extra, + ) Task.logger.info( f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}" ) @@ -354,34 +447,70 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): tasks.append(cc) else: - Task.fail_config(f"Your model {model} looks like a path, but I couldn't find a model at {potential_path}") + Task.fail_config( + f"Your model {model} looks like a path, but I couldn't find a model at {potential_path}" + ) else: if len(tasks) == 0: - Task.fail_config(f"Your model {model} has not yet been defined.") + Task.fail_config( + f"Your model {model} has not yet been defined." + ) for t in tasks: if model == t.name: # deps.append(t) extra = t.get_unique_name() - assert isinstance(t, cls), f"Model {clas_name} with class {cls} has model {model} with class {t.__class__}, they should match!" + assert isinstance( + t, cls + ), f"Model {clas_name} with class {cls} has model {model} with class {t.__class__}, they should match!" indexes = get_num_ranseed(sim_deps, fit_deps) for i in range(indexes): num = i + 1 if indexes > 1 else None - clas_output_dir = _get_clas_output_dir(base_output_dir, stage_number, sim_name, fit_name, clas_name, index=num, extra=extra) - cc = cls(clas_name, clas_output_dir, config, deps + [t], mode, options, index=i) + clas_output_dir = _get_clas_output_dir( + base_output_dir, + stage_number, + sim_name, + fit_name, + clas_name, + index=num, + extra=extra, + ) + cc = cls( + clas_name, + clas_output_dir, + config, + deps + [t], + mode, + options, + index=i, + ) Task.logger.info( f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}" ) num_gen += 1 tasks.append(cc) else: - indexes = get_num_ranseed(sim_deps, fit_deps) for i in range(indexes): num = i + 1 if indexes > 1 else None - clas_output_dir = _get_clas_output_dir(base_output_dir, stage_number, sim_name, fit_name, clas_name, index=num) - cc = cls(clas_name, clas_output_dir, config, deps, mode, options, index=i) + clas_output_dir = _get_clas_output_dir( + base_output_dir, + stage_number, + sim_name, + fit_name, + clas_name, + index=num, + ) + cc = cls( + clas_name, + clas_output_dir, + config, + deps, + mode, + options, + index=i, + ) Task.logger.info( f"Creating classification task {name} with {cc.num_jobs} jobs, for LC fit {fit_name} on simulation {sim_name} and index {i}" ) @@ -389,5 +518,7 @@ def get_num_ranseed(sim_tasks, lcfit_tasks): tasks.append(cc) if num_gen == 0: - Task.fail_config(f"Classifier {clas_name} with masks |{mask}|{mask_sim}|{mask_fit}| matched no combination of sims and fits") + Task.fail_config( + f"Classifier {clas_name} with masks |{mask}|{mask_sim}|{mask_fit}| matched no combination of sims and fits" + ) return tasks diff --git a/pippin/classifiers/fitprob.py b/pippin/classifiers/fitprob.py index e2b73b4a..2919659c 100644 --- a/pippin/classifiers/fitprob.py +++ b/pippin/classifiers/fitprob.py @@ -6,7 +6,7 @@ class FitProbClassifier(Classifier): - """ FitProb classifier + """FitProb classifier CONFIGURATION: ============== @@ -28,8 +28,27 @@ class FitProbClassifier(Classifier): """ - def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): - super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) + def __init__( + self, + name, + output_dir, + config, + dependencies, + mode, + options, + index=0, + model_name=None, + ): + super().__init__( + name, + output_dir, + config, + dependencies, + mode, + options, + index=index, + model_name=model_name, + ) self.output_file = None self.passed = False self.num_jobs = 1 # This is the default. Can get this from options if needed. @@ -43,16 +62,24 @@ def classify(self): mkdirs(self.output_dir) input = self.get_fit_dependency()[0] if len(self.get_fit_dependency()) > 1: - self.logger.warning(f"Found more than one fit dependency for {self.name}, possibly because COMBINE_MASK is being used. FITPROB doesn't currently support this. Using first one.") - fitres_file = os.path.join(input["fitres_dirs"][self.index], input["fitopt_map"][self.fitopt]) + self.logger.warning( + f"Found more than one fit dependency for {self.name}, possibly because COMBINE_MASK is being used. FITPROB doesn't currently support this. Using first one." + ) + fitres_file = os.path.join( + input["fitres_dirs"][self.index], input["fitopt_map"][self.fitopt] + ) self.logger.debug(f"Looking for {fitres_file}") if not os.path.exists(fitres_file): - self.logger.error(f"FITRES file could not be found at {fitres_file}, classifer has nothing to work with") + self.logger.error( + f"FITRES file could not be found at {fitres_file}, classifer has nothing to work with" + ) self.passed = False return False df = pd.read_csv(fitres_file, delim_whitespace=True, comment="#") - df = df[["CID", "FITPROB"]].rename(columns={"FITPROB": self.get_prob_column_name()}) + df = df[["CID", "FITPROB"]].rename( + columns={"FITPROB": self.get_prob_column_name()} + ) self.logger.info(f"Saving probabilities to {self.output_file}") df.to_csv(self.output_file, index=False, float_format="%0.4f") diff --git a/pippin/classifiers/nearest_neighbor.py b/pippin/classifiers/nearest_neighbor.py index 158dc9b6..8d6ec864 100644 --- a/pippin/classifiers/nearest_neighbor.py +++ b/pippin/classifiers/nearest_neighbor.py @@ -9,7 +9,7 @@ class NearestNeighborClassifier(Classifier): - """ Nearest Neighbour classifier + """Nearest Neighbour classifier CONFIGURATION ============= @@ -33,8 +33,27 @@ class NearestNeighborClassifier(Classifier): """ - def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): - super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) + def __init__( + self, + name, + output_dir, + config, + dependencies, + mode, + options, + index=0, + model_name=None, + ): + super().__init__( + name, + output_dir, + config, + dependencies, + mode, + options, + index=index, + model_name=model_name, + ) self.passed = False self.num_jobs = 40 self.outfile_train = f"{output_dir}/NN_trainResult.out" @@ -53,7 +72,9 @@ def setup(self): lcfit = self.get_fit_dependency() self.fitopt = self.options.get("FITOPT", "DEFAULT") self.fitres_filename = lcfit["fitopt_map"][self.fitopt] - self.fitres_path = os.path.abspath(os.path.join(lcfit["fitres_dirs"][self.index], self.fitres_filename)) + self.fitres_path = os.path.abspath( + os.path.join(lcfit["fitres_dirs"][self.index], self.fitres_filename) + ) def train(self): # Created April 2019 by R.Kessler @@ -89,7 +110,10 @@ def prepare_train_job(self): nml_file_train1 = f"{temp_dir}/{genversion}-2.nml" nml_file_train2 = f"{self.output_dir}/{genversion}-2.nml" - train_info_local = {"outfile_NNtrain": outfile_train, "nml_file_NNtrain": nml_file_train2} + train_info_local = { + "outfile_NNtrain": outfile_train, + "nml_file_NNtrain": nml_file_train2, + } # construct sed to copy original NMLFILE and to # + replace OUTDIR: @@ -117,7 +141,9 @@ def prepare_train_job(self): # make sure that the new NML file is really there if not os.path.isfile(nml_file_train1): - self.logger.error(f"Unable to create {nml_file_train1} with sed command {sed_command}") + self.logger.error( + f"Unable to create {nml_file_train1} with sed command {sed_command}" + ) return None # check that expected FITRES ref file is really there. @@ -130,7 +156,9 @@ def prepare_train_job(self): f.write("\n# NNINP below added by prepare_NNtrainJob\n") f.write("\n&NNINP \n") f.write(" NEARNBR_TRAINFILE_PATH = '%s' \n" % fitres_dir) - f.write(" NEARNBR_TRAINFILE_LIST = '%s' \n" % os.path.basename(fitres_file)) + f.write( + " NEARNBR_TRAINFILE_LIST = '%s' \n" % os.path.basename(fitres_file) + ) f.write(" NEARNBR_SEPMAX_VARDEF = '%s' \n" % self.nn_options) f.write(" NEARNBR_TRUETYPE_VARNAME = 'SIM_TYPE_INDEX' \n") f.write(" NEARNBR_TRAIN_ODDEVEN = T \n") @@ -153,7 +181,11 @@ def prepare_train_job(self): return None, train_info_local def run_train_job(self): - cmd = ["split_and_fit.pl", self.train_info_local["nml_file_NNtrain"], "NOPROMPT"] + cmd = [ + "split_and_fit.pl", + self.train_info_local["nml_file_NNtrain"], + "NOPROMPT", + ] self.logger.debug(f"Launching training via {cmd}") with open(self.logging_file, "w") as f: subprocess.run(cmd, stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir) @@ -182,7 +214,9 @@ def _check_completion(self, squeue): return 0 else: if os.path.exists(self.outfile_predict): - self.logger.debug(f"Predictions can be found at {self.outfile_predict}") + self.logger.debug( + f"Predictions can be found at {self.outfile_predict}" + ) self.output["predictions_filename"] = self.outfile_predict return Task.FINISHED_SUCCESS else: @@ -193,8 +227,12 @@ def _check_completion(self, squeue): with open(self.logging_file, "r") as f: output_error = False for line in f.read().splitlines(): - if ("ERROR" in line or ("ABORT" in line and " 0 " not in line)) and not output_error: - self.logger.error(f"Fatal error in light curve fitting. See {self.logging_file} for details.") + if ( + "ERROR" in line or ("ABORT" in line and " 0 " not in line) + ) and not output_error: + self.logger.error( + f"Fatal error in light curve fitting. See {self.logging_file} for details." + ) output_error = True if output_error: self.logger.info(f"Excerpt: {line}") @@ -214,10 +252,14 @@ def predict(self): self.setup() model = self.options.get("MODEL") - assert model is not None, "If TRAIN is not specified, you have to point to a model to use" + assert ( + model is not None + ), "If TRAIN is not specified, you have to point to a model to use" for t in self.dependencies: if model == t.name: - self.logger.debug(f"Found task dependency {t.name} with model file {t.output['model_filename']}") + self.logger.debug( + f"Found task dependency {t.name} with model file {t.output['model_filename']}" + ) model = t.output["model_filename"] model_path = get_output_loc(model) @@ -241,7 +283,12 @@ def predict(self): cmd_job = "%s %s %s" % (job_name, inArgs, outArgs) self.logger.debug(f"Executing command {cmd_job}") with open(self.logging_file, "w") as f: - val = subprocess.run(cmd_job.split(" "), stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir) + val = subprocess.run( + cmd_job.split(" "), + stdout=f, + stderr=subprocess.STDOUT, + cwd=self.output_dir, + ) with open(self.done_file, "w") as f: if val.returncode == 0: f.write("SUCCESS") diff --git a/pippin/classifiers/nearest_neighbor_code.py b/pippin/classifiers/nearest_neighbor_code.py index 6e31f287..28cb7366 100644 --- a/pippin/classifiers/nearest_neighbor_code.py +++ b/pippin/classifiers/nearest_neighbor_code.py @@ -15,29 +15,79 @@ def setup_logging(): fmt = "[%(levelname)8s |%(funcName)21s:%(lineno)3d] %(message)s" handler = logging.StreamHandler(sys.stdout) - logging.basicConfig(level=logging.DEBUG, format=fmt, handlers=[handler, logging.FileHandler("nn.log")]) + logging.basicConfig( + level=logging.DEBUG, + format=fmt, + handlers=[handler, logging.FileHandler("nn.log")], + ) logging.getLogger("matplotlib").setLevel(logging.ERROR) def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("fitres_file", help="the name of the fitrres file to load. For example: somepath/FITOPT000.FITRES") - parser.add_argument("-p", "--predict", help="If in predict mode", action="store_true") - parser.add_argument("-m", "--model", help="Pickled model to load", default="model.pkl", type=str) - parser.add_argument("-d", "--done_file", help="Location to write done file", default="done.txt", type=str) - parser.add_argument("-f", "--features", help="Space separated string of features out of fitres", type=str, nargs="+", default=None) - parser.add_argument("-t", "--types", help="Ia types, space separated list", type=int, nargs="+", default=None) - parser.add_argument("-o", "--output", help="Output CSV of predictions", type=str, default="predictions.csv") - parser.add_argument("-n", "--name", help="Column name for probability", type=str, default="PROB") + parser.add_argument( + "fitres_file", + help="the name of the fitrres file to load. For example: somepath/FITOPT000.FITRES", + ) + parser.add_argument( + "-p", "--predict", help="If in predict mode", action="store_true" + ) + parser.add_argument( + "-m", "--model", help="Pickled model to load", default="model.pkl", type=str + ) + parser.add_argument( + "-d", + "--done_file", + help="Location to write done file", + default="done.txt", + type=str, + ) + parser.add_argument( + "-f", + "--features", + help="Space separated string of features out of fitres", + type=str, + nargs="+", + default=None, + ) + parser.add_argument( + "-t", + "--types", + help="Ia types, space separated list", + type=int, + nargs="+", + default=None, + ) + parser.add_argument( + "-o", + "--output", + help="Output CSV of predictions", + type=str, + default="predictions.csv", + ) + parser.add_argument( + "-n", "--name", help="Column name for probability", type=str, default="PROB" + ) args = parser.parse_args() return args def sanitise_args(args): - """ Set up defaults and do some sanity checks """ + """Set up defaults and do some sanity checks""" if args.features is None: - args.features = ["zHD", "x1", "c", "cERR", "x1ERR", "mBERR", "COV_x1_c", "COV_x1_x0", "COV_c_x0", "FITPROB"] + args.features = [ + "zHD", + "x1", + "c", + "cERR", + "x1ERR", + "mBERR", + "COV_x1_c", + "COV_x1_x0", + "COV_c_x0", + "FITPROB", + ] if args.types is None: args.types = [1, 101] @@ -45,14 +95,18 @@ def sanitise_args(args): logging.info(f"Input fitres_file is {args.fitres_file}") assert os.path.exists(args.fitres_file), f"File {args.fitres_file} does not exist" - assert " " not in args.name, f"Prob column name '{args.name}' should not have spaces" + assert ( + " " not in args.name + ), f"Prob column name '{args.name}' should not have spaces" return args def get_features(filename, features, types): df = pd.read_csv(filename, delim_whitespace=True, comment="#") for f in features: - assert f in df.columns, f"Features {f} is not in DataFrame columns {list(df.columns)}" + assert ( + f in df.columns + ), f"Features {f} is not in DataFrame columns {list(df.columns)}" assert "TYPE" in df.columns, f"DataFrame does not have a TYPE column!" X = df[features].values @@ -67,9 +121,16 @@ def train(args): _, X, y = get_features(args.fitres_file, args.features, args.types) - X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.05, random_state=0) + X_train, X_test, y_train, y_test = train_test_split( + X, y, stratify=y, test_size=0.05, random_state=0 + ) - clf = Pipeline([("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=50, algorithm="kd_tree"))]) + clf = Pipeline( + [ + ("scaler", StandardScaler()), + ("knn", KNeighborsClassifier(n_neighbors=50, algorithm="kd_tree")), + ] + ) logging.info(f"Training NN on feature matrix {X.shape}") clf.fit(X_train, y_train) @@ -84,7 +145,9 @@ def train(args): def predict(args): args = sanitise_args(args) - logging.info(f"Predicting model on file {args.fitres_file} using pickle {args.model}") + logging.info( + f"Predicting model on file {args.fitres_file} using pickle {args.model}" + ) assert os.path.exists(args.model), f"Pickle {args.model} does not exist!" with open(args.model, "rb") as f: clf = pickle.load(f) diff --git a/pippin/classifiers/nearest_neighbor_python.py b/pippin/classifiers/nearest_neighbor_python.py index 7abccc9d..a94824b2 100644 --- a/pippin/classifiers/nearest_neighbor_python.py +++ b/pippin/classifiers/nearest_neighbor_python.py @@ -10,7 +10,7 @@ class NearestNeighborPyClassifier(Classifier): - """ Nearest Neighbor Python classifier + """Nearest Neighbor Python classifier CONFIGURATION: ============== @@ -34,16 +34,41 @@ class NearestNeighborPyClassifier(Classifier): """ - def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): - super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) + def __init__( + self, + name, + output_dir, + config, + dependencies, + mode, + options, + index=0, + model_name=None, + ): + super().__init__( + name, + output_dir, + config, + dependencies, + mode, + options, + index=index, + model_name=model_name, + ) self.global_config = get_config() self.num_jobs = 1 self.conda_env = self.global_config["SNIRF"]["conda_env"] self.path_to_classifier = os.path.dirname(inspect.stack()[0][1]) - self.job_base_name = os.path.basename(Path(output_dir).parents[1]) + "__" + os.path.basename(output_dir) - self.features = options.get("FEATURES", "zHD x1 c cERR x1ERR COV_x1_c COV_x1_x0 COV_c_x0 PKMJDERR") + self.job_base_name = ( + os.path.basename(Path(output_dir).parents[1]) + + "__" + + os.path.basename(output_dir) + ) + self.features = options.get( + "FEATURES", "zHD x1 c cERR x1ERR COV_x1_c COV_x1_x0 COV_c_x0 PKMJDERR" + ) # self.model_pk_file = self.get_unique_name() + ".pkl" self.model_pk_file = "model.pkl" @@ -55,7 +80,9 @@ def __init__(self, name, output_dir, config, dependencies, mode, options, index= self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) - self.batch_replace = self.options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) + self.batch_replace = self.options.get( + "BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {}) + ) self.output["predictions_filename"] = self.predictions_filename self.output["model_filename"] = self.output_pk_file @@ -72,7 +99,9 @@ def __init__(self, name, output_dir, config, dependencies, mode, options, index= def setup(self): lcfit = self.get_fit_dependency() self.fitres_filename = lcfit["fitopt_map"][self.fitopt] - self.fitres_file = os.path.abspath(os.path.join(lcfit["fitres_dirs"][self.index], self.fitres_filename)) + self.fitres_file = os.path.abspath( + os.path.join(lcfit["fitres_dirs"][self.index], self.fitres_filename) + ) def classify(self, command): self.setup() @@ -82,17 +111,17 @@ def classify(self, command): else: self.sbatch_header = self.sbatch_cpu_header else: - with open(self.batch_file, 'r') as f: + with open(self.batch_file, "r") as f: self.sbatch_header = f.read() self.sbatch_header = self.clean_header(self.sbatch_header) header_dict = { - "REPLACE_NAME": self.job_base_name, - "REPLACE_LOGFILE": "output.log", - "REPLACE_WALLTIME": "00:55:00", - "REPLACE_MEM": "8GB", - "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=4"] - } + "REPLACE_NAME": self.job_base_name, + "REPLACE_LOGFILE": "output.log", + "REPLACE_WALLTIME": "00:55:00", + "REPLACE_MEM": "8GB", + "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=4"], + } header_dict = merge_dict(header_dict, self.batch_replace) self.update_header(header_dict) @@ -100,14 +129,16 @@ def classify(self, command): "job_name": self.job_base_name, "conda_env": self.conda_env, "path_to_classifier": self.path_to_classifier, - "command_opts": command + "command_opts": command, } format_dict = { - "done_file": self.done_file, - "sbatch_header": self.sbatch_header, - "task_setup": self.update_setup(setup_dict, self.task_setup['nearest_neighbour']) - } + "done_file": self.done_file, + "sbatch_header": self.sbatch_header, + "task_setup": self.update_setup( + setup_dict, self.task_setup["nearest_neighbour"] + ), + } slurm_script = self.slurm.format(**format_dict) @@ -132,17 +163,26 @@ def predict(self): self.setup() model = self.options.get("MODEL") if model is None: - self.logger.error("If you are in predict model, please specify a MODEL in OPTS. Either a file location or a training task name.") + self.logger.error( + "If you are in predict model, please specify a MODEL in OPTS. Either a file location or a training task name." + ) return False if not os.path.exists(get_output_loc(model)): # If its not a file, it must be a task for t in self.dependencies: if model == t.name: - self.logger.debug(f"Found task dependency {t.name} with model file {t.output['model_filename']}") + self.logger.debug( + f"Found task dependency {t.name} with model file {t.output['model_filename']}" + ) model = t.output["model_filename"] else: model = get_output_loc(model) - types = " ".join([str(a) for a in self.get_simulation_dependency().output["types_dict"]["IA"]]) + types = " ".join( + [ + str(a) + for a in self.get_simulation_dependency().output["types_dict"]["IA"] + ] + ) if not types: types = "1" command = ( @@ -158,7 +198,12 @@ def predict(self): return self.classify(command) def train(self): - types = " ".join([str(a) for a in self.get_simulation_dependency().output["types_dict"]["IA"]]) + types = " ".join( + [ + str(a) + for a in self.get_simulation_dependency().output["types_dict"]["IA"] + ] + ) if not types: self.logger.error("No Ia types for a training sim!") return False diff --git a/pippin/classifiers/perfect.py b/pippin/classifiers/perfect.py index 134effda..87179132 100644 --- a/pippin/classifiers/perfect.py +++ b/pippin/classifiers/perfect.py @@ -9,8 +9,9 @@ from pippin.config import chown_dir, mkdirs from pippin.task import Task + class PerfectClassifier(Classifier): - """ Classification task for the SuperNNova classifier. + """Classification task for the SuperNNova classifier. CONFIGURATION ============= @@ -35,8 +36,27 @@ class PerfectClassifier(Classifier): """ - def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): - super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) + def __init__( + self, + name, + output_dir, + config, + dependencies, + mode, + options, + index=0, + model_name=None, + ): + super().__init__( + name, + output_dir, + config, + dependencies, + mode, + options, + index=index, + model_name=model_name, + ) self.output_file = None self.passed = False self.num_jobs = 1 # This is the default. Can get this from options if needed. @@ -49,7 +69,9 @@ def get_unique_name(self): return self.name def classify(self): - new_hash = self.get_hash_from_string(self.name + f"{self.prob_ia}_{self.prob_cc}") + new_hash = self.get_hash_from_string( + self.name + f"{self.prob_ia}_{self.prob_cc}" + ) if self._check_regenerate(new_hash): shutil.rmtree(self.output_dir, ignore_errors=True) @@ -59,14 +81,21 @@ def classify(self): cid = "CID" s = self.get_simulation_dependency() df = None - phot_dirs = [sim_dep.output["photometry_dirs"][self.index] for sim_dep in s] - headers = [os.path.join(phot_dir, a) for phot_dir in phot_dirs for a in os.listdir(phot_dir) if "HEAD" in a] + phot_dirs = [ + sim_dep.output["photometry_dirs"][self.index] for sim_dep in s + ] + headers = [ + os.path.join(phot_dir, a) + for phot_dir in phot_dirs + for a in os.listdir(phot_dir) + if "HEAD" in a + ] if not headers: Task.fail_config(f"No HEAD fits files found in {phot_dir}!") else: types = defaultdict(list) for t in self.get_simulation_dependency(): - for k, v in t.output['types_dict'].items(): + for k, v in t.output["types_dict"].items(): types[k] = np.unique(types[k] + v) self.logger.debug(f"Input types are {types}") diff --git a/pippin/classifiers/scone.py b/pippin/classifiers/scone.py index 395e6d89..cac263fe 100644 --- a/pippin/classifiers/scone.py +++ b/pippin/classifiers/scone.py @@ -1,4 +1,4 @@ -# Created Mar 2024 by R.Kessler and H.Qu +# Created Mar 2024 by R.Kessler and H.Qu # Refactor pippin interface to scone to accept and modify # a scone-input file. @@ -16,15 +16,26 @@ SCONE_SHELL_SCRIPT = "run.py" # top-level script under $SCONE_DIR -KEYLIST_SCONE_INPUT = [ 'init_env_train', 'init_env_heatmaps', - 'prescale_heatmaps', 'nevt_select_heatmaps', - 'batch_size', 'categorical', 'class_balanced', - 'num_epochs', 'num_mjd_bins', 'num_wavelength_bins', - 'mode', 'trained_model', 'prob_column_name' ] +KEYLIST_SCONE_INPUT = [ + "init_env_train", + "init_env_heatmaps", + "prescale_heatmaps", + "nevt_select_heatmaps", + "batch_size", + "categorical", + "class_balanced", + "num_epochs", + "num_mjd_bins", + "num_wavelength_bins", + "mode", + "trained_model", + "prob_column_name", +] + # ========================================== class SconeClassifier(Classifier): - """ convolutional neural network-based SN photometric classifier + """convolutional neural network-based SN photometric classifier for details, see https://arxiv.org/abs/2106.04370, https://arxiv.org/abs/2111.05539, https://arxiv.org/abs/2207.09440 CONFIGURATION: @@ -58,172 +69,218 @@ class SconeClassifier(Classifier): """ - def __new__(cls, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): - # XXX DEPRECATION - # If no BASE file is present, run legacy version of Scone - # Avoid recursive nonsense by making sure the type of `cls` is SconeClassifier - if cls == SconeClassifier and config.get('BASE') is None: - # Have to import later because SconeClassifier must exist prior to importing SconeLegacyClassifier - from pippin.classifiers.scone_legacy import SconeLegacyClassifier - cls = SconeLegacyClassifier - return super().__new__(cls) - - def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): - super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) - self.global_config = get_config() - self.options = options - - # - - - - - - - - # special checks to help users cope with some changes - if mode == 'predict' and 'MODEL' in options: - self.options['TRAINED_MODEL'] = self.options['MODEL'] - - self.gpu = self.options.get("GPU", False) - self.init_env_heatmaps = self.global_config["SCONE"]["init_env_cpu"] - self.init_env = self.global_config["SCONE"]["init_env_cpu"] if not self.gpu else self.global_config["SCONE"]["init_env_gpu"] - self.path_to_classifier = self.global_config["SCONE"]["location"] - - self.combine_mask = "COMBINE_MASK" in config - - self.select_lcfit = self.options.get("OPTIONAL_MASK_FIT", None) # RK May 3 2024 - scone_input_file = config.get('BASE') # refactor by passing scone input file to pippin - if scone_input_file is not None: - scone_input_file = get_data_loc(scone_input_file) - self.scone_input_file = scone_input_file - - output_path_obj = Path(self.output_dir) - heatmaps_path_obj = output_path_obj / "heatmaps" - - - self.job_base_name = output_path_obj.parents[1].name + "__" + output_path_obj.name - - self.batch_replace = self.options.get("BATCH_REPLACE", - self.global_config.get("BATCH_REPLACE", {})) - - self.heatmaps_done_file = str(heatmaps_path_obj / "done.txt") - - remake_heatmaps = self.options.get("REMAKE_HEATMAPS", False) - self.keep_heatmaps = not remake_heatmaps - - return + def __new__( + cls, + name, + output_dir, + config, + dependencies, + mode, + options, + index=0, + model_name=None, + ): + # XXX DEPRECATION + # If no BASE file is present, run legacy version of Scone + # Avoid recursive nonsense by making sure the type of `cls` is SconeClassifier + if cls == SconeClassifier and config.get("BASE") is None: + # Have to import later because SconeClassifier must exist prior to importing SconeLegacyClassifier + from pippin.classifiers.scone_legacy import SconeLegacyClassifier + + cls = SconeLegacyClassifier + return super().__new__(cls) + + def __init__( + self, + name, + output_dir, + config, + dependencies, + mode, + options, + index=0, + model_name=None, + ): + super().__init__( + name, + output_dir, + config, + dependencies, + mode, + options, + index=index, + model_name=model_name, + ) + self.global_config = get_config() + self.options = options + + # - - - - - - - + # special checks to help users cope with some changes + if mode == "predict" and "MODEL" in options: + self.options["TRAINED_MODEL"] = self.options["MODEL"] + + self.gpu = self.options.get("GPU", False) + self.init_env_heatmaps = self.global_config["SCONE"]["init_env_cpu"] + self.init_env = ( + self.global_config["SCONE"]["init_env_cpu"] + if not self.gpu + else self.global_config["SCONE"]["init_env_gpu"] + ) + self.path_to_classifier = self.global_config["SCONE"]["location"] + + self.combine_mask = "COMBINE_MASK" in config + + self.select_lcfit = self.options.get("OPTIONAL_MASK_FIT", None) # RK May 3 2024 + scone_input_file = config.get( + "BASE" + ) # refactor by passing scone input file to pippin + if scone_input_file is not None: + scone_input_file = get_data_loc(scone_input_file) + self.scone_input_file = scone_input_file + + output_path_obj = Path(self.output_dir) + heatmaps_path_obj = output_path_obj / "heatmaps" + + self.job_base_name = ( + output_path_obj.parents[1].name + "__" + output_path_obj.name + ) + + self.batch_replace = self.options.get( + "BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {}) + ) + + self.heatmaps_done_file = str(heatmaps_path_obj / "done.txt") + + remake_heatmaps = self.options.get("REMAKE_HEATMAPS", False) + self.keep_heatmaps = not remake_heatmaps + + return def classify(self, mode): - self.logger.info(f"============ Prepare refactored SCONE with mode = {mode} =============") - failed = False - if Path(self.done_file).exists(): - self.logger.debug(f"Found done file at {self.done_file}") - with open(self.done_file) as f: - if "SUCCESS" not in f.read().upper(): - failed = True - - scone_input_file = self.scone_input_file - - # - - - - - sim_deps = self.get_simulation_dependency() - sim_dirs = [sim_dep.output["photometry_dirs"][self.index] for sim_dep in sim_deps] - - # prepare scone input lines needed to create hash, - # but don't create scone input file yet. - scone_input_lines = self.prepare_scone_input_lines(sim_dirs,mode) - - str_config = ' '.join(scone_input_lines) - new_hash = self.get_hash_from_string(str_config) - - if self._check_regenerate(new_hash) or failed: - self.logger.debug("Regenerating scone") - else: - self.logger.info("scone hash check passed, not rerunning") - self.should_be_done() - return True - - # later, perhaps check to preserve heatmaps ?? - if os.path.exists(self.output_dir): - shutil.rmtree(self.output_dir) - os.makedirs(self.output_dir) - - # write scone input file, and beware that name of scone - # input file is updated - scone_input_base = os.path.basename(self.scone_input_file) - self.scone_input_file = self.output_dir + '/' + 'PIP_' + scone_input_base - with open(self.scone_input_file,"wt") as i: - for line in scone_input_lines: - i.write(f"{line}\n") - - self.save_new_hash(new_hash) - - path = Path(self.path_to_classifier) / SCONE_SHELL_SCRIPT - path = path if path.exists() else Path(self.path_to_classifier) / SCONE_SHELL_SCRIPT - cmd = f"python {str(path)} " \ - f"--config_path {self.scone_input_file} " - # f"--sbatch_job_name {self.job_base_name} " + self.logger.info( + f"============ Prepare refactored SCONE with mode = {mode} =============" + ) + failed = False + if Path(self.done_file).exists(): + self.logger.debug(f"Found done file at {self.done_file}") + with open(self.done_file) as f: + if "SUCCESS" not in f.read().upper(): + failed = True - self.logger.info(f"Running command: {cmd}") - subprocess.run([cmd], shell=True) + scone_input_file = self.scone_input_file - return True + # - - - - + sim_deps = self.get_simulation_dependency() + sim_dirs = [ + sim_dep.output["photometry_dirs"][self.index] for sim_dep in sim_deps + ] + + # prepare scone input lines needed to create hash, + # but don't create scone input file yet. + scone_input_lines = self.prepare_scone_input_lines(sim_dirs, mode) + + str_config = " ".join(scone_input_lines) + new_hash = self.get_hash_from_string(str_config) + + if self._check_regenerate(new_hash) or failed: + self.logger.debug("Regenerating scone") + else: + self.logger.info("scone hash check passed, not rerunning") + self.should_be_done() + return True + + # later, perhaps check to preserve heatmaps ?? + if os.path.exists(self.output_dir): + shutil.rmtree(self.output_dir) + os.makedirs(self.output_dir) + + # write scone input file, and beware that name of scone + # input file is updated + scone_input_base = os.path.basename(self.scone_input_file) + self.scone_input_file = self.output_dir + "/" + "PIP_" + scone_input_base + with open(self.scone_input_file, "wt") as i: + for line in scone_input_lines: + i.write(f"{line}\n") + + self.save_new_hash(new_hash) + + path = Path(self.path_to_classifier) / SCONE_SHELL_SCRIPT + path = ( + path + if path.exists() + else Path(self.path_to_classifier) / SCONE_SHELL_SCRIPT + ) + cmd = f"python {str(path)} " f"--config_path {self.scone_input_file} " + # f"--sbatch_job_name {self.job_base_name} " + + self.logger.info(f"Running command: {cmd}") + subprocess.run([cmd], shell=True) - def prepare_scone_input_lines(self, sim_dirs, mode ): + return True + def prepare_scone_input_lines(self, sim_dirs, mode): # Created Apr 2024 by R.Kessler - # Read base scone input and make a few modification such as + # Read base scone input and make a few modification such as # the sim data dirs, and other substitutions defined in pippin input. # Method returns list of lines for modified scone-config input file. # Original comments and input layout are preserved. config_lines = [] scone_input_file = self.scone_input_file - options_local = self.options.copy() # make local copy + options_local = self.options.copy() # make local copy # set local mode as if it were an override key in pippin input file - options_local['MODE'] = mode + options_local["MODE"] = mode - if mode == 'predict' : - options_local['PROB_COLUMN_NAME'] = self.get_prob_column_name() + if mode == "predict": + options_local["PROB_COLUMN_NAME"] = self.get_prob_column_name() # - - - - flag_remove_line = False - with open(scone_input_file, 'r') as i: - inp_config = i.read().split('\n') + with open(scone_input_file, "r") as i: + inp_config = i.read().split("\n") key_replace_dict = {} - key_remove_list = [ 'input_data_paths:' , 'snid_select_files:', - 'sbatch_job_name:' ] + key_remove_list = [ + "input_data_paths:", + "snid_select_files:", + "sbatch_job_name:", + ] for line_in in inp_config: line_out = line_in wdlist = line_in.split() - nwd = len(wdlist) - if nwd == 0 : + nwd = len(wdlist) + if nwd == 0: flag_remove_line = False else: - if wdlist[0] == 'output_path:' : - line_out = line_in.replace(wdlist[1],self.output_dir) + if wdlist[0] == "output_path:": + line_out = line_in.replace(wdlist[1], self.output_dir) - # goofy logic to remove original input_data_paths - if flag_remove_line and wdlist[0] != '-' : + # goofy logic to remove original input_data_paths + if flag_remove_line and wdlist[0] != "-": flag_remove_line = False if wdlist[0] in key_remove_list: flag_remove_line = True # check all possible scone keys that can be overwritten/added for key in KEYLIST_SCONE_INPUT: - if wdlist[0] == key + ':' : + if wdlist[0] == key + ":": key_pippin = key.upper() if key_pippin in options_local: key_replace_dict[key_pippin] = True val_replace = options_local[key_pippin] - line_out = line_in.replace(wdlist[1],str(val_replace)) + line_out = line_in.replace(wdlist[1], str(val_replace)) # remove prescale for predict mode - if mode == 'predict' and 'prescale' in wdlist[0]: + if mode == "predict" and "prescale" in wdlist[0]: line_out = f"# WARNING: {wdlist[0]} removed for {mode} mode." - - if not flag_remove_line : + if not flag_remove_line: config_lines.append(line_out) - - # - - - - - - - - - - + + # - - - - - - - - - - # add extra info from pippin config_lines.append(f"") config_lines.append(f"# ======================================= ") @@ -231,7 +288,7 @@ def prepare_scone_input_lines(self, sim_dirs, mode ): # pass sbatch_job_name via config since there are other sbatch config # keys already. Could also pass via command line arg --sbatch_job_name. - config_lines.append(f"sbatch_job_name: {self.job_base_name}\n") + config_lines.append(f"sbatch_job_name: {self.job_base_name}\n") config_lines.append(f"input_data_paths:") for sim_dir in sim_dirs: @@ -249,22 +306,21 @@ def prepare_scone_input_lines(self, sim_dirs, mode ): # check option to select events passing LCFIT - if self.select_lcfit: - config_lines.append(f'') - config_lines.append(f'# Train on events passing LCFIT') - config_lines.append('snid_select_files:') + if self.select_lcfit: + config_lines.append(f"") + config_lines.append(f"# Train on events passing LCFIT") + config_lines.append("snid_select_files:") lcfit_deps = self.get_fit_dependency() - #self.logger.info(f"\n xxx lcfit_deps = \n{lcfit_deps}\n") + # self.logger.info(f"\n xxx lcfit_deps = \n{lcfit_deps}\n") for tmp_dict in lcfit_deps: - fitres_dir = tmp_dict['fitres_dirs'][self.index] - fitopt_base_file = tmp_dict['fitopt_map']['DEFAULT'] - fitres_file = f"{fitres_dir}/{fitopt_base_file}" + fitres_dir = tmp_dict["fitres_dirs"][self.index] + fitopt_base_file = tmp_dict["fitopt_map"]["DEFAULT"] + fitres_file = f"{fitres_dir}/{fitopt_base_file}" config_lines.append(f" - {fitres_file}") return config_lines - - #def get_optional_requirements(config): + # def get_optional_requirements(config): # # Created May 3 2024 by R.Kessler and P.Armstrong # if config.get("SELECT_LCFIT", False): # return False, True # wait for LCFIT task @@ -284,13 +340,20 @@ def _check_completion(self, squeue): return Task.FINISHED_FAILURE pred_path = str(Path(self.output_dir) / "predictions.csv") - #predictions = pd.read_csv(pred_path) - #if "pred_labels" in predictions.columns: + # predictions = pd.read_csv(pred_path) + # if "pred_labels" in predictions.columns: # predictions = predictions[["snid", "pred_labels"]] # make sure snid is the first col # predictions = predictions.rename(columns={"pred_labels": self.get_prob_column_name()}) # predictions.to_csv(pred_path, index=False) - #self.logger.info(f"Predictions file can be found at {pred_path}") - self.output.update({"model_filename": self.options.get("MODEL", str(Path(self.output_dir) / "trained_model")), "predictions_filename": pred_path}) + # self.logger.info(f"Predictions file can be found at {pred_path}") + self.output.update( + { + "model_filename": self.options.get( + "MODEL", str(Path(self.output_dir) / "trained_model") + ), + "predictions_filename": pred_path, + } + ) return Task.FINISHED_SUCCESS return self.check_for_job(squeue, self.job_base_name) @@ -301,8 +364,10 @@ def _heatmap_creation_success(self): with open(self.heatmaps_done_file, "r") as donefile: if "CREATE HEATMAPS FAILURE" in donefile.read(): return False - return Path(self.heatmaps_path).exists() and (Path(self.heatmaps_path) / "done.log").exists() - + return ( + Path(self.heatmaps_path).exists() + and (Path(self.heatmaps_path) / "done.log").exists() + ) @staticmethod def get_requirements(options): diff --git a/pippin/classifiers/scone_legacy.py b/pippin/classifiers/scone_legacy.py index 35d8a153..4738d978 100644 --- a/pippin/classifiers/scone_legacy.py +++ b/pippin/classifiers/scone_legacy.py @@ -11,8 +11,9 @@ from pippin.config import get_config, get_output_loc, mkdirs, get_data_loc, merge_dict from pippin.task import Task + class SconeLegacyClassifier(SconeClassifier): - """ convolutional neural network-based SN photometric classifier + """convolutional neural network-based SN photometric classifier for details, see https://arxiv.org/abs/2106.04370, https://arxiv.org/abs/2111.05539, https://arxiv.org/abs/2207.09440 CONFIGURATION: @@ -45,145 +46,193 @@ class SconeLegacyClassifier(SconeClassifier): """ - def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): - super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) - self.logger.warning(f'Using Legacy Scone version, pass a Scone input file via `BASE: /path/to/input.yml` to use the latest Scone version.') - self.global_config = get_config() - self.options = options - - self.gpu = self.options.get("GPU", False) - self.init_env_heatmaps = self.global_config["SCONE"]["init_env_cpu"] - self.init_env = self.global_config["SCONE"]["init_env_cpu"] if not self.gpu else self.global_config["SCONE"]["init_env_gpu"] - self.path_to_classifier = self.global_config["SCONE"]["location"] - self.combine_mask = "COMBINE_MASK" in config - - output_path_obj = Path(self.output_dir) - heatmaps_path_obj = output_path_obj / "heatmaps" - - self.job_base_name = output_path_obj.parents[1].name + "__" + output_path_obj.name - - self.batch_replace = self.options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) - self.slurm = """{sbatch_header} + def __init__( + self, + name, + output_dir, + config, + dependencies, + mode, + options, + index=0, + model_name=None, + ): + super().__init__( + name, + output_dir, + config, + dependencies, + mode, + options, + index=index, + model_name=model_name, + ) + self.logger.warning( + f"Using Legacy Scone version, pass a Scone input file via `BASE: /path/to/input.yml` to use the latest Scone version." + ) + self.global_config = get_config() + self.options = options + + self.gpu = self.options.get("GPU", False) + self.init_env_heatmaps = self.global_config["SCONE"]["init_env_cpu"] + self.init_env = ( + self.global_config["SCONE"]["init_env_cpu"] + if not self.gpu + else self.global_config["SCONE"]["init_env_gpu"] + ) + self.path_to_classifier = self.global_config["SCONE"]["location"] + self.combine_mask = "COMBINE_MASK" in config + + output_path_obj = Path(self.output_dir) + heatmaps_path_obj = output_path_obj / "heatmaps" + + self.job_base_name = ( + output_path_obj.parents[1].name + "__" + output_path_obj.name + ) + + self.batch_replace = self.options.get( + "BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {}) + ) + self.slurm = """{sbatch_header} {task_setup}""" - self.config_path = str(output_path_obj / "model_config.yml") - self.logfile = str(output_path_obj / "output.log") - self.model_sbatch_job_path = str(output_path_obj / "job.slurm") + self.config_path = str(output_path_obj / "model_config.yml") + self.logfile = str(output_path_obj / "output.log") + self.model_sbatch_job_path = str(output_path_obj / "job.slurm") - self.heatmaps_path = str(heatmaps_path_obj) - self.heatmaps_done_file = str(heatmaps_path_obj / "done.txt") - self.heatmaps_sbatch_header_path = str(heatmaps_path_obj / "sbatch_header.sh") - self.heatmaps_log_path = str(heatmaps_path_obj / f"create_heatmaps__{Path(self.config_path).name.split('.')[0]}.log") + self.heatmaps_path = str(heatmaps_path_obj) + self.heatmaps_done_file = str(heatmaps_path_obj / "done.txt") + self.heatmaps_sbatch_header_path = str(heatmaps_path_obj / "sbatch_header.sh") + self.heatmaps_log_path = str( + heatmaps_path_obj + / f"create_heatmaps__{Path(self.config_path).name.split('.')[0]}.log" + ) - remake_heatmaps = self.options.get("REMAKE_HEATMAPS", False) - self.keep_heatmaps = not remake_heatmaps + remake_heatmaps = self.options.get("REMAKE_HEATMAPS", False) + self.keep_heatmaps = not remake_heatmaps def make_sbatch_header(self, option_name, header_dict, use_gpu=False): - sbatch_header_template = self.options.get(option_name) - sbatch_header = self.sbatch_gpu_header if use_gpu else self.sbatch_cpu_header + sbatch_header_template = self.options.get(option_name) + sbatch_header = self.sbatch_gpu_header if use_gpu else self.sbatch_cpu_header - if sbatch_header_template is not None: - self.logger.debug(f"batch file found at {sbatch_header_template}") - with open(get_data_loc(sbatch_header_template), 'r') as f: - sbatch_header = f.read() + if sbatch_header_template is not None: + self.logger.debug(f"batch file found at {sbatch_header_template}") + with open(get_data_loc(sbatch_header_template), "r") as f: + sbatch_header = f.read() - sbatch_header = self.clean_header(sbatch_header) + sbatch_header = self.clean_header(sbatch_header) - header_dict = merge_dict(header_dict, self.batch_replace) - return self._update_header(sbatch_header, header_dict) + header_dict = merge_dict(header_dict, self.batch_replace) + return self._update_header(sbatch_header, header_dict) def make_heatmaps_sbatch_header(self): - self.logger.info("heatmaps not created, creating now") - shutil.rmtree(self.output_dir, ignore_errors=True) - mkdirs(self.heatmaps_path) + self.logger.info("heatmaps not created, creating now") + shutil.rmtree(self.output_dir, ignore_errors=True) + mkdirs(self.heatmaps_path) - # TODO: if externally specified batchfile exists, have to parse desired logfile path from it - header_dict = { + # TODO: if externally specified batchfile exists, have to parse desired logfile path from it + header_dict = { "REPLACE_LOGFILE": self.heatmaps_log_path, - "REPLACE_WALLTIME": "12:00:00", #TODO: change to scale with # of heatmaps expected + "REPLACE_WALLTIME": "12:00:00", # TODO: change to scale with # of heatmaps expected "REPLACE_MEM": self.options.get("HEATMAPS_MEM", "32GB"), - } - heatmaps_sbatch_header = self.make_sbatch_header("HEATMAPS_BATCH_FILE", header_dict) + } + heatmaps_sbatch_header = self.make_sbatch_header( + "HEATMAPS_BATCH_FILE", header_dict + ) - with open(self.heatmaps_sbatch_header_path, "w+") as f: - f.write(heatmaps_sbatch_header) + with open(self.heatmaps_sbatch_header_path, "w+") as f: + f.write(heatmaps_sbatch_header) def make_model_sbatch_script(self): - header_dict = { - "REPLACE_NAME": self.job_base_name, - "REPLACE_LOGFILE": str(Path(self.output_dir) / "output.log"), - "REPLACE_MEM": self.options.get("MODEL_MEM", "64GB"), - "REPLACE_WALLTIME": "4:00:00" if self.gpu else "12:00:00", # 4h is max for gpu - } - model_sbatch_header = self.make_sbatch_header("MODEL_BATCH_FILE", header_dict, use_gpu=self.gpu) - - setup_dict = { - "init_env": self.init_env, - "path_to_classifier": self.path_to_classifier, - "heatmaps_path": self.heatmaps_path, - "config_path": self.config_path, - "done_file": self.done_file, - } - - format_dict = { - "sbatch_header": model_sbatch_header, - "task_setup": self.update_setup(setup_dict, self.task_setup['scone']) - } - - self.logger.info(f"Running SCONE model, slurm job written to {self.model_sbatch_job_path}") - slurm_script = self.slurm.format(**format_dict) - - with open(self.model_sbatch_job_path, "w") as f: - f.write(slurm_script) - - return slurm_script + header_dict = { + "REPLACE_NAME": self.job_base_name, + "REPLACE_LOGFILE": str(Path(self.output_dir) / "output.log"), + "REPLACE_MEM": self.options.get("MODEL_MEM", "64GB"), + "REPLACE_WALLTIME": "4:00:00" + if self.gpu + else "12:00:00", # 4h is max for gpu + } + model_sbatch_header = self.make_sbatch_header( + "MODEL_BATCH_FILE", header_dict, use_gpu=self.gpu + ) + + setup_dict = { + "init_env": self.init_env, + "path_to_classifier": self.path_to_classifier, + "heatmaps_path": self.heatmaps_path, + "config_path": self.config_path, + "done_file": self.done_file, + } + + format_dict = { + "sbatch_header": model_sbatch_header, + "task_setup": self.update_setup(setup_dict, self.task_setup["scone"]), + } + + self.logger.info( + f"Running SCONE model, slurm job written to {self.model_sbatch_job_path}" + ) + slurm_script = self.slurm.format(**format_dict) + + with open(self.model_sbatch_job_path, "w") as f: + f.write(slurm_script) + + return slurm_script def classify(self, mode): - failed = False - if Path(self.done_file).exists(): - self.logger.debug(f"Found done file at {self.done_file}") - with open(self.done_file) as f: - if "SUCCESS" not in f.read().upper(): - failed = True + failed = False + if Path(self.done_file).exists(): + self.logger.debug(f"Found done file at {self.done_file}") + with open(self.done_file) as f: + if "SUCCESS" not in f.read().upper(): + failed = True + + heatmaps_created = self._heatmap_creation_success() and self.keep_heatmaps + + sim_deps = self.get_simulation_dependency() + sim_dirs = [ + sim_dep.output["photometry_dirs"][self.index] for sim_dep in sim_deps + ] + + lcdata_paths = [ + path for path in self._get_lcdata_paths(sim_dirs) if "PHOT.FITS" in path + ] + metadata_paths = [ + path.replace("PHOT.FITS", "HEAD.FITS") for path in lcdata_paths + ] + + str_config = self._make_config( + metadata_paths, lcdata_paths, mode, heatmaps_created + ) + new_hash = self.get_hash_from_string(str_config) + + if self._check_regenerate(new_hash) or failed: + self.logger.debug("Regenerating") + else: + self.logger.info("Hash check passed, not rerunning") + self.should_be_done() + return True - heatmaps_created = self._heatmap_creation_success() and self.keep_heatmaps + if not heatmaps_created: + # this deletes the whole directory tree, don't write anything before this + self.make_heatmaps_sbatch_header() - sim_deps = self.get_simulation_dependency() - sim_dirs = [sim_dep.output["photometry_dirs"][self.index] for sim_dep in sim_deps] + self.save_new_hash(new_hash) + with open(self.config_path, "w+") as cfgfile: + cfgfile.write(str_config) - lcdata_paths = [path for path in self._get_lcdata_paths(sim_dirs) if "PHOT.FITS" in path] - metadata_paths = [path.replace("PHOT.FITS", "HEAD.FITS") for path in lcdata_paths] + slurm_script = self.make_model_sbatch_script() - str_config = self._make_config(metadata_paths, lcdata_paths, mode, heatmaps_created) - new_hash = self.get_hash_from_string(str_config) + # TODO: nersc needs `module load esslurm` to sbatch gpu jobs, maybe make + # this shell command to a file so diff systems can define their own + file_to_run = "run_legacy.py" + path = Path(self.path_to_classifier) / file_to_run + cmd = f"python {str(path)} --config_path {self.config_path}" + subprocess.run([cmd], shell=True) + self.logger.info(f"Running command: {cmd}") - if self._check_regenerate(new_hash) or failed: - self.logger.debug("Regenerating") - else: - self.logger.info("Hash check passed, not rerunning") - self.should_be_done() return True - if not heatmaps_created: - # this deletes the whole directory tree, don't write anything before this - self.make_heatmaps_sbatch_header() - - self.save_new_hash(new_hash) - with open(self.config_path, "w+") as cfgfile: - cfgfile.write(str_config) - - slurm_script = self.make_model_sbatch_script() - - # TODO: nersc needs `module load esslurm` to sbatch gpu jobs, maybe make - # this shell command to a file so diff systems can define their own - file_to_run = 'run_legacy.py' - path = Path(self.path_to_classifier) / file_to_run - cmd = f"python {str(path)} --config_path {self.config_path}" - subprocess.run([cmd], shell=True) - self.logger.info(f"Running command: {cmd}") - - return True - def predict(self): return self.classify("predict") @@ -193,7 +242,7 @@ def train(self): def _get_types(self): types = {} for t in self.get_simulation_dependency(): - for k, v in t.output['types'].items(): + for k, v in t.output["types"].items(): if k not in types: types[k] = v return types @@ -207,11 +256,13 @@ def _make_config(self, metadata_paths, lcdata_paths, mode, heatmaps_created): # info for heatmap creation if not heatmaps_created: - config["sbatch_header_path"] = self.heatmaps_sbatch_header_path + config["sbatch_header_path"] = self.heatmaps_sbatch_header_path config["heatmaps_donefile"] = self.heatmaps_done_file config["heatmaps_logfile"] = self.heatmaps_log_path - config["sim_fraction"] = self.options.get("SIM_FRACTION", 1) # 1/sim_fraction % of simulated SNe will be used for the model + config["sim_fraction"] = self.options.get( + "SIM_FRACTION", 1 + ) # 1/sim_fraction % of simulated SNe will be used for the model config["heatmaps_path"] = self.heatmaps_path config["model_sbatch_job_path"] = self.model_sbatch_job_path config["num_wavelength_bins"] = self.options.get("NUM_WAVELENGTH_BINS", 32) @@ -221,21 +272,27 @@ def _make_config(self, metadata_paths, lcdata_paths, mode, heatmaps_created): # info for classification model config["categorical"] = self.options.get("CATEGORICAL", False) - config["num_epochs"] = self.options.get("NUM_EPOCHS", 400) # TODO: replace num epochs with autostop: stop training when slope plateaus? - config["batch_size"] = self.options.get("BATCH_SIZE", 32) # TODO: replace with percentage of total size? + config["num_epochs"] = self.options.get( + "NUM_EPOCHS", 400 + ) # TODO: replace num epochs with autostop: stop training when slope plateaus? + config["batch_size"] = self.options.get( + "BATCH_SIZE", 32 + ) # TODO: replace with percentage of total size? config["Ia_fraction"] = self.options.get("IA_FRACTION", 0.5) config["output_path"] = self.output_dir config["trained_model"] = self.options.get("MODEL", None) config["kcor_file"] = self.options.get("KCOR_FILE", None) config["mode"] = mode config["job_base_name"] = self.job_base_name - config["class_balanced"] = (mode == "train") + config["class_balanced"] = mode == "train" types = self._get_types() if types is not None: - types = {int(k): v for k, v in types.items()} # sometimes the keys are strings, sometimes ints - self.logger.info(f"input types from sim found, types set to {types}") - config["sn_type_id_to_name"] = types + types = { + int(k): v for k, v in types.items() + } # sometimes the keys are strings, sometimes ints + self.logger.info(f"input types from sim found, types set to {types}") + config["sn_type_id_to_name"] = types return yaml.dump(config) @@ -249,11 +306,22 @@ def _check_completion(self, squeue): pred_path = str(Path(self.output_dir) / "predictions.csv") predictions = pd.read_csv(pred_path) if "pred_labels" in predictions.columns: - predictions = predictions[["snid", "pred_labels"]] # make sure snid is the first col - predictions = predictions.rename(columns={"pred_labels": self.get_prob_column_name()}) + predictions = predictions[ + ["snid", "pred_labels"] + ] # make sure snid is the first col + predictions = predictions.rename( + columns={"pred_labels": self.get_prob_column_name()} + ) predictions.to_csv(pred_path, index=False) self.logger.info(f"Predictions file can be found at {pred_path}") - self.output.update({"model_filename": self.options.get("MODEL", str(Path(self.output_dir) / "trained_model")), "predictions_filename": pred_path}) + self.output.update( + { + "model_filename": self.options.get( + "MODEL", str(Path(self.output_dir) / "trained_model") + ), + "predictions_filename": pred_path, + } + ) return Task.FINISHED_SUCCESS return self.check_for_job(squeue, self.job_base_name) @@ -263,29 +331,42 @@ def _heatmap_creation_success(self): with open(self.heatmaps_done_file, "r") as donefile: if "CREATE HEATMAPS FAILURE" in donefile.read(): return False - return Path(self.heatmaps_path).exists() and (Path(self.heatmaps_path) / "done.log").exists() + return ( + Path(self.heatmaps_path).exists() + and (Path(self.heatmaps_path) / "done.log").exists() + ) def num_jobs_in_queue(self): - squeue = [i.strip() for i in subprocess.check_output(f"squeue -h -u $USER -o '%.200j'", shell=True, text=True).splitlines()] + squeue = [ + i.strip() + for i in subprocess.check_output( + f"squeue -h -u $USER -o '%.200j'", shell=True, text=True + ).splitlines() + ] self.logger.debug(f"{squeue}") return self.check_for_job(squeue, self.job_base_name) @staticmethod def _get_lcdata_paths(sim_dirs): - lcdata_paths = [str(f.resolve()) for sim_dir in sim_dirs for f in Path(sim_dir).iterdir() if "PHOT" in f.name] + lcdata_paths = [ + str(f.resolve()) + for sim_dir in sim_dirs + for f in Path(sim_dir).iterdir() + if "PHOT" in f.name + ] return lcdata_paths @staticmethod def _update_header(header, header_dict): - for key, value in header_dict.items(): - if key in header: - header = header.replace(key, str(value)) - append_list = header_dict.get("APPEND") - if append_list is not None: - lines = header.split('\n') - lines += append_list - header = '\n'.join(lines) - return header + for key, value in header_dict.items(): + if key in header: + header = header.replace(key, str(value)) + append_list = header_dict.get("APPEND") + if append_list is not None: + lines = header.split("\n") + lines += append_list + header = "\n".join(lines) + return header @staticmethod def get_requirements(options): diff --git a/pippin/classifiers/snirf.py b/pippin/classifiers/snirf.py index a3a31b94..14279863 100644 --- a/pippin/classifiers/snirf.py +++ b/pippin/classifiers/snirf.py @@ -10,7 +10,7 @@ class SnirfClassifier(Classifier): - """ SNIRF classifier + """SNIRF classifier CONFIGURATION: ============== @@ -38,14 +38,39 @@ class SnirfClassifier(Classifier): """ - def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): - super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) + def __init__( + self, + name, + output_dir, + config, + dependencies, + mode, + options, + index=0, + model_name=None, + ): + super().__init__( + name, + output_dir, + config, + dependencies, + mode, + options, + index=index, + model_name=model_name, + ) self.global_config = get_config() self.num_jobs = 4 self.conda_env = self.global_config["SNIRF"]["conda_env"] - self.path_to_classifier = get_output_loc(self.global_config["SNIRF"]["location"]) - self.job_base_name = os.path.basename(Path(output_dir).parents[1]) + "__" + os.path.basename(output_dir) + self.path_to_classifier = get_output_loc( + self.global_config["SNIRF"]["location"] + ) + self.job_base_name = ( + os.path.basename(Path(output_dir).parents[1]) + + "__" + + os.path.basename(output_dir) + ) self.features = options.get("FEATURES", "x1 c zHD x1ERR cERR PKMJDERR") self.validate_model() @@ -58,7 +83,9 @@ def __init__(self, name, output_dir, config, dependencies, mode, options, index= self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) - self.batch_replace = self.options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) + self.batch_replace = self.options.get( + "BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {}) + ) self.slurm = """{sbatch_header} {task_setup} @@ -67,9 +94,13 @@ def __init__(self, name, output_dir, config, dependencies, mode, options, index= def setup(self): lcfit = self.get_fit_dependency()[0] if len(self.get_fit_dependency()) > 1: - self.logger.warning(f"Found multiple fit dependencies for SNIRF possibly because COMBINE_MASK is being used, but SNIRF does not currently support this. Using the first one: {lcfit.name}") + self.logger.warning( + f"Found multiple fit dependencies for SNIRF possibly because COMBINE_MASK is being used, but SNIRF does not currently support this. Using the first one: {lcfit.name}" + ) self.fitres_filename = lcfit["fitopt_map"][self.fitopt] - self.fitres_file = os.path.abspath(os.path.join(lcfit["fitres_dirs"][self.index], self.fitres_filename)) + self.fitres_file = os.path.abspath( + os.path.join(lcfit["fitres_dirs"][self.index], self.fitres_filename) + ) def classify(self, command): if self.batch_file is None: @@ -78,17 +109,17 @@ def classify(self, command): else: self.sbatch_header = self.sbatch_cpu_header else: - with open(self.batch_file, 'r') as f: + with open(self.batch_file, "r") as f: self.sbatch_header = f.read() self.sbatch_header = self.clean_header(self.sbatch_header) header_dict = { - "REPLACE_NAME": self.job_base_name, - "REPLACE_LOGFILE": "output.log", - "REPLACE_WALLTIME": "15:00:00", - "REPLACE_MEM": "3GB", - "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=4"] - } + "REPLACE_NAME": self.job_base_name, + "REPLACE_LOGFILE": "output.log", + "REPLACE_WALLTIME": "15:00:00", + "REPLACE_MEM": "3GB", + "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=4"], + } header_dict = merge_dict(header_dict, self.batch_replace) self.update_header(header_dict) @@ -101,9 +132,8 @@ def classify(self, command): format_dict = { "sbatch_header": self.sbatch_header, - "task_setup": self.update_setup(setup_dict, self.task_setup['snirf']) - - } + "task_setup": self.update_setup(setup_dict, self.task_setup["snirf"]), + } slurm_script = self.slurm.format(**format_dict) new_hash = self.get_hash_from_string(slurm_script) @@ -127,10 +157,26 @@ def classify(self, command): def get_rf_conf(self): leaf_opts = ( - (f"--n_estimators {self.options.get('N_ESTIMATORS')} " if self.options.get("N_ESTIMATORS") is not None else "") - + (f"--min_samples_split {self.options.get('MIN_SAMPLES_SPLIT')} " if self.options.get("MIN_SAMPLES_SPLIT") is not None else "") - + (f"--min_samples_leaf {self.options.get('MIN_SAMPLES_LEAF')} " if self.options.get("MIN_SAMPLES_LEAF") is not None else "") - + (f"--max_depth {self.options.get('MAX_DEPTH')} " if self.options.get("MAX_DEPTH") is not None else "") + ( + f"--n_estimators {self.options.get('N_ESTIMATORS')} " + if self.options.get("N_ESTIMATORS") is not None + else "" + ) + + ( + f"--min_samples_split {self.options.get('MIN_SAMPLES_SPLIT')} " + if self.options.get("MIN_SAMPLES_SPLIT") is not None + else "" + ) + + ( + f"--min_samples_leaf {self.options.get('MIN_SAMPLES_LEAF')} " + if self.options.get("MIN_SAMPLES_LEAF") is not None + else "" + ) + + ( + f"--max_depth {self.options.get('MAX_DEPTH')} " + if self.options.get("MAX_DEPTH") is not None + else "" + ) ) return leaf_opts @@ -138,7 +184,9 @@ def predict(self): self.setup() model = self.options.get("MODEL") if model is None: - self.logger.error("If you are in predict model, please specify a MODEL in OPTS. Either a file location or a training task name.") + self.logger.error( + "If you are in predict model, please specify a MODEL in OPTS. Either a file location or a training task name." + ) return False potential_path = get_output_loc(model) if os.path.exists(potential_path): @@ -146,11 +194,15 @@ def predict(self): model = potential_path else: if "/" in model: - self.logger.warning(f"Your model {model} looks like a path, but I couldn't find a model at {potential_path}") + self.logger.warning( + f"Your model {model} looks like a path, but I couldn't find a model at {potential_path}" + ) # If its not a file, it must be a task for t in self.dependencies: if model == t.name: - self.logger.debug(f"Found task dependency {t.name} with model file {t.output['model_filename']}") + self.logger.debug( + f"Found task dependency {t.name} with model file {t.output['model_filename']}" + ) model = t.output["model_filename"] command = ( f"--nc 4 " @@ -195,21 +247,41 @@ def _check_completion(self, squeue): if self.mode == Classifier.PREDICT: # Rename output file myself # First check to see if this is already done - predictions_filename = os.path.join(self.output_dir, "predictions.csv") + predictions_filename = os.path.join( + self.output_dir, "predictions.csv" + ) if not os.path.exists(predictions_filename): # Find the output file - output_files = [i for i in os.listdir(self.output_dir) if i.endswith("Classes.txt")] + output_files = [ + i + for i in os.listdir(self.output_dir) + if i.endswith("Classes.txt") + ] if len(output_files) != 1: - self.logger.error(f"Could not find the output file in {self.output_dir}") + self.logger.error( + f"Could not find the output file in {self.output_dir}" + ) return Task.FINISHED_FAILURE - df = pd.read_csv(os.path.join(self.output_dir, output_files[0]), delim_whitespace=True) + df = pd.read_csv( + os.path.join(self.output_dir, output_files[0]), + delim_whitespace=True, + ) df_final = df[["CID", "RFprobability0"]] - df_final = df_final.rename(columns={"CID": "SNID", "RFprobability0": self.get_prob_column_name()}) - df_final.to_csv(predictions_filename, index=False, float_format="%0.4f") + df_final = df_final.rename( + columns={ + "CID": "SNID", + "RFprobability0": self.get_prob_column_name(), + } + ) + df_final.to_csv( + predictions_filename, index=False, float_format="%0.4f" + ) self.output["predictions_filename"] = predictions_filename else: self.output["model_filename"] = [ - os.path.join(self.output_dir, f) for f in os.listdir(self.output_dir) if f.startswith(self.model_pk_file) + os.path.join(self.output_dir, f) + for f in os.listdir(self.output_dir) + if f.startswith(self.model_pk_file) ][0] return Task.FINISHED_SUCCESS return self.check_for_job(squeue, self.job_base_name) diff --git a/pippin/classifiers/supernnova.py b/pippin/classifiers/supernnova.py index bba19b86..cd080566 100644 --- a/pippin/classifiers/supernnova.py +++ b/pippin/classifiers/supernnova.py @@ -5,14 +5,21 @@ import pickle from collections import OrderedDict from pippin.classifiers.classifier import Classifier -from pippin.config import chown_dir, mkdirs, get_config, get_output_loc, get_data_loc, merge_dict +from pippin.config import ( + chown_dir, + mkdirs, + get_config, + get_output_loc, + get_data_loc, + merge_dict, +) from pippin.task import Task from time import sleep import numpy as np class SuperNNovaClassifier(Classifier): - """ Classification task for the SuperNNova classifier. + """Classification task for the SuperNNova classifier. CONFIGURATION ============= @@ -41,8 +48,27 @@ class SuperNNovaClassifier(Classifier): """ - def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): - super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) + def __init__( + self, + name, + output_dir, + config, + dependencies, + mode, + options, + index=0, + model_name=None, + ): + super().__init__( + name, + output_dir, + config, + dependencies, + mode, + options, + index=index, + model_name=model_name, + ) self.global_config = get_config() self.dump_dir = output_dir + "/dump" self.job_base_name = os.path.basename(output_dir) @@ -52,13 +78,13 @@ def __init__(self, name, output_dir, config, dependencies, mode, options, index= self.done_file2 = os.path.join(self.output_dir, "done_task2.txt") self.variant = options.get("VARIANT", "vanilla").lower() # Redshift can be True, False, 'zpho', 'zspe', or 'none' - redshift = options.get("REDSHIFT", 'zspe') + redshift = options.get("REDSHIFT", "zspe") # Not sure how python deals with strings and bools, so just being careful if redshift == True: redshift = "zspe" elif redshift == False: redshift = "none" - if redshift not in ['zpho', 'zspe', 'none']: + if redshift not in ["zpho", "zspe", "none"]: self.logger.warning(f"Unknown redshift option ['zpho', 'zspe', 'none']") self.redshift = redshift self.norm = options.get("NORM", "cosmo") @@ -71,32 +97,42 @@ def __init__(self, name, output_dir, config, dependencies, mode, options, index= # Must be a list self.list_filters = options.get("LIST_FILTERS", None) if self.list_filters is None: - self.list_filters = ['g', 'i', 'r', 'z'] - assert isinstance(self.list_filters, list), f"LIST_FILTERS must be a list, instead got {type(self.list_filters)}" + self.list_filters = ["g", "i", "r", "z"] + assert isinstance( + self.list_filters, list + ), f"LIST_FILTERS must be a list, instead got {type(self.list_filters)}" # Can either be a yml dictionary or a str filepath to a txt files containing all mappings self.sntypes = options.get("SNTYPES", None) if self.sntypes is None: self.sntypes = {} elif isinstance(self.sntypes, str): sntypes_path = get_data_loc(self.sntypes) - assert sntypes_path is not None, f"SNTYPES: {self.sntypes} does not resolve to a path." + assert ( + sntypes_path is not None + ), f"SNTYPES: {self.sntypes} does not resolve to a path." self.logger.debug(f"Reading in SNTYPES from {sntypes_path}") sntypes_raw = np.loadtxt(sntypes_path, dtype=str) self.sntypes = {i[0]: i[1] for i in sntypes_raw} - assert isinstance(self.sntypes, dict), f"SNTYPES must be a dict, instead got {type(self.sntypes)}" + assert isinstance( + self.sntypes, dict + ), f"SNTYPES must be a dict, instead got {type(self.sntypes)}" # Setup yml files self.data_yml_file = options.get("DATA_YML", None) self.output_data_yml = os.path.join(self.output_dir, "data.yml") self.classification_yml_file = options.get("CLASSIFICATION_YML", None) - self.output_classification_yml = os.path.join(self.output_dir, "classification.yml") + self.output_classification_yml = os.path.join( + self.output_dir, "classification.yml" + ) # XOR - only runs if either but not both yml's are None if (self.data_yml_file is None) ^ (self.classification_yml_file is None): - self.logger.error(f"If using yml inputs, both 'DATA_YML' (currently {self.data_yml} and 'CLASSIFICATION_YML' (currently {self.classification_yml}) must be provided") + self.logger.error( + f"If using yml inputs, both 'DATA_YML' (currently {self.data_yml} and 'CLASSIFICATION_YML' (currently {self.classification_yml}) must be provided" + ) elif self.data_yml_file is not None: - with open(self.data_yml_file, 'r') as f: + with open(self.data_yml_file, "r") as f: self.data_yml = f.read() - with open(self.classification_yml_file, 'r') as f: + with open(self.classification_yml_file, "r") as f: self.classification_yml = f.read() self.has_yml = True self.variant = self.get_variant_from_yml(self.classification_yml) @@ -108,7 +144,9 @@ def __init__(self, name, output_dir, config, dependencies, mode, options, index= self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) - self.batch_replace = self.options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) + self.batch_replace = self.options.get( + "BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {}) + ) self.validate_model() @@ -119,18 +157,24 @@ def __init__(self, name, output_dir, config, dependencies, mode, options, index= "cosmo_quantile", "none", ], f"Norm option is set to {self.norm}, needs to be one of 'global', 'cosmo', 'perfilter', 'cosmo_quantile" - assert self.variant in ["vanilla", "variational", "bayesian"], f"Variant {self.variant} is not vanilla, variational or bayesian" + assert self.variant in [ + "vanilla", + "variational", + "bayesian", + ], f"Variant {self.variant} is not vanilla, variational or bayesian" self.slurm = """{sbatch_header} {task_setup} """ self.conda_env = self.global_config["SuperNNova"]["conda_env"] - self.path_to_classifier = get_output_loc(self.global_config["SuperNNova"]["location"]) + self.path_to_classifier = get_output_loc( + self.global_config["SuperNNova"]["location"] + ) def get_variant_from_yml(self, yml_file): if "model" in yml_file: self.logger.debug("Detected model in yml file") - stripped = "".join(yml_file.split(' ')) + stripped = "".join(yml_file.split(" ")) if "model:bayesian" in stripped: self.logger.debug("Detected bayesian model") return "bayesian" @@ -141,7 +185,11 @@ def get_variant_from_yml(self, yml_file): return "vanilla" def update_yml(self): - replace_dict = {"DONE_FILE": self.done_file, "DUMP_DIR": self.dump_dir, "RAW_DIR": self.raw_dir} + replace_dict = { + "DONE_FILE": self.done_file, + "DUMP_DIR": self.dump_dir, + "RAW_DIR": self.raw_dir, + } for key, value in replace_dict.items(): self.data_yml = self.data_yml.replace(key, value) self.classification_yml = self.classification_yml.replace(key, value) @@ -152,8 +200,14 @@ def get_model_and_pred(self): self.logger.debug(f"Max Tries: {max_tries}") try: model_folder = self.dump_dir + "/models" - files = [f for f in os.listdir(model_folder) if os.path.isdir(os.path.join(model_folder, f))] - assert len(files) == 1, f"Did not find singular output file: {str(files)}" + files = [ + f + for f in os.listdir(model_folder) + if os.path.isdir(os.path.join(model_folder, f)) + ] + assert ( + len(files) == 1 + ), f"Did not find singular output file: {str(files)}" saved_dir = os.path.abspath(os.path.join(model_folder, files[0])) subfiles = list(os.listdir(saved_dir)) @@ -162,10 +216,18 @@ def get_model_and_pred(self): model_file = os.path.join(saved_dir, model_files[0]) self.logger.debug(f"Found model file {model_file}") else: - self.logger.debug("No model found. Not an issue if you've specified a model.") + self.logger.debug( + "No model found. Not an issue if you've specified a model." + ) model_file = None - ending = "_aggregated.pickle" if self.variant in ["variational", "bayesian"] else ".pickle" - pred_files = [f for f in subfiles if f.startswith("PRED") and f.endswith(ending)] + ending = ( + "_aggregated.pickle" + if self.variant in ["variational", "bayesian"] + else ".pickle" + ) + pred_files = [ + f for f in subfiles if f.startswith("PRED") and f.endswith(ending) + ] self.logger.debug(pred_files) pred_file = pred_files[0] self.logger.debug(f"Success after {100-max_tries} tries.") @@ -185,7 +247,7 @@ def predict(self): def get_types(self): types = {} for t in self.get_simulation_dependency(): - for k, v in t.output['types'].items(): + for k, v in t.output["types"].items(): if k not in types: types[k] = v return types @@ -194,11 +256,15 @@ def classify(self, training): model = self.options.get("MODEL") model_path = "" if not training: - assert model is not None, "If TRAIN is not specified, you have to point to a model to use" + assert ( + model is not None + ), "If TRAIN is not specified, you have to point to a model to use" if not os.path.exists(get_output_loc(model)): for t in self.dependencies: if model == t.name: - self.logger.debug(f"Found task dependency {t.name} with model file {t.output['model_filename']}") + self.logger.debug( + f"Found task dependency {t.name} with model file {t.output['model_filename']}" + ) model = t.output["model_filename"] model_path = get_output_loc(model) self.logger.debug(f"Looking for model in {model_path}") @@ -249,7 +315,9 @@ def classify(self, training): if not has_ia: self.logger.debug("No Ia type found, injecting type") types[1] = "Ia" - types = dict(sorted(types.items(), key=lambda x: -1 if x[0] == 1 else x[0])) + types = dict( + sorted(types.items(), key=lambda x: -1 if x[0] == 1 else x[0]) + ) self.logger.debug(f"Inject types with Ias are {types}") if not has_cc: self.logger.debug("No cc type found, injecting type") @@ -264,14 +332,26 @@ def classify(self, training): str_list_filters = " ".join(self.list_filters) self.logger.debug(f"Filter list set to {str_list_filters}") - sim_dep = self.get_simulation_dependency()[0] # only taking the first one because SNN internally takes a single fits dir as input + sim_dep = self.get_simulation_dependency()[ + 0 + ] # only taking the first one because SNN internally takes a single fits dir as input if len(self.get_simulation_dependency()) > 1: - self.logger.warning(f"Found more than one simulation dependency, possibly because COMBINE_MASK is being used. SuperNNova doesn't currently support this. Using only the first sim dependency: {sim_dep.name}") + self.logger.warning( + f"Found more than one simulation dependency, possibly because COMBINE_MASK is being used. SuperNNova doesn't currently support this. Using only the first sim dependency: {sim_dep.name}" + ) light_curve_dir = sim_dep.output["photometry_dirs"][self.index] self.raw_dir = light_curve_dir fit = self.get_fit_dependency() - fit_dir = f"" if ((fit is None) or (len(fit) == 0)) else f"--fits_dir {fit[self.index]['fitres_dirs']}" - cyclic = "--cyclic" if self.variant in ["vanilla", "variational"] and self.cyclic else "" + fit_dir = ( + f"" + if ((fit is None) or (len(fit) == 0)) + else f"--fits_dir {fit[self.index]['fitres_dirs']}" + ) + cyclic = ( + "--cyclic" + if self.variant in ["vanilla", "variational"] and self.cyclic + else "" + ) batch_size = f"--batch_size {self.batch_size}" num_layers = f"--num_layers {self.num_layers}" hidden_dim = f"--hidden_dim {self.hidden_dim}" @@ -291,11 +371,10 @@ def classify(self, training): else: self.sbatch_header = self.sbatch_cpu_header else: - with open(self.batch_file, 'r') as f: + with open(self.batch_file, "r") as f: self.sbatch_header = f.read() self.sbatch_header = self.clean_header(self.sbatch_header) - if self.has_yml: self.update_yml() setup_file = "supernnova_yml" @@ -303,12 +382,12 @@ def classify(self, training): setup_file = "supernnova" header_dict = { - "REPLACE_NAME": self.job_base_name, - "REPLACE_WALLTIME": "23:00:00", - "REPLACE_LOGFILE": "output.log", - "REPLACE_MEM": "32GB", - "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=1"] - } + "REPLACE_NAME": self.job_base_name, + "REPLACE_WALLTIME": "23:00:00", + "REPLACE_LOGFILE": "output.log", + "REPLACE_MEM": "32GB", + "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=1"], + } header_dict = merge_dict(header_dict, self.batch_replace) self.update_header(header_dict) @@ -342,16 +421,18 @@ def classify(self, training): "hidden_dim": hidden_dim, "data_yml": self.output_data_yml, "classification_yml": self.output_classification_yml, - "classification_command": "train_rnn" if training else "validate_rnn" + "classification_command": "train_rnn" if training else "validate_rnn", } format_dict = { "sbatch_header": self.sbatch_header, - "task_setup": self.update_setup(setup_dict, self.task_setup[setup_file]) - } + "task_setup": self.update_setup(setup_dict, self.task_setup[setup_file]), + } slurm_output_file = self.output_dir + "/job.slurm" - self.logger.info(f"Running SuperNNova, slurm job outputting to {slurm_output_file}") + self.logger.info( + f"Running SuperNNova, slurm job outputting to {slurm_output_file}" + ) slurm_text = self.slurm.format(**format_dict) new_hash = self.get_hash_from_string(slurm_text) @@ -363,9 +444,9 @@ def classify(self, training): shutil.rmtree(self.output_dir, ignore_errors=True) mkdirs(self.output_dir) if self.has_yml: - with open(self.output_data_yml, 'w') as f: + with open(self.output_data_yml, "w") as f: f.write(self.data_yml) - with open(self.output_classification_yml, 'w') as f: + with open(self.output_classification_yml, "w") as f: f.write(self.classification_yml) self.save_new_hash(new_hash) @@ -373,7 +454,9 @@ def classify(self, training): with open(slurm_output_file, "w") as f: f.write(slurm_text) - self.logger.info(f"Submitting batch job to {'train' if training else 'predict using'} SuperNNova") + self.logger.info( + f"Submitting batch job to {'train' if training else 'predict using'} SuperNNova" + ) subprocess.run(["sbatch", slurm_output_file], cwd=self.output_dir) return True @@ -394,14 +477,26 @@ def _check_completion(self, squeue): new_model_file = os.path.join(self.output_dir, f"model.pt") if not os.path.exists(new_pred_file) or not os.path.exists(new_model_file): - self.logger.info("Updating model location or generating predictions file") + self.logger.info( + "Updating model location or generating predictions file" + ) model, predictions = self.get_model_and_pred() if not os.path.exists(new_model_file): if model is not None: shutil.move(model, new_model_file) - args_old, args_new = os.path.abspath(os.path.join(os.path.dirname(model), "cli_args.json")), self.output_dir + "/cli_args.json" - norm_old, norm_new = os.path.abspath(os.path.join(os.path.dirname(model), "data_norm.json")), self.output_dir + "/data_norm.json" + args_old, args_new = ( + os.path.abspath( + os.path.join(os.path.dirname(model), "cli_args.json") + ), + self.output_dir + "/cli_args.json", + ) + norm_old, norm_new = ( + os.path.abspath( + os.path.join(os.path.dirname(model), "data_norm.json") + ), + self.output_dir + "/data_norm.json", + ) shutil.move(args_old, args_new) shutil.move(norm_old, norm_new) self.logger.info(f"Model file can be found at {new_model_file}") @@ -411,18 +506,35 @@ def _check_completion(self, squeue): self.logger.debug(dataframe) self.logger.debug(self.variant) if self.variant in ["variational", "bayesian"]: - final_dataframe = dataframe[["SNID", "all_class0_median", "all_class0_std"]] + final_dataframe = dataframe[ + ["SNID", "all_class0_median", "all_class0_std"] + ] final_dataframe = final_dataframe.rename( - columns={"all_class0_median": self.get_prob_column_name(), "all_class0_std": self.get_prob_column_name() + "_ERR"} + columns={ + "all_class0_median": self.get_prob_column_name(), + "all_class0_std": self.get_prob_column_name() + + "_ERR", + } ) else: final_dataframe = dataframe[["SNID", "all_class0"]] - final_dataframe = final_dataframe.rename(columns={"all_class0": self.get_prob_column_name()}) - final_dataframe.to_csv(new_pred_file, index=False, float_format="%0.4f") - self.logger.info(f"Predictions file can be found at {new_pred_file}") + final_dataframe = final_dataframe.rename( + columns={"all_class0": self.get_prob_column_name()} + ) + final_dataframe.to_csv( + new_pred_file, index=False, float_format="%0.4f" + ) + self.logger.info( + f"Predictions file can be found at {new_pred_file}" + ) chown_dir(self.output_dir) - self.output.update({"model_filename": new_model_file, "predictions_filename": new_pred_file}) + self.output.update( + { + "model_filename": new_model_file, + "predictions_filename": new_pred_file, + } + ) return Task.FINISHED_SUCCESS else: return self.check_for_job(squeue, self.job_base_name) diff --git a/pippin/classifiers/unity.py b/pippin/classifiers/unity.py index cc8dc0e8..d00dda9a 100644 --- a/pippin/classifiers/unity.py +++ b/pippin/classifiers/unity.py @@ -11,7 +11,7 @@ class UnityClassifier(Classifier): - """ Classification task for the SuperNNova classifier. + """Classification task for the SuperNNova classifier. CONFIGURATION ============= @@ -33,8 +33,27 @@ class UnityClassifier(Classifier): """ - def __init__(self, name, output_dir, config, dependencies, mode, options, index=0, model_name=None): - super().__init__(name, output_dir, config, dependencies, mode, options, index=index, model_name=model_name) + def __init__( + self, + name, + output_dir, + config, + dependencies, + mode, + options, + index=0, + model_name=None, + ): + super().__init__( + name, + output_dir, + config, + dependencies, + mode, + options, + index=index, + model_name=model_name, + ) self.output_file = None self.passed = False self.num_jobs = 1 # This is the default. Can get this from options if needed. @@ -55,13 +74,21 @@ def classify(self): s = self.get_simulation_dependency() df = None phot_dir = s.output["photometry_dirs"][self.index] - headers = [os.path.join(phot_dir, a) for a in os.listdir(phot_dir) if "HEAD" in a] + headers = [ + os.path.join(phot_dir, a) + for a in os.listdir(phot_dir) + if "HEAD" in a + ] if len(headers) == 0: - self.logger.warning(f"No HEAD fits files found in {phot_dir}! Going to do it manually, this may not work.") + self.logger.warning( + f"No HEAD fits files found in {phot_dir}! Going to do it manually, this may not work." + ) cmd = "grep --exclude-dir=* SNID: * | awk -F ':' '{print $3}'" self.logger.debug(f"Running command {cmd}") - process = subprocess.run(cmd, capture_output=True, cwd=phot_dir, shell=True) + process = subprocess.run( + cmd, capture_output=True, cwd=phot_dir, shell=True + ) output = process.stdout.decode("ascii").split("\n") output = [x for x in output if x] @@ -74,7 +101,9 @@ def classify(self): with fits.open(h) as hdul: data = hdul[1].data snid = np.array(data.field("SNID")) - dataframe = pd.DataFrame({cid: snid, name: np.ones(snid.shape)}) + dataframe = pd.DataFrame( + {cid: snid, name: np.ones(snid.shape)} + ) dataframe[cid] = dataframe[cid].apply(str) dataframe[cid] = dataframe[cid].str.strip() if df is None: diff --git a/pippin/config.py b/pippin/config.py index 04309102..34675f0d 100644 --- a/pippin/config.py +++ b/pippin/config.py @@ -8,37 +8,38 @@ import tarfile import gzip + def compress_dir(output_filename, source_dir): logging.info(f"Compressing {source_dir} to {output_filename}") with tarfile.open(output_filename, "w:gz") as tar: tar.add(source_dir, arcname=os.path.basename(source_dir)) shutil.rmtree(source_dir) + def uncompress_dir(output_dir, source_filename): logging.info(f"Uncompressing {source_filename} to {output_dir}") with tarfile.open(source_filename, "r:gz") as tar: + def is_within_directory(directory, target): - abs_directory = os.path.abspath(directory) abs_target = os.path.abspath(target) - + prefix = os.path.commonprefix([abs_directory, abs_target]) - + return prefix == abs_directory - + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): - for member in tar.getmembers(): member_path = os.path.join(path, member.name) if not is_within_directory(path, member_path): raise Exception("Attempted Path Traversal in Tar File") - - tar.extractall(path, members, numeric_owner=numeric_owner) - - + + tar.extractall(path, members, numeric_owner=numeric_owner) + safe_extract(tar, path=output_dir) os.remove(source_filename) + def singleton(fn): instance = None @@ -52,7 +53,6 @@ def get(*args, **kwargs): def merge_dict(original, extra): - for key, value in extra.items(): if isinstance(value, dict): node = original.setdefault(key, {}) @@ -66,7 +66,9 @@ def merge_dict(original, extra): @singleton def get_config(initial_path=None, overwrites=None): - this_dir = os.path.abspath(os.path.join(os.path.dirname(inspect.stack()[0][1]), "..")) + this_dir = os.path.abspath( + os.path.join(os.path.dirname(inspect.stack()[0][1]), "..") + ) if initial_path is None: filename = os.path.abspath(os.path.join(this_dir, "cfg.yml")) else: @@ -95,7 +97,9 @@ def get_output_dir(): if "$" in output_dir: raise ValueError(f"Could not resolve variable in path: {output_dir}") if not output_dir.startswith("/"): - output_dir = os.path.abspath(os.path.dirname(inspect.stack()[0][1]) + "/../" + output_dir) + output_dir = os.path.abspath( + os.path.dirname(inspect.stack()[0][1]) + "/../" + output_dir + ) return output_dir @@ -114,7 +118,9 @@ def get_data_loc(path, extra=None): if "$" in path: path = os.path.expandvars(path) if "$" in path: - logging.error(f"Unable to resolve the variable in {path}, please check to see if it is set in your environment") + logging.error( + f"Unable to resolve the variable in {path}, please check to see if it is set in your environment" + ) return None return path if path.startswith("/"): @@ -127,7 +133,9 @@ def get_data_loc(path, extra=None): new_path = os.path.join(data_dir, path) if os.path.exists(new_path): return new_path - logging.error(f"Unable to find relative path {path} when searching through the data directories: {data_dirs}") + logging.error( + f"Unable to find relative path {path} when searching through the data directories: {data_dirs}" + ) return None @@ -220,14 +228,18 @@ def chown_dir(directory, walk=True): for d in dirs: if not os.path.islink(os.path.join(root, d)): try: - os.chown(os.path.join(root, d), -1, group_id, follow_symlinks=False) + os.chown( + os.path.join(root, d), -1, group_id, follow_symlinks=False + ) os.chmod(os.path.join(root, d), 0o2775) except Exception as e: logger.warning(f"Chown error: {os.path.join(root, d)}") for f in files: if not os.path.islink(os.path.join(root, f)): try: - os.chown(os.path.join(root, f), -1, group_id, follow_symlinks=False) + os.chown( + os.path.join(root, f), -1, group_id, follow_symlinks=False + ) os.chmod(os.path.join(root, f), 0o664) except Exception as e: logger.warning(f"Chown error: {os.path.join(root, f)}") @@ -238,6 +250,7 @@ def ensure_list(a): return a return [a] + def generic_open(fpath, mode="r"): """ Check that fpath exists, identify whether it is compressed or uncompressed, and open it @@ -246,10 +259,11 @@ def generic_open(fpath, mode="r"): if not os.path.exists(fpath): logging.error(f"Path doesn't exist: {fpath}") if ".gz" in fpath: - return gzip.open(fpath, mode) + return gzip.open(fpath, mode) else: return open(fpath, mode) + if __name__ == "__main__": c = get_config() print(c.sections()) diff --git a/pippin/cosmofitters/cosmofit.py b/pippin/cosmofitters/cosmofit.py index aea16cf3..80c3a3e6 100644 --- a/pippin/cosmofitters/cosmofit.py +++ b/pippin/cosmofitters/cosmofit.py @@ -1,5 +1,6 @@ from pippin.task import Task + class CosmoFit(Task): """ @@ -16,23 +17,32 @@ class CosmoFit(Task): ====== """ - def get_tasks(task_config, prior_tasks, output_dir, stage_num, prefix, global_config): + def get_tasks( + task_config, prior_tasks, output_dir, stage_num, prefix, global_config + ): from pippin.cosmofitters.factory import FitterFactory + Task.logger.debug("Setting up CosmoFit tasks") tasks = [] for fitter_name in task_config.get("COSMOFIT", []): - Task.logger.info(f"Found fitter of type {fitter_name}, generating tasks.") + Task.logger.info(f"Found fitter of type {fitter_name}, generating tasks.") config = {fitter_name: task_config["COSMOFIT"][fitter_name]} Task.logger.debug(f"Config for {fitter_name}: {config}") fitter = FitterFactory.get(fitter_name.lower()) Task.logger.debug(f"Fitter class for {fitter_name}: {fitter}") if fitter is None: - Task.logger.error(f"Fitter of type {fitter_name} not found, perhaps it's a typo? Skipping.") + Task.logger.error( + f"Fitter of type {fitter_name} not found, perhaps it's a typo? Skipping." + ) continue - Task.logger.debug(f"get_task function for {fitter_name}: {fitter.get_tasks}") - ts = fitter.get_tasks(config, prior_tasks, output_dir, stage_num, prefix, global_config) + Task.logger.debug( + f"get_task function for {fitter_name}: {fitter.get_tasks}" + ) + ts = fitter.get_tasks( + config, prior_tasks, output_dir, stage_num, prefix, global_config + ) Task.logger.debug(f"{fitter} tasks: {ts}") tasks += ts if len(tasks) == 0: diff --git a/pippin/cosmofitters/cosmomc.py b/pippin/cosmofitters/cosmomc.py index c0e96f3d..1cd9cedc 100644 --- a/pippin/cosmofitters/cosmomc.py +++ b/pippin/cosmofitters/cosmomc.py @@ -11,8 +11,10 @@ from pippin.task import Task -class CosmoMC(CosmoFit): # TODO: Define the location of the output so we can run the lc fitting on it. - """ Run cosmomc given an ini file +class CosmoMC( + CosmoFit +): # TODO: Define the location of the output so we can run the lc fitting on it. + """Run cosmomc given an ini file CONFIGURATION ============= @@ -41,19 +43,29 @@ class CosmoMC(CosmoFit): # TODO: Define the location of the output so we can ru """ - def __init__(self, name, output_dir, config, options, global_config, dependencies=None): + def __init__( + self, name, output_dir, config, options, global_config, dependencies=None + ): super().__init__(name, output_dir, config=config, dependencies=dependencies) self.options = options self.global_config = global_config - self.job_name = os.path.basename(Path(output_dir).parents[1]) + "_COSMOMC_" + name + self.job_name = ( + os.path.basename(Path(output_dir).parents[1]) + "_COSMOMC_" + name + ) self.logfile = os.path.join(self.output_dir, "output.log") self.path_to_cosmomc = get_output_loc(self.global_config["CosmoMC"]["location"]) self.create_cov_dep = self.get_dep(CreateCov) - self.blind = np.all(self.create_cov_dep.output["blind"]) if self.create_cov_dep is not None else self.options.get("BLIND", False) - assert isinstance(self.blind, (bool, np.bool_)), "Blind should be set to a boolan value!" + self.blind = ( + np.all(self.create_cov_dep.output["blind"]) + if self.create_cov_dep is not None + else self.options.get("BLIND", False) + ) + assert isinstance( + self.blind, (bool, np.bool_) + ), "Blind should be set to a boolan value!" self.ini_prefix = options.get("INI").replace(".ini", "") self.static = self.ini_prefix.replace(".ini", "") in ["cmb_omw", "cmb_omol"] self.static_path = "cosmomc_static_chains/" @@ -71,7 +83,9 @@ def __init__(self, name, output_dir, config, options, global_config, dependencie self.covopts = options.get("COVOPTS") or list(avail_cov_opts.keys()) self.covopts_numbers = [avail_cov_opts[k] for k in self.covopts] - self.ini_files = [f"{self.ini_prefix}_{num}.ini" for num in self.covopts_numbers] + self.ini_files = [ + f"{self.ini_prefix}_{num}.ini" for num in self.covopts_numbers + ] self.output["hubble_plot"] = self.create_cov_dep.output["hubble_plot"] self.output["bcor_name"] = self.create_cov_dep.output["bcor_name"] @@ -79,15 +93,24 @@ def __init__(self, name, output_dir, config, options, global_config, dependencie self.num_jobs = len(self.covopts) self.ntasks = 10 - self.logger.debug(f"Num Walkers: {self.num_walkers}") + self.logger.debug(f"Num Walkers: {self.num_walkers}") self.chain_dir = os.path.join(self.output_dir, "chains/") - self.param_dict = {l: os.path.join(self.chain_dir, i.replace(".ini", ".paramnames")) for l, i in zip(self.covopts, self.ini_files)} + self.param_dict = { + l: os.path.join(self.chain_dir, i.replace(".ini", ".paramnames")) + for l, i in zip(self.covopts, self.ini_files) + } self.done_files = [f"done_{num}.txt" for num in self.covopts_numbers] self.chain_dict = { - l: os.path.join(self.chain_dir, i.replace(".ini", f"_{n + 1}.txt")) for l, i in zip(self.covopts, self.ini_files) for n in range(self.ntasks) + l: os.path.join(self.chain_dir, i.replace(".ini", f"_{n + 1}.txt")) + for l, i in zip(self.covopts, self.ini_files) + for n in range(self.ntasks) + } + self.base_dict = { + l: os.path.join(self.chain_dir, i.replace(".ini", "")) + for l, i in zip(self.covopts, self.ini_files) + for n in range(self.ntasks) } - self.base_dict = {l: os.path.join(self.chain_dir, i.replace(".ini", "")) for l, i in zip(self.covopts, self.ini_files) for n in range(self.ntasks)} self.output["chain_dir"] = self.chain_dir self.output["param_dict"] = self.param_dict self.output["chain_dict"] = self.chain_dict @@ -96,13 +119,25 @@ def __init__(self, name, output_dir, config, options, global_config, dependencie self.output["blind"] = self.blind self.output["label"] = ( - self.options.get("LABEL", f"({' + '.join(self.ini_prefix.upper().split('_')[:-1])})") + self.options.get( + "LABEL", f"({' + '.join(self.ini_prefix.upper().split('_')[:-1])})" + ) + " " - + (self.create_cov_dep.output["name"] if self.create_cov_dep is not None else "") + + ( + self.create_cov_dep.output["name"] + if self.create_cov_dep is not None + else "" + ) ) # TODO: Better logic here please final = self.ini_prefix.split("_")[-1] - ps = {"omw": ["omegam", "w"], "flatomol": ["omegam"], "omol": ["omegam", "omegal"], "wnu": ["w", "nu"], "wwa": ["w", "wa"]} + ps = { + "omw": ["omegam", "w"], + "flatomol": ["omegam"], + "omol": ["omegam", "omegal"], + "wnu": ["w", "nu"], + "wwa": ["w", "wa"], + } if final not in ps.keys(): self.fail_config( f"The filename passed in ({self.ini_prefix}) needs to have format 'components_cosmology.ini', where the cosmology is omw, omol, wnu or wwa. Is this a custom file?" @@ -112,7 +147,9 @@ def __init__(self, name, output_dir, config, options, global_config, dependencie self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) - self.batch_replace = self.options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) + self.batch_replace = self.options.get( + "BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {}) + ) self.slurm = """{sbatch_header} {task_setup} @@ -129,7 +166,9 @@ def _check_completion(self, squeue): self.logger.debug(f"Done file found at f{self.done_file}") with open(self.done_file) as f: if "FAILURE" in f.read(): - self.logger.error(f"Done file reported failure. Check output log {self.logfile}") + self.logger.error( + f"Done file reported failure. Check output log {self.logfile}" + ) return Task.FINISHED_FAILURE else: return Task.FINISHED_SUCCESS @@ -140,7 +179,9 @@ def _check_completion(self, squeue): if os.path.exists(df): with open(df) as f: if "FAILURE" in f.read(): - self.logger.error(f"Done file {d} reported failure. Check output log {self.logfile}") + self.logger.error( + f"Done file {d} reported failure. Check output log {self.logfile}" + ) return Task.FINISHED_FAILURE else: all_files = False @@ -161,7 +202,9 @@ def _check_completion(self, squeue): if os.path.exists(self.logfile): with open(self.logfile) as f: if "CANCELLED AT" in f.read(): - self.logger.debug(f"The job was cancelled! Check {self.logfile} for details") + self.logger.debug( + f"The job was cancelled! Check {self.logfile} for details" + ) return Task.FINISHED_FAILURE return self.check_for_job(squeue, self.job_name) @@ -175,19 +218,26 @@ def get_ini_file(self): path = os.path.join(directory, file) self.logger.debug(f"Path: {path}") if not os.path.exists(path): - self.logger.error(f"Cannot find the file {path}, make sure you specified a correct INI string matching an existing template") + self.logger.error( + f"Cannot find the file {path}, make sure you specified a correct INI string matching an existing template" + ) return None self.logger.debug(f"Reading in {path} to format") with open(path) as f: input_files.append( - f.read().format(**{"path_to_cosmomc": self.path_to_cosmomc, "ini_dir": self.create_cov_dep.output["ini_dir"], "root_dir": self.chain_dir}) + f.read().format( + **{ + "path_to_cosmomc": self.path_to_cosmomc, + "ini_dir": self.create_cov_dep.output["ini_dir"], + "root_dir": self.chain_dir, + } + ) ) self.logger.debug(f"Input Files: {input_files}") return input_files def _run(self): - if self.static: self.logger.info("CMB only constraints detected, copying static files") @@ -196,7 +246,6 @@ def _run(self): self.logger.error("Seems like we can't find the static chains...") return False else: - new_hash = self.get_hash_from_string(cosmomc_static_loc) if self._check_regenerate(new_hash): @@ -223,17 +272,20 @@ def _run(self): else: self.sbatch_header = self.sbatch_cpu_header else: - with open(self.batch_file, 'r') as f: + with open(self.batch_file, "r") as f: self.sbatch_header = f.read() self.sbatch_header = self.clean_header(self.sbatch_header) - header_dict = { "REPLACE_NAME": self.job_name, "REPLACE_WALLTIME": "34:00:00", "REPLACE_LOGFILE": self.logfile, "REPLACE_MEM": "2GB", - "APPEND": [f"#SBATCH --ntasks={self.ntasks}", f"#SBATCH --array=1-{len(self.ini_files)}", "#SBATCH --cpus-per-task=1"] + "APPEND": [ + f"#SBATCH --ntasks={self.ntasks}", + f"#SBATCH --array=1-{len(self.ini_files)}", + "#SBATCH --cpus-per-task=1", + ], } header_dict = merge_dict(header_dict, self.batch_replace) self.update_header(header_dict) @@ -249,7 +301,7 @@ def _run(self): format_dict = { "sbatch_header": self.sbatch_header, - "task_setup": self.update_setup(setup_dict, self.task_setup['cosmomc']) + "task_setup": self.update_setup(setup_dict, self.task_setup["cosmomc"]), } final_slurm = self.slurm.format(**format_dict) @@ -285,7 +337,6 @@ def _run(self): @staticmethod def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): - create_cov_tasks = Task.get_task_of_type(prior_tasks, CreateCov) def _get_cosmomc_dir(base_output_dir, stage_number, name): @@ -302,7 +353,13 @@ def _get_cosmomc_dir(base_output_dir, stage_number, name): # Check if this is static. Could scan the folder, but dont have all the chains yet. # TODO: Update this when I have all the chains if options.get("INI") in ["cmb_omw", "cmb_omol"]: - a = CosmoMC(cname, _get_cosmomc_dir(base_output_dir, stage_number, cname), config, options, global_config) + a = CosmoMC( + cname, + _get_cosmomc_dir(base_output_dir, stage_number, cname), + config, + options, + global_config, + ) Task.logger.info(f"Creating CosmoMC task {cname} for {a.num_jobs} jobs") tasks.append(a) @@ -311,11 +368,22 @@ def _get_cosmomc_dir(base_output_dir, stage_number, name): if mask not in ctask.name: continue name = f"COSMOMC_{cname}_{ctask.name}" - a = CosmoMC(name, _get_cosmomc_dir(base_output_dir, stage_number, name), config, options, global_config, dependencies=[ctask]) - Task.logger.info(f"Creating CosmoMC task {name} for {ctask.name} with {a.num_jobs} jobs") + a = CosmoMC( + name, + _get_cosmomc_dir(base_output_dir, stage_number, name), + config, + options, + global_config, + dependencies=[ctask], + ) + Task.logger.info( + f"Creating CosmoMC task {name} for {ctask.name} with {a.num_jobs} jobs" + ) tasks.append(a) if len(create_cov_tasks) == 0: - Task.fail_config(f"CosmoMC task {cname} has no create_cov task to run on!") + Task.fail_config( + f"CosmoMC task {cname} has no create_cov task to run on!" + ) return tasks diff --git a/pippin/cosmofitters/factory.py b/pippin/cosmofitters/factory.py index 4833cf2a..4909be34 100644 --- a/pippin/cosmofitters/factory.py +++ b/pippin/cosmofitters/factory.py @@ -1,6 +1,7 @@ from pippin.cosmofitters.cosmomc import CosmoMC from pippin.cosmofitters.wfit import WFit + class FitterFactory: ids = {} @@ -12,5 +13,6 @@ def get(cls, name): def add_factory(cls, fitter_class): cls.ids[fitter_class.__name__.lower()] = fitter_class + FitterFactory.add_factory(CosmoMC) FitterFactory.add_factory(WFit) diff --git a/pippin/cosmofitters/wfit.py b/pippin/cosmofitters/wfit.py index 8f9a585b..3df01f0c 100644 --- a/pippin/cosmofitters/wfit.py +++ b/pippin/cosmofitters/wfit.py @@ -11,8 +11,11 @@ from pippin.base import ConfigBasedExecutable from pippin.task import Task + class WFit(ConfigBasedExecutable, CosmoFit): - def __init__(self, name, output_dir, create_cov_tasks, config, options, global_config): + def __init__( + self, name, output_dir, create_cov_tasks, config, options, global_config + ): # First check if all required options exist # In this case, WFITOPTS must exist with at least 1 entry @@ -24,7 +27,14 @@ def __init__(self, name, output_dir, create_cov_tasks, config, options, global_c Task.fail_config(f"WFITOPTS for task {name} does not have any options!") base_file = get_data_loc("wfit/input_file.INPUT") - super().__init__(name, output_dir, config, base_file, default_assignment=": ", dependencies=create_cov_tasks) + super().__init__( + name, + output_dir, + config, + base_file, + default_assignment=": ", + dependencies=create_cov_tasks, + ) self.num_jobs = len(self.wfitopts) self.create_cov_tasks = create_cov_tasks @@ -37,7 +47,9 @@ def __init__(self, name, output_dir, create_cov_tasks, config, options, global_c self.global_config = global_config self.done_file = os.path.join(self.output_dir, "output", "ALL.DONE") - self.batch_replace = self.options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) + self.batch_replace = self.options.get( + "BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {}) + ) batch_mem = self.batch_replace.get("REPLACE_MEM", None) if batch_mem is not None: self.yaml["CONFIG"]["BATCH_MEM"] = batch_mem @@ -48,7 +60,7 @@ def __init__(self, name, output_dir, create_cov_tasks, config, options, global_c self.logfile = os.path.join(self.output_dir, "output.log") self.input_name = f"{self.job_name}.INPUT" self.input_file = os.path.join(self.output_dir, self.input_name) - + def _check_completion(self, squeue): if os.path.exists(self.done_file): self.logger.debug(f"Done file found at {self.done_file}") @@ -56,7 +68,9 @@ def _check_completion(self, squeue): if "SUCCESS" in f.read(): return Task.FINISHED_SUCCESS else: - self.logger.error(f"Done file reported failure. Check output log {self.logfile}") + self.logger.error( + f"Done file reported failure. Check output log {self.logfile}" + ) self.scan_files_for_error([self.logfile], "ERROR", "EXCEPTION") return Task.FINISHED_FAILURE return self.check_for_job(squeue, self.job_name) @@ -71,7 +85,7 @@ def _run(self): if k == "WFITOPTS": k = "WFITOPT" self.yaml["CONFIG"][k] = v - + final_output_for_hash = self.get_output_string() new_hash = self.get_hash_from_string(final_output_for_hash) @@ -86,12 +100,20 @@ def _run(self): f.write(self.get_output_string()) cmd = ["submit_batch_jobs.sh", os.path.basename(self.input_file)] - self.logger.debug(f"Submitting wfit job: {' '.join(cmd)} in cwd: {self.output_dir}") + self.logger.debug( + f"Submitting wfit job: {' '.join(cmd)} in cwd: {self.output_dir}" + ) self.logger.debug(f"Logging to {self.logfile}") - with open(self.logfile, 'w') as f: - subprocess.run(' '.join(cmd), stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir, shell=True) + with open(self.logfile, "w") as f: + subprocess.run( + " ".join(cmd), + stdout=f, + stderr=subprocess.STDOUT, + cwd=self.output_dir, + shell=True, + ) chown_dir(self.output_dir) - + else: self.should_be_done() self.logger.info("Has check passed, not rerunning") @@ -99,7 +121,6 @@ def _run(self): @staticmethod def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): - create_cov_tasks = Task.get_task_of_type(prior_tasks, CreateCov) def _get_wfit_dir(base_output_dir, stage_number, name): @@ -118,7 +139,14 @@ def _get_wfit_dir(base_output_dir, stage_number, name): if len(ctasks) == 0: Task.fail_config(f"WFit task {name} has no create_cov task to run on!") - t = WFit(name, _get_wfit_dir(base_output_dir, stage_number, name), ctasks, config, options, global_config) + t = WFit( + name, + _get_wfit_dir(base_output_dir, stage_number, name), + ctasks, + config, + options, + global_config, + ) Task.logger.info(f"Creating WFit task {name} {t.num_jobs} jobs") tasks.append(t) diff --git a/pippin/create_cov.py b/pippin/create_cov.py index 9bf2382b..c3818a3c 100644 --- a/pippin/create_cov.py +++ b/pippin/create_cov.py @@ -10,8 +10,9 @@ import pippin.cosmofitters.cosmomc as cosmomc from pippin.config import get_data_loc, get_config, read_yaml, mkdirs, chown_dir + class CreateCov(ConfigBasedExecutable): - """ Create covariance matrices and data from salt2mu used for cosmomc and wfit. + """Create covariance matrices and data from salt2mu used for cosmomc and wfit. Run through submit_batch CONFIGURATION: @@ -39,23 +40,34 @@ class CreateCov(ConfigBasedExecutable): """ - - def __init__(self, name, output_dir, config, options, global_config, dependencies=None): - + def __init__( + self, name, output_dir, config, options, global_config, dependencies=None + ): base_file = get_data_loc("create_cov/COVMAT.input") - super().__init__(name, output_dir, config, base_file, default_assignment=": ", dependencies = dependencies) + super().__init__( + name, + output_dir, + config, + base_file, + default_assignment=": ", + dependencies=dependencies, + ) if options is None: options = {} self.options = options self.templates_dir = self.options.get("INI_DIR", "cosmomc_templates") self.global_config = get_config() - self.job_name = os.path.basename(Path(output_dir).parents[1]) + "_CREATE_COV_" + name + self.job_name = ( + os.path.basename(Path(output_dir).parents[1]) + "_CREATE_COV_" + name + ) self.config_dir = os.path.join(self.output_dir, "output") self.wfit_inpdir = [] for d in dependencies: for subdir in d.output["subdirs"]: - self.wfit_inpdir.append(os.path.join(self.config_dir, f"{self.name}_{d.name}_{subdir}")) + self.wfit_inpdir.append( + os.path.join(self.config_dir, f"{self.name}_{d.name}_{subdir}") + ) self.done_file = os.path.join(self.config_dir, "ALL.DONE") self.input_name = f"{self.job_name}.INPUT" self.input_file = os.path.join(self.output_dir, self.input_name) @@ -79,7 +91,9 @@ def __init__(self, name, output_dir, config, options, global_config, dependencie if num_jobs > 20: num_jobs = 20 BATCH_INFO = f"sbatch {BATCH_FILE} {num_jobs}" - BATCH_REPLACE = self.options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) + BATCH_REPLACE = self.options.get( + "BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {}) + ) if BATCH_REPLACE != {}: BATCH_MEM = BATCH_REPLACE.get("REPLACE_MEM", None) BATCH_WALLTIME = BATCH_REPLACE.get("REPLACE_WALLTIME", None) @@ -93,13 +107,13 @@ def __init__(self, name, output_dir, config, options, global_config, dependencie self.yaml["CONFIG"]["BATCH_WALLTIME"] = BATCH_WALLTIME # create_covariance.py input file - self.input_covmat_file = get_data_loc("create_cov/input_file.txt") + self.input_covmat_file = get_data_loc("create_cov/input_file.txt") self.output_covmat_file = os.path.join(self.output_dir, "input_file.txt") self.prepare_cosmomc = self.config.get("COSMOMC", False) if self.prepare_cosmomc: self.logger.info("Generating CosmoMC output") else: - self.logger.info("Not generating CosmoMC output") + self.logger.info("Not generating CosmoMC output") self.sys_file_in = self.get_sys_file_in() self.sys_file_out = os.path.join(self.output_dir, "sys_scale.yml") self.calibration_set = options.get("CALIBRATORS", []) @@ -117,7 +131,9 @@ def __init__(self, name, output_dir, config, options, global_config, dependencie # Output self.output["blind"] = [d.output["blind"] for d in self.dependencies] - self.output["hubble_plot"] = [d.output["hubble_plot"] for d in self.dependencies] + self.output["hubble_plot"] = [ + d.output["hubble_plot"] for d in self.dependencies + ] covopts_map = {"ALL": 0} for i, covopt in enumerate(self.options.get("COVOPTS", [])): covopts_map[covopt.split("]")[0][1:]] = i + 1 @@ -129,7 +145,9 @@ def __init__(self, name, output_dir, config, options, global_config, dependencie def add_dependent(self, task): self.dependents.append(task) if isinstance(task, cosmomc.CosmoMC): - self.logger.info("CosmoMC task found, CreateCov will generate CosmoMC output") + self.logger.info( + "CosmoMC task found, CreateCov will generate CosmoMC output" + ) self.prepare_cosmomc = True def _check_completion(self, squeue): @@ -137,7 +155,9 @@ def _check_completion(self, squeue): self.logger.debug(f"Done file found at {self.done_file}") with open(self.done_file) as f: if "FAIL" in f.read(): - self.logger.error(f"Done file reported failure. Check output log {self.logfile}") + self.logger.error( + f"Done file reported failure. Check output log {self.logfile}" + ) self.scan_files_for_error([self.logfile], "ERROR", "EXCEPTION") return Task.FINISHED_FAILURE else: @@ -156,7 +176,9 @@ def load_input_covmat(self): def prepare_input_covmat(self): self.load_input_covmat() if self.prepare_cosmomc: - self.input_covmat_yaml["COSMOMC_TEMPLATES_PATH"] = get_data_loc(self.templates_dir) + self.input_covmat_yaml["COSMOMC_TEMPLATES_PATH"] = get_data_loc( + self.templates_dir + ) else: self.input_covmat_yaml.pop("COSMOMC_TEMPLATES_PATH", None) self.input_covmat_yaml["SYS_SCALE_FILE"] = self.sys_file_out @@ -175,9 +197,11 @@ def get_bbc_outdirs(self): def get_covmatopt(self): rebin_x1 = self.options.get("REBINNED_X1", 0) rebin_c = self.options.get("REBINNED_C", 0) - if (rebin_x1 + rebin_c > 0): + if rebin_x1 + rebin_c > 0: if (rebin_x1 == 0) or (rebin_c == 0): - Task.fail_config(f"If rebin, both REBINNED_X1 ({rebin_x1}) and REBINNED_C ({rebin_c}) must be greater than 0") + Task.fail_config( + f"If rebin, both REBINNED_X1 ({rebin_x1}) and REBINNED_C ({rebin_c}) must be greater than 0" + ) else: cmd = f"--nbin_x1 {rebin_x1} --nbin_c {rebin_c}" elif self.options.get("SUBTRACT_VPEC", False): @@ -203,7 +227,9 @@ def get_sys_file_in(self): for d in self.dependencies: fitopt_files = [] fitopt_files += [f for f in d.output["fitopt_files"] if f is not None] - assert len(set(fitopt_files)) < 2, f"Cannot automatically determine scaling from FITOPT file as you have multiple files: {fitopt_files}" + assert ( + len(set(fitopt_files)) < 2 + ), f"Cannot automatically determine scaling from FITOPT file as you have multiple files: {fitopt_files}" if (len(fitopt_files) > 0) and (path is None): path = fitopt_files[0] break @@ -216,13 +242,20 @@ def get_scales_from_fitopt_file(self): return {} self.logger.debug(f"Loading sys scaling from {self.sys_file_in}") yaml = read_yaml(self.sys_file_in) - if 'FLAG_USE_SAME_EVENTS' in yaml.keys(): - yaml.pop('FLAG_USE_SAME_EVENTS') - raw = {k: float(v.split(maxsplit=1)[0]) for _, d in yaml.items() for k, v in d.items()} + if "FLAG_USE_SAME_EVENTS" in yaml.keys(): + yaml.pop("FLAG_USE_SAME_EVENTS") + raw = { + k: float(v.split(maxsplit=1)[0]) + for _, d in yaml.items() + for k, v in d.items() + } return raw def get_sys_scale(self): - return {**self.get_scales_from_fitopt_file(), **self.options.get("FITOPT_SCALES", {})} + return { + **self.get_scales_from_fitopt_file(), + **self.options.get("FITOPT_SCALES", {}), + } def _run(self): sys_scale = self.get_sys_scale() @@ -248,22 +281,28 @@ def _run(self): f.write(yaml.safe_dump(self.input_covmat_yaml, width=2048)) cmd = ["submit_batch_jobs.sh", os.path.basename(self.input_file)] - self.logger.debug(f"Submitting CreateCov job: {' '.join(cmd)} in cwd: {self.output_dir}") + self.logger.debug( + f"Submitting CreateCov job: {' '.join(cmd)} in cwd: {self.output_dir}" + ) self.logger.debug(f"Logging to {self.logfile}") - with open(self.logfile, 'w') as f: - subprocess.run(' '.join(cmd), stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir, shell=True) + with open(self.logfile, "w") as f: + subprocess.run( + " ".join(cmd), + stdout=f, + stderr=subprocess.STDOUT, + cwd=self.output_dir, + shell=True, + ) chown_dir(self.output_dir) else: self.should_be_done() self.logger.info("Hash check passed, not rerunning") return True - @staticmethod def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_config): - biascor_tasks = Task.get_task_of_type(prior_tasks, BiasCor) - + def _get_createcov_dir(base_output_dir, stage_number, name): return f"{base_output_dir}/{stage_number}_CREATE_COV/{name}" @@ -277,9 +316,18 @@ def _get_createcov_dir(base_output_dir, stage_number, name): btasks = [btask for btask in biascor_tasks if mask in btask.name] if len(btasks) == 0: - Task.fail_config(f"Create cov task {cname} has no biascor tasks matching mask {mask}") - - t = CreateCov(cname, _get_createcov_dir(base_output_dir, stage_number, cname), config, options, global_config, dependencies=btasks) + Task.fail_config( + f"Create cov task {cname} has no biascor tasks matching mask {mask}" + ) + + t = CreateCov( + cname, + _get_createcov_dir(base_output_dir, stage_number, cname), + config, + options, + global_config, + dependencies=btasks, + ) tasks.append(t) return tasks diff --git a/pippin/dataprep.py b/pippin/dataprep.py index 7727c1fe..13f657b6 100644 --- a/pippin/dataprep.py +++ b/pippin/dataprep.py @@ -4,12 +4,21 @@ import os from collections import OrderedDict from pathlib import Path -from pippin.config import mkdirs, get_output_loc, get_config, get_data_loc, read_yaml, merge_dict +from pippin.config import ( + mkdirs, + get_output_loc, + get_config, + get_data_loc, + read_yaml, + merge_dict, +) from pippin.task import Task -class DataPrep(Task): # TODO: Define the location of the output so we can run the lc fitting on it. - """ Smack the data into something that looks like the simulated data +class DataPrep( + Task +): # TODO: Define the location of the output so we can run the lc fitting on it. + """Smack the data into something that looks like the simulated data OUTPUTS: ======== @@ -26,7 +35,9 @@ class DataPrep(Task): # TODO: Define the location of the output so we can run t is_sim: bool - whether or not the input is a simulation """ - def __init__(self, name, output_dir, config, options, global_config, dependencies=None): + def __init__( + self, name, output_dir, config, options, global_config, dependencies=None + ): super().__init__(name, output_dir, config=config, dependencies=dependencies) self.options = options self.global_config = get_config() @@ -47,7 +58,9 @@ def __init__(self, name, output_dir, config, options, global_config, dependencie if self.unparsed_raw == "$SCRATCH_SIMDIR" or "SNDATA_ROOT/SIM" in self.raw_dir: self.logger.debug("Removing PRIVATE_DATA_PATH from NML file") self.data_path = "" - self.job_name = os.path.basename(Path(output_dir).parents[1]) + "_DATAPREP_" + self.name + self.job_name = ( + os.path.basename(Path(output_dir).parents[1]) + "_DATAPREP_" + self.name + ) self.output_info = os.path.join(self.output_dir, f"{self.genversion}.YAML") self.output["genversion"] = self.genversion @@ -66,7 +79,27 @@ def __init__(self, name, output_dir, config, options, global_config, dependencie self.types_dict = options.get("TYPES") if self.types_dict is None: - self.types_dict = {"IA": [1], "NONIA": [2, 20, 21, 22, 29, 30, 31, 32, 33, 39, 40, 41, 42, 43, 80, 81]} + self.types_dict = { + "IA": [1], + "NONIA": [ + 2, + 20, + 21, + 22, + 29, + 30, + 31, + 32, + 33, + 39, + 40, + 41, + 42, + 43, + 80, + 81, + ], + } else: for key in self.types_dict.keys(): self.types_dict[key] = [int(c) for c in self.types_dict[key]] @@ -74,7 +107,9 @@ def __init__(self, name, output_dir, config, options, global_config, dependencie self.batch_file = self.options.get("BATCH_FILE") if self.batch_file is not None: self.batch_file = get_data_loc(self.batch_file) - self.batch_replace = self.options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) + self.batch_replace = self.options.get( + "BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {}) + ) self.logger.debug(f"\tIA types are {self.types_dict['IA']}") self.logger.debug(f"\tNONIA types are {self.types_dict['NONIA']}") @@ -123,11 +158,15 @@ def _check_completion(self, squeue): self.logger.debug(f"Done file found at {self.done_file}") with open(self.done_file) as f: if "FAILURE" in f.read(): - self.logger.info(f"Done file reported failure. Check output log {self.logfile}") + self.logger.info( + f"Done file reported failure. Check output log {self.logfile}" + ) return Task.FINISHED_FAILURE else: if not os.path.exists(self.output_info): - self.logger.exception(f"Cannot find output info file {self.output_info}") + self.logger.exception( + f"Cannot find output info file {self.output_info}" + ) return Task.FINISHED_FAILURE else: content = read_yaml(self.output_info) @@ -138,43 +177,44 @@ def _check_completion(self, squeue): return self.check_for_job(squeue, self.job_name) def _run(self): - val_p = self.options.get("PHOTFLAG_DETECT") val_c = self.options.get("CUTWIN_SNR_NODETECT") photflag = f"PHOTFLAG_DETECT = {val_p}" if val_p else "" cutwin = f"CUTWIN_SNR_NODETECT = {val_c}" if val_c else "" command_string = self.clump_command.format( - genversion=self.genversion, data_path=self.data_path, opt_setpkmjd=self.opt_setpkmjd, photflag=photflag, cutwin_snr_nodetect=cutwin, photflag_mskrej=self.photflag_mskrej + genversion=self.genversion, + data_path=self.data_path, + opt_setpkmjd=self.opt_setpkmjd, + photflag=photflag, + cutwin_snr_nodetect=cutwin, + photflag_mskrej=self.photflag_mskrej, ) - + if self.batch_file is None: if self.gpu: self.sbatch_header = self.sbatch_gpu_header else: self.sbatch_header = self.sbatch_cpu_header else: - with open(self.batch_file, 'r') as f: + with open(self.batch_file, "r") as f: self.sbatch_header = f.read() self.sbatch_header = self.clean_header(self.sbatch_header) header_dict = { - "REPLACE_NAME": self.job_name, - "REPLACE_WALLTIME": "0:20:00", - "REPLACE_LOGFILE": self.logfile, - "REPLACE_MEM": "2GB", - "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=1"] - } + "REPLACE_NAME": self.job_name, + "REPLACE_WALLTIME": "0:20:00", + "REPLACE_LOGFILE": self.logfile, + "REPLACE_MEM": "2GB", + "APPEND": ["#SBATCH --ntasks=1", "#SBATCH --cpus-per-task=1"], + } header_dict = merge_dict(header_dict, self.batch_replace) self.update_header(header_dict) - setup_dict = { - "path_to_task": self.path_to_task, - "done_file": self.done_file - } + setup_dict = {"path_to_task": self.path_to_task, "done_file": self.done_file} format_dict = { - "sbatch_header": self.sbatch_header, - "task_setup": self.update_setup(setup_dict, self.task_setup['dataprep']) - } - #format_dict = {"job_name": self.job_name, "log_file": self.logfile, "path_to_task": self.path_to_task, "done_file": self.done_file} + "sbatch_header": self.sbatch_header, + "task_setup": self.update_setup(setup_dict, self.task_setup["dataprep"]), + } + # format_dict = {"job_name": self.job_name, "log_file": self.logfile, "path_to_task": self.path_to_task, "done_file": self.done_file} final_slurm = self.slurm.format(**format_dict) new_hash = self.get_hash_from_string(command_string + final_slurm) @@ -199,14 +239,20 @@ def _run(self): return True @staticmethod - def get_tasks(config, prior_tasks, base_output_dir, stage_number, prefix, global_config): + def get_tasks( + config, prior_tasks, base_output_dir, stage_number, prefix, global_config + ): tasks = [] for name in config.get("DATAPREP", []): output_dir = f"{base_output_dir}/{stage_number}_DATAPREP/{name}" options = config["DATAPREP"][name].get("OPTS") if options is None and config["DATAPREP"][name].get("EXTERNAL") is None: Task.fail_config(f"DATAPREP task {name} needs to specify OPTS!") - s = DataPrep(name, output_dir, config["DATAPREP"][name], options, global_config) - Task.logger.debug(f"Creating data prep task {name} with {s.num_jobs} jobs, output to {output_dir}") + s = DataPrep( + name, output_dir, config["DATAPREP"][name], options, global_config + ) + Task.logger.debug( + f"Creating data prep task {name} with {s.num_jobs} jobs, output to {output_dir}" + ) tasks.append(s) return tasks diff --git a/pippin/external/aggregator_plot.py b/pippin/external/aggregator_plot.py index 493e44d4..5b6625ca 100644 --- a/pippin/external/aggregator_plot.py +++ b/pippin/external/aggregator_plot.py @@ -7,7 +7,20 @@ import seaborn as sb from scipy.stats import binned_statistic -colours = ["#1976D2", "#8BC34A", "#E53935", "#673AB7", "#F2D026", "#9E9E9E", "#4FC3F7", "#E91E63", "#43A047", "#795548", "#333333", "#FB8C00"] * 2 +colours = [ + "#1976D2", + "#8BC34A", + "#E53935", + "#673AB7", + "#F2D026", + "#9E9E9E", + "#4FC3F7", + "#E91E63", + "#43A047", + "#795548", + "#333333", + "#FB8C00", +] * 2 def plot_corr(df, output_dir, index): @@ -27,12 +40,16 @@ def plot_prob_acc(df, output_dir, index): prob_bins = np.linspace(0, 1, 21) bin_center = 0.5 * (prob_bins[1:] + prob_bins[:-1]) - columns = [c for c in df.columns if c.startswith("PROB_") and not c.endswith("_ERR")] + columns = [ + c for c in df.columns if c.startswith("PROB_") and not c.endswith("_ERR") + ] fig, ax = plt.subplots(figsize=(8, 6)) for c, col in zip(columns, colours): data, truth = _get_data_and_truth(df[c], df["IA"]) - actual_prob, _, _ = binned_statistic(data, truth.astype(np.float), bins=prob_bins, statistic="mean") + actual_prob, _, _ = binned_statistic( + data, truth.astype(np.float), bins=prob_bins, statistic="mean" + ) ax.plot(bin_center, actual_prob, label=c, c=col) ax.plot(prob_bins, prob_bins, label="Expected", color="k", ls="--") ax.legend(loc=4, frameon=False, markerfirst=False) @@ -49,7 +66,12 @@ def _get_matrix(classified, truth): false_positive = classified & ~truth true_negative = ~classified & ~truth false_negative = ~classified & truth - return true_positives.sum(), false_positive.sum(), true_negative.sum(), false_negative.sum() + return ( + true_positives.sum(), + false_positive.sum(), + true_negative.sum(), + false_negative.sum(), + ) def _get_metrics(classified, truth): @@ -73,7 +95,9 @@ def plot_thresholds(df, output_dir, index): logging.debug("Making threshold plot") thresholds = np.linspace(0.5, 0.999, 100) - columns = [c for c in df.columns if c.startswith("PROB_") and not c.endswith("_ERR")] + columns = [ + c for c in df.columns if c.startswith("PROB_") and not c.endswith("_ERR") + ] fig, ax = plt.subplots(figsize=(7, 5)) ls = ["-", "--", ":", ":-", "-", "--", ":"] @@ -89,7 +113,9 @@ def plot_thresholds(df, output_dir, index): res[key] = [] res[key].append(metrics[key]) for key, l in zip(keys, ls): - ax.plot(thresholds, res[key], color=col, linestyle=l, label=f"{c[5:]} {key}") + ax.plot( + thresholds, res[key], color=col, linestyle=l, label=f"{c[5:]} {key}" + ) ax.set_xlabel("Classification probability threshold") ax.legend(loc=3, frameon=False, ncol=2) @@ -103,7 +129,9 @@ def plot_pr(df, output_dir, index): logging.debug("Making pr plot") thresholds = np.linspace(0.01, 1, 100) - columns = [c for c in df.columns if c.startswith("PROB_") and not c.endswith("_ERR")] + columns = [ + c for c in df.columns if c.startswith("PROB_") and not c.endswith("_ERR") + ] fig, ax = plt.subplots(figsize=(7, 5)) @@ -132,7 +160,9 @@ def plot_roc(df, output_dir, index): logging.debug("Making roc plot") thresholds = np.linspace(0.01, 0.999, 100) - columns = [c for c in df.columns if c.startswith("PROB_") and not c.endswith("_ERR")] + columns = [ + c for c in df.columns if c.startswith("PROB_") and not c.endswith("_ERR") + ] fig, ax = plt.subplots(figsize=(7, 5)) @@ -162,7 +192,9 @@ def plot_comparison(df, output_dir, index): df = df.sample(frac=frac) logging.debug("Making comparison plot") - columns = [c for c in df.columns if c.startswith("PROB_") and not c.endswith("_ERR")] + columns = [ + c for c in df.columns if c.startswith("PROB_") and not c.endswith("_ERR") + ] n = len(columns) scale = 1.5 @@ -179,8 +211,21 @@ def plot_comparison(df, output_dir, index): ax.axis("off") continue elif i == j: - h, _, _ = ax.hist(df[label1], bins=bins, histtype="stepfilled", linewidth=2, alpha=0.3, color=colours[i]) - ax.hist(df[label1], bins=bins, histtype="step", linewidth=1.5, color=colours[i]) + h, _, _ = ax.hist( + df[label1], + bins=bins, + histtype="stepfilled", + linewidth=2, + alpha=0.3, + color=colours[i], + ) + ax.hist( + df[label1], + bins=bins, + histtype="step", + linewidth=1.5, + color=colours[i], + ) ax.set_yticklabels([]) ax.tick_params(axis="y", left=False) ax.set_xlim(*lim) @@ -189,7 +234,9 @@ def plot_comparison(df, output_dir, index): if j == 0: ax.spines["left"].set_visible(False) if j == n - 1: - ax.set_xlabel(label1.replace("PROB_", "").replace("_", "\n"), fontsize=10) + ax.set_xlabel( + label1.replace("PROB_", "").replace("_", "\n"), fontsize=10 + ) else: ax.set_xticklabels([]) else: @@ -204,9 +251,13 @@ def plot_comparison(df, output_dir, index): ax.set_yticklabels([]) ax.tick_params(axis="y", left=False) else: - ax.set_ylabel(label1.replace("PROB_", "").replace("_", "\n"), fontsize=10) + ax.set_ylabel( + label1.replace("PROB_", "").replace("_", "\n"), fontsize=10 + ) if i == n - 1: - ax.set_xlabel(label2.replace("PROB_", "").replace("_", "\n"), fontsize=10) + ax.set_xlabel( + label2.replace("PROB_", "").replace("_", "\n"), fontsize=10 + ) else: ax.set_xticklabels([]) plt.subplots_adjust(hspace=0.0, wspace=0) @@ -217,7 +268,6 @@ def plot_comparison(df, output_dir, index): def plot(df, output_dir, index): - plot_corr(df, output_dir, index) plot_prob_acc(df, output_dir, index) # plot_thresholds(df, output_dir, index) @@ -236,7 +286,11 @@ def plot(df, output_dir, index): fmt = "[%(levelname)8s |%(filename)21s:%(lineno)3d] %(message)s" logging_filename = os.path.join(args.output_dir, "output_plots.log") - logging.basicConfig(level=logging.DEBUG, format=fmt, handlers=[logging.FileHandler(logging_filename), logging.StreamHandler()]) + logging.basicConfig( + level=logging.DEBUG, + format=fmt, + handlers=[logging.FileHandler(logging_filename), logging.StreamHandler()], + ) logging.getLogger("matplotlib").setLevel(logging.ERROR) logging.info(f"Input csv is {args.mergedcsv}") diff --git a/pippin/external/compute_map.py b/pippin/external/compute_map.py index cdf01b47..e9d55ce7 100644 --- a/pippin/external/compute_map.py +++ b/pippin/external/compute_map.py @@ -48,7 +48,13 @@ def parse_data(filename): # mask_fitprob = all_data["FITPROB"] > 0.01 # mask_ratio = all_data["RATIO"] > 0 - mask = mask_errtest_high & mask_errtest_low & mask_sbfluxcal & mask_5yr & mask_nolowz_y1 # & mask_fitprob # & mask_fluxcalsim & mask_ratio + mask = ( + mask_errtest_high + & mask_errtest_low + & mask_sbfluxcal + & mask_5yr + & mask_nolowz_y1 + ) # & mask_fitprob # & mask_fluxcalsim & mask_ratio print_drop(mask_errtest_high, "errtest < 10") print_drop(mask_errtest_low, "errtest >= 0.1") print_drop(mask_sbfluxcal, "SBFLUXCAL > 0") @@ -125,7 +131,11 @@ def rejmean(x, debug=False): df_s = get_data("sim_obs.pkl", cut, cut_fitprob=cut_fitprob) data = [ - [("LOGSNR", np.arange(0, 3.1, 1)), ("SBMAG", np.arange(20, 30, 2)), ("PSF", np.array([1, 3, 5]))], + [ + ("LOGSNR", np.arange(0, 3.1, 1)), + ("SBMAG", np.arange(20, 30, 2)), + ("PSF", np.array([1, 3, 5])), + ], [("LOGSNR", np.arange(0, 3.1, 1)), ("SBMAG", np.arange(20, 30, 1))], [("SBMAG", np.arange(20, 30, 1))], ] @@ -141,7 +151,9 @@ def rejmean(x, debug=False): shape = [] for k, bins in maps: if bins is None: - bins = np.linspace(df_f[k].quantile(0.001), df_f[k].quantile(0.999), 7) + bins = np.linspace( + df_f[k].quantile(0.001), df_f[k].quantile(0.999), 7 + ) bcs.append(0.5 * (bins[:-1] + bins[1:])) shape.append(bcs[-1].size) indices_f.append(np.digitize(df_f[k], bins=bins) - 1) @@ -149,14 +161,22 @@ def rejmean(x, debug=False): data = {} for field_name, color in zip(field_names, ("viridis", "magma")): - fields_list = ["E1", "E2", "S1", "S2", "C1", "C2", "X1", "X2"] if field_name == "SHALLOW" else ["C3", "X3"] + fields_list = ( + ["E1", "E2", "S1", "S2", "C1", "C2", "X1", "X2"] + if field_name == "SHALLOW" + else ["C3", "X3"] + ) for band_index, band in enumerate(bands): print(f"Doing field {field_name} and band {band}") # Select down to field and band - mask_f = (df_f["IFILTOBS"] == band_index + 2) & (np.isin(df_f["FIELD"], fields_list)) - mask_s = (df_s["IFILTOBS"] == band_index + 2) & (np.isin(df_s["FIELD"], fields_list)) + mask_f = (df_f["IFILTOBS"] == band_index + 2) & ( + np.isin(df_f["FIELD"], fields_list) + ) + mask_s = (df_s["IFILTOBS"] == band_index + 2) & ( + np.isin(df_s["FIELD"], fields_list) + ) df_f2 = df_f[mask_f] df_s2 = df_s[mask_s] indices_f2 = [i[mask_f] for i in indices_f] @@ -203,7 +223,11 @@ def rejmean(x, debug=False): x2 = xx[non_nan] ind2 = ind[:, non_nan] - x = griddata(ind2.T, x2, ind.T, method="nearest").T.flatten().reshape((x.shape)) + x = ( + griddata(ind2.T, x2, ind.T, method="nearest") + .T.flatten() + .reshape((x.shape)) + ) # Save to output data[n + field_name + band] = x @@ -211,7 +235,9 @@ def rejmean(x, debug=False): # Create output files for j, n in enumerate(["SIM", "FAKES"]): output_string = [] - output_string.append("DEFINE_FIELDGROUP: SHALLOW E1+E2+S1+S2+C1+C2+X1+X2") + output_string.append( + "DEFINE_FIELDGROUP: SHALLOW E1+E2+S1+S2+C1+C2+X1+X2" + ) output_string.append("DEFINE_FIELDGROUP: DEEP C3+X3\n") names = [m[0] for m in maps] for field in field_names: @@ -229,5 +255,7 @@ def rejmean(x, debug=False): bc.append(f"{value:0.3f}") output_string.append("ROW: " + " ".join(bc)) output_string.append("ENDMAP:\n") - with open(f"maps/DES5YR_{n}_ERRORFUDGES_DIFFIMG_{'_'.join(names)}.DAT", "w") as ff: + with open( + f"maps/DES5YR_{n}_ERRORFUDGES_DIFFIMG_{'_'.join(names)}.DAT", "w" + ) as ff: ff.write("\n".join(output_string)) diff --git a/pippin/external/create_covariance_staticbins.py b/pippin/external/create_covariance_staticbins.py index c54471b6..545266cb 100644 --- a/pippin/external/create_covariance_staticbins.py +++ b/pippin/external/create_covariance_staticbins.py @@ -96,7 +96,14 @@ def dataset(output_dir, base_output, strex1, strex2, sys=1): if sys == 1: g.write("has_mag_covmat = T\n") - g.write("mag_covmat_file = " + output_dir + "/sys_" + base_output + strex1 + ".txt\n") + g.write( + "mag_covmat_file = " + + output_dir + + "/sys_" + + base_output + + strex1 + + ".txt\n" + ) # g.write('mag_covmat_file = '+os.getcwd()+'/'+output_dir+'/sys_'+base_output+strex1+'.txt\n') if sys != 1: g.write("has_mag_covmat = F\n") @@ -115,15 +122,32 @@ def fullcosmo(base_output, file1, lc1, mat1, output_dir="COSMO"): headn = linef(file1, "zCMB") data1 = np.genfromtxt(file1, skip_header=headn, names=True, comments="#") - cid = np.genfromtxt(file1, skip_header=headn, usecols=(1), comments="#", dtype="str")[1:] + cid = np.genfromtxt( + file1, skip_header=headn, usecols=(1), comments="#", dtype="str" + )[1:] z1 = data1["zHD"].astype(float) mu = data1["MU"].astype(float) muerr = data1["MUERR"].astype(float) - f1 = open(output_dir + "/lcparam_" + base_output + ".txt", "w") # this is the file for cosmomc - f1.write("#name zcmb zhel dz mb dmb x1 dx1 color dcolor 3rdvar d3rdvar cov_m_s cov_m_c cov_s_c set ra dec \n") # standard format + f1 = open( + output_dir + "/lcparam_" + base_output + ".txt", "w" + ) # this is the file for cosmomc + f1.write( + "#name zcmb zhel dz mb dmb x1 dx1 color dcolor 3rdvar d3rdvar cov_m_s cov_m_c cov_s_c set ra dec \n" + ) # standard format for x in range(0, len(z1)): - f1.write(cid[x] + " " + str(z1[x]) + " " + str(z1[x]) + " 0.0 " + str(mu[x] - 19.35) + " " + str(muerr[x]) + " 0 0 0 0 0 0 0 0 0 0 0\n") + f1.write( + cid[x] + + " " + + str(z1[x]) + + " " + + str(z1[x]) + + " 0.0 " + + str(mu[x] - 19.35) + + " " + + str(muerr[x]) + + " 0 0 0 0 0 0 0 0 0 0 0\n" + ) f1.close() g = open(output_dir + "/" + base_output + ".dataset", "w") h = open(output_dir + "/" + base_output + "_nosys.dataset", "w") @@ -189,7 +213,12 @@ def avgmat(base_output, mat1, mat2, lc1, lc2, output_dir="COSMO"): cosmo2 = FlatLambdaCDM(H0=70, Om0=0.3) - list1, z1, mb1, mb1e = np.loadtxt(output_dir + "/lcparam_" + lc1 + ".txt", usecols=(0, 1, 4, 5), unpack=True, dtype="string") + list1, z1, mb1, mb1e = np.loadtxt( + output_dir + "/lcparam_" + lc1 + ".txt", + usecols=(0, 1, 4, 5), + unpack=True, + dtype="string", + ) z1 = z1.astype(float) mb1 = mb1.astype(float) mb1e = mb1e.astype(float) @@ -198,7 +227,12 @@ def avgmat(base_output, mat1, mat2, lc1, lc2, output_dir="COSMO"): mu_syn1 = 5.0 * (np.log10(x)) + 25.0 - 19.35 mu1 = mb1 - mu_syn1 - list2, z2, mb2, mb2e = np.loadtxt(output_dir + "/lcparam_" + lc2 + ".txt", usecols=(0, 1, 4, 5), unpack=True, dtype="string") + list2, z2, mb2, mb2e = np.loadtxt( + output_dir + "/lcparam_" + lc2 + ".txt", + usecols=(0, 1, 4, 5), + unpack=True, + dtype="string", + ) z2 = z1 # using z1 so z lines up mb2 = mb2.astype(float) @@ -216,10 +250,25 @@ def avgmat(base_output, mat1, mat2, lc1, lc2, output_dir="COSMO"): # stop # print z1 # stop - f1 = open(output_dir + "/lcparam_" + base_output + ".txt", "w") # this is the file for cosmomc - f1.write("#name zcmb zhel dz mb dmb x1 dx1 color dcolor 3rdvar d3rdvar cov_m_s cov_m_c cov_s_c set ra dec biascor \n") # standard format + f1 = open( + output_dir + "/lcparam_" + base_output + ".txt", "w" + ) # this is the file for cosmomc + f1.write( + "#name zcmb zhel dz mb dmb x1 dx1 color dcolor 3rdvar d3rdvar cov_m_s cov_m_c cov_s_c set ra dec biascor \n" + ) # standard format for x in range(0, len(z1)): - f1.write(str(list1[x]) + " " + str(z1[x]) + " " + str(z1[x]) + " 0.0 " + str(mua[x]) + " " + str(muae[x]) + " 0 0 0 0 0 0 0 0 0 0 0 0\n") + f1.write( + str(list1[x]) + + " " + + str(z1[x]) + + " " + + str(z1[x]) + + " 0.0 " + + str(mua[x]) + + " " + + str(muae[x]) + + " 0 0 0 0 0 0 0 0 0 0 0 0\n" + ) f1.close() print(output_dir + "/sys_" + mat1 + ".txt") print(output_dir + "/sys_" + mat2 + ".txt") @@ -255,7 +304,12 @@ def avgmat_Ngrid(base_output, mats, lcs, output_dir="COSMO"): lists, zs, mbs, mbes, xs, mu_syns, mus = [], [], [], [], [], [], [] for mat, lc in zip(mats, lcs): - list1, z1, mb1, mb1e = np.loadtxt(output_dir + "/lcparam_" + lc + ".txt", usecols=(0, 1, 4, 5), unpack=True, dtype="string") + list1, z1, mb1, mb1e = np.loadtxt( + output_dir + "/lcparam_" + lc + ".txt", + usecols=(0, 1, 4, 5), + unpack=True, + dtype="string", + ) z1 = z1.astype(float) mb1 = mb1.astype(float) mb1e = mb1e.astype(float) @@ -297,17 +351,34 @@ def avgmat_Ngrid(base_output, mats, lcs, output_dir="COSMO"): # stop # print z1 # stop - f1 = open(output_dir + "/lcparam_" + base_output + ".txt", "w") # this is the file for cosmomc - f1.write("#name zcmb zhel dz mb dmb x1 dx1 color dcolor 3rdvar d3rdvar cov_m_s cov_m_c cov_s_c set ra dec biascor \n") # standard format + f1 = open( + output_dir + "/lcparam_" + base_output + ".txt", "w" + ) # this is the file for cosmomc + f1.write( + "#name zcmb zhel dz mb dmb x1 dx1 color dcolor 3rdvar d3rdvar cov_m_s cov_m_c cov_s_c set ra dec biascor \n" + ) # standard format for x in range(0, len(zs[0])): - f1.write(str(lists[0][x]) + " " + str(zs[0][x]) + " " + str(zs[0][x]) + " 0.0 " + str(mua[x]) + " " + str(muae[x]) + " 0 0 0 0 0 0 0 0 0 0 0 0\n") + f1.write( + str(lists[0][x]) + + " " + + str(zs[0][x]) + + " " + + str(zs[0][x]) + + " 0.0 " + + str(mua[x]) + + " " + + str(muae[x]) + + " 0 0 0 0 0 0 0 0 0 0 0 0\n" + ) f1.close() # print output_dir+'/sys_'+mat1+'.txt' # print output_dir+'/sys_'+mat2+'.txt' syss = [] for mat, lc in zip(mats, lcs): - sys1 = np.loadtxt(output_dir + "/sys_" + mat + ".txt", unpack=True, dtype="string") + sys1 = np.loadtxt( + output_dir + "/sys_" + mat + ".txt", unpack=True, dtype="string" + ) sys1 = sys1.astype(float) syss.append(sys1) # print syss[0].shape @@ -373,7 +444,13 @@ def sysmat( look_dir = os.path.join(topdir, subdir) - file_lines = sorted([os.path.join(look_dir, x) for x in os.listdir(look_dir) if x.endswith(".M0DIF")]) + file_lines = sorted( + [ + os.path.join(look_dir, x) + for x in os.listdir(look_dir) + if x.endswith(".M0DIF") + ] + ) if not file_lines: print("No M0DIF files!!! This makes me sad!!! Im done here!!") @@ -381,11 +458,15 @@ def sysmat( if not os.path.exists(topdir + "/SALT2mu_FITSCRIPTS/FITJOBS_SUMMARY.LOG"): print(topdir + "/SALT2mu_FITSCRIPTS/FITJOBS_SUMMARY.LOG") - print("Log file not there. No M0DIF files!!! This makes me sad!!! Im done here!!") + print( + "Log file not there. No M0DIF files!!! This makes me sad!!! Im done here!!" + ) return 0 if os.path.isfile(topdir + "/SALT2mu_FITSCRIPTS/FITJOBS_SUMMARY.LOG"): - log_lines = open(topdir + "/SALT2mu_FITSCRIPTS/FITJOBS_SUMMARY.LOG", "r").readlines() + log_lines = open( + topdir + "/SALT2mu_FITSCRIPTS/FITJOBS_SUMMARY.LOG", "r" + ).readlines() print(topdir + "/SALT2mu_FITSCRIPTS/FITJOBS_SUMMARY.LOG") filesize = len(file_lines) # read in number of M0DIF files @@ -430,7 +511,9 @@ def sysmat( FITOPT_var1 = np.append(FITOPT_var1, "FITOPT" + mu_split[1]) FITOPT_var2 = np.append(FITOPT_var2, mu_split[2][1:-1]) - if (os.path.isfile(sysfile) & (sysfile != "NONE") & (errscales == "NONE")) | ((sysfile == "NONE") & (errscales != "NONE")): + if (os.path.isfile(sysfile) & (sysfile != "NONE") & (errscales == "NONE")) | ( + (sysfile == "NONE") & (errscales != "NONE") + ): if os.path.isfile(sysfile) & (sysfile != "NONE") & (errscales == "NONE"): if (os.path.isfile(sysfile) == False) & (sysfile != "NONE"): print("That " + sysfile + " doesnt exist. Grrrr. Have to leave") @@ -448,27 +531,37 @@ def sysmat( SYSOPT_var3 = np.append(SYSOPT_var3, mu_split[3]) if (sysfile == "NONE") & (errscales == "NONE"): - print("WARNING: All systematics have default scaling with no cuts. This is really dangerous!") + print( + "WARNING: All systematics have default scaling with no cuts. This is really dangerous!" + ) SYSOPT_var1 = [] if (sysfile != "NONE") & (errscales != "NONE"): - print("You have a list of systematics in your inFile and in your included file. That is one two many lists. We have to stop") + print( + "You have a list of systematics in your inFile and in your included file. That is one two many lists. We have to stop" + ) topfile = os.path.join(look_dir, topfile) print(f"TOPFILE IS {topfile}") skipc = linef(topfile, "VARNAMES") if topfile != "": - z1, mu1, mu1e = np.loadtxt(topfile, usecols=(4, 5, 6), unpack=True, dtype="str", skiprows=skipc + 1) + z1, mu1, mu1e = np.loadtxt( + topfile, usecols=(4, 5, 6), unpack=True, dtype="str", skiprows=skipc + 1 + ) if topfile == "": - z1, mu1, mu1e = np.loadtxt(topfile, usecols=(4, 5, 6), unpack=True, dtype="str", skiprows=skipc + 1) + z1, mu1, mu1e = np.loadtxt( + topfile, usecols=(4, 5, 6), unpack=True, dtype="str", skiprows=skipc + 1 + ) print("topfile", topfile) mu1 = mu1.astype(float) mu1e = mu1e.astype(float) z1 = z1.astype(float) # xxa=[mu1e<90] - xxa = [mu1e < np.inf] # CHANGED BY DILLON HERE to get covmats all the same size for multiple sims + xxa = [ + mu1e < np.inf + ] # CHANGED BY DILLON HERE to get covmats all the same size for multiple sims z1 = z1[xxa] mu1 = mu1[xxa] mu1e = mu1e[xxa] @@ -477,10 +570,25 @@ def sysmat( mu_syn = 5.0 * (np.log10(x)) + 25.0 - 19.35 mu_syn1 = mu_syn + mu1 - f1 = open(output_dir + "/lcparam_" + base_output + ".txt", "w") # this is the file for cosmomc - f1.write("#name zcmb zhel dz mb dmb x1 dx1 color dcolor 3rdvar d3rdvar cov_m_s cov_m_c cov_s_c set ra dec biascor \n") # standard format + f1 = open( + output_dir + "/lcparam_" + base_output + ".txt", "w" + ) # this is the file for cosmomc + f1.write( + "#name zcmb zhel dz mb dmb x1 dx1 color dcolor 3rdvar d3rdvar cov_m_s cov_m_c cov_s_c set ra dec biascor \n" + ) # standard format for x in range(0, len(z1)): - f1.write(str(x) + " " + str(z1[x]) + " " + str(z1[x]) + " 0.0 " + str(mu_syn1[x]) + " " + str(mu1e[x]) + " 0 0 0 0 0 0 0 0 0 0 0 0\n") + f1.write( + str(x) + + " " + + str(z1[x]) + + " " + + str(z1[x]) + + " 0.0 " + + str(mu_syn1[x]) + + " " + + str(mu1e[x]) + + " 0 0 0 0 0 0 0 0 0 0 0 0\n" + ) f1.close() bigmatmm = np.zeros((len(z1), len(z1), sysnum + 1)) + 0.000000 @@ -493,7 +601,13 @@ def sysmat( xx1 = FITOPT_var1 == file_lines[xco].split("_")[-2] xx2 = MUOPT_var1 == file_lines[xco].split("_")[-1][:-6] skipc = linef(file_lines[xco], "VARNAMES") - z2, mu2, mu2e = np.loadtxt(file_lines[xco], usecols=(4, 5, 6), unpack=True, dtype="str", skiprows=skipc + 1) + z2, mu2, mu2e = np.loadtxt( + file_lines[xco], + usecols=(4, 5, 6), + unpack=True, + dtype="str", + skiprows=skipc + 1, + ) print(file_lines[xco]) mu2 = mu2.astype(float) mu2e = mu2e.astype(float) @@ -526,11 +640,26 @@ def sysmat( sys_ratio = float(SYSOPT_var3[y1]) # print sys_ratio # stop - print("Have a systematic from " + str(SYSOPT_var1[y1]) + str(SYSOPT_var2[y1]) + " of " + str(SYSOPT_var3[y1])) - logf.write("Have a systematic from " + str(SYSOPT_var1[y1]) + str(SYSOPT_var2[y1]) + " of " + str(SYSOPT_var3[y1]) + "\n") + print( + "Have a systematic from " + + str(SYSOPT_var1[y1]) + + str(SYSOPT_var2[y1]) + + " of " + + str(SYSOPT_var3[y1]) + ) + logf.write( + "Have a systematic from " + + str(SYSOPT_var1[y1]) + + str(SYSOPT_var2[y1]) + + " of " + + str(SYSOPT_var3[y1]) + + "\n" + ) # stop if comatch > 0: - print("WARNING you have had multiple systematics match up!!! That is bad") + print( + "WARNING you have had multiple systematics match up!!! That is bad" + ) comatch = comatch + 1 # if ((np.amax(np.absolute(z1-z2)/z1)>0.1)&(sys_ratio>0)): @@ -583,11 +712,24 @@ def sysmat( if syscheck1[0] == "-": print(sys_flag1) print(sys_flag2) - print(FITOPT_var2[xx1][0], MUOPT_var2[xx2][0], (sys_flag1) & (sys_flag2)) + print( + FITOPT_var2[xx1][0], MUOPT_var2[xx2][0], (sys_flag1) & (sys_flag2) + ) # stop if (sys_flag1) & (sys_flag2): logf.write( - FITOPT_var2[xx1][0] + " " + MUOPT_var2[xx2][0] + " " + syscheck1[0:] + " " + syscheck2[0:] + " " + str(x) + " " + str(sys_ratio) + " \n" + FITOPT_var2[xx1][0] + + " " + + MUOPT_var2[xx2][0] + + " " + + syscheck1[0:] + + " " + + syscheck2[0:] + + " " + + str(x) + + " " + + str(sys_ratio) + + " \n" ) bigmatmm[:, :, x] = np.add(bigmatmm[:, :, x], np.multiply(dmm, 1.0)) @@ -658,12 +800,22 @@ def makeini(outputdir, baseoutput, basedir, datasetnum=0): # dvin_nosn_ocmb_omol.ini print("we are making ini files!") svec = ["omw", "wwa", "omol"] - gvec = ["sn_", "sn_bao_", "sn_cmb_", "sn_cmb_bao_", "sn_prior_", "cmb_", "nohubble_sn_"] + gvec = [ + "sn_", + "sn_bao_", + "sn_cmb_", + "sn_cmb_bao_", + "sn_prior_", + "cmb_", + "nohubble_sn_", + ] for ss in svec: for gg in gvec: if os.path.isfile(basedir + "/" + gg + ss + ".ini"): g = open(basedir + "/" + gg + ss + ".ini", "r") - h = open(outputdir + "/" + gg + ss + "_" + str(int(datasetnum)) + ".ini", "w") + h = open( + outputdir + "/" + gg + ss + "_" + str(int(datasetnum)) + ".ini", "w" + ) with open(basedir + "/" + gg + ss + ".ini", "r") as f: content = f.readlines() for x in content: @@ -681,7 +833,6 @@ def write_done(filename, success=True): if __name__ == "__main__": - # parse input argument(s) if len(sys.argv) < 3: raise ValueError("Must give INFILE argument\n-->ABORT") @@ -741,14 +892,26 @@ def write_done(filename, success=True): ) print(FileInfo.OUTPUTDIR) # DILLON: I'm editing here for giving full outputdir path not relative to cwd - with open("/".join(FileInfo.OUTPUTDIR.split("/")[:-1]) + "/covopt.dict", "w") as f: + with open( + "/".join(FileInfo.OUTPUTDIR.split("/")[:-1]) + "/covopt.dict", "w" + ) as f: for d in range(len(FileInfo.COVOPT) + 1): - makeini(FileInfo.OUTPUTDIR, FileInfo.BASEOUTPUT, FileInfo.COSMOMC_TEMPLATES, datasetnum=d) + makeini( + FileInfo.OUTPUTDIR, + FileInfo.BASEOUTPUT, + FileInfo.COSMOMC_TEMPLATES, + datasetnum=d, + ) if d == 0: covwrite = "ALLSYS" else: if FileInfo.COVOPT[d - 1]: - covwrite = FileInfo.COVOPT[d - 1][0].replace("[", "").replace("'", "").replace("]", "") + covwrite = ( + FileInfo.COVOPT[d - 1][0] + .replace("[", "") + .replace("'", "") + .replace("]", "") + ) f.write("%d\t%s\n" % (d, covwrite)) print("Copying base.ini file over") diff --git a/pippin/external/parse_biascor.py b/pippin/external/parse_biascor.py index 1d40a42e..de0920e1 100644 --- a/pippin/external/parse_biascor.py +++ b/pippin/external/parse_biascor.py @@ -15,7 +15,11 @@ def setup_logging(): fmt = "[%(levelname)8s |%(funcName)21s:%(lineno)3d] %(message)s" handler = logging.StreamHandler(sys.stdout) - logging.basicConfig(level=logging.DEBUG, format=fmt, handlers=[handler, logging.FileHandler("plot_biascor.log")]) + logging.basicConfig( + level=logging.DEBUG, + format=fmt, + handlers=[handler, logging.FileHandler("plot_biascor.log")], + ) logging.getLogger("matplotlib").setLevel(logging.ERROR) logging.getLogger("chainconsumer").setLevel(logging.WARNING) @@ -55,7 +59,9 @@ def make_summary_file(wfit_files, args): df_all = None for f in wfit_files: logging.debug(f"Reading in wfit_summary {f}") - df = pd.read_csv(f, delim_whitespace=True, comment="#").drop(columns=["VARNAMES:", "ROW"]) + df = pd.read_csv(f, delim_whitespace=True, comment="#").drop( + columns=["VARNAMES:", "ROW"] + ) name = os.path.basename(os.path.dirname(os.path.dirname(f))) df["name"] = name logging.debug(f"Read {f}, contents are: {df}") diff --git a/pippin/external/parse_cosmomc.py b/pippin/external/parse_cosmomc.py index 1447860d..3128d931 100644 --- a/pippin/external/parse_cosmomc.py +++ b/pippin/external/parse_cosmomc.py @@ -21,7 +21,9 @@ def load_params(file): def load_chains(files, all_cols, use_cols=None): header = ["weights", "likelihood"] + all_cols - data = [pd.read_csv(f, delim_whitespace=True, header=None, names=header) for f in files] + data = [ + pd.read_csv(f, delim_whitespace=True, header=None, names=header) for f in files + ] # Remove burn in by cutting off first 30% data = [d.iloc[int(d.shape[0] * 0.3) :, :] for d in data] @@ -45,8 +47,15 @@ def get_chain_files(basename): folder = os.path.dirname(basename) logging.info(f"Looking for chains in folder {folder}") base = os.path.basename(basename) - files = [os.path.join(folder, f) for f in sorted(os.listdir(folder)) if base in f and f.endswith(".txt")] - fail(f"No chain files found for {os.path.join(folder, basename)}", condition=len(files) == 0) + files = [ + os.path.join(folder, f) + for f in sorted(os.listdir(folder)) + if base in f and f.endswith(".txt") + ] + fail( + f"No chain files found for {os.path.join(folder, basename)}", + condition=len(files) == 0, + ) logging.info(f"{len(files)} chains found for basename {basename}") return files @@ -54,7 +63,11 @@ def get_chain_files(basename): def setup_logging(): fmt = "[%(levelname)8s |%(filename)21s:%(lineno)3d] %(message)s" handler = logging.StreamHandler(sys.stdout) - logging.basicConfig(level=logging.DEBUG, format=fmt, handlers=[handler, logging.FileHandler("parse_cosmomc.log")]) + logging.basicConfig( + level=logging.DEBUG, + format=fmt, + handlers=[handler, logging.FileHandler("parse_cosmomc.log")], + ) logging.getLogger("matplotlib").setLevel(logging.ERROR) @@ -65,7 +78,9 @@ def blind(chain, names, columns_to_blind, index=0): try: ii = names.index(c) scale = np.random.normal(loc=1, scale=0.0, size=1000)[321 + i] - offset = np.random.normal(loc=0, scale=10.0, size=1000)[343 + i + (index + 10)] + offset = np.random.normal(loc=0, scale=10.0, size=1000)[ + 343 + i + (index + 10) + ] chain[:, ii] = chain[:, ii] * scale + np.std(chain[:, ii]) * offset except ValueError as e: logging.warning(f"Cannot find blinding column {c} in list of names {names}") @@ -83,7 +98,8 @@ def get_arguments(): if config.get("NAMES") is not None: assert len(config["NAMES"]) == len(config["INPUT_FILES"]), ( - "You should specify one name per base file you pass in." + f" Have {len(config['FILES'])} base names and {len(config['NAMES'])} names" + "You should specify one name per base file you pass in." + + f" Have {len(config['FILES'])} base names and {len(config['NAMES'])} names" ) return config @@ -99,14 +115,23 @@ def parse_chains(basename, outname, args, index): if blind_params: blind(chain, params or names, blind_params, index=index) labels = [ - f"${l}" + (r"\ \mathrm{Blinded}" if blind_params is not None and u in blind_params else "") + "$" + f"${l}" + + ( + r"\ \mathrm{Blinded}" + if blind_params is not None and u in blind_params + else "" + ) + + "$" for u in params for l, n in zip(labels, names) if n == u ] # Turn into new df - output_df = pd.DataFrame(np.vstack((weights, likelihood, chain.T)).T, columns=["_weight", "_likelihood"] + labels) + output_df = pd.DataFrame( + np.vstack((weights, likelihood, chain.T)).T, + columns=["_weight", "_likelihood"] + labels, + ) output_df.to_csv(outname, float_format="%0.5f", index=False) logging.info(f"Chain for {basename} has shape {chain.shape}") @@ -123,10 +148,16 @@ def parse_chains(basename, outname, args, index): biases = {} b = 1 - truth = {"$\\Omega_m$": 0.3, "$w\\ \\mathrm{Blinded}$": -1.0, "$\\Omega_\\Lambda$": 0.7} + truth = { + "$\\Omega_m$": 0.3, + "$w\\ \\mathrm{Blinded}$": -1.0, + "$\\Omega_\\Lambda$": 0.7, + } shift_params = truth if args.get("SHIFT") else None - for index, (basename, outname) in enumerate(zip(args.get("INPUT_FILES"), args.get("PARSED_FILES"))): + for index, (basename, outname) in enumerate( + zip(args.get("INPUT_FILES"), args.get("PARSED_FILES")) + ): if args.get("NAMES"): name = args.get("NAMES")[index].replace("_", " ") else: diff --git a/pippin/external/parse_lcfit.py b/pippin/external/parse_lcfit.py index 0e8e82de..c97ce76b 100644 --- a/pippin/external/parse_lcfit.py +++ b/pippin/external/parse_lcfit.py @@ -12,7 +12,11 @@ def setup_logging(): fmt = "[%(levelname)8s |%(funcName)21s:%(lineno)3d] %(message)s" handler = logging.StreamHandler(sys.stdout) - logging.basicConfig(level=logging.DEBUG, format=fmt, handlers=[handler, logging.FileHandler("parse_lcfit.log")]) + logging.basicConfig( + level=logging.DEBUG, + format=fmt, + handlers=[handler, logging.FileHandler("parse_lcfit.log")], + ) logging.getLogger("matplotlib").setLevel(logging.ERROR) logging.getLogger("chainconsumer").setLevel(logging.WARNING) @@ -34,7 +38,9 @@ def add_muref(df, filename, alpha=0.14, beta=3.1, om=0.311, h0=70, MB=-19.361): cols = ["zHD", "x1", "mB", "c"] for c in cols: if c not in df.columns: - logging.exception(f"Filename {filename} has no column {c}, has {df.columns}") + logging.exception( + f"Filename {filename} has no column {c}, has {df.columns}" + ) cosmo = FlatLambdaCDM(h0, om) cosmo_dist_mod = cosmo.distmod(df["zHD"]).value obs_dist_mod = df["mB"] + alpha * df["x1"] - beta * df["c"] - MB @@ -43,7 +49,6 @@ def add_muref(df, filename, alpha=0.14, beta=3.1, om=0.311, h0=70, MB=-19.361): def load_file(infile, outfile): - logging.info(f"Attempting to load in original file {infile}") df = pd.read_csv(infile, delim_whitespace=True, comment="#") @@ -66,8 +71,18 @@ def load_file(infile, outfile): logging.warning("Warning, no Ia types specified, assuming 1 and 101.") args["IA_TYPES"] = [1, 101] - data_dfs = [load_file(f, fo) for f, fo in zip(args.get("DATA_FITRES_INPUT", []), args.get("DATA_FITRES_PARSED", []))] - sim_dfs = [load_file(f, fo) for f, fo in zip(args.get("SIM_FITRES_INPUT", []), args.get("SIM_FITRES_PARSED", []))] + data_dfs = [ + load_file(f, fo) + for f, fo in zip( + args.get("DATA_FITRES_INPUT", []), args.get("DATA_FITRES_PARSED", []) + ) + ] + sim_dfs = [ + load_file(f, fo) + for f, fo in zip( + args.get("SIM_FITRES_INPUT", []), args.get("SIM_FITRES_PARSED", []) + ) + ] logging.info(f"Finishing gracefully") diff --git a/pippin/external/plot_biascor.py b/pippin/external/plot_biascor.py index a502d8ec..e832592e 100644 --- a/pippin/external/plot_biascor.py +++ b/pippin/external/plot_biascor.py @@ -14,7 +14,11 @@ def setup_logging(): fmt = "[%(levelname)8s |%(funcName)21s:%(lineno)3d] %(message)s" handler = logging.StreamHandler(sys.stdout) - logging.basicConfig(level=logging.DEBUG, format=fmt, handlers=[handler, logging.FileHandler("plot_biascor.log")]) + logging.basicConfig( + level=logging.DEBUG, + format=fmt, + handlers=[handler, logging.FileHandler("plot_biascor.log")], + ) logging.getLogger("matplotlib").setLevel(logging.ERROR) logging.getLogger("chainconsumer").setLevel(logging.WARNING) @@ -47,7 +51,7 @@ def plot_single_file(source_file, df): labels = [r"$\Omega_m$", "$w$", r"$\sigma_{int}$"] for index, row in df.iterrows(): means = [row["omm"], row["w"], row["sigint"]] - cov = np.diag([row["omm_sig"] ** 2, row["w_sig"] ** 2, 0.01 ** 2]) + cov = np.diag([row["omm_sig"] ** 2, row["w_sig"] ** 2, 0.01**2]) c.add_covariance(means, cov, parameters=labels, name=f"Realisation {index}") c.plotter.plot_summary(errorbar=True, filename=output_file) del c @@ -64,10 +68,14 @@ def plot_all_files(df_all): means = [df["omm"].mean(), df["w"].mean(), df["sigint"].mean()] if df.shape[0] < 2: name2 = name + " (showing mean error)" - cov = np.diag([df["omm_sig"].mean() ** 2, df["w_sig"].mean() ** 2, 0.01 ** 2]) + cov = np.diag( + [df["omm_sig"].mean() ** 2, df["w_sig"].mean() ** 2, 0.01**2] + ) else: name2 = name + " (showing scatter error)" - cov = np.diag([df["omm"].std() ** 2, df["w"].std() ** 2, df["sigint"].std() ** 2]) + cov = np.diag( + [df["omm"].std() ** 2, df["w"].std() ** 2, df["sigint"].std() ** 2] + ) c.add_covariance(means, cov, parameters=labels, name=name2.replace("_", "\\_")) data.append([name, df["w"].mean(), df["w"].std(), df["w_sig"].mean()]) wdf = pd.DataFrame(data, columns=["name", "mean_w", "scatter_mean_w", "mean_std_w"]) @@ -112,14 +120,31 @@ def plot_scatter_comp(df_all): ax.axis("off") continue elif i == j: - h, _, _ = ax.hist(ws[i, :], bins=bins, histtype="stepfilled", linewidth=2, alpha=0.3, color=cols[i]) - ax.hist(ws[i, :], bins=bins, histtype="step", linewidth=1.5, color=cols[i]) + h, _, _ = ax.hist( + ws[i, :], + bins=bins, + histtype="stepfilled", + linewidth=2, + alpha=0.3, + color=cols[i], + ) + ax.hist( + ws[i, :], + bins=bins, + histtype="step", + linewidth=1.5, + color=cols[i], + ) ax.set_yticklabels([]) ax.tick_params(axis="y", left=False) ax.set_xlim(*lim) if bins[0] < -1 < bins[-1]: - yval = interp1d(0.5 * (bins[:-1] + bins[1:]), h, kind="nearest")([-1.0])[0] - ax.plot([-1.0, -1.0], [0, yval], color="k", lw=1, ls="--", alpha=0.4) + yval = interp1d( + 0.5 * (bins[:-1] + bins[1:]), h, kind="nearest" + )([-1.0])[0] + ax.plot( + [-1.0, -1.0], [0, yval], color="k", lw=1, ls="--", alpha=0.4 + ) ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) if j == 0: @@ -130,10 +155,14 @@ def plot_scatter_comp(df_all): a1 = ws[j, :] a2 = ws[i, :] c = np.abs(a1 - a2) - ax.scatter(a1, a2, s=2, c=c, cmap="viridis_r", vmin=-0.02, vmax=0.05) + ax.scatter( + a1, a2, s=2, c=c, cmap="viridis_r", vmin=-0.02, vmax=0.05 + ) ax.set_xlim(*lim) ax.set_ylim(*lim) - ax.plot([min_w, max_w], [min_w, max_w], c="k", lw=1, alpha=0.8, ls=":") + ax.plot( + [min_w, max_w], [min_w, max_w], c="k", lw=1, alpha=0.8, ls=":" + ) ax.axvline(-1.0, color="k", lw=1, ls="--", alpha=0.4) ax.axhline(-1.0, color="k", lw=1, ls="--", alpha=0.4) @@ -151,7 +180,9 @@ def plot_scatter_comp(df_all): def make_hubble_plot(fitres_file, m0diff_file, prob_col_name, args): - logging.info(f"Making Hubble plot from FITRES file {fitres_file} and M0DIF file {m0diff_file}") + logging.info( + f"Making Hubble plot from FITRES file {fitres_file} and M0DIF file {m0diff_file}" + ) # Note that the fitres file has mu and fit 0, m0diff will have to select down to it name, sim_num, *_ = fitres_file.split("__") @@ -159,7 +190,12 @@ def make_hubble_plot(fitres_file, m0diff_file, prob_col_name, args): df = pd.read_csv(fitres_file, delim_whitespace=True, comment="#") dfm = pd.read_csv(m0diff_file) - dfm = dfm[(dfm.name == name) & (dfm.sim_num == sim_num) & (dfm.muopt_num == 0) & (dfm.fitopt_num == 0)] + dfm = dfm[ + (dfm.name == name) + & (dfm.sim_num == sim_num) + & (dfm.muopt_num == 0) + & (dfm.fitopt_num == 0) + ] from astropy.cosmology import FlatwCDM import numpy as np @@ -208,13 +244,29 @@ def make_hubble_plot(fitres_file, m0diff_file, prob_col_name, args): n = v * num_sn_fit contam_data = f"$R_{{CC, data}} = {v:0.4f} (\\approx {int(n)} SN)$" if "scalePCC" in line and "+-" in line: - scalepcc = "scalePCC = $" + line.split("=")[-1].strip().replace("+-", r"\pm") + "$" + scalepcc = ( + "scalePCC = $" + + line.split("=")[-1].strip().replace("+-", r"\pm") + + "$" + ) if prob_col_name is not None: prob_label = prob_col_name.replace("PROB_", "").replace("_", " ") classifier_text = f"Classifier = {prob_label}" else: classifier_text = "No Classification" - label = "\n".join([num_sn, alpha, beta, sigint, gamma, scalepcc, contam_true, contam_data, classifier_text]) + label = "\n".join( + [ + num_sn, + alpha, + beta, + sigint, + gamma, + scalepcc, + contam_true, + contam_data, + classifier_text, + ] + ) label = label.replace("\n\n", "\n").replace("\n\n", "\n") dfz = df["zHD"] zs = np.linspace(dfz.min(), dfz.max(), 500) @@ -227,7 +279,9 @@ def make_hubble_plot(fitres_file, m0diff_file, prob_col_name, args): if zs.min() > n_thresh: n_space = 0.01 subsec = False - z_a = np.logspace(np.log10(min(0.01, zs.min() * 0.9)), np.log10(n_thresh), int(n_space * n_trans)) + z_a = np.logspace( + np.log10(min(0.01, zs.min() * 0.9)), np.log10(n_thresh), int(n_space * n_trans) + ) z_b = np.linspace(n_thresh, zs.max() * 1.01, 1 + int((1 - n_space) * n_trans))[1:] z_trans = np.concatenate((z_a, z_b)) z_scale = np.arange(n_trans) @@ -248,7 +302,12 @@ def tranz(zs): x_tick_t = tranz(x_ticks) x_ticks_mt = tranz(x_ticks_m) - fig, axes = plt.subplots(figsize=(7, 5), nrows=2, sharex=True, gridspec_kw={"height_ratios": [1.5, 1], "hspace": 0}) + fig, axes = plt.subplots( + figsize=(7, 5), + nrows=2, + sharex=True, + gridspec_kw={"height_ratios": [1.5, 1], "hspace": 0}, + ) logging.info(f"Hubble plot prob colour given by column {prob_col_name}") if prob_col_name is not None: @@ -273,12 +332,21 @@ def tranz(zs): sub2 = -dfm["MUREF"] sub3 = 0 ax.set_ylabel(r"$\mu$") - ax.annotate(label, (0.98, 0.02), xycoords="axes fraction", horizontalalignment="right", verticalalignment="bottom", fontsize=8) + ax.annotate( + label, + (0.98, 0.02), + xycoords="axes fraction", + horizontalalignment="right", + verticalalignment="bottom", + fontsize=8, + ) alpha = 0.7 ax.set_xlabel("$z$") if subsec: - ax.axvline(tranz(n_thresh), c="#888888", alpha=0.4, zorder=0, lw=0.7, ls="--") + ax.axvline( + tranz(n_thresh), c="#888888", alpha=0.4, zorder=0, lw=0.7, ls="--" + ) if prob_col_name is None or df[prob_col_name].min() >= 1.0: cc = df["IDSURVEY"] @@ -292,15 +360,42 @@ def tranz(zs): cmap = "inferno" # Plot each point - ax.errorbar(tranz(dfz), df["MU"] - sub, yerr=df["MUERR"], fmt="none", elinewidth=0.5, c="#AAAAAA", alpha=0.5 * alpha) - h = ax.scatter(tranz(dfz), df["MU"] - sub, c=cc, s=1, zorder=2, alpha=alpha, vmax=vmax, cmap=cmap) + ax.errorbar( + tranz(dfz), + df["MU"] - sub, + yerr=df["MUERR"], + fmt="none", + elinewidth=0.5, + c="#AAAAAA", + alpha=0.5 * alpha, + ) + h = ax.scatter( + tranz(dfz), + df["MU"] - sub, + c=cc, + s=1, + zorder=2, + alpha=alpha, + vmax=vmax, + cmap=cmap, + ) if not args.get("BLIND", []): # Plot ref cosmology ax.plot(tranz(zs), distmod - sub3, c="k", zorder=-1, lw=0.5, alpha=0.7) # Plot m0diff - ax.errorbar(tranz(dfm["z"]), dfm["MUDIF"] - sub2, yerr=dfm["MUDIFERR"], fmt="o", mew=0.5, capsize=3, elinewidth=0.5, c="k", ms=4) + ax.errorbar( + tranz(dfm["z"]), + dfm["MUDIF"] - sub2, + yerr=dfm["MUDIFERR"], + fmt="o", + mew=0.5, + capsize=3, + elinewidth=0.5, + c="k", + ms=4, + ) ax.set_xticks(x_tick_t) ax.set_xticks(x_ticks_mt, minor=True) ax.set_xticklabels(x_ticks) @@ -310,7 +405,9 @@ def tranz(zs): ax.set_yticklabels([]) ax.set_yticks([]) if color_prob: - cbar = fig.colorbar(h, ax=axes, orientation="vertical", fraction=0.1, pad=0.01, aspect=40) + cbar = fig.colorbar( + h, ax=axes, orientation="vertical", fraction=0.1, pad=0.01, aspect=40 + ) cbar.set_label("Prob Ia") fp = fitres_file.replace(".fitres.gz", ".png") @@ -344,7 +441,13 @@ def make_m0diff_plot(m0diff_file): ncols = 1 nrows = (n + (ncols - 1)) // ncols - fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(10, 1 + 1.5 * nrows), squeeze=False, sharex=True) + fig, axes = plt.subplots( + nrows=nrows, + ncols=ncols, + figsize=(10, 1 + 1.5 * nrows), + squeeze=False, + sharex=True, + ) axes = axes.flatten() for (name, df), ax in zip(dfg, axes): @@ -374,7 +477,9 @@ def make_m0diff_plot(m0diff_file): c = None diff = df2.MUDIF.to_numpy() - base.MUDIF.to_numpy() - ax.plot(df2.z, diff, label=label, ls=ls, alpha=alpha, zorder=zorder, c=c, lw=1) + ax.plot( + df2.z, diff, label=label, ls=ls, alpha=alpha, zorder=zorder, c=c, lw=1 + ) if len(dfg2) > 10: ax.legend(bbox_to_anchor=(0.5, -0.1), ncol=2) @@ -390,7 +495,6 @@ def make_m0diff_plot(m0diff_file): setup_logging() args = get_arguments() try: - # Plot wfit distributions wfit_file = args.get("WFIT_SUMMARY_OUTPUT") df_all = load_file(wfit_file) diff --git a/pippin/external/plot_cosmomc.py b/pippin/external/plot_cosmomc.py index 060856cb..c8534b6e 100644 --- a/pippin/external/plot_cosmomc.py +++ b/pippin/external/plot_cosmomc.py @@ -17,7 +17,11 @@ def fail(msg, condition=True): def setup_logging(): fmt = "[%(levelname)8s |%(filename)21s:%(lineno)3d] %(message)s" handler = logging.StreamHandler(sys.stdout) - logging.basicConfig(level=logging.DEBUG, format=fmt, handlers=[handler, logging.FileHandler("plot_cosmomc.log")]) + logging.basicConfig( + level=logging.DEBUG, + format=fmt, + handlers=[handler, logging.FileHandler("plot_cosmomc.log")], + ) logging.getLogger("matplotlib").setLevel(logging.ERROR) @@ -33,7 +37,8 @@ def get_arguments(): if config.get("NAMES") is not None: assert len(config["NAMES"]) == len(config["PARSED_FILES"]), ( - "You should specify one name per base file you pass in." + f" Have {len(config['PARSED_FILES'])} base names and {len(config['NAMES'])} names" + "You should specify one name per base file you pass in." + + f" Have {len(config['PARSED_FILES'])} base names and {len(config['NAMES'])} names" ) return config @@ -42,7 +47,12 @@ def load_output(basename): if os.path.exists(basename): logging.warning(f"Loading in pre-saved CSV file from {basename}") df = pd.read_csv(basename) - return df["_weight"].values, df["_likelihood"].values, df.iloc[:, 2:].to_numpy(), list(df.columns[2:]) + return ( + df["_weight"].values, + df["_likelihood"].values, + df.iloc[:, 2:].to_numpy(), + list(df.columns[2:]), + ) else: return None @@ -58,11 +68,17 @@ def load_output(basename): do_full = False biases = {} b = 1 - truth = {"$\\Omega_m$": 0.3, "$w\\ \\mathrm{Blinded}$": -1.0, "$\\Omega_\\Lambda$": 0.7} + truth = { + "$\\Omega_m$": 0.3, + "$w\\ \\mathrm{Blinded}$": -1.0, + "$\\Omega_\\Lambda$": 0.7, + } shift_params = truth if args.get("SHIFT") else None num_parsed = len(args.get("PARSED_FILES")) - for index, (basename, covopt) in enumerate(zip(args.get("PARSED_FILES"), args.get("PARSED_COVOPTS"))): + for index, (basename, covopt) in enumerate( + zip(args.get("PARSED_FILES"), args.get("PARSED_COVOPTS")) + ): if plot_covopts is not None and covopt not in plot_covopts: continue if args.get("NAMES"): @@ -92,7 +108,15 @@ def load_output(basename): weights *= prior if num_parsed > 30: name = "CosmoMC Fit" - c.add_chain(chain, weights=weights, parameters=labels, name=name, posterior=-likelihood, shift_params=shift_params, linestyle=linestyle) + c.add_chain( + chain, + weights=weights, + parameters=labels, + name=name, + posterior=-likelihood, + shift_params=shift_params, + linestyle=linestyle, + ) # Write all our glorious output out = args.get("OUTPUT_NAME") diff --git a/pippin/external/plot_efficiency.py b/pippin/external/plot_efficiency.py index f4422a5a..2bc9a74a 100644 --- a/pippin/external/plot_efficiency.py +++ b/pippin/external/plot_efficiency.py @@ -13,7 +13,11 @@ def setup_logging(): fmt = "[%(levelname)8s |%(funcName)21s:%(lineno)3d] %(message)s" handler = logging.StreamHandler(sys.stdout) - logging.basicConfig(level=logging.DEBUG, format=fmt, handlers=[handler, logging.FileHandler("plot_biascor.log")]) + logging.basicConfig( + level=logging.DEBUG, + format=fmt, + handlers=[handler, logging.FileHandler("plot_biascor.log")], + ) logging.getLogger("matplotlib").setLevel(logging.ERROR) logging.getLogger("chainconsumer").setLevel(logging.WARNING) @@ -28,7 +32,10 @@ def get_arguments(): config = yaml.safe_load(f) config.update(config["LCFIT"]) if config.get("FIELDS") is None: - config["FIELDS"] = [["X3", "C3"], ["E1", "E2", "C1", "C2", "S1", "S2", "X1", "X2"]] + config["FIELDS"] = [ + ["X3", "C3"], + ["E1", "E2", "C1", "C2", "S1", "S2", "X1", "X2"], + ] return config @@ -38,9 +45,10 @@ def load_file(file): def plot_efficiency(data_all, sims, fields): - for i, sim in enumerate(sims): - fig, axes = plt.subplots(len(fields), 3, figsize=(12, 1 + 2 * len(fields)), squeeze=False) + fig, axes = plt.subplots( + len(fields), 3, figsize=(12, 1 + 2 * len(fields)), squeeze=False + ) cols = ["HOST_MAG_i", "HOST_MAG_r", "zHD"] field_eff = {} @@ -49,7 +57,6 @@ def plot_efficiency(data_all, sims, fields): s = sim[np.isin(sim["FIELD"], field)] for c, ax in zip(cols, row): - if c == "zHD": bins = np.arange(0.15, 1.55, 0.1) else: @@ -87,13 +94,29 @@ def plot_efficiency(data_all, sims, fields): ratio2 = ratio2 / ratio2.max() ratio3 = np.concatenate((ratio2, np.zeros(100))) - bc3 = interp1d(np.arange(bc2.size), bc2, bounds_error=False, fill_value="extrapolate")(np.arange(bc2.size + 20)) + bc3 = interp1d( + np.arange(bc2.size), + bc2, + bounds_error=False, + fill_value="extrapolate", + )(np.arange(bc2.size + 20)) smoothed_ratio = gaussian_filter(ratio3, sigma=4, mode="nearest")[:-80] smoothed_ratio = smoothed_ratio / smoothed_ratio.max() - err = np.sqrt((err_data / hist_data) ** 2 + (err_sim / hist_sim) ** 2) * ratio - - ddf = pd.DataFrame({c: bc, "Ndata": hist_data, "Nsim": hist_sim, "eff": ratio, "eff_err": err}) + err = ( + np.sqrt((err_data / hist_data) ** 2 + (err_sim / hist_sim) ** 2) + * ratio + ) + + ddf = pd.DataFrame( + { + c: bc, + "Ndata": hist_data, + "Nsim": hist_sim, + "eff": ratio, + "eff_err": err, + } + ) ddf.to_csv(f"eff_{c}.csv", index=False, float_format="%0.4f") ax.plot(bc, ratio, linewidth=0.5) @@ -118,12 +141,20 @@ def plot_efficiency(data_all, sims, fields): save_efficiency_file(field_eff, fields) fig.tight_layout() - fig.savefig(f"efficiency_{i}.png", bbox_inches="tight", dpi=150, transparent=True) + fig.savefig( + f"efficiency_{i}.png", bbox_inches="tight", dpi=150, transparent=True + ) def save_efficiency_file(field_eff, fields): labels = field_eff.keys() - name_map = {"HOST_MAG_i": "i_obs", "HOST_MAG_r": "r_obs", "HOST_MAG_z": "z_obs", "HOST_MAG_g": "g_obs", "zHD": "ZTRUE"} + name_map = { + "HOST_MAG_i": "i_obs", + "HOST_MAG_r": "r_obs", + "HOST_MAG_z": "z_obs", + "HOST_MAG_g": "g_obs", + "zHD": "ZTRUE", + } for c in labels: with open(f"efficiency_{c}.dat", "w") as f: header = f"OPT_EXTRAP: 1\n\n" @@ -173,19 +204,25 @@ def plot_efficiency2d(data_all, sims, fields): ratio = hist_data / hist_sim # ratio = ratio / ratio[np.isfinite(ratio)].max() - im = ax[0].imshow(hist_data.T, origin="lower", extent=[min_i, max_i, min_r, max_r]) + im = ax[0].imshow( + hist_data.T, origin="lower", extent=[min_i, max_i, min_r, max_r] + ) ax[0].set_title("Data " + ff) divider = make_axes_locatable(ax[0]) cax = divider.append_axes("right", size="5%", pad=0.05) fig.colorbar(im, cax=cax, orientation="vertical") - im = ax[1].imshow(hist_sim.T, origin="lower", extent=[min_i, max_i, min_r, max_r]) + im = ax[1].imshow( + hist_sim.T, origin="lower", extent=[min_i, max_i, min_r, max_r] + ) ax[1].set_title("Sim " + ff) divider = make_axes_locatable(ax[1]) cax = divider.append_axes("right", size="5%", pad=0.05) fig.colorbar(im, cax=cax, orientation="vertical") - im = ax[2].imshow(ratio.T, origin="lower", extent=[min_i, max_i, min_r, max_r]) + im = ax[2].imshow( + ratio.T, origin="lower", extent=[min_i, max_i, min_r, max_r] + ) ax[2].set_title("Ratio " + ff) divider = make_axes_locatable(ax[2]) cax = divider.append_axes("right", size="5%", pad=0.05) @@ -196,16 +233,27 @@ def plot_efficiency2d(data_all, sims, fields): a.set_ylabel(cr) fig.tight_layout() - fig.savefig(f"efficiency2d_{i}.png", bbox_inches="tight", dpi=150, transparent=True) + fig.savefig( + f"efficiency2d_{i}.png", bbox_inches="tight", dpi=150, transparent=True + ) def get_means_and_errors(x, y, bins): means, *_ = binned_statistic(x, y, bins=bins, statistic="mean") - err, *_ = binned_statistic(x, y, bins=bins, statistic=lambda x: np.std(x) / np.sqrt(x.size)) + err, *_ = binned_statistic( + x, y, bins=bins, statistic=lambda x: np.std(x) / np.sqrt(x.size) + ) std, *_ = binned_statistic(x, y, bins=bins, statistic=lambda x: np.std(x)) std_err, *_ = binned_statistic( - x, y, bins=bins, statistic=lambda x: np.sqrt((1 / x.size) * (moment(x, 4) - (((x.size - 3) / (x.size - 1)) * np.var(x) ** 2))) / (2 * np.std(x)) + x, + y, + bins=bins, + statistic=lambda x: np.sqrt( + (1 / x.size) + * (moment(x, 4) - (((x.size - 3) / (x.size - 1)) * np.var(x) ** 2)) + ) + / (2 * np.std(x)), ) return means, err, std, std_err @@ -225,9 +273,10 @@ def get_means_and_errors(x, y, bins): if "HOST_MAG_i" not in sim_dfs[0].columns: logging.info("HOST_MAG_i not in output fitres, not computing efficiencies") else: - if len(data_dfs) > 1: - logging.info("Please specify only one data file if you want to calculate efficiency") + logging.info( + "Please specify only one data file if you want to calculate efficiency" + ) else: for d in data_dfs + sim_dfs: d["HOST_MAG_i-r"] = d["HOST_MAG_i"] - d["HOST_MAG_r"] diff --git a/pippin/external/plot_errbudget.py b/pippin/external/plot_errbudget.py index 6d70dc89..8db51549 100644 --- a/pippin/external/plot_errbudget.py +++ b/pippin/external/plot_errbudget.py @@ -10,7 +10,11 @@ def setup_logging(): fmt = "[%(levelname)8s |%(filename)21s:%(lineno)3d] %(message)s" handler = logging.StreamHandler(sys.stdout) - logging.basicConfig(level=logging.DEBUG, format=fmt, handlers=[handler, logging.FileHandler("plot_errbudget.log")]) + logging.basicConfig( + level=logging.DEBUG, + format=fmt, + handlers=[handler, logging.FileHandler("plot_errbudget.log")], + ) logging.getLogger("matplotlib").setLevel(logging.ERROR) @@ -24,7 +28,9 @@ def get_arguments(): # Set up command line arguments parser = argparse.ArgumentParser() parser.add_argument("input_file", help="Input yml file", type=str) - parser.add_argument("-d", "--donefile", help="Path of done file", type=str, default="errbudget.done") + parser.add_argument( + "-d", "--donefile", help="Path of done file", type=str, default="errbudget.done" + ) args = parser.parse_args() with open(args.input_file, "r") as f: @@ -33,7 +39,8 @@ def get_arguments(): if config.get("NAMES") is not None: assert len(config["NAMES"]) == len(config["PARSED_FILES"]), ( - "You should specify one name per base file you pass in." + f" Have {len(config['PARSED_FILES'])} base names and {len(config['NAMES'])} names" + "You should specify one name per base file you pass in." + + f" Have {len(config['PARSED_FILES'])} base names and {len(config['NAMES'])} names" ) return config @@ -42,7 +49,12 @@ def load_output(basename): if os.path.exists(basename): logging.info(f"Loading in pre-saved CSV file from {basename}") df = pd.read_csv(basename) - return df["_weight"].values, df["_likelihood"].values, df.iloc[:, 2:].to_numpy(), list(df.columns[2:]) + return ( + df["_weight"].values, + df["_likelihood"].values, + df.iloc[:, 2:].to_numpy(), + list(df.columns[2:]), + ) else: fail(f"Cannot find file {basename}") return None @@ -75,19 +87,29 @@ def get_entry(name, syst, filename): logging.info("Making Error Budgets") budget_labels = [n.split()[-1] for n in names] - bases = [(b, i) for i, b in enumerate(budget_labels) if b in ["NOSYS", "STAT", "STATONLY"]] + bases = [ + (b, i) + for i, b in enumerate(budget_labels) + if b in ["NOSYS", "STAT", "STATONLY"] + ] if len(bases): base, base_index = bases[0] - data = [get_entry(n, b, f) for n, b, f in zip(names, budget_labels, files)] + data = [ + get_entry(n, b, f) for n, b, f in zip(names, budget_labels, files) + ] others = [d for i, d in enumerate(data) if i != base_index] base_df = data[base_index] df_all = pd.concat([base_df] + others).reset_index() - df_all.columns = [c.replace(r"\ \mathrm{Blinded}", "") for c in df_all.columns] + df_all.columns = [ + c.replace(r"\ \mathrm{Blinded}", "") for c in df_all.columns + ] # Save out all the means + stds to file unchanged. - df_all.to_csv("errbudget_all_uncertainties.csv", index=False, float_format="%0.4f") + df_all.to_csv( + "errbudget_all_uncertainties.csv", index=False, float_format="%0.4f" + ) # At this point, we have all the loaded in to a single dataframe, and now we group by name, compute the metrics, and save to file dfg = df_all.groupby("name") @@ -97,21 +119,37 @@ def get_entry(name, syst, filename): logging.info(f"Determining error budget for {name}") output_filename = f"errbudget_{name}.txt".replace(" ", "_") - nosys_mask = df.covopt.str.upper().isin(["NOSYS", "NO_SYS", "STAT", "STATONLY", "STAT_ONLY"]) - assert nosys_mask.sum() == 1, f"Multiple potential no systematic covopts found for name {name}, this is an issue" + nosys_mask = df.covopt.str.upper().isin( + ["NOSYS", "NO_SYS", "STAT", "STATONLY", "STAT_ONLY"] + ) + assert ( + nosys_mask.sum() == 1 + ), f"Multiple potential no systematic covopts found for name {name}, this is an issue" avg_cols = [c for c in df.columns if c.endswith(" avg")] std_cols = [c for c in df.columns if c.endswith(" std")] - delta_cols = [c.replace(" avg", " delta") for c in df.columns if c.endswith(" avg")] - contrib_cols = [c.replace(" std", " contrib") for c in df.columns if c.endswith(" std")] - - df[delta_cols] = df.loc[:, avg_cols] - df.loc[nosys_mask, avg_cols].to_numpy() + delta_cols = [ + c.replace(" avg", " delta") + for c in df.columns + if c.endswith(" avg") + ] + contrib_cols = [ + c.replace(" std", " contrib") + for c in df.columns + if c.endswith(" std") + ] + + df[delta_cols] = ( + df.loc[:, avg_cols] - df.loc[nosys_mask, avg_cols].to_numpy() + ) min_var = (df.loc[nosys_mask, std_cols].to_numpy()) ** 2 df[contrib_cols] = np.sqrt(df.loc[:, std_cols] ** 2 - min_var) df = df.reindex(sorted(df.columns)[::-1], axis=1) - df.to_latex(output_filename, index=False, escape=False, float_format="%0.3f") + df.to_latex( + output_filename, index=False, escape=False, float_format="%0.3f" + ) rep_file = output_filename.replace(".txt", "_repr.txt") pd.set_option("display.max_rows", 500) diff --git a/pippin/external/plot_histogram.py b/pippin/external/plot_histogram.py index 58f188fc..11dfeeea 100644 --- a/pippin/external/plot_histogram.py +++ b/pippin/external/plot_histogram.py @@ -14,7 +14,11 @@ def setup_logging(): fmt = "[%(levelname)8s |%(funcName)21s:%(lineno)3d] %(message)s" handler = logging.StreamHandler(sys.stdout) - logging.basicConfig(level=logging.DEBUG, format=fmt, handlers=[handler, logging.FileHandler("plot_biascor.log")]) + logging.basicConfig( + level=logging.DEBUG, + format=fmt, + handlers=[handler, logging.FileHandler("plot_biascor.log")], + ) logging.getLogger("matplotlib").setLevel(logging.ERROR) logging.getLogger("chainconsumer").setLevel(logging.WARNING) @@ -51,24 +55,63 @@ def plot_histograms(data, sims, types, figname): "SNRMAX1", "SNRMAX2", "SNRMAX3", - #"SNRMAX_g", - #"SNRMAX_r", - #"SNRMAX_i", - #"SNRMAX_z", - ["zHD","c"], - ["zHD","x1"], - ["zHD","HOST_LOGMASS"], + # "SNRMAX_g", + # "SNRMAX_r", + # "SNRMAX_i", + # "SNRMAX_z", + ["zHD", "c"], + ["zHD", "x1"], + ["zHD", "HOST_LOGMASS"], "NDOF", - #"chi2_g", - #"chi2_r", - #"chi2_i", - #"chi2_z", + # "chi2_g", + # "chi2_r", + # "chi2_i", + # "chi2_z", + "__MUDIFF", + ] + restricted = [ + "FITCHI2", + "SNRMAX1", + "SNRMAX2", + "SNRMAX3", + "SNRMAX_g", + "SNRMAX_r", + "SNRMAX_i", + "SNRMAX_z", + "chi2_g", + "chi2_r", + "chi2_i", + "chi2_z", + ] + logs = [ + "SNRMAX1", + "SNRMAX2", + "SNRMAX3", + "SNRMAX_g", + "SNRMAX_r", + "SNRMAX_i", + "SNRMAX_z", + "FITCHI2", + "chi2_g", + "chi2_r", + "chi2_i", + "chi2_z", "__MUDIFF", ] - restricted = ["FITCHI2", "SNRMAX1", "SNRMAX2", "SNRMAX3", "SNRMAX_g", "SNRMAX_r", "SNRMAX_i", "SNRMAX_z", "chi2_g", "chi2_r", "chi2_i", "chi2_z"] - logs = ["SNRMAX1", "SNRMAX2", "SNRMAX3", "SNRMAX_g", "SNRMAX_r", "SNRMAX_i", "SNRMAX_z", "FITCHI2", "chi2_g", "chi2_r", "chi2_i", "chi2_z", "__MUDIFF"] - cs = ["#1976D2", "#FB8C00", "#8BC34A", "#E53935", "#4FC3F7", "#43A047", "#F2D026", "#673AB7", "#FFB300", "#E91E63", "#F2D026"] * 3 + cs = [ + "#1976D2", + "#FB8C00", + "#8BC34A", + "#E53935", + "#4FC3F7", + "#43A047", + "#F2D026", + "#673AB7", + "#FFB300", + "#E91E63", + "#F2D026", + ] * 3 usecols = [] for c in cols: if isinstance(c, list): @@ -84,7 +127,12 @@ def plot_histograms(data, sims, types, figname): x[0].loc[x[0][c] < -10, c] = -9 ncols = (len(cols) + 3) // 3 - fig, axes = plt.subplots(3, ncols, figsize=(1 + 2.5 * ncols, 8), gridspec_kw={"wspace": 0.13, "hspace": 0.4}) + fig, axes = plt.subplots( + 3, + ncols, + figsize=(1 + 2.5 * ncols, 8), + gridspec_kw={"wspace": 0.13, "hspace": 0.4}, + ) for ax in axes.flatten(): ax.set_axis_off() @@ -92,7 +140,7 @@ def plot_histograms(data, sims, types, figname): ax.set_axis_on() u = 0.95 if c in restricted else 0.99 - #HISTOGRAM + # HISTOGRAM if not isinstance(c, list): minv = min([x[0][c].quantile(0.01) for x in data + sims]) maxv = max([x[0][c].quantile(u) for x in data + sims]) @@ -104,7 +152,15 @@ def plot_histograms(data, sims, types, figname): err = np.sqrt(hist) area = (bins[1] - bins[0]) * hist.sum() delta = (bc[1] - bc[0]) / 20 - ax.errorbar(bc + i * delta, hist / area, yerr=err / area, fmt="o", ms=2, elinewidth=0.75, label=n) + ax.errorbar( + bc + i * delta, + hist / area, + yerr=err / area, + fmt="o", + ms=2, + elinewidth=0.75, + label=n, + ) lw = 1 if len(sims) < 3 else 0.5 for index, (s, n) in enumerate(sims): @@ -116,11 +172,35 @@ def plot_histograms(data, sims, types, figname): hist, _ = np.histogram(s[c], bins=bins) area = (bins[1] - bins[0]) * hist.sum() - ax.hist(s[c], bins=bins, histtype="step", weights=np.ones(s[c].shape) / area, label=n, linewidth=lw, color=cs[index]) + ax.hist( + s[c], + bins=bins, + histtype="step", + weights=np.ones(s[c].shape) / area, + label=n, + linewidth=lw, + color=cs[index], + ) if len(sims) == 1 and nonia.shape[0] > 10 and len(data) == 1: logging.info(f"Nonia shape is {nonia.shape}") - ax.hist(ia[c], bins=bins, histtype="step", weights=np.ones(ia[c].shape) / area, linestyle="--", label=n + " Ia only", linewidth=1) - ax.hist(nonia[c], bins=bins, histtype="step", weights=np.ones(nonia[c].shape) / area, linestyle=":", label=n + " CC only", linewidth=1) + ax.hist( + ia[c], + bins=bins, + histtype="step", + weights=np.ones(ia[c].shape) / area, + linestyle="--", + label=n + " Ia only", + linewidth=1, + ) + ax.hist( + nonia[c], + bins=bins, + histtype="step", + weights=np.ones(nonia[c].shape) / area, + linestyle=":", + label=n + " CC only", + linewidth=1, + ) if "MUDIFF" in c: ax.set_xlabel("FAKE MUDIFF") @@ -141,12 +221,14 @@ def plot_histograms(data, sims, types, figname): for i, (s, n) in enumerate(sims): sim_col = s[c] - + sim_hist, _ = np.histogram(sim_col, bins=bins) sim_err = 1 / np.sqrt(data_hist) sim_dist, _ = np.histogram(sim_col, bins=bins, density=True) - dist_error = np.sqrt((data_dist * data_err) ** 2 + (sim_dist * sim_err) ** 2) + dist_error = np.sqrt( + (data_dist * data_err) ** 2 + (sim_dist * sim_err) ** 2 + ) dist_diff = data_dist - sim_dist chi2 = np.nansum(((dist_diff / dist_error) ** 2)) @@ -172,45 +254,62 @@ def plot_histograms(data, sims, types, figname): for i, (s, n) in enumerate(sims): sim_xcol = s[c[0]] sim_ycol = s[c[1]] - bin_medians, bin_edges, binnumber = binned_statistic(sim_xcol, - sim_ycol, - statistic='median', - bins=bins) - bincenters = (bin_edges[:-1]+ bin_edges[1:])/2. - ax.plot(bincenters,bin_medians,label=n,alpha=.9,color=cs[i]) + bin_medians, bin_edges, binnumber = binned_statistic( + sim_xcol, sim_ycol, statistic="median", bins=bins + ) + bincenters = (bin_edges[:-1] + bin_edges[1:]) / 2.0 + ax.plot(bincenters, bin_medians, label=n, alpha=0.9, color=cs[i]) for i, (d, n) in enumerate(data): data_xcol = d[c[0]] data_ycol = d[c[1]] try: - bin_medians, bin_edges, binnumber = binned_statistic(data_xcol, - data_ycol, - statistic='median', - bins=bins) - bin_stds, bin_edges, binnumber = binned_statistic(data_xcol, - data_ycol, - statistic='std', - bins=bins) - bin_counts, bin_edges, binnumber = binned_statistic(data_xcol, - data_ycol, - statistic='count', - bins=bins) - - - bincenters = (bin_edges[:-1]+ bin_edges[1:])/2. - ax.errorbar(bincenters,bin_medians,yerr=bin_stds/np.sqrt(bin_counts),fmt='o',label=n,alpha=.9) + bin_medians, bin_edges, binnumber = binned_statistic( + data_xcol, data_ycol, statistic="median", bins=bins + ) + bin_stds, bin_edges, binnumber = binned_statistic( + data_xcol, data_ycol, statistic="std", bins=bins + ) + bin_counts, bin_edges, binnumber = binned_statistic( + data_xcol, data_ycol, statistic="count", bins=bins + ) + + bincenters = (bin_edges[:-1] + bin_edges[1:]) / 2.0 + ax.errorbar( + bincenters, + bin_medians, + yerr=bin_stds / np.sqrt(bin_counts), + fmt="o", + label=n, + alpha=0.9, + ) except: pass ax.set_xlabel(c[0]) ax.set_ylabel(c[1]) - #ax.legend() + # ax.legend() handles, labels = ax.get_legend_handles_labels() - bb = (fig.subplotpars.left, fig.subplotpars.top + 0.02, fig.subplotpars.right - fig.subplotpars.left, 0.1) - - #for ax in axes.flatten(): + bb = ( + fig.subplotpars.left, + fig.subplotpars.top + 0.02, + fig.subplotpars.right - fig.subplotpars.left, + 0.1, + ) + + # for ax in axes.flatten(): # ax.set_yticklabels([]) - fig.legend(handles, labels, loc="upper center", ncol=4, mode="expand", frameon=False, bbox_to_anchor=bb, borderaxespad=0.0, bbox_transform=fig.transFigure) + fig.legend( + handles, + labels, + loc="upper center", + ncol=4, + mode="expand", + frameon=False, + bbox_to_anchor=bb, + borderaxespad=0.0, + bbox_transform=fig.transFigure, + ) # plt.legend(bbox_to_anchor=(-3, 2.3, 4.0, 0.2), loc="lower left", mode="expand", ncol=3, frameon=False) # plt.tight_layout(rect=[0, 0, 0.75, 1]) fig.savefig(figname, bbox_inches="tight", dpi=600) @@ -220,7 +319,9 @@ def get_means_and_errors(x, y, bins): x = np.array(x) y = np.array(y) means, *_ = binned_statistic(x, y, bins=bins, statistic="mean") - err, *_ = binned_statistic(x, y, bins=bins, statistic=lambda x: np.std(x) / np.sqrt(len(x))) + err, *_ = binned_statistic( + x, y, bins=bins, statistic=lambda x: np.std(x) / np.sqrt(len(x)) + ) std, *_ = binned_statistic(x, y, bins=bins, statistic=lambda x: np.std(x)) std_err, *_ = binned_statistic( @@ -229,13 +330,19 @@ def get_means_and_errors(x, y, bins): bins=bins, statistic=lambda x: np.nan if len(x) < 20 - else np.sqrt((1 / len(x)) * (moment(x, 4) - (((len(x) - 3) / (len(x) - 1)) * np.var(x) ** 2))) / (2 * np.std(x)), + else np.sqrt( + (1 / len(x)) + * (moment(x, 4) - (((len(x) - 3) / (len(x) - 1)) * np.var(x) ** 2)) + ) + / (2 * np.std(x)), ) return means, err, std, std_err def plot_redshift_evolution(data, sims, types, figname): - fig, axes = plt.subplots(2, 2, figsize=(6, 4), sharex=True, gridspec_kw={"hspace": 0.0, "wspace": 0.4}) + fig, axes = plt.subplots( + 2, 2, figsize=(6, 4), sharex=True, gridspec_kw={"hspace": 0.0, "wspace": 0.4} + ) cols = ["x1", "c"] for c, row in zip(cols, axes.T): @@ -253,8 +360,19 @@ def plot_redshift_evolution(data, sims, types, figname): if d.shape[0] == 0: continue means, err, std, std_err = get_means_and_errors(d["zHD"], d[c], bins=bins) - ax0.errorbar(bc, means, yerr=err, fmt="o", ms=2, elinewidth=0.75, zorder=20, label=n) - ax1.errorbar(bc, std, yerr=std_err, fmt="o", ms=2, elinewidth=0.75, zorder=20, label=n) + ax0.errorbar( + bc, means, yerr=err, fmt="o", ms=2, elinewidth=0.75, zorder=20, label=n + ) + ax1.errorbar( + bc, + std, + yerr=std_err, + fmt="o", + ms=2, + elinewidth=0.75, + zorder=20, + label=n, + ) for sim, n in sims: if sim.shape[0] == 0: @@ -265,14 +383,20 @@ def plot_redshift_evolution(data, sims, types, figname): has_nonia = nonia.shape[0] > 0 if has_nonia and len(sims) == 1: - groups = [(sim, "-", 10, " all"), (ia, "--", 3, " Ia"), (nonia, ":", 2, " CC")] + groups = [ + (sim, "-", 10, " all"), + (ia, "--", 3, " Ia"), + (nonia, ":", 2, " CC"), + ] else: groups = [(sim, "-", 10, "")] for s, ls, z, n2 in groups: if s.shape[0] < 100: continue - means, err, std, std_err = get_means_and_errors(s["zHD"], s[c], bins=bins) + means, err, std, std_err = get_means_and_errors( + s["zHD"], s[c], bins=bins + ) ax0.plot(bc, means, ls=ls, zorder=z, label=n + n2) ax0.fill_between(bc, means - err, means + err, alpha=0.1, zorder=z) ax1.plot(bc, std, ls=ls, zorder=z, label=n + n2) @@ -284,7 +408,13 @@ def plot_redshift_evolution(data, sims, types, figname): ax1.set_xlim(*lim) axes[1, 0].set_xlabel("z") axes[1, 1].set_xlabel("z") - plt.legend(bbox_to_anchor=(-1.2, 2, 2.1, 0.2), loc="lower left", mode="expand", ncol=2, frameon=False) + plt.legend( + bbox_to_anchor=(-1.2, 2, 2.1, 0.2), + loc="lower left", + mode="expand", + ncol=2, + frameon=False, + ) # plt.tight_layout(rect=[0, 0, 0.75, 1]) fig.savefig(figname, bbox_inches="tight", dpi=150, transparent=True) @@ -313,15 +443,29 @@ def plot_redshift_evolution(data, sims, types, figname): plot_redshift_evolution(data_dfs, sim_dfs, args["IA_TYPES"], "redshift.png") try: - fields = [(["X3", "C3"], "DEEP"), (["C1", "C2", "S1", "S2", "E1", "E2", "X1", "X2"], "SHALLOW")] + fields = [ + (["X3", "C3"], "DEEP"), + (["C1", "C2", "S1", "S2", "E1", "E2", "X1", "X2"], "SHALLOW"), + ] for f, n in fields: data_masks = [np.isin(d["FIELD"], f) for d, _ in data_dfs] sim_masks = [np.isin(s["FIELD"], f) for s, _ in sim_dfs] - masked_data_dfs = [(d[0].loc[m, :], d[1]) for d, m in zip(data_dfs, data_masks)] - masked_sim_dfs = [(d[0].loc[m, :], d[1]) for d, m in zip(sim_dfs, sim_masks)] - plot_histograms(masked_data_dfs, masked_sim_dfs, args["IA_TYPES"], f"hist_{n}.png") - plot_redshift_evolution(masked_data_dfs, masked_sim_dfs, args["IA_TYPES"], f"redshift_{n}.png") + masked_data_dfs = [ + (d[0].loc[m, :], d[1]) for d, m in zip(data_dfs, data_masks) + ] + masked_sim_dfs = [ + (d[0].loc[m, :], d[1]) for d, m in zip(sim_dfs, sim_masks) + ] + plot_histograms( + masked_data_dfs, masked_sim_dfs, args["IA_TYPES"], f"hist_{n}.png" + ) + plot_redshift_evolution( + masked_data_dfs, + masked_sim_dfs, + args["IA_TYPES"], + f"redshift_{n}.png", + ) except: logging.info(f"NO DES Fields, skipping...") diff --git a/pippin/manager.py b/pippin/manager.py index d6e834cb..19ef3d56 100644 --- a/pippin/manager.py +++ b/pippin/manager.py @@ -8,7 +8,15 @@ from pippin.analyse import AnalyseChains from pippin.biascor import BiasCor from pippin.classifiers.classifier import Classifier -from pippin.config import get_logger, get_config, get_output_dir, mkdirs, chown_dir, chown_file, get_data_loc +from pippin.config import ( + get_logger, + get_config, + get_output_dir, + mkdirs, + chown_dir, + chown_file, + get_data_loc, +) from pippin.cosmofitters.cosmofit import CosmoFit from pippin.create_cov import CreateCov from pippin.dataprep import DataPrep @@ -19,8 +27,30 @@ class Manager: - task_order = [DataPrep, SNANASimulation, SNANALightCurveFit, Classifier, Aggregator, Merger, BiasCor, CreateCov, CosmoFit, AnalyseChains] - stages = ["DATAPREP", "SIM", "LCFIT", "CLASSIFY", "AGGREGATE", "MERGE", "BIASCOR", "CREATE_COV", "COSMOFIT", "ANALYSE"] + task_order = [ + DataPrep, + SNANASimulation, + SNANALightCurveFit, + Classifier, + Aggregator, + Merger, + BiasCor, + CreateCov, + CosmoFit, + AnalyseChains, + ] + stages = [ + "DATAPREP", + "SIM", + "LCFIT", + "CLASSIFY", + "AGGREGATE", + "MERGE", + "BIASCOR", + "CREATE_COV", + "COSMOFIT", + "ANALYSE", + ] def __init__(self, filename, config_path, config_raw, config, message_store): self.logger = get_logger() @@ -36,15 +66,21 @@ def __init__(self, filename, config_path, config_raw, config, message_store): self.max_jobs = int(self.global_config["QUEUE"]["max_jobs"]) self.max_jobs_gpu = int(self.global_config["QUEUE"]["max_gpu_jobs"]) self.max_jobs_in_queue = int(self.global_config["QUEUE"]["max_jobs_in_queue"]) - self.max_jobs_in_queue_gpu = int(self.global_config["QUEUE"]["max_gpu_jobs_in_queue"]) + self.max_jobs_in_queue_gpu = int( + self.global_config["QUEUE"]["max_gpu_jobs_in_queue"] + ) self.logger.debug(self.global_config.keys()) - self.sbatch_cpu_path = get_data_loc(self.global_config["SBATCH"]["cpu_location"]) - with open(self.sbatch_cpu_path, 'r') as f: + self.sbatch_cpu_path = get_data_loc( + self.global_config["SBATCH"]["cpu_location"] + ) + with open(self.sbatch_cpu_path, "r") as f: self.sbatch_cpu_header = f.read() - self.sbatch_gpu_path = get_data_loc(self.global_config["SBATCH"]["gpu_location"]) - with open(self.sbatch_gpu_path, 'r') as f: + self.sbatch_gpu_path = get_data_loc( + self.global_config["SBATCH"]["gpu_location"] + ) + with open(self.sbatch_gpu_path, "r") as f: self.sbatch_gpu_header = f.read() self.sbatch_cpu_header = self.clean_header(self.sbatch_cpu_header) self.sbatch_gpu_header = self.clean_header(self.sbatch_gpu_header) @@ -68,11 +104,21 @@ def __init__(self, filename, config_path, config_raw, config, message_store): self.blocked = [] def load_task_setup(self): - tasks = ['cosmomc', 'snirf', 'analyse', 'supernnova', 'nearest_neighbour', 'create_cov', 'supernnova_yml', 'scone', 'dataprep'] + tasks = [ + "cosmomc", + "snirf", + "analyse", + "supernnova", + "nearest_neighbour", + "create_cov", + "supernnova_yml", + "scone", + "dataprep", + ] self.task_setup = {} for task in tasks: - with open(get_data_loc(f"{self.setup_task_location}/{task}"), 'r') as f: - self.task_setup[task] = f.read() + with open(get_data_loc(f"{self.setup_task_location}/{task}"), "r") as f: + self.task_setup[task] = f.read() def get_force_refresh(self, task): if self.start is None: @@ -85,7 +131,9 @@ def get_force_refresh(self, task): self.logger.error(f"Task {task} did not match any class in the task order!") index = 0 force = index >= self.start - self.logger.debug(f"Start set! Task {task} has index {index}, start index set {self.start}, so returning {force}") + self.logger.debug( + f"Start set! Task {task} has index {index}, start index set {self.start}, so returning {force}" + ) return force def get_force_ignore(self, task): @@ -99,7 +147,9 @@ def get_force_ignore(self, task): self.logger.error(f"Task {task} did not match any class in the task order!") assert index is not None force_ignore = index <= self.force_ignore_stage - self.logger.debug(f"Task {task} has index {index}, ignore index is {self.force_ignore_stage}, so returning force_ignore={force_ignore}") + self.logger.debug( + f"Task {task} has index {index}, ignore index is {self.force_ignore_stage}, so returning force_ignore={force_ignore}" + ) return force_ignore def set_force_refresh(self, force_refresh): @@ -109,10 +159,10 @@ def set_force_ignore_stage(self, force_ignore_stage): self.force_ignore_stage = self.resolve_stage(force_ignore_stage) def clean_header(self, header): - lines = header.split('\n') - mask = lambda x: (len(x) > 0) and (x[0] == '#') and ('xxxx' not in x) + lines = header.split("\n") + mask = lambda x: (len(x) > 0) and (x[0] == "#") and ("xxxx" not in x) lines = filter(mask, lines) - header = '\n'.join(lines) + header = "\n".join(lines) return header def set_start(self, stage): @@ -128,18 +178,28 @@ def resolve_stage(self, stage): num = int(stage) else: key = stage.upper() - assert key in Manager.stages, f"Stage {key} is not in recognised keys {Manager.stages}" + assert ( + key in Manager.stages + ), f"Stage {key} is not in recognised keys {Manager.stages}" num = Manager.stages.index(key) - assert 0 <= num < len(Manager.stages), f"Stage {num} is not in recognised values is not valid - from 0 to {len(Manager.stages) - 1}" + assert ( + 0 <= num < len(Manager.stages) + ), f"Stage {num} is not in recognised values is not valid - from 0 to {len(Manager.stages) - 1}" return num def get_tasks(self, config): - total_tasks = [] try: for i, task in enumerate(Manager.task_order): if self.finish is None or i <= self.finish: - new_tasks = task.get_tasks(config, total_tasks, self.output_dir, i, self.prefix, self.global_config) + new_tasks = task.get_tasks( + config, + total_tasks, + self.output_dir, + i, + self.prefix, + self.global_config, + ) if new_tasks is not None: total_tasks += new_tasks except Exception as e: @@ -155,7 +215,11 @@ def get_tasks(self, config): return total_tasks def get_num_running_jobs(self): - num_jobs = int(subprocess.check_output("squeue -ho %A -u $USER | wc -l", shell=True, stderr=subprocess.STDOUT)) + num_jobs = int( + subprocess.check_output( + "squeue -ho %A -u $USER | wc -l", shell=True, stderr=subprocess.STDOUT + ) + ) return num_jobs def get_task_to_run(self): @@ -164,12 +228,19 @@ def get_task_to_run(self): for dep in t.dependencies: if dep not in self.done: can_run = False - if t.gpu and self.num_jobs_queue_gpu + t.num_jobs >= self.max_jobs_in_queue_gpu: - self.logger.warning(f"Cant submit {t} because GPU NUM_JOBS {t.num_jobs} would exceed {self.num_jobs_queue_gpu}/{self.max_jobs_in_queue_gpu}") + if ( + t.gpu + and self.num_jobs_queue_gpu + t.num_jobs >= self.max_jobs_in_queue_gpu + ): + self.logger.warning( + f"Cant submit {t} because GPU NUM_JOBS {t.num_jobs} would exceed {self.num_jobs_queue_gpu}/{self.max_jobs_in_queue_gpu}" + ) can_run = False if not t.gpu and self.num_jobs_queue + t.num_jobs >= self.max_jobs_in_queue: - self.logger.warning(f"Cant submit {t} because NUM_JOBS {t.num_jobs} would exceed {self.num_jobs_queue}/{self.max_jobs_in_queue}") + self.logger.warning( + f"Cant submit {t} because NUM_JOBS {t.num_jobs} would exceed {self.num_jobs_queue}/{self.max_jobs_in_queue}" + ) can_run = False if can_run: @@ -202,7 +273,6 @@ def fail_task(self, t): modified = True break - def log_status(self): self.logger.debug("") self.logger.debug(f"Status as of {time.ctime()}:") @@ -264,12 +334,13 @@ def get_dashboard_line(self, stage, tasks, prnt=True): def print_dashboard(self): all_tasks = self.tasks + self.running + self.done + self.failed + self.blocked - self.logger.info("-------------------") self.logger.info("CURRENT TASK STATUS") options = ["WAITING", "RUNNING", "DONE", "FAILED", "BLOCKED"] - header = "Key: " + " ".join([self.get_string_with_colour(o, o.lower()) for o in options]) + header = "Key: " + " ".join( + [self.get_string_with_colour(o, o.lower()) for o in options] + ) self.logger.info(header) for name, task_class in zip(Manager.stages, Manager.task_order): tasks = self.get_subtasks(task_class, all_tasks) @@ -278,12 +349,14 @@ def print_dashboard(self): self.logger.info("-------------------") try: - with open(self.dashboard, 'w') as f: + with open(self.dashboard, "w") as f: f.write("-------------------\n") f.write("CURRENT TASK STATUS\n") options = ["WAITING", "RUNNING", "DONE", "FAILED", "BLOCKED"] - header = "Key: " + " ".join([self.get_string_with_colour(o, o.lower()) for o in options]) + header = "Key: " + " ".join( + [self.get_string_with_colour(o, o.lower()) for o in options] + ) f.write(header + "\n") for name, task_class in zip(Manager.stages, Manager.task_order): tasks = self.get_subtasks(task_class, all_tasks) @@ -305,7 +378,9 @@ def execute(self, check_config, compress_output, uncompress_output): self.logger.info(f"Output will be located in {self.output_dir}") if check_config: self.logger.info("Only verifying config, not launching anything") - assert not (compress_output and uncompress_output), "-C / --compress and -U / --uncompress are mutually exclusive" + assert not ( + compress_output and uncompress_output + ), "-C / --compress and -U / --uncompress are mutually exclusive" # Whilst compressing is being debugged, false by default self.compress = False if compress_output: @@ -324,7 +399,6 @@ def execute(self, check_config, compress_output, uncompress_output): self.num_jobs_queue_gpu = 0 squeue = None - if check_config: if compress_output: self.compress_all() @@ -339,12 +413,16 @@ def execute(self, check_config, compress_output, uncompress_output): max_sleep_time = self.global_config["OUTPUT"]["max_ping_frequency"] current_sleep_time = start_sleep_time - config_file_output = os.path.join(self.output_dir, os.path.basename(self.filename_path)) + config_file_output = os.path.join( + self.output_dir, os.path.basename(self.filename_path) + ) if not check_config and self.filename_path != config_file_output: - self.logger.info(f"Saving processed and parsed config file to {config_file_output}") - with open(config_file_output, 'w') as f: + self.logger.info( + f"Saving processed and parsed config file to {config_file_output}" + ) + with open(config_file_output, "w") as f: f.write(self.file_raw) - #shutil.copy(self.filename_path, config_file_output) + # shutil.copy(self.filename_path, config_file_output) chown_file(config_file_output) # Welcome to the primary loop @@ -362,7 +440,6 @@ def execute(self, check_config, compress_output, uncompress_output): # Submit new jobs if needed while self.num_jobs_queue < self.max_jobs: - t = self.get_task_to_run() if t is not None: self.logger.info("") @@ -381,9 +458,7 @@ def execute(self, check_config, compress_output, uncompress_output): if started: if t.gpu: self.num_jobs_queue_gpu += t.num_jobs - message = ( - f"LAUNCHED: {t} with {t.num_jobs} GPU NUM_JOBS. Total GPU NUM_JOBS now {self.num_jobs_queue_gpu}/{self.max_jobs_in_queue_gpu}" - ) + message = f"LAUNCHED: {t} with {t.num_jobs} GPU NUM_JOBS. Total GPU NUM_JOBS now {self.num_jobs_queue_gpu}/{self.max_jobs_in_queue_gpu}" else: self.num_jobs_queue += t.num_jobs message = f"LAUNCHED: {t} with {t.num_jobs} NUM_JOBS. Total NUM_JOBS now {self.num_jobs_queue}/{self.max_jobs_in_queue}" @@ -414,14 +489,24 @@ def execute(self, check_config, compress_output, uncompress_output): current_sleep_time *= 2 if current_sleep_time > max_sleep_time: current_sleep_time = max_sleep_time - p = subprocess.run(f"squeue -h -u $USER -o '%.j'", shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p = subprocess.run( + f"squeue -h -u $USER -o '%.j'", + shell=True, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) if (p.returncode != 0) or (p.stderr != ""): - self.logger.error(f"Command '{p.args}' failed with exit status '{p.returncode}' and error '{p.stderr.strip()}'") + self.logger.error( + f"Command '{p.args}' failed with exit status '{p.returncode}' and error '{p.stderr.strip()}'" + ) else: squeue = [i.strip() for i in p.stdout.splitlines()] n = len(squeue) if n == 0 or n > self.max_jobs: - self.logger.debug(f"Squeue is reporting {n} NUM_JOBS in the queue... this is either 0 or toeing the line as to too many") + self.logger.debug( + f"Squeue is reporting {n} NUM_JOBS in the queue... this is either 0 or toeing the line as to too many" + ) num_errs = self.log_finals() return num_errs @@ -435,9 +520,13 @@ def check_task_completion(self, t, squeue): for task in t.dependencies: self.logger.debug(f"Modifying dependency task {task.name}") task.dependents.remove(t) - #t.dependencies.remove(task) - self.logger.debug(f"Task {task.name} has dependencies: {task.dependencies}") - self.logger.debug(f"Task {task.name} has dependents: {task.dependents}") + # t.dependencies.remove(task) + self.logger.debug( + f"Task {task.name} has dependencies: {task.dependencies}" + ) + self.logger.debug( + f"Task {task.name} has dependents: {task.dependents}" + ) if len(task.dependents) == 0: if self.compress: task.compress() @@ -450,7 +539,9 @@ def check_task_completion(self, t, squeue): self.num_jobs_queue -= t.num_jobs if result == Task.FINISHED_SUCCESS: self.running.remove(t) - self.logger.notice(f"FINISHED: {t} with {t.num_jobs} NUM_JOBS. NUM_JOBS now {self.num_jobs_queue}") + self.logger.notice( + f"FINISHED: {t} with {t.num_jobs} NUM_JOBS. NUM_JOBS now {self.num_jobs_queue}" + ) self.done.append(t) if self.compress: if len(t.dependents) == 0: diff --git a/pippin/merge.py b/pippin/merge.py index a46d6bef..43eabac6 100644 --- a/pippin/merge.py +++ b/pippin/merge.py @@ -11,7 +11,7 @@ class Merger(Task): - """ Merge fitres files and aggregator output + """Merge fitres files and aggregator output CONFIGURATION: ============== @@ -57,7 +57,10 @@ def __init__(self, name, output_dir, config, dependencies, options): self.suboutput_dir = os.path.join(self.output_dir, "output") self.done_file = os.path.join(self.suboutput_dir, "ALL.DONE") - self.fitres_outdirs = [os.path.join(self.suboutput_dir, os.path.basename(f)) for f in self.lc_fit["fitres_dirs"]] + self.fitres_outdirs = [ + os.path.join(self.suboutput_dir, os.path.basename(f)) + for f in self.lc_fit["fitres_dirs"] + ] self.output["lc_output_dir"] = self.suboutput_dir self.output["fitres_dirs"] = self.fitres_outdirs self.output["genversion"] = self.lc_fit["genversion"] @@ -81,7 +84,9 @@ def get_agg_dep(self): def _check_completion(self, squeue): if os.path.exists(self.done_file): - self.logger.debug(f"Merger finished, see combined fitres at {self.suboutput_dir}") + self.logger.debug( + f"Merger finished, see combined fitres at {self.suboutput_dir}" + ) return Task.FINISHED_SUCCESS else: output_error = False @@ -89,7 +94,9 @@ def _check_completion(self, squeue): with open(self.logfile, "r") as f: for line in f.read().splitlines(): if "ERROR" in line or "ABORT" in line: - self.logger.error(f"Fatal error in combine_fitres. See {self.logfile} for details.") + self.logger.error( + f"Fatal error in combine_fitres. See {self.logfile} for details." + ) output_error = True if output_error: self.logger.info(f"Excerpt: {line}") @@ -98,7 +105,9 @@ def _check_completion(self, squeue): os.remove(self.hash_file) chown_dir(self.output_dir) else: - self.logger.error("Combine task failed with no output log. Please debug") + self.logger.error( + "Combine task failed with no output log. Please debug" + ) return Task.FINISHED_FAILURE def add_to_fitres(self, fitres_file, outdir, lcfit, index=0): @@ -121,13 +130,21 @@ def add_to_fitres(self, fitres_file, outdir, lcfit, index=0): try: self.logger.debug(f"Executing command {' '.join(command)}") with open(self.logfile, "w+") as f: - subprocess.run(command, stdout=f, stderr=subprocess.STDOUT, cwd=outdir, check=True) + subprocess.run( + command, + stdout=f, + stderr=subprocess.STDOUT, + cwd=outdir, + check=True, + ) except subprocess.CalledProcessError as e: self.logger.error(f"Error invoking command {command}") raise e else: - self.logger.info("Empty aggregation result found, not invoking combine_fitres.exe") + self.logger.info( + "Empty aggregation result found, not invoking combine_fitres.exe" + ) self.logger.debug(f"Copying file {fitres_file} to {outdir}") shutil.copy(fitres_file, outdir) @@ -139,16 +156,29 @@ def _run(self): self.output["SURVEY_ID"] = self.lc_fit["SURVEY_ID"] fitres_files, symlink_files = [], [] - for index, (fitres_dir, outdir) in enumerate(zip(self.lc_fit["fitres_dirs"], self.fitres_outdirs)): + for index, (fitres_dir, outdir) in enumerate( + zip(self.lc_fit["fitres_dirs"], self.fitres_outdirs) + ): files = os.listdir(fitres_dir) fitres_files += [ - (fitres_dir, outdir, f, index, self.lc_fit["name"]) for f in files if "FITRES" in f and not os.path.islink(os.path.join(fitres_dir, f)) + (fitres_dir, outdir, f, index, self.lc_fit["name"]) + for f in files + if "FITRES" in f and not os.path.islink(os.path.join(fitres_dir, f)) ] symlink_files += [ - (fitres_dir, outdir, f, index, self.lc_fit["name"]) for f in files if "FITRES" in f and os.path.islink(os.path.join(fitres_dir, f)) + (fitres_dir, outdir, f, index, self.lc_fit["name"]) + for f in files + if "FITRES" in f and os.path.islink(os.path.join(fitres_dir, f)) ] - new_hash = self.get_hash_from_string(" ".join([a + b + c + f"{d}" + e for a, b, c, d, e in (fitres_files + symlink_files)])) + new_hash = self.get_hash_from_string( + " ".join( + [ + a + b + c + f"{d}" + e + for a, b, c, d, e in (fitres_files + symlink_files) + ] + ) + ) if self._check_regenerate(new_hash): shutil.rmtree(self.output_dir, ignore_errors=True) self.logger.debug("Regenerating, running combine_fitres") @@ -158,11 +188,18 @@ def _run(self): mkdirs(fitres_dir) for f in fitres_files: if f[1] == fitres_dir: - self.add_to_fitres(os.path.join(f[0], f[2]), f[1], f[4], index=f[3]) + self.add_to_fitres( + os.path.join(f[0], f[2]), f[1], f[4], index=f[3] + ) for s in symlink_files: if s[1] == fitres_dir: - self.logger.debug(f"Creating symlink for {os.path.join(s[1], s[2])} to {os.path.join(s[1], 'FITOPT000.FITRES.gz')}") - os.symlink(os.path.join(s[1], "FITOPT000.FITRES.gz"), os.path.join(s[1], s[2])) + self.logger.debug( + f"Creating symlink for {os.path.join(s[1], s[2])} to {os.path.join(s[1], 'FITOPT000.FITRES.gz')}" + ) + os.symlink( + os.path.join(s[1], "FITOPT000.FITRES.gz"), + os.path.join(s[1], s[2]), + ) self.logger.debug(f"Copying MERGE.LOG") filenames = ["MERGE.LOG", "SUBMIT.INFO"] @@ -192,7 +229,9 @@ def get_tasks(c, prior_tasks, base_output_dir, stage_number, prefix, global_conf lcfit_tasks = Task.get_task_of_type(prior_tasks, SNANALightCurveFit) tasks = [] - def _get_merge_output_dir(base_output_dir, stage_number, merge_name, lcfit_name): + def _get_merge_output_dir( + base_output_dir, stage_number, merge_name, lcfit_name + ): return f"{base_output_dir}/{stage_number}_MERGE/{merge_name}_{lcfit_name}" for name in c.get("MERGE", []): @@ -229,9 +268,21 @@ def _get_merge_output_dir(base_output_dir, stage_number, merge_name, lcfit_name) num_gen += 1 merge_name2 = f"{name}_{lcfit.name}" - task = Merger(merge_name2, _get_merge_output_dir(base_output_dir, stage_number, name, lcfit.name), config, [lcfit, agg], options) - Task.logger.info(f"Creating merge task {merge_name2} for {lcfit.name} and {agg.name} with {task.num_jobs} jobs") + task = Merger( + merge_name2, + _get_merge_output_dir( + base_output_dir, stage_number, name, lcfit.name + ), + config, + [lcfit, agg], + options, + ) + Task.logger.info( + f"Creating merge task {merge_name2} for {lcfit.name} and {agg.name} with {task.num_jobs} jobs" + ) tasks.append(task) if num_gen == 0: - Task.fail_config(f"Merger {name} with mask {mask} matched no combination of aggregators and fits") + Task.fail_config( + f"Merger {name} with mask {mask} matched no combination of aggregators and fits" + ) return tasks diff --git a/pippin/snana_fit.py b/pippin/snana_fit.py index c290ec49..0a5ef7f5 100644 --- a/pippin/snana_fit.py +++ b/pippin/snana_fit.py @@ -30,7 +30,6 @@ class SNANALightCurveFit(ConfigBasedExecutable): """ def __init__(self, name, output_dir, sim_task, config, global_config): - self.config = config self.global_config = global_config @@ -41,11 +40,15 @@ def __init__(self, name, output_dir, sim_task, config, global_config): if self.base_file is None: Task.fail_config(f"Base file {base} cannot be found for task {name}") - super().__init__(name, output_dir, config, self.base_file, " = ", dependencies=[sim_task]) + super().__init__( + name, output_dir, config, self.base_file, " = ", dependencies=[sim_task] + ) self.options = self.config.get("OPTS", {}) - self.batch_replace = self.options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) + self.batch_replace = self.options.get( + "BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {}) + ) batch_mem = self.batch_replace.get("REPLACE_MEM", None) if batch_mem is not None: self.yaml["CONFIG"]["BATCH_MEM"] = batch_mem @@ -58,7 +61,10 @@ def __init__(self, name, output_dir, sim_task, config, global_config): self.config_path = self.output_dir + "/FIT_" + self.sim_version + ".nml" self.lc_output_dir = os.path.join(self.output_dir, "output") self.lc_log_dir = os.path.join(self.lc_output_dir, "SPLIT_JOBS_LCFIT") - self.fitres_dirs = [os.path.join(self.lc_output_dir, os.path.basename(s)) for s in self.sim_task.output["sim_folders"]] + self.fitres_dirs = [ + os.path.join(self.lc_output_dir, os.path.basename(s)) + for s in self.sim_task.output["sim_folders"] + ] self.logging_file = self.config_path.replace(".nml", ".LOG") self.kill_file = self.config_path.replace(".input", "_KILL.LOG") @@ -95,11 +101,15 @@ def __init__(self, name, output_dir, sim_task, config, global_config): self.logger.debug("Num jobs set by NUM_JOBS option") else: try: - property = self.options.get("BATCH_INFO") or self.yaml["CONFIG"].get("BATCH_INFO") + property = self.options.get("BATCH_INFO") or self.yaml["CONFIG"].get( + "BATCH_INFO" + ) self.num_jobs = int(property.split()[-1]) self.logger.debug("Num jobs set by BATCH_INFO") except Exception: - self.logger.warning("Could not determine BATCH_INFO for job, setting num_jobs to 10") + self.logger.warning( + "Could not determine BATCH_INFO for job, setting num_jobs to 10" + ) self.num_jobs = 10 self.logger.debug("Num jobs set to default") @@ -118,13 +128,19 @@ def validate_fitopts(self, config): potential_path = get_data_loc(f) if potential_path is not None and os.path.exists(potential_path): if has_file: - raise ValueError("It seems that you're trying to load in two files for the FITOPTS! Please specify only one file path!") + raise ValueError( + "It seems that you're trying to load in two files for the FITOPTS! Please specify only one file path!" + ) self.logger.debug(f"Loading in fitopts from {potential_path}") y = read_yaml(potential_path) - assert isinstance(y, dict), "New FITOPT format for external files is a yaml dictionary. See global.yml for an example." + assert isinstance( + y, dict + ), "New FITOPT format for external files is a yaml dictionary. See global.yml for an example." has_file = True self.raw_fitopts.append(y) - self.logger.debug(f"Loaded a fitopt dictionary file from {potential_path}") + self.logger.debug( + f"Loaded a fitopt dictionary file from {potential_path}" + ) self.output["fitopt_file"] = potential_path else: assert f.strip().startswith( @@ -134,7 +150,7 @@ def validate_fitopts(self, config): self.raw_fitopts.append(f) def compute_fitopts(self): - """ Runs after the sim/data to locate the survey """ + """Runs after the sim/data to locate the survey""" survey = self.get_sim_dependency()["SURVEY"] @@ -152,14 +168,20 @@ def compute_fitopts(self): self.logger.debug(f"FLAG_USE_SAME_EVENTS: {values}") self.use_same_events = values if key in ["GLOBAL", survey]: - assert isinstance(values, dict), "Fitopt values should be a dict of label: scale command" + assert isinstance( + values, dict + ), "Fitopt values should be a dict of label: scale command" for label, scale_command in values.items(): scale, command = scale_command.split(maxsplit=1) fitopt = f"/{label}/ {command}" - self.logger.debug(f"Adding FITOPT from {key}: {fitopt} for SURVEY: {survey}") + self.logger.debug( + f"Adding FITOPT from {key}: {fitopt} for SURVEY: {survey}" + ) fitopts.append(fitopt) else: - raise ValueError(f"Fitopt item {f} is not a string or dictionary, what on earth is it?") + raise ValueError( + f"Fitopt item {f} is not a string or dictionary, what on earth is it?" + ) # Map the fitopt outputs self.logger.debug(f"USE_SAME_EVENTS: {self.use_same_events}") @@ -175,7 +197,9 @@ def compute_fitopts(self): self.yaml["CONFIG"]["FITOPT"] = fitopts self.output["fitopt_map"] = mapped self.output["fitopt_index"] = mapped2 - self.output["fitres_file"] = os.path.join(self.fitres_dirs[0], mapped["DEFAULT"]) + self.output["fitres_file"] = os.path.join( + self.fitres_dirs[0], mapped["DEFAULT"] + ) def get_sim_dependency(self): for t in self.dependencies: @@ -184,7 +208,11 @@ def get_sim_dependency(self): return None def print_stats(self): - folders = [f for f in os.listdir(self.lc_output_dir) if os.path.isdir(self.lc_output_dir + "/" + f)] + folders = [ + f + for f in os.listdir(self.lc_output_dir) + if os.path.isdir(self.lc_output_dir + "/" + f) + ] if len(folders) > 5: self.logger.debug(f"Have {len(folders)} folders, only showing first five!") folders = folders[:5] @@ -195,9 +223,18 @@ def print_stats(self): if not os.path.exists(full_path): self.logger.info(f"{full_path} not found, seeing if it was gzipped") full_path += ".gz" - data = pd.read_csv(full_path, delim_whitespace=True, comment="#", compression="infer") + data = pd.read_csv( + full_path, delim_whitespace=True, comment="#", compression="infer" + ) d = data.groupby("TYPE").agg(num=("CID", "count")) - self.logger.info("Types: " + (" ".join([f"{k}:{v}" for k, v in zip(d.index, d["num"].values)]))) + self.logger.info( + "Types: " + + ( + " ".join( + [f"{k}:{v}" for k, v in zip(d.index, d["num"].values)] + ) + ) + ) d.to_csv(os.path.join(path, "stats.txt")) except Exception as e: self.logger.error(f"Cannot load {path}") @@ -206,7 +243,7 @@ def print_stats(self): return True def set_snlcinp(self, name, value): - """ Ensures the property name value pair is set in the SNLCINP section. + """Ensures the property name value pair is set in the SNLCINP section. Parameters ---------- @@ -219,7 +256,7 @@ def set_snlcinp(self, name, value): self.set_property(name, value, section_start="&SNLCINP", section_end="&END") def set_fitinp(self, name, value): - """ Ensures the property name value pair is set in the FITINP section. + """Ensures the property name value pair is set in the FITINP section. Parameters ---------- @@ -251,14 +288,13 @@ def ensure_quotes_good(self, value): return value def write_nml(self): - # Parse config, first SNLCINP and then FITINP for key, value in self.config.get("SNLCINP", {}).items(): self.set_snlcinp(key, value) for key, value in self.config.get("FITINP", {}).items(): self.set_fitinp(key, value) for key, value in self.options.items(): - #print(key,value) + # print(key,value) self.yaml["CONFIG"][key] = value self.compute_fitopts() @@ -274,8 +310,12 @@ def write_nml(self): if isinstance(self.sim_task, DataPrep): data_path = self.sim_task.output["data_path"] if "SNDATA_ROOT/lcmerge" not in data_path: - self.set_snlcinp("PRIVATE_DATA_PATH", f"'{self.sim_task.output['data_path']}'") - self.set_snlcinp("VERSION_PHOTOMETRY", f"'{self.sim_task.output['genversion']}'") + self.set_snlcinp( + "PRIVATE_DATA_PATH", f"'{self.sim_task.output['data_path']}'" + ) + self.set_snlcinp( + "VERSION_PHOTOMETRY", f"'{self.sim_task.output['genversion']}'" + ) # We want to do our hashing check here string_to_hash = self.get_output_string() @@ -304,28 +344,49 @@ def _run(self): return True self.logger.info(f"Light curve fitting outputting to {self.logging_file}") with open(self.logging_file, "w") as f: - subprocess.run(["submit_batch_jobs.sh", os.path.basename(self.config_path)], stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir) + subprocess.run( + ["submit_batch_jobs.sh", os.path.basename(self.config_path)], + stdout=f, + stderr=subprocess.STDOUT, + cwd=self.output_dir, + ) return True def kill_and_fail(self): with open(self.kill_file, "w") as f: self.logger.info(f"Killing remaining jobs for {self.name}") - subprocess.run(["submit_batch_jobs.sh", "--kill", os.path.basename(self.config_path)], stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir) + subprocess.run( + ["submit_batch_jobs.sh", "--kill", os.path.basename(self.config_path)], + stdout=f, + stderr=subprocess.STDOUT, + cwd=self.output_dir, + ) return Task.FINISHED_FAILURE def check_issues(self): log_files = [] + self.log_files if os.path.exists(self.lc_log_dir): - log_files += [os.path.join(self.lc_log_dir, f) for f in os.listdir(self.lc_log_dir) if f.upper().endswith(".LOG")] - - self.scan_files_for_error(log_files, "FATAL ERROR ABORT", "QOSMaxSubmitJobPerUserLimit", "DUE TO TIME LIMIT") + log_files += [ + os.path.join(self.lc_log_dir, f) + for f in os.listdir(self.lc_log_dir) + if f.upper().endswith(".LOG") + ] + + self.scan_files_for_error( + log_files, + "FATAL ERROR ABORT", + "QOSMaxSubmitJobPerUserLimit", + "DUE TO TIME LIMIT", + ) return Task.FINISHED_FAILURE def _check_completion(self, squeue): if os.path.exists(self.done_file): self.logger.info("Light curve done file found") if not os.path.exists(self.logging_file): - self.logger.info(f"{self.logging_file} not found, checking FITOPT existence") + self.logger.info( + f"{self.logging_file} not found, checking FITOPT existence" + ) success = self.print_stats() if not success: return Task.FINISHED_FAILURE @@ -336,7 +397,15 @@ def _check_completion(self, squeue): y = read_yaml(self.merge_log) if "MERGE" in y.keys(): for i, row in enumerate(y["MERGE"]): - state, iver, fitopt, n_all, n_snanacut, n_fitcut, cpu = row + ( + state, + iver, + fitopt, + n_all, + n_snanacut, + n_fitcut, + cpu, + ) = row if cpu < 60: units = "minutes" else: @@ -346,7 +415,9 @@ def _check_completion(self, squeue): f"LCFIT {i + 1} fit {n_all} events. {n_snanacut} passed SNANA cuts, {n_fitcut} passed fitcuts, taking {cpu:0.1f} CPU {units}" ) else: - self.logger.error(f"File {self.merge_log} does not have a MERGE section - did it die?") + self.logger.error( + f"File {self.merge_log} does not have a MERGE section - did it die?" + ) return Task.FINISHED_FAILURE if "SURVEY" in y.keys(): self.output["SURVEY"] = y["SURVEY"] @@ -359,7 +430,9 @@ def _check_completion(self, squeue): else: return Task.FINISHED_FAILURE else: - self.logger.debug(f"Done file reporting failure, scanning log files in {self.lc_log_dir}") + self.logger.debug( + f"Done file reporting failure, scanning log files in {self.lc_log_dir}" + ) return self.check_issues() elif not os.path.exists(self.merge_log): self.logger.error("MERGE.LOG was not created, job died on submission") @@ -368,22 +441,40 @@ def _check_completion(self, squeue): return self.check_for_job(squeue, os.path.basename(self.config_path)) @staticmethod - def get_tasks(config, prior_tasks, base_output_dir, stage_number, prefix, global_config): + def get_tasks( + config, prior_tasks, base_output_dir, stage_number, prefix, global_config + ): tasks = [] - all_deps = Task.match_tasks_of_type(None, prior_tasks, DataPrep, SNANASimulation) + all_deps = Task.match_tasks_of_type( + None, prior_tasks, DataPrep, SNANASimulation + ) for fit_name in config.get("LCFIT", []): num_matches = 0 fit_config = config["LCFIT"][fit_name] mask = fit_config.get("MASK", "") - sim_tasks = Task.match_tasks_of_type(mask, prior_tasks, DataPrep, SNANASimulation) + sim_tasks = Task.match_tasks_of_type( + mask, prior_tasks, DataPrep, SNANASimulation + ) for sim in sim_tasks: num_matches += 1 - fit_output_dir = f"{base_output_dir}/{stage_number}_LCFIT/{fit_name}_{sim.name}" - f = SNANALightCurveFit(f"{fit_name}_{sim.name}", fit_output_dir, sim, fit_config, global_config) - Task.logger.info(f"Creating fitting task {fit_name} with {f.num_jobs} jobs, for simulation {sim.name}") + fit_output_dir = ( + f"{base_output_dir}/{stage_number}_LCFIT/{fit_name}_{sim.name}" + ) + f = SNANALightCurveFit( + f"{fit_name}_{sim.name}", + fit_output_dir, + sim, + fit_config, + global_config, + ) + Task.logger.info( + f"Creating fitting task {fit_name} with {f.num_jobs} jobs, for simulation {sim.name}" + ) tasks.append(f) if num_matches == 0: - Task.fail_config(f"LCFIT task {fit_name} with mask '{mask}' matched no sim_names: {[sim.name for sim in all_deps]}") + Task.fail_config( + f"LCFIT task {fit_name} with mask '{mask}' matched no sim_names: {[sim.name for sim in all_deps]}" + ) return tasks diff --git a/pippin/snana_sim.py b/pippin/snana_sim.py index 3acac0b2..62906040 100644 --- a/pippin/snana_sim.py +++ b/pippin/snana_sim.py @@ -5,11 +5,20 @@ import json from pippin.base import ConfigBasedExecutable -from pippin.config import chown_dir, copytree, mkdirs, get_data_loc, get_hash, read_yaml, get_config +from pippin.config import ( + chown_dir, + copytree, + mkdirs, + get_data_loc, + get_hash, + read_yaml, + get_config, +) from pippin.task import Task + class SNANASimulation(ConfigBasedExecutable): - """ Merge fitres files and aggregator output + """Merge fitres files and aggregator output CONFIGURATION: ============== @@ -37,12 +46,13 @@ class SNANASimulation(ConfigBasedExecutable): blind: bool - whether to blind cosmo results """ - def __init__(self, name, output_dir, config, global_config, combine="combine.input"): + def __init__( + self, name, output_dir, config, global_config, combine="combine.input" + ): self.data_dirs = global_config["DATA_DIRS"] base_file = get_data_loc(combine) super().__init__(name, output_dir, config, base_file, ": ") - # Check for any replacements path_sndata_sim = get_config().get("SNANA").get("sim_dir") self.logger.debug(f"Setting PATH_SNDATA_SIM to {path_sndata_sim}") @@ -61,8 +71,10 @@ def __init__(self, name, output_dir, config, global_config, combine="combine.inp self.reserved_top = ["GENVERSION", "GLOBAL", "OPTS", "EXTERNAL"] self.config_path = f"{self.output_dir}/{self.genversion}.input" # Make sure this syncs with the tmp file name self.global_config = global_config - - self.batch_replace = self.options.get("BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {})) + + self.batch_replace = self.options.get( + "BATCH_REPLACE", self.global_config.get("BATCH_REPLACE", {}) + ) batch_mem = self.batch_replace.get("REPLACE_MEM", None) if batch_mem is not None: self.yaml["CONFIG"]["BATCH_MEM"] = batch_mem @@ -87,10 +99,14 @@ def __init__(self, name, output_dir, config, global_config, combine="combine.inp d = self.config[k] base_file = d.get("BASE") if base_file is None: - Task.fail_config(f"Your simulation component {k} for sim name {self.name} needs to specify a BASE input file") + Task.fail_config( + f"Your simulation component {k} for sim name {self.name} needs to specify a BASE input file" + ) base_path = get_data_loc(base_file) if base_path is None: - Task.fail_config(f"Cannot find sim component {k} base file at {base_path} for sim name {self.name}") + Task.fail_config( + f"Cannot find sim component {k} base file at {base_path} for sim name {self.name}" + ) gentype, genmodel = None, None with open(base_path) as f: @@ -102,22 +118,32 @@ def __init__(self, name, output_dir, config, global_config, combine="combine.inp gentype = gentype or d.get("GENTYPE") if gentype is None: - self.fail_config(f"The simulation component {k} needs to specify a GENTYPE in its input file") + self.fail_config( + f"The simulation component {k} needs to specify a GENTYPE in its input file" + ) gentype = int(gentype) genmodel = genmodel or d.get("GENMODEL") if not gentype: - gentype_sublist = self.get_simInput_key_values(base_path, ['GENTYPE']) + gentype_sublist = self.get_simInput_key_values( + base_path, ["GENTYPE"] + ) self.logger.debug(f"gentype_sublist: {gentype_sublist}") if len(gentype_sublist) == 0: - Task.fail_config(f"Cannot find GENTYPE for component {k} and base file {base_path}, or any included files") + Task.fail_config( + f"Cannot find GENTYPE for component {k} and base file {base_path}, or any included files" + ) else: gentype = gentype_sublist[0] if not genmodel: - genmodel_sublist = self.get_simInput_key_values(base_path, ['GENMODEL'])["GENMODEL"] + genmodel_sublist = self.get_simInput_key_values( + base_path, ["GENMODEL"] + )["GENMODEL"] self.logger.debug(f"genmodel_sublist: {genmodel_sublist}") if len(genmodel_sublist) == 0: - Task.fail_config(f"Cannot find GENMODEL for component {k} and base file {base_path}, or any included files") + Task.fail_config( + f"Cannot find GENMODEL for component {k} and base file {base_path}, or any included files" + ) else: genmodel = genmodel_sublist[0] @@ -140,8 +166,14 @@ def __init__(self, name, output_dir, config, global_config, combine="combine.inp self.output["types_dict"] = types_dict self.output["types"] = sorted_types - rankeys = [r for r in self.config.get("GLOBAL", {}).keys() if r.startswith("RANSEED_")] - value = int(self.config["GLOBAL"][rankeys[0]].split(" ")[0]) if rankeys else 1 + rankeys = [ + r + for r in self.config.get("GLOBAL", {}).keys() + if r.startswith("RANSEED_") + ] + value = ( + int(self.config["GLOBAL"][rankeys[0]].split(" ")[0]) if rankeys else 1 + ) self.set_num_jobs(2 * value) self.output["blind"] = self.options.get("BLIND", False) @@ -149,7 +181,9 @@ def __init__(self, name, output_dir, config, global_config, combine="combine.inp # Determine if all the top level input files exist if len(self.base_ia + self.base_cc) == 0: - Task.fail_config("Your sim has no components specified! Please add something to simulate!") + Task.fail_config( + "Your sim has no components specified! Please add something to simulate!" + ) # Try to determine how many jobs will be put in the queue # First see if it's been explicitly set @@ -165,25 +199,37 @@ def __init__(self, name, output_dir, config, global_config, combine="combine.inp # If its not set, lets check for ranseed_repeat or ranseed_change if batch_info is None: - ranseed_repeat = self.config.get("GLOBAL", {}).get("RANSEED_REPEAT") - ranseed_change = self.config.get("GLOBAL", {}).get("RANSEED_CHANGE") + ranseed_repeat = self.config.get("GLOBAL", {}).get( + "RANSEED_REPEAT" + ) + ranseed_change = self.config.get("GLOBAL", {}).get( + "RANSEED_CHANGE" + ) default = self.yaml.get("CONFIG", {}).get("RANSEED_REPEAT") ranseed = ranseed_repeat or ranseed_change or default if ranseed: num_jobs = int(ranseed.strip().split()[0]) - self.logger.debug(f"Found a randseed with {num_jobs}, deriving batch info") + self.logger.debug( + f"Found a randseed with {num_jobs}, deriving batch info" + ) comps = default_batch_info.strip().split() comps[-1] = str(num_jobs) self.derived_batch_info = " ".join(comps) self.num_jobs = num_jobs - self.logger.debug(f"Num jobs set by RANSEED to {self.num_jobs}") + self.logger.debug( + f"Num jobs set by RANSEED to {self.num_jobs}" + ) else: # self.logger.debug(f"BATCH INFO property detected as {property}") self.num_jobs = int(batch_info.split()[-1]) - self.logger.debug(f"Num jobs set by BATCH_INFO to {self.num_jobs}") + self.logger.debug( + f"Num jobs set by BATCH_INFO to {self.num_jobs}" + ) except Exception: - self.logger.warning(f"Unable to determine how many jobs simulation {self.name} has") + self.logger.warning( + f"Unable to determine how many jobs simulation {self.name} has" + ) self.num_jobs = 1 self.output["genversion"] = self.genversion @@ -201,8 +247,13 @@ def __init__(self, name, output_dir, config, global_config, combine="combine.inp def get_sim_folders(self, base, genversion): if self.output.get("ranseed_change"): num_sims = int(self.output["ranseed_change_val"].split()[0]) - self.logger.debug(f"Detected randseed change with {num_sims} sims, updating sim_folders") - self.sim_folders = [os.path.join(base, genversion) + f"-{i + 1:04d}" for i in range(num_sims)] + self.logger.debug( + f"Detected randseed change with {num_sims} sims, updating sim_folders" + ) + self.sim_folders = [ + os.path.join(base, genversion) + f"-{i + 1:04d}" + for i in range(num_sims) + ] self.logger.debug(f"First sim folder set to {self.sim_folders[0]}") else: @@ -229,7 +280,9 @@ def write_input(self): if k.upper() not in self.reserved_top: run_config = self.config[k] run_config_keys = list(run_config.keys()) - assert "BASE" in run_config_keys, "You must specify a base file for each option" + assert ( + "BASE" in run_config_keys + ), "You must specify a base file for each option" for key in run_config_keys: if key.upper() in self.reserved_keywords: continue @@ -252,7 +305,15 @@ def write_input(self): for key in self.config.get("GLOBAL", []): if key.upper() == "BASE": continue - direct_set = ["FORMAT_MASK", "RANSEED_REPEAT", "RANSEED_CHANGE", "BATCH_INFO", "BATCH_MEM", "NGEN_UNIT", "RESET_CIDOFF"] + direct_set = [ + "FORMAT_MASK", + "RANSEED_REPEAT", + "RANSEED_CHANGE", + "BATCH_INFO", + "BATCH_MEM", + "NGEN_UNIT", + "RESET_CIDOFF", + ] if key in direct_set: c[key] = self.config["GLOBAL"][key] else: @@ -304,21 +365,31 @@ def write_input(self): if line.startswith("INPUT_FILE_INCLUDE"): include_file = line.split(":")[-1].strip() include_file_path = get_data_loc(include_file) - self.logger.debug(f"Copying INPUT_FILE_INCLUDE file {include_file_path} to {temp_dir}") + self.logger.debug( + f"Copying INPUT_FILE_INCLUDE file {include_file_path} to {temp_dir}" + ) include_file_basename = os.path.basename(include_file_path) - include_file_output = os.path.join(temp_dir, include_file_basename) + include_file_output = os.path.join( + temp_dir, include_file_basename + ) if include_file_output not in input_copied: - # Copy include file into the temp dir shutil.copy(include_file_path, temp_dir) # Then SED the file to replace the full path with just the basename if include_file != include_file_basename: sed_command = f"sed -i -e 's|{include_file}|{include_file_basename}|g' {copied_path}" - self.logger.debug(f"Running sed command: {sed_command}") - subprocess.run(sed_command, stderr=subprocess.STDOUT, cwd=temp_dir, shell=True) + self.logger.debug( + f"Running sed command: {sed_command}" + ) + subprocess.run( + sed_command, + stderr=subprocess.STDOUT, + cwd=temp_dir, + shell=True, + ) # And make sure we dont do this file again fs.append(include_file_output) @@ -329,7 +400,9 @@ def write_input(self): # Remove any duplicates and order the output files output_files = [f"{temp_dir}/{a}" for a in sorted(os.listdir(temp_dir))] - self.logger.debug(f"{len(output_files)} files used to create simulation. Hashing them.") + self.logger.debug( + f"{len(output_files)} files used to create simulation. Hashing them." + ) # Get current hash new_hash = self.get_hash_from_files(output_files) @@ -346,7 +419,9 @@ def write_input(self): copytree(temp_dir, self.output_dir) self.save_new_hash(new_hash) else: - self.logger.error(f"Seems to be an issue with the output dir path: {self.output_dir}") + self.logger.error( + f"Seems to be an issue with the output dir path: {self.output_dir}" + ) chown_dir(self.output_dir) else: @@ -355,14 +430,18 @@ def write_input(self): return regenerate, new_hash def _run(self): - regenerate, new_hash = self.write_input() if not regenerate: self.should_be_done() return True with open(self.logging_file, "w") as f: - subprocess.run(["submit_batch_jobs.sh", os.path.basename(self.config_path)], stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir) + subprocess.run( + ["submit_batch_jobs.sh", os.path.basename(self.config_path)], + stdout=f, + stderr=subprocess.STDOUT, + cwd=self.output_dir, + ) self.logger.info(f"Sim running and logging outputting to {self.logging_file}") return True @@ -370,27 +449,43 @@ def _run(self): def kill_and_fail(self): with open(self.kill_file, "w") as f: self.logger.info(f"Killing remaining jobs for {self.name}") - subprocess.run(["submit_batch_jobs.sh", "--kill", os.path.basename(self.config_path)], stdout=f, stderr=subprocess.STDOUT, cwd=self.output_dir) + subprocess.run( + ["submit_batch_jobs.sh", "--kill", os.path.basename(self.config_path)], + stdout=f, + stderr=subprocess.STDOUT, + cwd=self.output_dir, + ) return Task.FINISHED_FAILURE def check_issues(self): log_files = [self.logging_file] if os.path.exists(self.sim_log_dir): - log_files += [os.path.join(self.sim_log_dir, f) for f in os.listdir(self.sim_log_dir) if f.upper().endswith(".LOG")] + log_files += [ + os.path.join(self.sim_log_dir, f) + for f in os.listdir(self.sim_log_dir) + if f.upper().endswith(".LOG") + ] else: - self.logger.warning(f"Warning, sim log dir {self.sim_log_dir} does not exist. Something might have gone terribly wrong") - self.scan_files_for_error(log_files, "FATAL ERROR ABORT", "QOSMaxSubmitJobPerUserLimit", "DUE TO TIME LIMIT") + self.logger.warning( + f"Warning, sim log dir {self.sim_log_dir} does not exist. Something might have gone terribly wrong" + ) + self.scan_files_for_error( + log_files, + "FATAL ERROR ABORT", + "QOSMaxSubmitJobPerUserLimit", + "DUE TO TIME LIMIT", + ) return self.kill_and_fail() def _check_completion(self, squeue): - if os.path.exists(self.done_file) or not os.path.exists(self.total_summary): - if os.path.exists(self.done_file): self.logger.info(f"Simulation {self.name} found done file!") with open(self.done_file) as f: if "FAIL" in f.read(): - self.logger.error(f"Done file {self.done_file} reporting failure") + self.logger.error( + f"Done file {self.done_file} reporting failure" + ) return self.check_issues() else: self.logger.error("MERGE.LOG was not created, job died on submission") @@ -400,18 +495,24 @@ def _check_completion(self, squeue): y = read_yaml(self.total_summary) if "MERGE" in y.keys(): for i, row in enumerate(y["MERGE"]): - if len(row) == 6: # Old version for backward compatibility (before 15/01/2021) + if ( + len(row) == 6 + ): # Old version for backward compatibility (before 15/01/2021) state, iver, version, ngen, nwrite, cpu = row - else: # New MERGE.LOG syntax (after 15/01/2021) + else: # New MERGE.LOG syntax (after 15/01/2021) state, iver, version, ngen, nwrite, nspec, cpu = row if cpu < 60: units = "minutes" else: cpu = cpu / 60 units = "hours" - self.logger.info(f"Simulation {i + 1} generated {ngen} events and wrote {nwrite} to file, taking {cpu:0.1f} CPU {units}") + self.logger.info( + f"Simulation {i + 1} generated {ngen} events and wrote {nwrite} to file, taking {cpu:0.1f} CPU {units}" + ) else: - self.logger.error(f"File {self.total_summary} does not have a MERGE section - did it die?") + self.logger.error( + f"File {self.total_summary} does not have a MERGE section - did it die?" + ) return self.kill_and_fail() if "SURVEY" in y.keys(): self.output["SURVEY"] = y["SURVEY"] @@ -424,13 +525,18 @@ def _check_completion(self, squeue): self.logger.warning(f"Cannot find {self.total_summary}") self.logger.info("Done file found, creating symlinks") - s_ends = [os.path.join(self.output_dir, os.path.basename(s)) for s in self.sim_folders] + s_ends = [ + os.path.join(self.output_dir, os.path.basename(s)) + for s in self.sim_folders + ] for s, s_end in zip(self.sim_folders, s_ends): if not os.path.exists(s_end): # Check to make sure there isn't a broken symlink at s_end # os.path.exists will return false for broken symlinks, even if one exists if os.path.islink(s_end): - self.logger.error(f"Symlink {s_end} exists and is pointing to a broken or missing directory") + self.logger.error( + f"Symlink {s_end} exists and is pointing to a broken or missing directory" + ) return Task.FINISHED_FAILURE else: self.logger.debug(f"Linking {s} -> {s_end}") @@ -442,7 +548,6 @@ def _check_completion(self, squeue): return self.check_for_job(squeue, f"{self.genversion}.input-CPU") def get_simInput_key_values(self, sim_input_file, key_list): - # Created Feb 2024 by R.Kessler # Example: # input key_list = [ 'GENMODEL', 'GENRANGE_PEAKMJD' ] @@ -457,56 +562,62 @@ def get_simInput_key_values(self, sim_input_file, key_list): # is just a public place to store this utility. # .xyz - - INPUT_FILE_LIST = [ sim_input_file ] + INPUT_FILE_LIST = [sim_input_file] # first find INCLUDE files and append INPUT_FILE_LIST - KEYLIST_INCLUDE_FILE = [ 'INPUT_INCLUDE_FILE', 'INPUT_FILE_INCLUDE' ] - with open(sim_input_file,"rt") as f: + KEYLIST_INCLUDE_FILE = ["INPUT_INCLUDE_FILE", "INPUT_FILE_INCLUDE"] + with open(sim_input_file, "rt") as f: line_list = f.readlines() for line in line_list: - line = line.rstrip() # remove trailine space + line = line.rstrip() # remove trailine space wdlist = line.split() - if len(wdlist) < 2 : continue + if len(wdlist) < 2: + continue for key in KEYLIST_INCLUDE_FILE: - if wdlist[0] == key + ':' : + if wdlist[0] == key + ":": inc_file = os.path.expandvars(wdlist[1]) INPUT_FILE_LIST.append(inc_file) - + # - - - - - - # init output dictionary + # init output dictionary key_dict = {} - for key in key_list: key_dict[key] = [] + for key in key_list: + key_dict[key] = [] - #print(f" xxx util: INPUT_FILE_LIST = {INPUT_FILE_LIST}") + # print(f" xxx util: INPUT_FILE_LIST = {INPUT_FILE_LIST}") # loop over all of the sim-input files and search for key values for inp_file in INPUT_FILE_LIST: - f = open(inp_file,"rt") + f = open(inp_file, "rt") line_list = f.readlines() f.close() - #print(f" Inspect {inp_file}") + # print(f" Inspect {inp_file}") for line in line_list: - if line.isspace() : continue - if line[0:1] == '#' : continue - line = line.rstrip() # remove trailing space - line = line.split('#')[0] # remove comments + if line.isspace(): + continue + if line[0:1] == "#": + continue + line = line.rstrip() # remove trailing space + line = line.split("#")[0] # remove comments wdlist = line.split() - if len(wdlist) < 2 : continue + if len(wdlist) < 2: + continue for key in key_list: - if wdlist[0] == key + ':' : - args = ' '.join(wdlist[1:]) - #print(f" xxx load {key} with {args}") + if wdlist[0] == key + ":": + args = " ".join(wdlist[1:]) + # print(f" xxx load {key} with {args}") key_dict[key].append(args) - # - - - - - - + # - - - - - + return key_dict # end get_simInput_key_values @staticmethod - def get_tasks(config, prior_tasks, base_output_dir, stage_number, prefix, global_config): + def get_tasks( + config, prior_tasks, base_output_dir, stage_number, prefix, global_config + ): tasks = [] for sim_name in config.get("SIM", []): task_config = config["SIM"][sim_name] @@ -514,6 +625,8 @@ def get_tasks(config, prior_tasks, base_output_dir, stage_number, prefix, global task_config["GENVERSION"] = f"{prefix}_{sim_name}" sim_output_dir = f"{base_output_dir}/{stage_number}_SIM/{sim_name}" s = SNANASimulation(sim_name, sim_output_dir, task_config, global_config) - Task.logger.debug(f"Creating simulation task {sim_name} with {s.num_jobs} jobs, output to {sim_output_dir}") + Task.logger.debug( + f"Creating simulation task {sim_name} with {s.num_jobs} jobs, output to {sim_output_dir}" + ) tasks.append(s) return tasks diff --git a/pippin/task.py b/pippin/task.py index 55182c54..3c30ae5e 100644 --- a/pippin/task.py +++ b/pippin/task.py @@ -1,13 +1,22 @@ import logging import shutil from abc import ABC, abstractmethod -from pippin.config import get_logger, get_hash, ensure_list, get_data_loc, read_yaml, compress_dir, uncompress_dir +from pippin.config import ( + get_logger, + get_hash, + ensure_list, + get_data_loc, + read_yaml, + compress_dir, + uncompress_dir, +) import tarfile import os import datetime import numpy as np import yaml import sys + sys.setrecursionlimit(10000) @@ -16,7 +25,9 @@ class Task(ABC): FINISHED_FAILURE = -9 logger = get_logger() - def __init__(self, name, output_dir, dependencies=None, config=None, done_file="done.txt"): + def __init__( + self, name, output_dir, dependencies=None, config=None, done_file="done.txt" + ): self.name = name self.output_dir = output_dir self.num_jobs = 1 @@ -44,16 +55,22 @@ def __init__(self, name, output_dir, dependencies=None, config=None, done_file=" if name_match is not None: matching_dirs = [d for d in external_dirs if name_match in d] if len(matching_dirs) == 0: - self.logger.error(f"Task {output_name} has external mapping {name_match} but there were no matching EXTERNAL_DIRS") + self.logger.error( + f"Task {output_name} has external mapping {name_match} but there were no matching EXTERNAL_DIRS" + ) else: if len(matching_dirs) > 1: - self.logger.warning(f"Task {output_name} has external mapping {name_match} which matched with multiple EXTERNAL_DIRS: {matching_dirs}. Defaulting to {matching_dirs[0]}") + self.logger.warning( + f"Task {output_name} has external mapping {name_match} which matched with multiple EXTERNAL_DIRS: {matching_dirs}. Defaulting to {matching_dirs[0]}" + ) self.logger.info(f"Found external match for {output_name}") self.config["EXTERNAL"] = matching_dirs[0] # If you haven't specified an EXTERNAL_MAP for this output_name, check for exact match elif output_name in external_names: - self.config["EXTERNAL"] = external_dirs[external_names.index(output_name)] + self.config["EXTERNAL"] = external_dirs[ + external_names.index(output_name) + ] else: self.logger.info(f"No external match found for {output_name}") @@ -63,13 +80,19 @@ def __init__(self, name, output_dir, dependencies=None, config=None, done_file=" self.external = get_data_loc(self.external) # External directory might be compressed if not os.path.exists(self.external): - self.logger.warning(f"External config {self.external} does not exist, checking if it's compressed") + self.logger.warning( + f"External config {self.external} does not exist, checking if it's compressed" + ) compressed_dir = self.external + ".tar.gz" if not os.path.exists(compressed_dir): - self.logger.error(f"{self.external} and {compressed_dir} do not exist") + self.logger.error( + f"{self.external} and {compressed_dir} do not exist" + ) else: self.external = compressed_dir - self.logger.debug(f"External config file path resolved to {self.external}") + self.logger.debug( + f"External config file path resolved to {self.external}" + ) with tarfile.open(self.external, "r:gz") as tar: for member in tar: if member.isfile(): @@ -82,11 +105,15 @@ def __init__(self, name, output_dir, dependencies=None, config=None, done_file=" conf.update(self.config) self.config = conf self.output = external_config.get("OUTPUT", {}) - self.logger.debug("Loaded external config successfully") + self.logger.debug( + "Loaded external config successfully" + ) else: if os.path.isdir(self.external): self.external = os.path.join(self.external, "config.yml") - self.logger.debug(f"External config file path resolved to {self.external}") + self.logger.debug( + f"External config file path resolved to {self.external}" + ) with open(self.external, "r") as f: external_config = yaml.load(f, Loader=yaml.Loader) conf = external_config.get("CONFIG", {}) @@ -113,7 +140,14 @@ def __init__(self, name, output_dir, dependencies=None, config=None, done_file=" self.force_refresh = False self.force_ignore = False - self.output.update({"name": name, "output_dir": output_dir, "hash_file": self.hash_file, "done_file": self.done_file}) + self.output.update( + { + "name": name, + "output_dir": output_dir, + "hash_file": self.hash_file, + "done_file": self.done_file, + } + ) self.config_file = os.path.join(output_dir, "config.yml") def add_dependent(self, task): @@ -145,16 +179,16 @@ def update_header(self, header_dict): self.sbatch_header = self.sbatch_header.replace(key, str(value)) append_list = header_dict.get("APPEND") if append_list is not None: - lines = self.sbatch_header.split('\n') + lines = self.sbatch_header.split("\n") lines += append_list - self.sbatch_header = '\n'.join(lines) + self.sbatch_header = "\n".join(lines) self.logger.debug("Updated header") def clean_header(self, header): - lines = header.split('\n') - mask = lambda x: (len(x) > 0) and (x[0] == '#') and ('Sxxx' not in x) + lines = header.split("\n") + mask = lambda x: (len(x) > 0) and (x[0] == "#") and ("Sxxx" not in x) lines = filter(mask, lines) - header = '\n'.join(lines) + header = "\n".join(lines) return header def compress(self): @@ -175,15 +209,18 @@ def uncompress(self): if os.path.exists(source_file): uncompress_dir(os.path.dirname(t.output_dir), source_file) - def _check_regenerate(self, new_hash): hash_are_different = new_hash != self.get_old_hash() if self.force_ignore: if hash_are_different: - self.logger.warning(f"Warning, hashes are different for {self}, but force_ignore is True so regenerate=False") + self.logger.warning( + f"Warning, hashes are different for {self}, but force_ignore is True so regenerate=False" + ) else: - self.logger.debug("Hashes agree and force_ignore is set, returning regenerate=False") + self.logger.debug( + "Hashes agree and force_ignore is set, returning regenerate=False" + ) return False elif self.force_refresh: self.logger.debug("Force refresh is set, returning regenerate=True") @@ -228,10 +265,14 @@ def check_for_job(self, squeue, match): if num_jobs == 0: self.num_empty += 1 if self.num_empty >= self.num_empty_threshold: - self.logger.error(f"No more waiting, there are no slurm jobs active that match {match}! Debug output dir {self.output_dir}") + self.logger.error( + f"No more waiting, there are no slurm jobs active that match {match}! Debug output dir {self.output_dir}" + ) return Task.FINISHED_FAILURE elif self.num_empty > 1 and self.num_empty > self.display_threshold: - self.logger.warning(f"Task {str(self)} has no match for {match} in squeue, warning {self.num_empty}/{self.num_empty_threshold}") + self.logger.warning( + f"Task {str(self)} has no match for {match} in squeue, warning {self.num_empty}/{self.num_empty_threshold}" + ) return 0 return num_jobs @@ -264,7 +305,9 @@ def get_hash_from_files(self, output_files): return new_hash def get_hash_from_string(self, string_to_hash): - hashes = sorted([dep.get_old_hash(quiet=True, required=True) for dep in self.dependencies]) + hashes = sorted( + [dep.get_old_hash(quiet=True, required=True) for dep in self.dependencies] + ) string_to_hash += " ".join(hashes) new_hash = get_hash(string_to_hash) self.logger.debug(f"Current hash set to {new_hash}") @@ -287,27 +330,40 @@ def run(self): if self.external is not None: self.logger.debug(f"Name: {self.name} External: {self.external}") if os.path.exists(self.output_dir) and not self.force_refresh: - self.logger.info(f"Not copying external site, output_dir already exists at {self.output_dir}") + self.logger.info( + f"Not copying external site, output_dir already exists at {self.output_dir}" + ) else: if os.path.exists(self.output_dir): self.logger.debug(f"Removing old directory {self.output_dir}") shutil.rmtree(self.output_dir, ignore_errors=True) if ".tar.gz" in self.external: tardir = os.path.basename(self.external).replace(".tar.gz", "") - self.logger.info(f"Copying files from {self.external} to {self.output_dir}") - - shutil.copyfile(self.external, self.output_dir + '.tar.gz') + self.logger.info( + f"Copying files from {self.external} to {self.output_dir}" + ) + + shutil.copyfile(self.external, self.output_dir + ".tar.gz") self.uncompress() - shutil.move(os.path.join(os.path.dirname(self.output_dir), tardir), self.output_dir) + shutil.move( + os.path.join(os.path.dirname(self.output_dir), tardir), + self.output_dir, + ) else: - self.logger.info(f"Copying from {os.path.dirname(self.external)} to {self.output_dir}") - shutil.copytree(os.path.dirname(self.external), self.output_dir, symlinks=True) + self.logger.info( + f"Copying from {os.path.dirname(self.external)} to {self.output_dir}" + ) + shutil.copytree( + os.path.dirname(self.external), self.output_dir, symlinks=True + ) return True return self._run() def scan_file_for_error(self, path, *error_match, max_lines=10): - assert len(error_match) >= 1, "You need to specify what string to search for. I have nothing." + assert ( + len(error_match) >= 1 + ), "You need to specify what string to search for. I have nothing." found = False if not os.path.exists(path): self.logger.warning(f"Note, expected log path {path} does not exist") @@ -324,7 +380,9 @@ def scan_file_for_error(self, path, *error_match, max_lines=10): self.logger.error(f"Excerpt: {line}") return found - def scan_files_for_error(self, paths, *error_match, max_lines=10, max_erroring_files=3): + def scan_files_for_error( + self, paths, *error_match, max_lines=10, max_erroring_files=3 + ): num_errors = 0 self.logger.debug(f"Found {len(paths)} to scan") for path in paths: @@ -333,8 +391,12 @@ def scan_files_for_error(self, paths, *error_match, max_lines=10, max_erroring_f fail_summary = read_yaml(path) for key, dicts in fail_summary.items(): if key.startswith("FAILURE-0"): - self.logger.error(f"{key}: {' '.join(dicts.get('ABORT_MESSAGES', 'Unknown message'))}") - self.logger.error(f"{key}: Detailed in {dicts.get('JOB_LOG_FILE', 'Unknown path')}") + self.logger.error( + f"{key}: {' '.join(dicts.get('ABORT_MESSAGES', 'Unknown message'))}" + ) + self.logger.error( + f"{key}: Detailed in {dicts.get('JOB_LOG_FILE', 'Unknown path')}" + ) num_errors += 1 if num_errors > max_erroring_files: break @@ -363,17 +425,24 @@ def match_tasks(mask, deps, match_none=True, allowed_failure=False): for m in mask: specific_match = [d for d in matching_deps if m in d.name] if len(specific_match) == 0 and not allowed_failure: - Task.fail_config(f"Mask '{m}' does not match any deps. Probably a typo. Available options are {deps}") + Task.fail_config( + f"Mask '{m}' does not match any deps. Probably a typo. Available options are {deps}" + ) return matching_deps @staticmethod def match_tasks_of_type(mask, deps, *cls, match_none=True, allowed_failure=False): - return Task.match_tasks(mask, Task.get_task_of_type(deps, *cls), match_none=match_none, allowed_failure=allowed_failure) + return Task.match_tasks( + mask, + Task.get_task_of_type(deps, *cls), + match_none=match_none, + allowed_failure=allowed_failure, + ) @abstractmethod def _run(self): - """ Execute the primary function of the task + """Execute the primary function of the task :param force_refresh: to force refresh and rerun - do not pass hash checks :return: true or false if the job launched successfully @@ -391,7 +460,9 @@ def fail_config(message): @staticmethod @abstractmethod - def get_tasks(config, prior_tasks, base_output_dir, stage_number, prefix, global_config): + def get_tasks( + config, prior_tasks, base_output_dir, stage_number, prefix, global_config + ): raise NotImplementedError() def get_wall_time_str(self): @@ -400,7 +471,7 @@ def get_wall_time_str(self): return None def check_completion(self, squeue): - """ Checks if the job has completed. + """Checks if the job has completed. Invokes `_check_completion` and determines wall time. @@ -413,28 +484,40 @@ def check_completion(self, squeue): if self.start_time is None and os.path.exists(self.hash_file): self.start_time = os.path.getmtime(self.hash_file) if self.end_time is not None and self.start_time is not None: - self.wall_time = int(self.end_time - self.start_time + 0.5) # round up - self.logger.info(f"Task finished with wall time {self.get_wall_time_str()}") + self.wall_time = int( + self.end_time - self.start_time + 0.5 + ) # round up + self.logger.info( + f"Task finished with wall time {self.get_wall_time_str()}" + ) if result == Task.FINISHED_FAILURE: self.clear_hash() elif not self.fresh_run: - self.logger.error("Hash check had passed, so the task should be done, but it said it wasn't!") - self.logger.error(f"This means it probably crashed, have a look in {self.output_dir}") + self.logger.error( + "Hash check had passed, so the task should be done, but it said it wasn't!" + ) + self.logger.error( + f"This means it probably crashed, have a look in {self.output_dir}" + ) self.logger.error(f"Removing hash from {self.hash_file}") self.clear_hash() - #TODO try rerunning task + # TODO try rerunning task return Task.FINISHED_FAILURE - if self.external is None and result == Task.FINISHED_SUCCESS and not os.path.exists(self.config_file): + if ( + self.external is None + and result == Task.FINISHED_SUCCESS + and not os.path.exists(self.config_file) + ): self.write_config() return result @abstractmethod def _check_completion(self, squeue): - """ Checks if the job is complete or has failed. - - If it is complete it should also load in the any useful results that + """Checks if the job is complete or has failed. + + If it is complete it should also load in the any useful results that other tasks may need in `self.output` dictionary - + Such as the location of a trained model or output files. :param squeue: """ diff --git a/run.py b/run.py index 5094d90b..f3dcc6f1 100644 --- a/run.py +++ b/run.py @@ -5,11 +5,19 @@ import coloredlogs import signal import sys -from pippin.config import mkdirs, get_logger, get_output_dir, chown_file, get_config, chown_dir +from pippin.config import ( + mkdirs, + get_logger, + get_output_dir, + chown_file, + get_config, + chown_dir, +) from pippin.manager import Manager from colorama import init import socket + class MessageStore(logging.Handler): store = None @@ -31,7 +39,6 @@ def get_errors(self): def setup_logging(config_filename, logging_folder, args): - level = logging.DEBUG if args.verbose else logging.INFO logging_filename = f"{logging_folder}/{config_filename}.log" @@ -56,16 +63,18 @@ def notice(self, message, *args, **kws): handlers.append(logging.FileHandler(logging_filename, mode="w")) handlers[-1].setLevel(logging.DEBUG) handlers[-1].setFormatter(logging.Formatter(fmt_verbose)) - #logging.basicConfig(level=level, format=fmt, handlers=handlers) + # logging.basicConfig(level=level, format=fmt, handlers=handlers) for h in handlers: - logger.addHandler(h) + logger.addHandler(h) coloredlogs.install( level=level, fmt=fmt, reconfigure=True, - level_styles=coloredlogs.parse_encoded_styles("debug=8;notice=green;warning=yellow;error=red,bold;critical=red,inverse"), + level_styles=coloredlogs.parse_encoded_styles( + "debug=8;notice=green;warning=yellow;error=red,bold;critical=red,inverse" + ), ) logging.getLogger("matplotlib").setLevel(logging.ERROR) @@ -73,8 +82,9 @@ def notice(self, message, *args, **kws): return message_store, logging_filename + def load_yaml(yaml_path): - with open(yaml_path, 'r') as f: + with open(yaml_path, "r") as f: raw = f.read() logging.info("Preprocessing yaml") yaml_str = preprocess(raw) @@ -83,12 +93,18 @@ def load_yaml(yaml_path): config = yaml.safe_load(yaml_str) return yaml_str, config + def preprocess(raw): lines = raw.split("\n") # Get all lines which start with # comment_lines = [line[1:] for line in lines if (len(line) > 0) and (line[0] == "#")] # Now get all lines which start and end with % - preprocess_lines = [line for line in comment_lines if (len(line.split()) > 0) and (line.split()[0][0] == line.split()[-1][-1] == "%")] + preprocess_lines = [ + line + for line in comment_lines + if (len(line.split()) > 0) + and (line.split()[0][0] == line.split()[-1][-1] == "%") + ] if len(preprocess_lines) == 0: logging.info("No preprocessing found") return raw @@ -105,19 +121,22 @@ def preprocess(raw): logging.warning(f"Unknown preprocessing step: {name}, skipping") yaml_str = "\n".join(lines) return yaml_str - + + def preprocess_include(value, lines): include_path = os.path.abspath(os.path.expandvars(value[0])) - assert os.path.exists(include_path), f"Attempting to include {include_path}, but file cannot be found." - with open(include_path, 'r') as f: + assert os.path.exists( + include_path + ), f"Attempting to include {include_path}, but file cannot be found." + with open(include_path, "r") as f: include_yaml = f.read() include_yaml = include_yaml.split("\n") index = [i for i, l in enumerate(lines) if value[0] in l][0] info = [f"# Anchors included from {include_path}"] - return lines[:index] + info + include_yaml + lines[index + 1:] + return lines[:index] + info + include_yaml + lines[index + 1 :] -def run(args): +def run(args): if args is None: return None @@ -127,12 +146,14 @@ def run(args): yaml_path = os.path.abspath(os.path.expandvars(args.yaml)) assert os.path.exists(yaml_path), f"File {yaml_path} cannot be found." config_raw, config = load_yaml(yaml_path) - #with open(yaml_path, "r") as f: + # with open(yaml_path, "r") as f: # config = yaml.safe_load(f) overwrites = config.get("GLOBAL") if config.get("GLOBALS") is not None: - logging.warning("Your config file has a GLOBALS section in it. If you're trying to overwrite cfg.yml, rename this to GLOBAL") + logging.warning( + "Your config file has a GLOBALS section in it. If you're trying to overwrite cfg.yml, rename this to GLOBAL" + ) cfg = None if overwrites: @@ -154,16 +175,22 @@ def run(args): if args.permission: return - message_store, logging_filename = setup_logging(config_filename, logging_folder, args) + message_store, logging_filename = setup_logging( + config_filename, logging_folder, args + ) for i, d in enumerate(global_config["DATA_DIRS"]): logging.debug(f"Data directory {i + 1} set as {d}") - assert d is not None, "Data directory is none, which means it failed to resolve. Check the error message above for why." + assert ( + d is not None + ), "Data directory is none, which means it failed to resolve. Check the error message above for why." + + logging.info( + f"Running on: {os.environ.get('HOSTNAME', '$HOSTNAME not set')} login node." + ) - logging.info(f"Running on: {os.environ.get('HOSTNAME', '$HOSTNAME not set')} login node.") - manager = Manager(config_filename, yaml_path, config_raw, config, message_store) - + # Gracefully hand Ctrl-c def handler(signum, frame): logging.error("Ctrl-c was pressed.") @@ -173,7 +200,6 @@ def handler(signum, frame): signal.signal(signal.SIGINT, handler) - if args.start is not None: args.refresh = True manager.set_start(args.start) @@ -185,32 +211,47 @@ def handler(signum, frame): chown_file(logging_filename) return manager + def get_syntax(options): syntax = {} - taskname = ["DATAPREP", "SIM", "LCFIT", "CLASSIFY", "AGG", "MERGE", "BIASCOR", "CREATE_COV", "COSMOFIT", "ANALYSE"] - syntax["options"] = f"Possible tasks are: ({[(i, task) for i, task in enumerate(taskname)]})" + taskname = [ + "DATAPREP", + "SIM", + "LCFIT", + "CLASSIFY", + "AGG", + "MERGE", + "BIASCOR", + "CREATE_COV", + "COSMOFIT", + "ANALYSE", + ] + syntax[ + "options" + ] = f"Possible tasks are: ({[(i, task) for i, task in enumerate(taskname)]})" if options: return syntax base = os.path.dirname(os.path.realpath(__file__)) - with open(f"{base}/README.md", 'r') as f: + with open(f"{base}/README.md", "r") as f: readme = f.read() - lines = readme.split('\n') + lines = readme.split("\n") start, end = [idx for (idx, line) in enumerate(lines) if "[//]" in line] lines = lines[start:end] - index = [idx for (idx, line) in enumerate(lines) if "###" == line.split(' ')[0]] + index = [idx for (idx, line) in enumerate(lines) if "###" == line.split(" ")[0]] tasks = [] for i in range(len(index)): idx = index[i] if idx != index[-1]: - tasks.append("\n".join(lines[idx+2:index[i+1]-1])) + tasks.append("\n".join(lines[idx + 2 : index[i + 1] - 1])) else: - tasks.append("\n".join(lines[idx+2:-1])) + tasks.append("\n".join(lines[idx + 2 : -1])) for i, name in enumerate(taskname): syntax[name] = tasks[i] return syntax + def print_syntax(s): - syntax = get_syntax(s=="options") + syntax = get_syntax(s == "options") try: keys = list(syntax.keys()) s = int(s) + 1 @@ -225,22 +266,87 @@ def print_syntax(s): print(msg) return None + def get_args(test=False): # Set up command line arguments parser = argparse.ArgumentParser() - parser.add_argument("yaml", help="the name of the yml config file to run. For example: configs/default.yml", type=str, nargs='*') - parser.add_argument("--config", help="Location of global config (i.e. cfg.yml)", default=None, type=str) - parser.add_argument("-v", "--verbose", help="increase output verbosity", action="store_true") - parser.add_argument("-s", "--start", help="Stage to start and force refresh. Accepts either the stage number or name (i.e. 1 or SIM)", default=None) - parser.add_argument("-f", "--finish", help="Stage to finish at (it runs this stage too). Accepts either the stage number or name (i.e. 1 or SIM)", default=None) - parser.add_argument("-r", "--refresh", help="Refresh all tasks, do not use hash", action="store_true") - parser.add_argument("-c", "--check", help="Check if config is valid", action="store_true", default=False) - parser.add_argument("-p", "--permission", help="Fix permissions and groups on all output, don't rerun", action="store_true", default=False) - parser.add_argument("-i", "--ignore", help="Dont rerun tasks with this stage or less. Accepts either the stage number of name (i.e. 1 or SIM)", default=None) - parser.add_argument("-S", "--syntax", help="Get the syntax of the given stage. Accepts either the stage number or name (i.e. 1 or SIM). If run without argument, will tell you all stage numbers / names.", default=None, const="options", type=str, nargs='?') + parser.add_argument( + "yaml", + help="the name of the yml config file to run. For example: configs/default.yml", + type=str, + nargs="*", + ) + parser.add_argument( + "--config", + help="Location of global config (i.e. cfg.yml)", + default=None, + type=str, + ) + parser.add_argument( + "-v", "--verbose", help="increase output verbosity", action="store_true" + ) + parser.add_argument( + "-s", + "--start", + help="Stage to start and force refresh. Accepts either the stage number or name (i.e. 1 or SIM)", + default=None, + ) + parser.add_argument( + "-f", + "--finish", + help="Stage to finish at (it runs this stage too). Accepts either the stage number or name (i.e. 1 or SIM)", + default=None, + ) + parser.add_argument( + "-r", + "--refresh", + help="Refresh all tasks, do not use hash", + action="store_true", + ) + parser.add_argument( + "-c", + "--check", + help="Check if config is valid", + action="store_true", + default=False, + ) + parser.add_argument( + "-p", + "--permission", + help="Fix permissions and groups on all output, don't rerun", + action="store_true", + default=False, + ) + parser.add_argument( + "-i", + "--ignore", + help="Dont rerun tasks with this stage or less. Accepts either the stage number of name (i.e. 1 or SIM)", + default=None, + ) + parser.add_argument( + "-S", + "--syntax", + help="Get the syntax of the given stage. Accepts either the stage number or name (i.e. 1 or SIM). If run without argument, will tell you all stage numbers / names.", + default=None, + const="options", + type=str, + nargs="?", + ) command_group = parser.add_mutually_exclusive_group() - command_group.add_argument("-C", "--compress", help="Compress pippin output during job. Combine with -c / --check in order to compress completed pippin job.", action="store_true", default=False) - command_group.add_argument("-U", "--uncompress", help="Do not compress pippin output during job. Combine with -c / --check in order to uncompress completed pippin job. Mutually exclusive with -C / --compress", action="store_true", default=False) + command_group.add_argument( + "-C", + "--compress", + help="Compress pippin output during job. Combine with -c / --check in order to compress completed pippin job.", + action="store_true", + default=False, + ) + command_group.add_argument( + "-U", + "--uncompress", + help="Do not compress pippin output during job. Combine with -c / --check in order to uncompress completed pippin job. Mutually exclusive with -C / --compress", + action="store_true", + default=False, + ) args = parser.parse_args() if args.syntax is not None: @@ -262,4 +368,4 @@ def get_args(test=False): manager = run(args) sys.stdout.flush() if manager.num_errs > 0: - raise(ValueError(f"{manager.num_errs} Errors found")) + raise (ValueError(f"{manager.num_errs} Errors found")) diff --git a/tests/test_valid_config.py b/tests/test_valid_config.py index 05bdaaf2..6367d46e 100644 --- a/tests/test_valid_config.py +++ b/tests/test_valid_config.py @@ -7,7 +7,7 @@ from pippin.snana_fit import SNANALightCurveFit from pippin.classifiers.fitprob import FitProbClassifier from pippin.classifiers.perfect import PerfectClassifier -from pippin.classifiers.scone import SconeClassifier +from pippin.classifiers.scone import SconeClassifier from pippin.classifiers.scone_legacy import SconeLegacyClassifier from pippin.aggregator import Aggregator from pippin.merge import Merger @@ -18,7 +18,6 @@ def test_dataprep_config_valid(): - # This shouldn't raise an error manager = get_manager(yaml="tests/config_files/valid_dataprep.yml", check=True) tasks = manager.tasks @@ -43,7 +42,6 @@ def test_dataprep_outputs_set(): def test_sim_config_valid(): - # This shouldn't raise an error manager = get_manager(yaml="tests/config_files/valid_sim.yml", check=True) tasks = manager.tasks @@ -68,7 +66,6 @@ def test_sim_outputs_set(): def test_lcfit_config_valid(): - # This shouldn't raise an error manager = get_manager(yaml="tests/config_files/valid_lcfit.yml", check=True) tasks = manager.tasks @@ -97,14 +94,17 @@ def test_lcfit_outputs_mask(): tasks = manager.tasks assert len(tasks) == 5 - expected = {"DIFFERENT_SN_EXAMPLESIM", "DIFFERENT_SN_EXAMPLESIM2", "MASKTEST_EXAMPLESIM2"} + expected = { + "DIFFERENT_SN_EXAMPLESIM", + "DIFFERENT_SN_EXAMPLESIM2", + "MASKTEST_EXAMPLESIM2", + } found = set([t.name for t in tasks if isinstance(t, SNANALightCurveFit)]) assert expected == found def test_classify_sim_only_config_valid(): - # This shouldn't raise an error manager = get_manager(yaml="tests/config_files/valid_classify_sim.yml", check=True) tasks = manager.tasks @@ -120,7 +120,9 @@ def test_classify_sim_only_config_valid(): def test_classifier_lcfit_config_valid(): - manager = get_manager(yaml="tests/config_files/valid_classify_lcfit.yml", check=True) + manager = get_manager( + yaml="tests/config_files/valid_classify_lcfit.yml", check=True + ) tasks = manager.tasks assert len(tasks) == 3 @@ -135,8 +137,11 @@ def test_classifier_lcfit_config_valid(): assert task.output["prob_column_name"] == "PROB_FITPROBTEST" assert len(task.dependencies) == 2 + def test_classifier_sim_with_opt_lcfit_config_valid(): - manager = get_manager(yaml="tests/config_files/valid_classify_sim_with_lcfit.yml", check=True) + manager = get_manager( + yaml="tests/config_files/valid_classify_sim_with_lcfit.yml", check=True + ) tasks = manager.tasks assert len(tasks) == 3 @@ -152,8 +157,11 @@ def test_classifier_sim_with_opt_lcfit_config_valid(): assert isinstance(deps[0], SNANASimulation) assert isinstance(deps[1], SNANALightCurveFit) + def test_classifier_scone_valid(): - manager = get_manager(yaml="tests/config_files/valid_classify_scone.yml", check=True) + manager = get_manager( + yaml="tests/config_files/valid_classify_scone.yml", check=True + ) tasks = manager.tasks # 1 Sim, 1 LCFit, 4 Scone @@ -166,46 +174,40 @@ def test_classifier_scone_valid(): tests = [ { - 'task': tasks[2], - 'cls': SconeLegacyClassifier, - 'attr': { - 'name': 'LEGACY_SCONE_TRAIN', - 'scone_input_file': None - } + "task": tasks[2], + "cls": SconeLegacyClassifier, + "attr": {"name": "LEGACY_SCONE_TRAIN", "scone_input_file": None}, }, { - 'task': tasks[3], - 'cls': SconeLegacyClassifier, - 'attr': { - 'name': 'LEGACY_SCONE_PREDICT', - 'scone_input_file': None - } + "task": tasks[3], + "cls": SconeLegacyClassifier, + "attr": {"name": "LEGACY_SCONE_PREDICT", "scone_input_file": None}, }, { - 'task': tasks[4], - 'cls': SconeClassifier, - 'attr': { - 'name': 'SCONE_TRAIN', - } + "task": tasks[4], + "cls": SconeClassifier, + "attr": { + "name": "SCONE_TRAIN", + }, }, { - 'task': tasks[5], - 'cls': SconeClassifier, - 'attr': { - 'name': 'SCONE_PREDICT', - } - } + "task": tasks[5], + "cls": SconeClassifier, + "attr": { + "name": "SCONE_PREDICT", + }, + }, ] for test in tests: - task = test['task'] - assert type(task) is test['cls'] - for (attr, val) in test['attr'].items(): + task = test["task"] + assert type(task) is test["cls"] + for attr, val in test["attr"].items(): assert hasattr(task, attr) assert getattr(task, attr) == val -def test_agg_config_valid(): +def test_agg_config_valid(): # This shouldn't raise an error manager = get_manager(yaml="tests/config_files/valid_agg.yml", check=True) tasks = manager.tasks @@ -264,7 +266,9 @@ def test_createcov_config_valid(): manager = get_manager(yaml="tests/config_files/valid_create_cov.yml", check=True) tasks = manager.tasks - assert len(tasks) == 14 # (2 sims, 2 lcfit, 4 classifiers, 2 agg, 2 merge, 1 bcor, 1 create cov) + assert ( + len(tasks) == 14 + ) # (2 sims, 2 lcfit, 4 classifiers, 2 agg, 2 merge, 1 bcor, 1 create cov) assert isinstance(tasks[-1], CreateCov) task = tasks[-1] @@ -281,7 +285,9 @@ def test_cosmomc_config_valid(): manager = get_manager(yaml="tests/config_files/valid_cosmomc.yml", check=True) tasks = manager.tasks - assert len(tasks) == 15 # (2 sims, 2 lcfit, 4 classifiers, 2 agg, 2 merge, 1 bcor, 1 create cov, 1 cosmomc) + assert ( + len(tasks) == 15 + ) # (2 sims, 2 lcfit, 4 classifiers, 2 agg, 2 merge, 1 bcor, 1 create cov, 1 cosmomc) assert isinstance(tasks[-1], CosmoMC) task = tasks[-1] @@ -299,11 +305,15 @@ def test_analyse_config_valid(): manager = get_manager(yaml="tests/config_files/valid_analyse.yml", check=True) tasks = manager.tasks - assert len(tasks) == 16 # (2 sims, 2 lcfit, 4 classifiers, 2 agg, 2 merge, 1 bcor, 1 create cov, 1 cosmomc, 1 analyse) + assert ( + len(tasks) == 16 + ) # (2 sims, 2 lcfit, 4 classifiers, 2 agg, 2 merge, 1 bcor, 1 create cov, 1 cosmomc, 1 analyse) assert isinstance(tasks[-1], AnalyseChains) task = tasks[-1] assert task.output["name"] == "ALL_OMW" - assert len(task.dependencies) == 2 # Create cosmomc for chains, and biascor for hubble diagram + assert ( + len(task.dependencies) == 2 + ) # Create cosmomc for chains, and biascor for hubble diagram assert isinstance(task.dependencies[0], (CosmoMC, BiasCor)) assert isinstance(task.dependencies[1], (CosmoMC, BiasCor))