Updating behaviour to abort on config more often to save precious time

dessn · Feb 13, 2020 · 16fc734 · 16fc734
1 parent af08ff9
commit 16fc734
Show file tree

Hide file tree

Showing 19 changed files with 209 additions and 34 deletions.
diff --git a/pippin/analyse.py b/pippin/analyse.py
@@ -124,7 +124,7 @@ def get_slurm_raw(self):
         return base
 
     def add_plot_script_to_run(self, script_name):
-        script_path = get_data_loc(self.plot_code_dir, script_name)
+        script_path = get_data_loc(script_name, extra=self.plot_code_dir)
         self.path_to_codes.append(script_path)
         self.done_files.append(os.path.join(self.output_dir, os.path.basename(script_name).split(".")[0] + ".done"))
 

diff --git a/pippin/biascor.py b/pippin/biascor.py
@@ -1,5 +1,4 @@
 import copy
-import inspect
 import shutil
 import subprocess
 import os
@@ -16,9 +15,7 @@
 
 class BiasCor(ConfigBasedExecutable):
     def __init__(self, name, output_dir, dependencies, options, config, global_config):
-        self.global_config = global_config
-        self.data_dirs = global_config["DATA_DIRS"]
-        base = get_data_loc(self.data_dirs, config.get("BASE", "surveys/des/bbc/bbc_5yr.input"))
+        base = get_data_loc(config.get("BASE", "surveys/des/bbc/bbc_5yr.input"))
 
         super().__init__(name, output_dir, base, "=", dependencies=dependencies)
 

diff --git a/pippin/classifiers/classifier.py b/pippin/classifiers/classifier.py
@@ -1,7 +1,7 @@
 import os
 from abc import abstractmethod
 
-from pippin.config import get_output_loc
+from pippin.config import get_output_loc, get_data_loc
 from pippin.dataprep import DataPrep
 from pippin.task import Task
 from pippin.snana_sim import SNANASimulation
@@ -85,6 +85,19 @@ def get_simulation_dependency(self):
                 return t
         return None
 
+    def validate_model(self):
+        if self.mode == Classifier.PREDICT:
+            model = self.options.get("MODEL")
+            if model is None:
+                Task.fail_config(f"Classifier {self.name} is in predict mode but does not have a model specified")
+            model_classifier = self.get_model_classifier()
+            if model_classifier is not None and model_classifier.name == model:
+                return True
+            path = get_data_loc(model)
+            if not os.path.exists(path):
+                Task.fail_config(f"Classifier {self.name} does not have a classifier dependency and model is not a serialised file path")
+        return True
+
     def get_model_classifier(self):
         for t in self.dependencies:
             if isinstance(t, Classifier):
@@ -142,6 +155,8 @@ def get_num_ranseed(sim_task, lcfit_task):
             name = config["CLASSIFIER"]
             cls = ClassifierFactory.get(name)
             options = config.get("OPTS", {})
+            if "MODE" not in config:
+                Task.fail_config(f"Classifier task {clas_name} needs to specify MODE as train or predict")
             mode = config["MODE"].lower()
             assert mode in ["train", "predict"], "MODE should be either train or predict"
             if mode == "train":
@@ -166,6 +181,7 @@ def get_num_ranseed(sim_task, lcfit_task):
             mask_sim = config.get("MASK_SIM", "")
             mask_fit = config.get("MASK_FIT", "")
             for s, l in runs:
+
                 sim_name = s.name if s is not None else None
                 fit_name = l.name if l is not None else None
                 matched_sim = True
@@ -200,7 +216,6 @@ def get_num_ranseed(sim_task, lcfit_task):
                         assert (
                             len(folders) == 1
                         ), f"Training requires one version of the lcfits, you have {len(folders)} for lcfit task {l}. Make sure your training sim doesn't set RANSEED_CHANGE"
-
                 if model is not None:
                     if "/" in model or "." in model:
                         potential_path = get_output_loc(model)
@@ -253,6 +268,5 @@ def get_num_ranseed(sim_task, lcfit_task):
                         tasks.append(cc)
 
             if num_gen == 0:
-                Task.logger.error(f"Classifier {clas_name} with masks |{mask}|{mask_sim}|{mask_fit}| matched no combination of sims and fits")
-                return None  # This should cause pippin to crash, which is probably what we want
+                Task.fail_config(f"Classifier {clas_name} with masks |{mask}|{mask_sim}|{mask_fit}| matched no combination of sims and fits")
         return tasks
diff --git a/pippin/classifiers/nearest_neighbor_python.py b/pippin/classifiers/nearest_neighbor_python.py
@@ -57,6 +57,7 @@ def __init__(self, name, output_dir, dependencies, mode, options, index=0, model
 
         self.output["predictions_filename"] = self.predictions_filename
         self.output["model_filename"] = self.output_pk_file
+        self.validate_model()
 
         self.slurm = """#!/bin/bash
 #SBATCH --job-name={job_name}

diff --git a/pippin/classifiers/snirf.py b/pippin/classifiers/snirf.py
@@ -47,7 +47,8 @@ def __init__(self, name, output_dir, dependencies, mode, options, index=0, model
         self.path_to_classifier = get_output_loc(self.global_config["SNIRF"]["location"])
         self.job_base_name = os.path.basename(Path(output_dir).parents[1]) + "__" + os.path.basename(output_dir)
         self.features = options.get("FEATURES", "x1 c zHD x1ERR cERR PKMJDERR")
-        # self.model_pk_file = self.get_unique_name() + ".pkl"
+        self.validate_model()
+
         self.model_pk_file = "model.pkl"
         self.output_pk_file = os.path.join(self.output_dir, self.model_pk_file)
         self.fitopt = options.get("FITOPT", "DEFAULT")

diff --git a/pippin/classifiers/supernnova.py b/pippin/classifiers/supernnova.py
@@ -49,6 +49,8 @@ def __init__(self, name, output_dir, dependencies, mode, options, index=0, model
         self.variant = options.get("VARIANT", "vanilla").lower()
         self.redshift = "zspe" if options.get("REDSHIFT", True) else "none"
         self.norm = options.get("NORM", "global")
+        self.validate_model()
+
         assert self.norm in ["global", "cosmo", "perfilter"], f"Norm option is set to {self.norm}, needs to be one of 'global', 'cosmo', 'perfilter'"
         assert self.variant in ["vanilla", "variational", "bayesian"], f"Variant {self.variant} is not vanilla, variational or bayesian"
         self.slurm = """#!/bin/bash

diff --git a/pippin/config.py b/pippin/config.py
@@ -47,7 +47,7 @@ def get_config(initial_path=None, overwrites=None):
         config = merge_dict(config, overwrites)
 
     for i, path in enumerate(config["DATA_DIRS"]):
-        updated = get_data_loc([this_dir], path)
+        updated = get_data_loc(path, extra=this_dir)
         if updated is None:
             logging.error(f"Data dir {path} cannot be resolved!")
             assert updated is not None
@@ -67,9 +67,13 @@ def get_output_dir():
     return output_dir
 
 
-def get_data_loc(data_dirs, path):
-    if not isinstance(data_dirs, list):
-        data_dirs = [data_dirs]
+def get_data_loc(path, extra=None):
+    if extra is None:
+        data_dirs = get_config()["DATA_DIRS"]
+        if not isinstance(data_dirs, list):
+            data_dirs = [data_dirs]
+    else:
+        data_dirs = [extra]
     if "$" in path:
         path = os.path.expandvars(path)
         if "$" in path:

diff --git a/pippin/cosmomc.py b/pippin/cosmomc.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 import numpy as np
 
-from pippin.config import mkdirs, get_config, get_output_loc, get_data_loc
+from pippin.config import mkdirs, get_output_loc, get_data_loc
 from pippin.create_cov import CreateCov
 from pippin.task import Task
 
@@ -39,7 +39,6 @@ class CosmoMC(Task):  # TODO: Define the location of the output so we can run th
     """
 
     def __init__(self, name, output_dir, options, global_config, dependencies=None):
-        self.data_dirs = global_config["DATA_DIRS"]
         super().__init__(name, output_dir, dependencies=dependencies)
         self.options = options
         self.global_config = global_config
@@ -198,7 +197,7 @@ def _run(self, force_refresh):
         if self.static:
             self.logger.info("CMB only constraints detected, copying static files")
 
-            cosmomc_static_loc = get_data_loc(self.data_dirs, self.static_path + self.ini_prefix)
+            cosmomc_static_loc = get_data_loc(self.static_path + self.ini_prefix)
             if cosmomc_static_loc is None:
                 self.logger.error("Seems like we can't find the static chains...")
                 return False

diff --git a/pippin/create_cov.py b/pippin/create_cov.py
@@ -37,9 +37,8 @@ class CreateCov(ConfigBasedExecutable):
     """
 
     def __init__(self, name, output_dir, options, global_config, dependencies=None, index=0):
-        self.data_dirs = global_config["DATA_DIRS"]
 
-        base_file = get_data_loc(self.data_dirs, "create_cov/input_file.txt")
+        base_file = get_data_loc("create_cov/input_file.txt")
         super().__init__(name, output_dir, base_file, default_assignment=": ", dependencies=dependencies)
 
         self.options = options
@@ -49,7 +48,7 @@ def __init__(self, name, output_dir, options, global_config, dependencies=None,
         self.path_to_code = os.path.abspath(os.path.dirname(inspect.stack()[0][1]) + "/external")
 
         self.logfile = os.path.join(self.output_dir, "output.log")
-        self.sys_file_in = get_data_loc(self.data_dirs, options.get("SYS_SCALE", "create_cov/sys_scale.LIST"))
+        self.sys_file_in = get_data_loc(options.get("SYS_SCALE", "create_cov/sys_scale.LIST"))
         self.sys_file_out = os.path.join(self.output_dir, "sys_scale.LIST")
         self.chain_dir = os.path.join(self.output_dir, "chains/")
         self.config_dir = os.path.join(self.output_dir, "output")
@@ -94,7 +93,7 @@ def _check_completion(self, squeue):
 
     def calculate_input(self):
         self.logger.debug(f"Calculating input")
-        self.set_property("COSMOMC_TEMPLATES", get_data_loc(self.data_dirs, "cosmomc_templates"))
+        self.set_property("COSMOMC_TEMPLATES", get_data_loc("cosmomc_templates"))
         self.set_property("BASEOUTPUT", self.name)
         self.set_property("SYSFILE", self.sys_file_out)
         self.set_property("TOPDIR", self.biascor_dep.output["fit_output_dir"])

diff --git a/pippin/dataprep.py b/pippin/dataprep.py
@@ -27,7 +27,6 @@ class DataPrep(Task):  # TODO: Define the location of the output so we can run t
     """
 
     def __init__(self, name, output_dir, options, global_config, dependencies=None):
-        self.data_dirs = global_config["DATA_DIRS"]
         super().__init__(name, output_dir, dependencies=dependencies)
         self.options = options
         self.global_config = get_config()
@@ -36,7 +35,7 @@ def __init__(self, name, output_dir, options, global_config, dependencies=None):
         self.conda_env = self.global_config["DataSkimmer"]["conda_env"]
         self.path_to_task = output_dir
 
-        self.raw_dir = get_data_loc(self.data_dirs, self.options.get("RAW_DIR"))
+        self.raw_dir = get_data_loc(self.options.get("RAW_DIR"))
         if self.raw_dir is None:
             Task.fail_config(f"Unable to find {self.options.get('RAW_DIR')}")
 

diff --git a/pippin/snana_fit.py b/pippin/snana_fit.py
@@ -1,4 +1,3 @@
-import inspect
 import os
 import shutil
 import subprocess
@@ -31,15 +30,14 @@ class SNANALightCurveFit(ConfigBasedExecutable):
     """
 
     def __init__(self, name, output_dir, sim_task, config, global_config):
-        self.data_dirs = global_config["DATA_DIRS"]
 
         self.config = config
         self.global_config = global_config
 
         base = config.get("BASE")
         if base is None:
             Task.fail_config(f"You have not specified a BASE nml file for task {name}")
-        self.base_file = get_data_loc(self.data_dirs, base)
+        self.base_file = get_data_loc(base)
         if self.base_file is None:
             Task.fail_config(f"Base file {base} cannot be found for task {name}")
 
@@ -80,7 +78,7 @@ def __init__(self, name, output_dir, sim_task, config, global_config):
         self.logger.debug("Loading fitopts")
         self.fitopts = []
         for f in fitopts:
-            potential_path = get_data_loc(self.data_dirs, f)
+            potential_path = get_data_loc(f)
             if os.path.exists(potential_path):
                 self.logger.debug(f"Loading in fitopts from {potential_path}")
                 with open(potential_path) as f:
@@ -205,8 +203,6 @@ def write_nml(self, force_refresh):
 
         # We want to do our hashing check here
         string_to_hash = self.fitopts + self.base
-        # with open(os.path.abspath(inspect.stack()[0][1]), "r") as f:
-        #     string_to_hash += f.read()
         new_hash = self.get_hash_from_string("".join(string_to_hash))
         old_hash = self.get_old_hash()
         regenerate = force_refresh or (old_hash is None or old_hash != new_hash)

diff --git a/pippin/snana_sim.py b/pippin/snana_sim.py
@@ -42,7 +42,7 @@ class SNANASimulation(ConfigBasedExecutable):
 
     def __init__(self, name, output_dir, genversion, config, global_config, combine="combine.input"):
         self.data_dirs = global_config["DATA_DIRS"]
-        base_file = get_data_loc(self.data_dirs, combine)
+        base_file = get_data_loc(combine)
         super().__init__(name, output_dir, base_file, ": ")
 
         self.genversion = genversion
@@ -76,7 +76,7 @@ def __init__(self, name, output_dir, genversion, config, global_config, combine=
         if len(self.base_ia + self.base_cc) == 0:
             Task.fail_config("Your sim has no components specified! Please add something to simulate!")
         for file in self.base_ia + self.base_cc:
-            if get_data_loc(self.data_dirs, file) is None:
+            if get_data_loc(file) is None:
                 Task.fail_config(f"Cannot find file {file} specified in simulation {self.name}")
 
         # Try to determine how many jobs will be put in the queue
@@ -168,7 +168,7 @@ def write_input(self, force_refresh):
         # Copy the base files across
         input_paths = []
         for f in self.base_ia + self.base_cc:
-            resolved = get_data_loc(self.data_dirs, f)
+            resolved = get_data_loc(f)
             shutil.copy(resolved, temp_dir)
             input_paths.append(os.path.join(temp_dir, os.path.basename(f)))
             self.logger.debug(f"Copying input file {resolved} to {temp_dir}")
@@ -181,14 +181,14 @@ def write_input(self, force_refresh):
         for ff in fs:
             if ff not in input_copied:
                 input_copied.append(ff)
-                path = get_data_loc(self.data_dirs, ff)
+                path = get_data_loc(ff)
                 copied_path = os.path.join(temp_dir, os.path.basename(path))
                 with open(path, "r") as f:
                     for line in f.readlines():
                         line = line.strip()
                         if line.startswith("INPUT_FILE_INCLUDE"):
                             include_file = line.split(":")[-1].strip()
-                            include_file_path = get_data_loc(self.data_dirs, include_file)
+                            include_file_path = get_data_loc(include_file)
                             self.logger.debug(f"Copying INPUT_FILE_INCLUDE file {include_file_path} to {temp_dir}")
 
                             include_file_basename = os.path.basename(include_file_path)

diff --git a/tests/config_files/fail_classify1.yml b/tests/config_files/fail_classify1.yml
@@ -0,0 +1,21 @@
+SIM:
+  ASIM:
+    IA_G10_DES3YR:
+      BASE: surveys/sdss/sims_ia/sn_ia_g10_sdss_3yr.input
+    II:
+      BASE: surveys/sdss/sims_cc/sn_ii_templates.input
+    Ibc:
+      BASE: surveys/sdss/sims_cc/sn_ibc_templates.input
+    GLOBAL:
+      NGEN_UNIT: 1
+      RANSEED_REPEAT: 10 12345
+      SOLID_ANGLE: 10
+
+LCFIT:
+  D:
+    BASE: surveys/des/lcfit_nml/des_5yr.nml
+
+CLASSIFICATION:
+  FITPROBTEST:
+    CLASSIFIER: FitProbClassifier  # No mode, no work
+
diff --git a/tests/config_files/fail_classify2.yml b/tests/config_files/fail_classify2.yml
@@ -0,0 +1,22 @@
+SIM:
+  ASIM:
+    IA_G10_DES3YR:
+      BASE: surveys/sdss/sims_ia/sn_ia_g10_sdss_3yr.input
+    II:
+      BASE: surveys/sdss/sims_cc/sn_ii_templates.input
+    Ibc:
+      BASE: surveys/sdss/sims_cc/sn_ibc_templates.input
+    GLOBAL:
+      NGEN_UNIT: 1
+      RANSEED_REPEAT: 10 12345
+      SOLID_ANGLE: 10
+
+LCFIT:
+  D:
+    BASE: surveys/des/lcfit_nml/des_5yr.nml
+
+CLASSIFICATION:
+  FITPROBTEST:
+    CLASSIFIER: FitProbClassifier  # No mode, no work
+    MODE: predict
+    MASK_SIM: NOTHING
diff --git a/tests/config_files/fail_classify3.yml b/tests/config_files/fail_classify3.yml
@@ -0,0 +1,21 @@
+SIM:
+  ASIM:
+    IA_G10_DES3YR:
+      BASE: surveys/sdss/sims_ia/sn_ia_g10_sdss_3yr.input
+    II:
+      BASE: surveys/sdss/sims_cc/sn_ii_templates.input
+    Ibc:
+      BASE: surveys/sdss/sims_cc/sn_ibc_templates.input
+    GLOBAL:
+      NGEN_UNIT: 1
+      RANSEED_REPEAT: 10 12345
+      SOLID_ANGLE: 10
+
+LCFIT:
+  D:
+    BASE: surveys/des/lcfit_nml/des_5yr.nml
+
+CLASSIFICATION:
+  NNTRAIN:
+    CLASSIFIER: NearestNeighborPyClassifier
+    MODE: predict
diff --git a/tests/config_files/fail_classify4.yml b/tests/config_files/fail_classify4.yml
@@ -0,0 +1,23 @@
+SIM:
+  ASIM:
+    IA_G10_DES3YR:
+      BASE: surveys/sdss/sims_ia/sn_ia_g10_sdss_3yr.input
+    II:
+      BASE: surveys/sdss/sims_cc/sn_ii_templates.input
+    Ibc:
+      BASE: surveys/sdss/sims_cc/sn_ibc_templates.input
+    GLOBAL:
+      NGEN_UNIT: 1
+      RANSEED_REPEAT: 10 12345
+      SOLID_ANGLE: 10
+
+LCFIT:
+  D:
+    BASE: surveys/des/lcfit_nml/des_5yr.nml
+
+CLASSIFICATION:
+  NNTRAIN:
+    CLASSIFIER: NearestNeighborPyClassifier  # No mode, no work
+    MODE: predict
+    OPTS:
+      MODEL: nothing