path config for a train-test analysis from existing prod (#451)

* path config for a train-test analysis from existing prod * new path of path_model for dl1_to_dl2 step * add pipeline to documentation --------- Co-authored-by: SeiyaNozaki <[email protected]>
cta-observatory · Feb 12, 2024 · 6f35f7b · 6f35f7b
1 parent b351255
commit 6f35f7b
Show file tree

Hide file tree

Showing 2 changed files with 100 additions and 0 deletions.
diff --git a/docs/pipeline.rst b/docs/pipeline.rst
@@ -258,3 +258,38 @@ To prepare the lstmcpipe config, you typically want to run **on the cluster**:
 .. code-block::
 
     lstmcpipe_generate_config PathConfigAllSkyFullDL1ab --dec_list dec_2276 --prod_id anothergreatprod --kwargs source_prod_id=whatagreatprod
+
+
+Retrain and apply a model
+=========================
+
+..
+    .. mermaid::
+
+        flowchart LR
+
+        subgraph pa[PROD A]
+            direction TB
+            gamma[DL1b merged gamma training]
+            proton[DL1b merged proton training]
+            gammaps[DL1b gamma testing\n- node 1\n- node  2\n...]
+        end
+
+        gamma & proton --> models
+
+        %% DL1train --> train_pipe((train_pipe))
+
+        models[models B] .-> real-data
+
+        models & gammaps --> DL2-GammaTest[DL2 gamma testing\n- node 1\n- node  2\n...]
+
+
+.. image:: https://mermaid.ink/img/pako:eNqVUstqwzAQ_BUhSEkgNsRHHwoNLr2ktKS5WaFsrI0jsB7IMqWE_HvXUkwa6KXyQePVzKxG0pk3ViIv-bGzX80JfGCbrTDCMBr9cGg9uBNzUL9v36rsaZ8WxiGVxyYoa9hufau2oDXU1WZ1YBp9izJVWPCgjDLtLwPnbbDmjptKf5Gji-sT-2qJfSCSECZjhkKw1Q2ygnCe51cHNHLKlLQPU6sse2SaFF0_EWYzRk3iFuJqRJ9OOZzPb3ixmPhJXaeJrfcsJ5FH6DIJAe5Z1PcaJFpXmyJ7Gf93FIWiFf9LNn58yensNChJd3geewkeTqhR8JKgxCMMXRBcmAtRB0d7wmepgvW8PELX45LDEOzHt2l4GfyAE6lSQHevpyJGzWt6K_HJXH4Ay4i5SA?type=png)](https://mermaid.live/edit#pako:eNqVUstqwzAQ_BUhSEkgNsRHHwoNLr2ktKS5WaFsrI0jsB7IMqWE_HvXUkwa6KXyQePVzKxG0pk3ViIv-bGzX80JfGCbrTDCMBr9cGg9uBNzUL9v36rsaZ8WxiGVxyYoa9hufau2oDXU1WZ1YBp9izJVWPCgjDLtLwPnbbDmjptKf5Gji-sT-2qJfSCSECZjhkKw1Q2ygnCe51cHNHLKlLQPU6sse2SaFF0_EWYzRk3iFuJqRJ9OOZzPb3ixmPhJXaeJrfcsJ5FH6DIJAe5Z1PcaJFpXmyJ7Gf93FIWiFf9LNn58yensNChJd3geewkeTqhR8JKgxCMMXRBcmAtRB0d7wmepgvW8PELX45LDEOzHt2l4GfyAE6lSQHevpyJGzWt6K_HJXH4Ay4i5SA
+
+The workflow starts from an existing PROD A with merged DL1b datasets, trains a new set of models and applies them to create a new set of DL2.
+
+Example of command to generate such a config:
+
+.. code-block::
+    lstmcpipe_generate_config PathConfigAllTrainTestDL1b --dec_list dec_2276 dec_931 --prod_id MY_NEW_PROD --kwargs source_prod_id=PROD-A
+
diff --git a/lstmcpipe/config/paths_config.py b/lstmcpipe/config/paths_config.py
@@ -1043,3 +1043,68 @@ def dl1ab(self):
         # we do only one DL1 test for one dec (dec does not matter, so we take the first one)
         paths.extend(self.test_configs[self.dec_list[0]].dl1ab)
         return paths
+
+
+class PathConfigAllTrainTestDL1b(PathConfigAllSkyFullDL1ab):
+    def __init__(self, prod_id, source_prod_id, dec_list, run_checker=True):
+        """
+        Config for an allsky train-test analysis from an existing source prod.
+        It runs:
+            - train_pipe from existing merged DL1 files (source_prod_id)
+            - dl1_to_dl2 from existing merged DL1 files (source_prod_id)
+        Note that in of source-dependent analysis,
+        missing src-dep parameters are recomputed on the fly during the train stage by lstchain.
+        """
+        super().__init__(prod_id, source_prod_id, dec_list)
+        self.dec_list = dec_list
+        self.source_prod_id = source_prod_id
+        self.source_configs = PathConfigAllSkyFullDL1ab(
+            source_prod_id, source_prod_id, dec_list, run_checker=run_checker
+        )
+        self.target_configs = PathConfigAllSkyFullDL1ab(prod_id, source_prod_id, dec_list, run_checker=run_checker)
+        self.stages = ['train_pipe', 'dl1_to_dl2']
+        if run_checker:
+            self.check_source_prod()
+
+    @property
+    def dl1_to_dl2(self):
+        paths = []
+        for dec in self.dec_list:
+            src_paths = self.source_configs.test_configs[dec].dl1_to_dl2
+            target_paths = self.target_configs.test_configs[dec].dl1_to_dl2
+            new_path = src_paths.copy()
+            for ii, p in enumerate(new_path):
+                new_path[ii]['path_model'] = target_paths[ii]['path_model']
+                new_path[ii]['output'] = target_paths[ii]['output']
+            paths.extend(new_path)
+        return paths
+
+    @property
+    def train_pipe(self):
+        paths = []
+        for dec in self.dec_list:
+            src_paths = self.source_configs.train_configs[dec].train_pipe
+            # print(src_paths)
+            target_paths = self.target_configs.train_configs[dec].train_pipe
+            new_path = src_paths.copy()
+            for ii, _ in enumerate(new_path):
+                new_path[ii]['output'] = target_paths[ii]['output']
+            paths.extend(new_path)
+        return paths
+
+    def check_source_prod(self):
+        """
+        Check that merged dl1 files for training exist in the source prod.
+        Otherwise, remove the dec from the list of decs to be processed and warn the user.
+        """
+
+        dec_to_remove = []
+        for dec in self.dec_list:
+            for path in self.source_configs.train_configs[dec].merge_dl1:
+                # the output from merging must exist to train  the model
+                source_dl1 = Path(path['output'])
+                if not source_dl1.exists():
+                    warnings.warn(f"{source_dl1} does not exist" f"This training will be removed from production.")
+                    dec_to_remove.append(dec)
+
+        self.dec_list = list(set(self.dec_list) - set(dec_to_remove))