model and dataset logging for raytune using CMF (#212)

HewlettPackard · Oct 14, 2024 · e559799 · e559799
1 parent 883fc3e
commit e559799
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 4 deletions.
diff --git a/cmflib/cmf_ray_logger.py b/cmflib/cmf_ray_logger.py
@@ -5,7 +5,7 @@
 class CmfRayLogger(Callback):
     #id_count = 1
 
-    def __init__(self, pipeline_name, file_path, pipeline_stage):
+    def __init__(self, pipeline_name, file_path, pipeline_stage, data_dir = None):
         """
         pipeline_name: The name of the CMF Pipelibe
         file_path: The path to metadata file
@@ -16,6 +16,7 @@ def __init__(self, pipeline_name, file_path, pipeline_stage):
         self.pipeline_stage = pipeline_stage
         self.cmf_obj = {}
         self.cmf_run = {}
+        self.data_dir = data_dir
 
     def on_trial_start(self, iteration, trials, trial, **info):
         trial_id = trial.trial_id
@@ -28,6 +29,8 @@ def on_trial_start(self, iteration, trials, trial, **info):
                                             custom_properties = {'Configuration': trial_config})
         #self.execution_id[trial_id] = CmfRayLogger.id_count
         #CmfRayLogger.id_count+=1
+        if self.data_dir:
+            _ = self.cmf_obj[trial_id].log_dataset(url = str(self.data_dir), event = "input")
 
     def on_trial_result(self, iteration, trials, trial, result, **info):
         trial_id = trial.trial_id
@@ -58,6 +61,11 @@ def on_trial_complete(self, iteration, trials, trial, **info):
         _ = self.cmf_obj[trial_id].log_execution_metrics(metrics_name = f"Trial_{trial_id}_Result",
                                       custom_properties = {'Result': trial_result})
 
+        if 'model_path' in trial_result:
+            _ = self.cmf_obj[trial_id].log_model(path = trial_result['model_path'], 
+                                                 event = 'input', 
+                                                 model_name = f"{trial_id}_model")
+
     def on_trial_error(self, iteration, trials, trial, **info):
         trial_id = trial.trial_id
         trial_config = trial.config

diff --git a/docs/api/public/cmf_ray_logger.md b/docs/api/public/cmf_ray_logger.md
@@ -25,11 +25,13 @@ Create an instance of CmfRayLogger by providing the following parameters:
 * pipeline_name: A string representing the name of the CMF pipeline.
 * file_path: The file path to the metadata file associated with the CMF pipeline.
 * pipeline_stage: The name of the current stage of the CMF pipeline.
+* data_dir (optional): A directory path where trial data should be logged. If the path is within the CMF directory, it should be relative. If it is outside, it must be an absolute path. Default vale is `None`.
 
 Example of instantiation:
 ```python
-logger = cmf_ray_logger.CmfRayLogger(pipeline_name, file_path, pipeline_stage)
+logger = cmf_ray_logger.CmfRayLogger(pipeline_name, file_path, pipeline_stage. data_dir)
 ```
+Here, the `data_dir` argument is used to log the dataset at the start of each trial. Ensure that this path is relative if within the CMF directory and absolute if external to the CMF directory.
 
 ## Integration with Ray Tune
 
@@ -50,8 +52,22 @@ tune.run(
 )
 ```
 
+## Model Logging
+`CmfRayLogger` can now log the model during trials. To enable this, the `train.report` method must include a special key: `"model_path"`. The value of `"model_path"` should be a relative path pointing to the saved model within the CMF directory.
+
+Important: Ensure that the `"model_path"` is relative, as the DVC wrapper expects all paths nested within the CMF directory to be relative.
+```Python
+train.report({
+    "accuracy": 0.95,
+    "loss": 0.05,
+    "model_path": "models/example_model.pth"
+})
+```
+
+
+
 ## Output
-During each trial, `CmfRayLogger` will automatically create a CMF object with attributes set as `pipeline_name`, `pipeline_stage`, and the CMF execution as `trial_id`. It captures the trial's output and logs it under the metric key `'Output'`.
+During each trial, `CmfRayLogger` will automatically create a CMF object with attributes set as `pipeline_name`, `pipeline_stage`, and the CMF execution as `trial_id`. It captures the trial's output and logs it under the metric key `'Output'`. Additionally, it logs the dataset at the start of each trial (if data_dir is specified) and logs the model based on the `"model_path"` key in `train.report`.
 
 ## Example
 Here is a complete example of how to use `CmfRayLogger` with Ray Tune:
@@ -61,7 +77,7 @@ from cmf import cmf_ray_logger
 from ray import tune
 
 # Initialize the logger
-logger = cmf_ray_logger.CmfRayLogger("ExamplePipeline", "/path/to/metadata.json", "Stage1")
+logger = cmf_ray_logger.CmfRayLogger("ExamplePipeline", "/path/to/metadata.json", "Stage1", "path/to/data_dir")
 
 # Configuration for tuning
 config = {
@@ -74,4 +90,11 @@ tune.run(
     config=config,
     callbacks=[logger]
 )
+
+# Reporting within your trainable function
+train.report({
+    "accuracy": 0.95,
+    "loss": 0.05,
+    "model_path": "path/to/models/example_model.pth"
+})
 ```