refactor(components): De-hardcoded local output paths. (kubeflow#580)

* Components - De-hardcoded local output paths. * pip install pathlib2 * Added component.yaml changes * The Dataflow components have been deleted
Bobgy · Sep 4, 2020 · 4c9a8c0 · 4c9a8c0
1 parent e275548
commit 4c9a8c0
Show file tree

Hide file tree

Showing 15 changed files with 85 additions and 162 deletions.
diff --git a/components/deprecated/dataflow/predict/component.yaml b/components/deprecated/dataflow/predict/component.yaml
diff --git a/components/deprecated/dataflow/tfdv/component.yaml b/components/deprecated/dataflow/tfdv/component.yaml
diff --git a/components/deprecated/dataflow/tfma/component.yaml b/components/deprecated/dataflow/tfma/component.yaml
diff --git a/components/deprecated/dataflow/tft/component.yaml b/components/deprecated/dataflow/tft/component.yaml
diff --git a/components/deprecated/dataproc/analyze/src/analyze.py b/components/deprecated/dataproc/analyze/src/analyze.py
@@ -25,6 +25,7 @@
 
 import argparse
 import os
+from pathlib import Path
 
 from common import _utils
 
@@ -37,6 +38,10 @@ def main(argv=None):
   parser.add_argument('--output', type=str, help='GCS path to use for output.')
   parser.add_argument('--train', type=str, help='GCS path of the training csv file.')
   parser.add_argument('--schema', type=str, help='GCS path of the json schema file.')
+  parser.add_argument('--output-dir-uri-output-path',
+                      type=str,
+                      default='/output.txt',
+                      help='Local output path for the file containing the output dir URI.')
   args = parser.parse_args()
 
   code_path = os.path.dirname(os.path.realpath(__file__))
@@ -50,8 +55,8 @@ def main(argv=None):
         api, args.project, args.region, args.cluster, dest_files[0], spark_args)
     print('Job request submitted. Waiting for completion...')
     _utils.wait_for_job(api, args.project, args.region, job_id)
-    with open('/output.txt', 'w') as f:
-      f.write(args.output)
+    Path(args.output_dir_uri_output_path).parent.mkdir(parents=True, exist_ok=True)
+    Path(args.output_dir_uri_output_path).write_text(args.output)
 
     print('Job completed.')
   finally:

diff --git a/components/deprecated/dataproc/base/Dockerfile b/components/deprecated/dataproc/base/Dockerfile
@@ -21,7 +21,7 @@ RUN easy_install pip
 
 RUN pip install google-api-python-client==1.6.2
 
-RUN pip install tensorflow==1.6.0
+RUN pip install tensorflow==1.6.0 pathlib2
 
 RUN wget -nv https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.zip && \
     unzip -qq google-cloud-sdk.zip -d tools && \

diff --git a/components/deprecated/dataproc/create_cluster/src/create_cluster.py b/components/deprecated/dataproc/create_cluster/src/create_cluster.py
@@ -22,6 +22,7 @@
 
 import argparse
 import os
+from pathlib import Path
 
 from common import _utils
 
@@ -32,6 +33,10 @@ def main(argv=None):
   parser.add_argument('--region', type=str, help='Which zone for GCE VMs.')
   parser.add_argument('--name', type=str, help='The name of the cluster to create.')
   parser.add_argument('--staging', type=str, help='GCS path to use for staging.')
+  parser.add_argument('--output-dir-uri-output-path',
+                      type=str,
+                      default='/output.txt',
+                      help='Local output path for the file containing the output dir URI.')
   args = parser.parse_args()
 
   code_path = os.path.dirname(os.path.realpath(__file__))
@@ -44,8 +49,8 @@ def main(argv=None):
     create_response = _utils.create_cluster(api, args.project, args.region, args.name, dest_files[0])
     print('Cluster creation request submitted. Waiting for completion...')
     _utils.wait_for_operation(api, create_response['name'])
-    with open('/output.txt', 'w') as f:
-      f.write(args.name)
+    Path(args.output_dir_uri_output_path).parent.mkdir(parents=True, exist_ok=True)
+    Path(args.output_dir_uri_output_path).write_text(args.output)
     print('Cluster created.')
   finally:
     _utils.remove_resources_from_gcs(dest_files)

diff --git a/components/deprecated/dataproc/predict/src/predict.py b/components/deprecated/dataproc/predict/src/predict.py
@@ -32,6 +32,7 @@
 import argparse
 import json
 import os
+from pathlib import Path
 
 from common import _utils
 import logging
@@ -50,6 +51,15 @@ def main(argv=None):
   parser.add_argument('--predict', type=str, help='GCS path of prediction libsvm file.')
   parser.add_argument('--analysis', type=str, help='GCS path of the analysis input.')
   parser.add_argument('--target', type=str, help='Target column name.')
+  parser.add_argument('--prediction-results-uri-pattern-output-path',
+                      type=str,
+                      default='/output.txt',
+                      help='Local output path for the file containing prediction results URI pattern.')
+  parser.add_argument('--ui-metadata-output-path',
+                      type=str,
+                      default='/mlpipeline-ui-metadata.json',
+                      help='Local output path for the file containing UI metadata JSON structure.')
+
   args = parser.parse_args()
 
   logging.getLogger().setLevel(logging.INFO)
@@ -61,9 +71,9 @@ def main(argv=None):
       'ml.dmlc.xgboost4j.scala.example.spark.XGBoostPredictor', spark_args)
   logging.info('Job request submitted. Waiting for completion...')
   _utils.wait_for_job(api, args.project, args.region, job_id)
-  prediction_results = os.path.join(args.output, 'part-*.csv')
-  with open('/output.txt', 'w') as f:
-    f.write(prediction_results)
+  prediction_results_uri_pattern = os.path.join(args.output, 'part-*.csv')
+  Path(args.prediction_results_uri_pattern_output_path).parent.mkdir(parents=True, exist_ok=True)
+  Path(args.prediction_results_uri_pattern_output_path).write_text(prediction_results_uri_pattern)
 
   with file_io.FileIO(os.path.join(args.output, 'schema.json'), 'r') as f:
     schema = json.load(f)
@@ -74,11 +84,11 @@ def main(argv=None):
       'storage': 'gcs',
       'format': 'csv',
       'header': [x['name'] for x in schema],
-      'source': prediction_results
+      'source': prediction_results_uri_pattern
     }]
   }
-  with open('/mlpipeline-ui-metadata.json', 'w') as f:
-    json.dump(metadata, f)
+  Path(args.ui_metadata_output_path).parent.mkdir(parents=True, exist_ok=True)
+  Path(args.ui_metadata_output_path).write_text(json.dumps(metadata))
   logging.info('Job completed.')
 
 

diff --git a/components/deprecated/dataproc/train/src/train.py b/components/deprecated/dataproc/train/src/train.py
@@ -32,6 +32,7 @@
 
 import argparse
 import logging
+from pathlib import Path
 
 from common import _utils
 
@@ -51,6 +52,11 @@ def main(argv=None):
   parser.add_argument('--eval', type=str, help='GCS path of the eval libsvm file pattern.')
   parser.add_argument('--analysis', type=str, help='GCS path of the analysis input.')
   parser.add_argument('--target', type=str, help='Target column name.')
+  parser.add_argument('--output-dir-uri-output-path',
+                      type=str,
+                      default='/output.txt',
+                      help='Local output path for the file containing the output dir URI.')
+
   args = parser.parse_args()
 
   logging.getLogger().setLevel(logging.INFO)
@@ -63,8 +69,8 @@ def main(argv=None):
       'ml.dmlc.xgboost4j.scala.example.spark.XGBoostTrainer', spark_args)
   logging.info('Job request submitted. Waiting for completion...')
   _utils.wait_for_job(api, args.project, args.region, job_id)
-  with open('/output.txt', 'w') as f:
-    f.write(args.output)
+  Path(args.output_dir_uri_output_path).parent.mkdir(parents=True, exist_ok=True)
+  Path(args.output_dir_uri_output_path).write_text(args.output)
 
   logging.info('Job completed.')
 

diff --git a/components/kubeflow/dnntrainer/component.yaml b/components/kubeflow/dnntrainer/component.yaml
@@ -29,7 +29,6 @@ implementation:
       --target, {inputValue: Target},
       --preprocessing-module, {inputValue: Preprocessing module},
       --job-dir, {inputValue: Training output dir},
+      --exported-model-dir-uri-output-path, {outputPath: Training output dir},
+      --ui-metadata-output-path, {outputPath: MLPipeline UI metadata},
     ]
-    fileOutputs:
-      Training output dir: /output.txt
-      MLPipeline UI metadata:  /mlpipeline-ui-metadata.json
diff --git a/components/kubeflow/dnntrainer/src/trainer/task.py b/components/kubeflow/dnntrainer/src/trainer/task.py
@@ -16,6 +16,7 @@
 import argparse
 import json
 import os
+from pathlib import Path
 import tensorflow as tf
 import tensorflow_transform as tft
 import tensorflow_model_analysis as tfma
@@ -80,6 +81,14 @@ def parse_arguments():
                       required=False,
                       help=('GCS path to a python file defining '
                             '"preprocess" and "get_feature_columns" functions.'))
+  parser.add_argument('--exported-model-dir-uri-output-path',
+                      type=str,
+                      default='/output.txt',
+                      help='Local output path for the file containing exported model directory URI.')
+  parser.add_argument('--ui-metadata-output-path',
+                      type=str,
+                      default='/mlpipeline-ui-metadata.json',
+                      help='Local output path for the file containing UI metadata JSON structure.')
 
   args = parser.parse_args()
   args.hidden_layer_size = [int(x.strip()) for x in args.hidden_layer_size.split(',')]
@@ -341,11 +350,11 @@ def main():
       'source': args.job_dir,
     }]
   }
-  with open('/mlpipeline-ui-metadata.json', 'w') as f:
-    json.dump(metadata, f)
+  Path(args.ui_metadata_output_path).parent.mkdir(parents=True, exist_ok=True)
+  Path(args.ui_metadata_output_path).write_text(json.dumps(metadata))
 
-  with open('/output.txt', 'w') as f:
-    f.write(args.job_dir)
+  Path(args.exported_model_dir_uri_output_path).parent.mkdir(parents=True, exist_ok=True)
+  Path(args.exported_model_dir_uri_output_path).write_text(args.job_dir)
 
 if __name__ == '__main__':
   main()
diff --git a/components/local/confusion_matrix/component.yaml b/components/local/confusion_matrix/component.yaml
@@ -15,7 +15,6 @@ implementation:
       --predictions, {inputValue: Predictions},
       --target_lambda, {inputValue: Target lambda},
       --output,      {inputValue: Output dir},
+      --ui-metadata-output-path, {outputPath: MLPipeline UI metadata},
+      --metrics-output-path, {outputPath: MLPipeline Metrics},
     ]
-    fileOutputs:
-      MLPipeline UI metadata: /mlpipeline-ui-metadata.json
-      MLPipeline Metrics:     /mlpipeline-metrics.json
diff --git a/components/local/confusion_matrix/src/confusion_matrix.py b/components/local/confusion_matrix/src/confusion_matrix.py
@@ -27,6 +27,7 @@
 import os
 import urlparse
 import pandas as pd
+from pathlib import Path
 from sklearn.metrics import confusion_matrix, accuracy_score
 from tensorflow.python.lib.io import file_io
 
@@ -39,6 +40,15 @@ def main(argv=None):
                       help='a lambda function as a string to compute target.' +
                            'For example, "lambda x: x[\'a\'] + x[\'b\']"' +
                            'If not set, the input must include a "target" column.')
+  parser.add_argument('--ui-metadata-output-path',
+                      type=str,
+                      default='/mlpipeline-ui-metadata.json',
+                      help='Local output path for the file containing UI metadata JSON structure.')
+  parser.add_argument('--metrics-output-path',
+                      type=str,
+                      default='/mlpipeline-metrics.json',
+                      help='Local output path for the file containing metrics JSON structure.')
+
   args = parser.parse_args()
 
   storage_service_scheme = urlparse.urlparse(args.output).scheme
@@ -85,8 +95,8 @@ def main(argv=None):
       'labels': list(map(str, vocab)),
     }]
   }
-  with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
-    json.dump(metadata, f)
+  Path(args.ui_metadata_output_path).parent.mkdir(parents=True, exist_ok=True)
+  Path(args.ui_metadata_output_path).write_text(json.dumps(metadata))
 
   accuracy = accuracy_score(df['target'], df['predicted'])
   metrics = {
@@ -96,8 +106,8 @@ def main(argv=None):
       'format': "PERCENTAGE",
     }]
   }
-  with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
-    json.dump(metrics, f)
+  Path(args.metrics_output_path).parent.mkdir(parents=True, exist_ok=True)
+  Path(args.metrics_output_path).write_text(json.dumps(metrics))
 
 if __name__== "__main__":
   main()