apache · damccorm · Feb 13, 2023 · Feb 9, 2023 · Feb 9, 2023 · Feb 9, 2023
diff --git a/sdks/python/apache_beam/examples/inference/pytorch_image_classification_with_side_inputs.py b/sdks/python/apache_beam/examples/inference/pytorch_image_classification_with_side_inputs.py
@@ -26,8 +26,11 @@
 This pipeline follows the pattern from
 https://beam.apache.org/documentation/patterns/side-inputs/
 
-This pipeline expects a PubSub topic as source, which emits an image
-path(UTF-8 encoded) that is accessible by the pipeline.
+To use the PubSub reading from a topic in the pipeline as source, you can
+publish a path to the model(resnet152 used in the pipeline from
+torchvision.models.resnet152) to the PubSub topic. Then pass that
+topic via command line arg --topic.  The published path(str) should be
+UTF-8 encoded.
 
 To run the example on DataflowRunner,
 
@@ -43,6 +46,16 @@
   --requirements_file=apache_beam/ml/inference/torch_tests_requirements.txt
   --topic=<pubusb_topic>
   --file_pattern=<glob_pattern>
+
+file_pattern is path(can contain glob characters), which will be passed to
+WatchContinuously transform for model updates. WatchContinuously watches the
+file_pattern and emits a latest file path, sorted by timestamp. Files that
+are read before and updated with same name will be ignored as an update.
+
+The pipeline expects there is at least one file present to match the
+file_pattern before pipeline startup. Presumably, this would be the
+`initial_model_path`. If there is no file matching before pipeline
+startup time, the pipeline would fail.
 """
 
 import argparse
@@ -119,9 +132,11 @@ def parse_known_args(argv):
       'Path must be accessible by the pipeline.')
   parser.add_argument(
       '--model_path',
+      '--initial_model_path',
       dest='model_path',
       default='gs://apache-beam-samples/run_inference/resnet152.pth',
-      help="Path to the model's state_dict.")
+      help="Path to the initial model's state_dict. "
+      "This will be used until the first model update occurs.")
   parser.add_argument(
       '--file_pattern', help='Glob pattern to watch for an update.')
   parser.add_argument(
@@ -159,18 +174,16 @@ def run(
     model_class = models.resnet152
     model_params = {'num_classes': 1000}
 
-  class PytorchModelHandlerTensorWithBatchSize(PytorchModelHandlerTensor):
-    def batch_elements_kwargs(self):
-      return {'min_batch_size': 10, 'max_batch_size': 100}
-
   # In this example we pass keyed inputs to RunInference transform.
   # Therefore, we use KeyedModelHandler wrapper over PytorchModelHandler.
   model_handler = KeyedModelHandler(
-      PytorchModelHandlerTensorWithBatchSize(
+      PytorchModelHandlerTensor(
           state_dict_path=known_args.model_path,
           model_class=model_class,
           model_params=model_params,
-          device=device))
+          device=device,
+          min_batch_size=10,
+          max_batch_size=100))
 
   pipeline = test_pipeline
   if not test_pipeline:

diff --git a/sdks/python/apache_beam/ml/inference/utils.py b/sdks/python/apache_beam/ml/inference/utils.py
@@ -94,7 +94,7 @@ class _GetLatestFileByTimeStamp(beam.DoFn):
   started. If no such files are found, it returns a default file as fallback.
    """
   TIME_STATE = CombiningValueStateSpec(
-      'count', combine_fn=partial(max, default=_START_TIME_STAMP))
+      'max', combine_fn=partial(max, default=_START_TIME_STAMP))
 
   def process(
       self, element, time_state=beam.DoFn.StateParam(TIME_STATE)
@@ -103,7 +103,7 @@ def process(
     new_ts = file_metadata.last_updated_in_seconds
     old_ts = time_state.read()
     if new_ts > old_ts:
-      # time_state.clear()
+      time_state.clear()
       time_state.add(new_ts)
       model_path = file_metadata.path
     else:
@@ -125,17 +125,21 @@ def __init__(
     """
     Watches a directory for updates to files matching a given file pattern.
 
-    **Note**: Start timestamp will be defaulted to timestamp when pipeline
-      was run. All the files matching file_pattern, that are uploaded before
-      the pipeline started will be discarded.
-
       Args:
-        file_pattern: A glob pattern used to watch a directory for model
-          updates.
+        file_pattern: The file path to read from as a local file path or a
+        GCS ``gs://`` path. The path can contain glob characters
+        (``*``, ``?``, and ``[...]`` sets).
         interval: Interval at which to check for files matching file_pattern
           in seconds.
         stop_timestamp: Timestamp after which no more files will be checked.
 
+    Constraints:
+    1. If the file is read and then there is an update to that file, this
+      transform will ignore that update. Always update a file with unique
+      name.
+    2. Initially, before the pipeline startup time, WatchFilePattern expects
+      at least one file present that matches the file_pattern.
+
     **Note**: This transform is supported in streaming mode since
       MatchContinuously produces an unbounded source. Running in batch
       mode can lead to undesired results or result in pipeline being stuck.

diff --git a/sdks/python/apache_beam/ml/inference/utils_test.py b/sdks/python/apache_beam/ml/inference/utils_test.py
@@ -80,7 +80,11 @@ def test_emitting_singleton_output(self):
         FileMetadata(
             'path3.py',
             10,
-            last_updated_in_seconds=utils._START_TIME_STAMP + 10)
+            last_updated_in_seconds=utils._START_TIME_STAMP + 10),
+        FileMetadata(
+            'path4.py',
+            10,
+            last_updated_in_seconds=utils._START_TIME_STAMP + 20)
     ]
     # returns path3.py
 
@@ -92,4 +96,4 @@ def test_emitting_singleton_output(self):
           | beam.ParDo(utils._GetLatestFileByTimeStamp())
           | beam.ParDo(utils._ConvertIterToSingleton())
           | beam.Map(lambda x: x[0]))
-      assert_that(files_pc, equal_to(['', 'path3.py']))
+      assert_that(files_pc, equal_to(['', 'path3.py', 'path4.py']))