add comment for excluding aggregation nodes in previous node run infos

microsoft · Mar 28, 2024 · 95bcad8 · 95bcad8
1 parent 5c6d708
commit 95bcad8
Showing 1 changed file with 14 additions and 1 deletion.
diff --git a/src/promptflow/promptflow/batch/_batch_engine.py b/src/promptflow/promptflow/batch/_batch_engine.py
@@ -250,8 +250,21 @@ def _copy_previous_run_result(
                     previous_run_info.root_run_id = run_id
                     previous_run_info.parent_run_id = run_id
 
-                    # Load previous node run info and remove aggregation nodes in case it is loaded into node run info
+                    # Load previous node run info
                     previous_node_run_infos = resume_from_run_storage.load_node_run_info_for_line(i)
+
+                    # In storage, aggregation nodes are persisted with filenames similar to regular nodes.
+                    # Currently we read regular node run records by filename in the node artifacts folder,
+                    # which may lead to load records of aggregation nodes at the same time, which is not intended.
+                    # E.g, aggregation-node/000000000.jsonl will be treated as the node_run_info of the first line:
+                    # node_artifacts/
+                    # ├─ non-aggregation-node/
+                    # │  ├─ 000000000.jsonl
+                    # │  ├─ 000000001.jsonl
+                    # │  ├─ 000000002.jsonl
+                    # ├─ aggregation-node/
+                    # │  ├─ 000000000.jsonl
+                    # So we filter out aggregation nodes since line records should not contain any info about them.
                     previous_node_run_infos = [
                         run_info for run_info in previous_node_run_infos if run_info.node not in aggregation_nodes
                     ]