From 95bcad8e455fc257754c68a332b5722329f580cc Mon Sep 17 00:00:00 2001
From: Min Shi <minshi@microsoft.com>
Date: Wed, 27 Mar 2024 14:48:32 +0800
Subject: [PATCH] add comment for excluding aggregation nodes in previous node
 run infos

---
 src/promptflow/promptflow/batch/_batch_engine.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/promptflow/promptflow/batch/_batch_engine.py b/src/promptflow/promptflow/batch/_batch_engine.py
index c84c6a9653e..6c52fbc00af 100644
--- a/src/promptflow/promptflow/batch/_batch_engine.py
+++ b/src/promptflow/promptflow/batch/_batch_engine.py
@@ -250,8 +250,21 @@ def _copy_previous_run_result(
                     previous_run_info.root_run_id = run_id
                     previous_run_info.parent_run_id = run_id
 
-                    # Load previous node run info and remove aggregation nodes in case it is loaded into node run info
+                    # Load previous node run info
                     previous_node_run_infos = resume_from_run_storage.load_node_run_info_for_line(i)
+
+                    # In storage, aggregation nodes are persisted with filenames similar to regular nodes.
+                    # Currently we read regular node run records by filename in the node artifacts folder,
+                    # which may lead to load records of aggregation nodes at the same time, which is not intended.
+                    # E.g, aggregation-node/000000000.jsonl will be treated as the node_run_info of the first line:
+                    # node_artifacts/
+                    # ├─ non-aggregation-node/
+                    # │  ├─ 000000000.jsonl
+                    # │  ├─ 000000001.jsonl
+                    # │  ├─ 000000002.jsonl
+                    # ├─ aggregation-node/
+                    # │  ├─ 000000000.jsonl
+                    # So we filter out aggregation nodes since line records should not contain any info about them.
                     previous_node_run_infos = [
                         run_info for run_info in previous_node_run_infos if run_info.node not in aggregation_nodes
                     ]