[Executor] Refine openai metrics calculator to adapt openai v1 (#1338)

# Description Refine openai metrics calculator to adapt openai v1. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. Co-authored-by: Lina Tang <[email protected]> (cherry picked from commit a14f55a)
microsoft · Dec 1, 2023 · b599428 · b599428
1 parent 17ba53b
commit b599428
Show file tree

Hide file tree

Showing 12 changed files with 96 additions and 43 deletions.
diff --git a/src/promptflow/promptflow/_core/run_tracker.py b/src/promptflow/promptflow/_core/run_tracker.py
@@ -337,7 +337,7 @@ def _collect_traces_from_nodes(self, run_id):
             traces.extend(node_run_info.api_calls or [])
         return traces
 
-    OPENAI_AGGREGATE_METRICS = ["total_tokens"]
+    OPENAI_AGGREGATE_METRICS = ["prompt_tokens", "completion_tokens", "total_tokens"]
 
     def collect_metrics(self, run_infos: List[RunInfo], aggregate_metrics: List[str] = []):
         if not aggregate_metrics:

diff --git a/src/promptflow/promptflow/_utils/openai_metrics_calculator.py b/src/promptflow/promptflow/_utils/openai_metrics_calculator.py
@@ -1,7 +1,10 @@
 import tiktoken
+from importlib.metadata import version
 
 from promptflow.exceptions import UserErrorException
 
+IS_LEGACY_OPENAI = version("openai").startswith("0.")
+
 
 class OpenAIMetricsCalculator:
     def __init__(self, logger=None) -> None:
@@ -47,19 +50,22 @@ def _get_openai_metrics_for_signal_api(self, api_call: dict):
             )
 
         name = api_call.get("name")
-        if name.split(".")[-2] == "ChatCompletion":
+        if name.split(".")[-2] == "ChatCompletion" or name == "openai.resources.chat.completions.Completions.create":
             return self._get_openai_metrics_for_chat_api(api_call)
-        elif name.split(".")[-2] == "Completion":
+        elif name.split(".")[-2] == "Completion" or name == "openai.resources.completions.Completions.create":
             return self._get_openai_metrics_for_completion_api(api_call)
         else:
             raise CalculatingMetricsError(f"Calculating metrics for api {name} is not supported.")
 
     def _try_get_model(self, inputs):
-        api_type = inputs.get("api_type")
-        if not api_type:
-            raise CalculatingMetricsError("Cannot calculate metrics for none or empty api_type.")
-        if api_type == "azure":
-            model = inputs.get("engine")
+        if IS_LEGACY_OPENAI:
+            api_type = inputs.get("api_type")
+            if not api_type:
+                raise CalculatingMetricsError("Cannot calculate metrics for none or empty api_type.")
+            if api_type == "azure":
+                model = inputs.get("engine")
+            else:
+                model = inputs.get("model")
         else:
             model = inputs.get("model")
         if not model:
@@ -81,7 +87,10 @@ def _get_openai_metrics_for_chat_api(self, api_call):
             tokens_per_name
         )
         if isinstance(output, list):
-            metrics["completion_tokens"] = len(output)
+            if IS_LEGACY_OPENAI:
+                metrics["completion_tokens"] = len(output)
+            else:
+                metrics["completion_tokens"] = len([chunk for chunk in output if chunk.choices[0].delta.content])
         else:
             metrics["completion_tokens"] = self._get_completion_tokens_for_chat_api(output, enc)
         metrics["total_tokens"] = metrics["prompt_tokens"] + metrics["completion_tokens"]
@@ -139,7 +148,10 @@ def _get_openai_metrics_for_completion_api(self, api_call: dict):
             for pro in prompt:
                 metrics["prompt_tokens"] += len(enc.encode(pro))
         if isinstance(output, list):
-            metrics["completion_tokens"] = len(output)
+            if IS_LEGACY_OPENAI:
+                metrics["completion_tokens"] = len(output)
+            else:
+                metrics["completion_tokens"] = len([chunk for chunk in output if chunk.choices[0].text])
         else:
             metrics["completion_tokens"] = self._get_completion_tokens_for_completion_api(output, enc)
         metrics["total_tokens"] = metrics["prompt_tokens"] + metrics["completion_tokens"]

diff --git a/src/promptflow/promptflow/batch/_result.py b/src/promptflow/promptflow/batch/_result.py
@@ -96,12 +96,25 @@ def _get_openai_metrics(line_results: List[LineResult], aggr_results: Aggregatio
         total_metrics = {}
         calculator = OpenAIMetricsCalculator()
         for run_info in node_run_infos:
-            api_calls = run_info.api_calls or []
-            for call in api_calls:
-                metrics = calculator.get_openai_metrics_from_api_call(call)
+            metrics = SystemMetrics._try_get_openai_metrics(run_info)
+            if metrics:
                 calculator.merge_metrics_dict(total_metrics, metrics)
+            else:
+                api_calls = run_info.api_calls or []
+                for call in api_calls:
+                    metrics = calculator.get_openai_metrics_from_api_call(call)
+                    calculator.merge_metrics_dict(total_metrics, metrics)
         return total_metrics
 
+    def _try_get_openai_metrics(run_info):
+        openai_metrics = {}
+        if run_info.system_metrics:
+            for metric in ["total_tokens", "prompt_tokens", "completion_tokens"]:
+                if metric not in run_info.system_metrics:
+                    return False
+                openai_metrics[metric] = run_info.system_metrics[metric]
+        return openai_metrics
+
     def to_dict(self):
         return {
             "total_tokens": self.total_tokens,

diff --git a/src/promptflow/tests/executor/e2etests/test_langchain.py b/src/promptflow/tests/executor/e2etests/test_langchain.py
@@ -1,4 +1,3 @@
-from importlib.metadata import version
 from pathlib import Path
 from tempfile import mkdtemp
 
@@ -10,10 +9,6 @@
 from ..utils import get_flow_folder, get_flow_inputs_file, get_yaml_file
 
 
-@pytest.mark.skipif(
-    version("openai").startswith("1."),
-    reason="test needs to be upgraded to adapt to openai>=1.0.0",
-)
 @pytest.mark.usefixtures("use_secrets_config_file", "dev_connections")
 @pytest.mark.e2etest
 class TestLangchain:

diff --git a/src/promptflow/tests/executor/unittests/batch/test_result.py b/src/promptflow/tests/executor/unittests/batch/test_result.py
@@ -10,7 +10,7 @@
 
 
 def get_api_call(type, name, inputs={}, output={}, children=None):
-    return {"type": type, "name": f"_._.{name}._", "inputs": inputs, "output": output, "children": children}
+    return {"type": type, "name": name, "inputs": inputs, "output": output, "children": children}
 
 
 @pytest.mark.unittest
@@ -98,32 +98,41 @@ def test_node_status(self):
         }
 
     def test_system_metrics(self):
+        from openai.types.completion import Completion, CompletionChoice
+
         line_dict = {0: {"node_0": Status.Completed}}
         aggr_dict = {"aggr_0": Status.Completed}
 
         api_call_1 = get_api_call(
             "LLM",
-            "Completion",
-            inputs={"prompt": "Please tell me a joke.", "api_type": "azure", "engine": "text-davinci-003"},
+            "openai.resources.completions.Completions.create",
+            inputs={"prompt": "Please tell me a joke.", "model": "text-davinci-003"},
             output={"choices": [{"text": "text"}]},
         )
         api_call_2 = get_api_call(
             "LLM",
-            "Completion",
+            "openai.resources.completions.Completions.create",
             inputs={
                 "prompt": ["Please tell me a joke.", "Please tell me a joke about fruit."],
-                "api_type": "azure",
-                "engine": "text-davinci-003",
+                "model": "text-davinci-003",
             },
-            output=[{"choices": [{"text": "text"}]}, {"choices": [{"text": "text"}]}],
+            output=[
+                Completion(
+                    choices=[CompletionChoice(text="text", finish_reason="stop", index=0, logprobs=None)],
+                    id="id", created=0, model="model", object="text_completion"
+                ),
+                Completion(
+                    choices=[CompletionChoice(text="text", finish_reason="stop", index=0, logprobs=None)],
+                    id="id", created=0, model="model", object="text_completion"
+                ),
+            ],
         )
         line_api_calls = get_api_call("Chain", "Chain", children=[api_call_1, api_call_2])
         aggr_api_call = get_api_call(
             "LLM",
-            "ChatCompletion",
+            "openai.resources.chat.completions.Completions.create",
             inputs={
                 "messages": [{"system": "You are a helpful assistant.", "user": "Please tell me a joke."}],
-                "api_type": "openai",
                 "model": "gpt-35-turbo",
             },
             output={"choices": [{"message": {"content": "content"}}]},

diff --git a/src/promptflow/tests/test_configs/flows/flow_with_langchain_traces/test_langchain_traces.py b/src/promptflow/tests/test_configs/flows/flow_with_langchain_traces/test_langchain_traces.py
@@ -12,10 +12,9 @@
 
 @tool
 def test_langchain_traces(question: str, conn: AzureOpenAIConnection):
-    os.environ["OPENAI_API_KEY"] = conn.api_key
+    os.environ["AZURE_OPENAI_API_KEY"] = conn.api_key
     os.environ["OPENAI_API_VERSION"] = conn.api_version
-    os.environ["OPENAI_API_BASE"] = conn.api_base
-    os.environ["OPENAI_API_TYPE"] = conn.api_type
+    os.environ["AZURE_OPENAI_ENDPOINT"] = conn.api_base
 
     llm = AzureOpenAI(
         temperature=0.7,

diff --git a/src/promptflow/tests/test_configs/flows/openai_chat_api_flow/chat.py b/src/promptflow/tests/test_configs/flows/openai_chat_api_flow/chat.py
@@ -5,6 +5,9 @@
 from promptflow import tool
 from promptflow.connections import AzureOpenAIConnection
 
+IS_LEGACY_OPENAI = OPENAI_VERSION.startswith("0.")
+
+
 def get_client(connection: AzureOpenAIConnection):
     api_key = connection.api_key
     conn = dict(
@@ -20,6 +23,7 @@ def get_client(connection: AzureOpenAIConnection):
         )
     return Client(**conn)
 
+
 def create_messages(question, chat_history):
     yield {"role": "system", "content": "You are a helpful assistant."}
     for chat in chat_history:
@@ -29,9 +33,8 @@ def create_messages(question, chat_history):
 
 
 @tool
-def chat(connection: AzureOpenAIConnection, question: str, chat_history: List) -> str:
-    stream = True
-    if OPENAI_VERSION.startswith("0."):
+def chat(connection: AzureOpenAIConnection, question: str, chat_history: List, stream: bool) -> str:
+    if IS_LEGACY_OPENAI:
         completion = openai.ChatCompletion.create(
             engine="gpt-35-turbo",
             messages=list(create_messages(question, chat_history)),
@@ -59,12 +62,18 @@ def chat(connection: AzureOpenAIConnection, question: str, chat_history: List) -
         def generator():
             for chunk in completion:
                 if chunk.choices:
-                    yield getattr(chunk.choices[0]["delta"], "content", "")
+                    if IS_LEGACY_OPENAI:
+                        yield getattr(chunk.choices[0]["delta"], "content", "")
+                    else:
+                        yield chunk.choices[0].delta.content or ""
 
         # We must return the generator object, not using yield directly here.
         # Otherwise, the function itself will become a generator, despite whether stream is True or False.
         # return generator()
         return "".join(generator())
     else:
         # chat api may return message with no content.
-        return getattr(completion.choices[0].message, "content", "")
+        if IS_LEGACY_OPENAI:
+            return getattr(completion.choices[0].message, "content", "")
+        else:
+            return completion.choices[0].message.content or ""
diff --git a/src/promptflow/tests/test_configs/flows/openai_chat_api_flow/flow.dag.yaml b/src/promptflow/tests/test_configs/flows/openai_chat_api_flow/flow.dag.yaml
@@ -3,6 +3,8 @@ inputs:
     type: string
   chat_history:
     type: list
+  stream:
+    type: bool
 outputs:
   answer:
     type: string
@@ -17,3 +19,4 @@ nodes:
     question: ${inputs.question}
     chat_history: ${inputs.chat_history}
     connection: azure_open_ai_connection
+    stream: ${inputs.stream}
diff --git a/src/promptflow/tests/test_configs/flows/openai_chat_api_flow/inputs.jsonl b/src/promptflow/tests/test_configs/flows/openai_chat_api_flow/inputs.jsonl
@@ -1,2 +1,2 @@
-{"question": "What is the capital of the United States of America?", "chat_history": []}
-{"question": "What is the capital of the United States of America?", "chat_history": []}
+{"question": "What is the capital of the United States of America?", "chat_history": [], "stream": true}
+{"question": "What is the capital of the United States of America?", "chat_history": [], "stream": false}
diff --git a/src/promptflow/tests/test_configs/flows/openai_completion_api_flow/completion.py b/src/promptflow/tests/test_configs/flows/openai_completion_api_flow/completion.py
@@ -4,6 +4,9 @@
 from promptflow import tool
 from promptflow.connections import AzureOpenAIConnection
 
+IS_LEGACY_OPENAI = OPENAI_VERSION.startswith("0.")
+
+
 def get_client(connection: AzureOpenAIConnection):
     api_key = connection.api_key
     conn = dict(
@@ -19,10 +22,10 @@ def get_client(connection: AzureOpenAIConnection):
         )
     return Client(**conn)
 
+
 @tool
-def completion(connection: AzureOpenAIConnection, prompt: str) -> str:
-    stream = True
-    if OPENAI_VERSION.startswith("0."):
+def completion(connection: AzureOpenAIConnection, prompt: str, stream: bool) -> str:
+    if IS_LEGACY_OPENAI:
         completion = openai.Completion.create(
             prompt=prompt,
             engine="text-davinci-003",
@@ -50,8 +53,14 @@ def completion(connection: AzureOpenAIConnection, prompt: str) -> str:
         def generator():
             for chunk in completion:
                 if chunk.choices:
-                    yield getattr(chunk.choices[0], "text", "")
+                    if IS_LEGACY_OPENAI:
+                        yield getattr(chunk.choices[0], "text", "")
+                    else:
+                        yield chunk.choices[0].text or ""
 
         return "".join(generator())
     else:
-        return getattr(completion.choices[0], "text", "")
+        if IS_LEGACY_OPENAI:
+            return getattr(completion.choices[0], "text", "")
+        else:
+            return completion.choices[0].text or ""
diff --git a/src/promptflow/tests/test_configs/flows/openai_completion_api_flow/flow.dag.yaml b/src/promptflow/tests/test_configs/flows/openai_completion_api_flow/flow.dag.yaml
@@ -1,6 +1,8 @@
 inputs:
   prompt:
     type: string
+  stream:
+    type: bool
 outputs:
   output:
     type: string
@@ -14,3 +16,4 @@ nodes:
   inputs:
     prompt: ${inputs.prompt}
     connection: azure_open_ai_connection
+    stream: ${inputs.stream}
diff --git a/src/promptflow/tests/test_configs/flows/openai_completion_api_flow/inputs.jsonl b/src/promptflow/tests/test_configs/flows/openai_completion_api_flow/inputs.jsonl
@@ -1 +1,2 @@
-{"prompt": "What is the capital of the United States of America?"}
+{"prompt": "What is the capital of the United States of America?", "stream": true}
+{"prompt": "What is the capital of the United States of America?", "stream": false}