From bd82aa650b8fd6833881bb0171067ecf9d17d7e3 Mon Sep 17 00:00:00 2001 From: Xingzhi Zhang <37076709+elliotzh@users.noreply.github.com> Date: Tue, 30 Apr 2024 16:31:56 +0800 Subject: [PATCH 01/21] release: update dev version (#3050) # Description update dev version in `pyproject.toml` after `1.10.0` is released. # All Promptflow Contribution checklist: - [x] **The pull request does not introduce [breaking changes].** - [x] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [x] Title of the pull request is clear and informative. - [x] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --------- Co-authored-by: Zhengfei Wang <38847871+zhengfeiwang@users.noreply.github.com> Co-authored-by: Philip Gao --- src/promptflow-azure/pyproject.toml | 2 +- src/promptflow-core/pyproject.toml | 2 +- src/promptflow-devkit/pyproject.toml | 2 +- src/promptflow-tracing/pyproject.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/promptflow-azure/pyproject.toml b/src/promptflow-azure/pyproject.toml index a64e9fc1b8f..9d9b3fa5ff3 100644 --- a/src/promptflow-azure/pyproject.toml +++ b/src/promptflow-azure/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "promptflow-azure" -version = "1.10.0.dev0" +version = "1.11.0.dev0" description = "Prompt flow azure" include = [ "promptflow/azure/resources/*" diff --git a/src/promptflow-core/pyproject.toml b/src/promptflow-core/pyproject.toml index e0c82a39e3a..f41219ebd1a 100644 --- a/src/promptflow-core/pyproject.toml +++ b/src/promptflow-core/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "promptflow-core" -version = "1.10.0.dev0" +version = "1.11.0.dev0" description = "Prompt flow core" include = [ "promptflow/core/_serving/static/*", diff --git a/src/promptflow-devkit/pyproject.toml b/src/promptflow-devkit/pyproject.toml index b4169508782..7045655caff 100644 --- a/src/promptflow-devkit/pyproject.toml +++ b/src/promptflow-devkit/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "promptflow-devkit" -version = "1.10.0.dev0" +version = "1.11.0.dev0" description = "Prompt flow devkit" include = [ "promptflow/_sdk/_service/static/*", diff --git a/src/promptflow-tracing/pyproject.toml b/src/promptflow-tracing/pyproject.toml index 30a84d08c62..636da68c7b0 100644 --- a/src/promptflow-tracing/pyproject.toml +++ b/src/promptflow-tracing/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api" # poetry [tool.poetry] name = "promptflow-tracing" -version = "1.10.0.dev0" +version = "1.11.0.dev0" description = "Prompt flow tracing" license = "MIT" authors = [ From 35ed6c5072e47e05febe75dab92317f64fd1d5da Mon Sep 17 00:00:00 2001 From: Heyi Tang Date: Tue, 30 Apr 2024 17:47:08 +0800 Subject: [PATCH 02/21] [Executor] Refine executor logic to support async generator in flow test (#3083) # Description Refine exec_line logic to support async generator in flow test. This pull request primarily focuses on improving the handling of generators and asynchronous generators in the `promptflow` package. The changes ensure that the code can handle both synchronous and asynchronous generators, improve the handling of generator outputs, and refactor the way nodes are executed in the flow. Here are the key changes: Improved handling of generators and asynchronous generators: * [`src/promptflow-core/promptflow/_core/run_tracker.py`](diffhunk://#diff-a5027d19a24cb28a68ead16dfe6c54492c78d6e0e7640e80533928808cdb3422R6-L9): The `inspect` module was imported to replace the use of `GeneratorType` for checking if a value is a generator. The method `update_and_persist_generator_node_runs` was introduced to replace `persist_selected_node_runs`, and it now updates the output of the node run with the output in the trace before persisting it. [[1]](diffhunk://#diff-a5027d19a24cb28a68ead16dfe6c54492c78d6e0e7640e80533928808cdb3422R6-L9) [[2]](diffhunk://#diff-a5027d19a24cb28a68ead16dfe6c54492c78d6e0e7640e80533928808cdb3422L299-R299) [[3]](diffhunk://#diff-a5027d19a24cb28a68ead16dfe6c54492c78d6e0e7640e80533928808cdb3422L433-R450) * [`src/promptflow-core/promptflow/_utils/run_tracker_utils.py`](diffhunk://#diff-cc2845177424c6393b16b91ff5a7753eaf73aa52c4c52c53c8f83eb68746815cR4-R7): The `inspect` module was imported, and the method `_deep_copy_and_extract_items_from_generator_proxy` was updated to handle `AsyncGeneratorProxy` and to convert generators to strings to avoid deepcopy errors. [[1]](diffhunk://#diff-cc2845177424c6393b16b91ff5a7753eaf73aa52c4c52c53c8f83eb68746815cR4-R7) [[2]](diffhunk://#diff-cc2845177424c6393b16b91ff5a7753eaf73aa52c4c52c53c8f83eb68746815cL21-R25) Refactoring of node execution: * [`src/promptflow-core/promptflow/executor/flow_executor.py`](diffhunk://#diff-bec06607cb28fd791b8ed11bb488979344ca342be5f1c67ba6dd663d5e12240fR63): The `ThreadPoolExecutorWithContext` was imported from `promptflow.tracing`. In the `exec_line` method, a check was added to use `exec_line_async` when the tools are async. The `exec_line_async` method was updated to include a `line_timeout_sec` parameter. The `_exec_inner_with_trace_async` method was updated to use `_stringify_generator_output_async` to handle async generator output. The `_exec_post_process` method was updated to use `update_and_persist_generator_node_runs` instead of `persist_selected_node_runs`. The `_should_use_async` method was updated to check if any tool is async. The `_traverse_nodes_async` method was updated to use an async scheduler. The methods `_merge_async_generator`, `_stringify_generator_output_async`, and `_merge_generator` were added to handle generator outputs. The `_submit_to_scheduler` method was updated to only use the thread pool mode. [[1]](diffhunk://#diff-bec06607cb28fd791b8ed11bb488979344ca342be5f1c67ba6dd663d5e12240fR63) [[2]](diffhunk://#diff-bec06607cb28fd791b8ed11bb488979344ca342be5f1c67ba6dd663d5e12240fR693-R699) [[3]](diffhunk://#diff-bec06607cb28fd791b8ed11bb488979344ca342be5f1c67ba6dd663d5e12240fR731) [[4]](diffhunk://#diff-bec06607cb28fd791b8ed11bb488979344ca342be5f1c67ba6dd663d5e12240fR751-R752) [[5]](diffhunk://#diff-bec06607cb28fd791b8ed11bb488979344ca342be5f1c67ba6dd663d5e12240fL869-R880) [[6]](diffhunk://#diff-bec06607cb28fd791b8ed11bb488979344ca342be5f1c67ba6dd663d5e12240fL913-R926) [[7]](diffhunk://#diff-bec06607cb28fd791b8ed11bb488979344ca342be5f1c67ba6dd663d5e12240fR1158-R1212) [[8]](diffhunk://#diff-bec06607cb28fd791b8ed11bb488979344ca342be5f1c67ba6dd663d5e12240fL1186-L1190) # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --------- Co-authored-by: Heyi --- .../promptflow/_core/run_tracker.py | 18 ++-- .../promptflow/_utils/async_utils.py | 5 +- .../promptflow/_utils/run_tracker_utils.py | 7 +- .../promptflow/executor/flow_executor.py | 99 ++++++++++++------- 4 files changed, 82 insertions(+), 47 deletions(-) diff --git a/src/promptflow-core/promptflow/_core/run_tracker.py b/src/promptflow-core/promptflow/_core/run_tracker.py index c10a20dd2cb..ac2abd26f7d 100644 --- a/src/promptflow-core/promptflow/_core/run_tracker.py +++ b/src/promptflow-core/promptflow/_core/run_tracker.py @@ -3,10 +3,10 @@ # --------------------------------------------------------- import asyncio +import inspect import json from contextvars import ContextVar from datetime import datetime, timezone -from types import GeneratorType from typing import Any, Dict, List, Mapping, Optional, Union from promptflow._constants import MessageFormatType @@ -296,7 +296,7 @@ def end_run( def _ensure_serializable_value(self, val, warning_msg: Optional[str] = None): if ConnectionType.is_connection_value(val): return ConnectionType.serialize_conn(val) - if self.allow_generator_types and isinstance(val, GeneratorType): + if inspect.isgenerator(val) or inspect.isasyncgen(val): return str(val) try: json.dumps(val, default=default_json_encoder) @@ -430,22 +430,24 @@ def get_run(self, run_id): def persist_node_run(self, run_info: RunInfo): self._storage.persist_node_run(run_info) - def persist_selected_node_runs(self, run_info: FlowRunInfo, node_names: List[str]): + def update_and_persist_generator_node_runs(self, run_id: str, node_names: List[str]): """ - Persists the node runs for the specified node names. + Persists the node runs for nodes producing generators. - :param run_info: The flow run information. - :type run_info: FlowRunInfo + :param run_id: The ID of the flow run. + :type run_id: str :param node_names: The names of the nodes to persist. :type node_names: List[str] :returns: None """ - run_id = run_info.run_id - selected_node_run_info = ( run_info for run_info in self.collect_child_node_runs(run_id) if run_info.node in node_names ) for node_run_info in selected_node_run_info: + # Update the output of the node run with the output in the trace. + # This is because the output in the trace would includes the generated items. + output_in_trace = node_run_info.api_calls[0]["output"] + node_run_info.output = output_in_trace self.persist_node_run(node_run_info) def persist_flow_run(self, run_info: FlowRunInfo): diff --git a/src/promptflow-core/promptflow/_utils/async_utils.py b/src/promptflow-core/promptflow/_utils/async_utils.py index dd019233bf8..41454e0d0f2 100644 --- a/src/promptflow-core/promptflow/_utils/async_utils.py +++ b/src/promptflow-core/promptflow/_utils/async_utils.py @@ -3,11 +3,8 @@ # --------------------------------------------------------- import asyncio -import contextvars import functools -from concurrent.futures import ThreadPoolExecutor -from promptflow._utils.utils import set_context from promptflow.tracing import ThreadPoolExecutorWithContext @@ -36,7 +33,7 @@ def async_run_allowing_running_loop(async_func, *args, **kwargs): event loop, we run _exec_batch in a new thread; otherwise, we run it in the current thread. """ if _has_running_loop(): - with ThreadPoolExecutor(1, initializer=set_context, initargs=(contextvars.copy_context(),)) as executor: + with ThreadPoolExecutorWithContext() as executor: return executor.submit(lambda: asyncio.run(async_func(*args, **kwargs))).result() else: return asyncio.run(async_func(*args, **kwargs)) diff --git a/src/promptflow-core/promptflow/_utils/run_tracker_utils.py b/src/promptflow-core/promptflow/_utils/run_tracker_utils.py index 1f44535db36..fca2be0da1d 100644 --- a/src/promptflow-core/promptflow/_utils/run_tracker_utils.py +++ b/src/promptflow-core/promptflow/_utils/run_tracker_utils.py @@ -1,9 +1,10 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +import inspect from copy import deepcopy -from promptflow.tracing.contracts.generator_proxy import GeneratorProxy +from promptflow.tracing.contracts.generator_proxy import AsyncGeneratorProxy, GeneratorProxy def _deep_copy_and_extract_items_from_generator_proxy(value: object) -> object: @@ -18,6 +19,8 @@ def _deep_copy_and_extract_items_from_generator_proxy(value: object) -> object: return [_deep_copy_and_extract_items_from_generator_proxy(v) for v in value] elif isinstance(value, dict): return {k: _deep_copy_and_extract_items_from_generator_proxy(v) for k, v in value.items()} - elif isinstance(value, GeneratorProxy): + elif isinstance(value, (GeneratorProxy, AsyncGeneratorProxy)): return deepcopy(value.items) + elif inspect.isgenerator(value) or inspect.isasyncgen(value): + return str(value) # Convert generator to string to avoid deepcopy error return deepcopy(value) diff --git a/src/promptflow-core/promptflow/executor/flow_executor.py b/src/promptflow-core/promptflow/executor/flow_executor.py index a5a1374b081..90cb95bb0e1 100644 --- a/src/promptflow-core/promptflow/executor/flow_executor.py +++ b/src/promptflow-core/promptflow/executor/flow_executor.py @@ -29,6 +29,7 @@ from promptflow._core.run_tracker import RunTracker from promptflow._core.tool import STREAMING_OPTION_PARAMETER_ATTR from promptflow._core.tools_manager import ToolsManager +from promptflow._utils.async_utils import async_run_allowing_running_loop from promptflow._utils.context_utils import _change_working_dir from promptflow._utils.execution_utils import ( apply_default_value_for_input, @@ -60,6 +61,7 @@ from promptflow.executor.flow_validator import FlowValidator from promptflow.storage import AbstractRunStorage from promptflow.storage._run_storage import DefaultRunStorage +from promptflow.tracing import ThreadPoolExecutorWithContext from promptflow.tracing._integrations._openai_injector import inject_openai_api from promptflow.tracing._operation_context import OperationContext from promptflow.tracing._start_trace import setup_exporter_from_environ @@ -693,24 +695,33 @@ def exec_line( :return: The result of executing the line. :rtype: ~promptflow.executor._result.LineResult """ + if self._should_use_async(): + # Use async exec_line when the tools are async + return async_run_allowing_running_loop( + self.exec_line_async, + inputs, + index, + run_id, + validate_inputs, + node_concurrency, + allow_generator_output, + line_timeout_sec, + ) # TODO: Call exec_line_async in exec_line when async is mature. self._node_concurrency = node_concurrency # TODO: Pass line_timeout_sec to flow node scheduler instead of updating self._line_timeout_sec self._line_timeout_sec = line_timeout_sec or self._line_timeout_sec inputs = apply_default_value_for_input(self._flow.inputs, inputs) # For flow run, validate inputs as default - with self._run_tracker.node_log_manager: - # exec_line interface may be called when executing a batch run, so we only set run_mode as flow run when - # it is not set. - run_id = run_id or str(uuid.uuid4()) - with self._update_operation_context(run_id, index): - line_result = self._exec( - inputs, - run_id=run_id, - line_number=index, - validate_inputs=validate_inputs, - allow_generator_output=allow_generator_output, - ) + run_id = run_id or str(uuid.uuid4()) + with self._run_tracker.node_log_manager, self._update_operation_context(run_id, index): + line_result = self._exec( + inputs, + run_id=run_id, + line_number=index, + validate_inputs=validate_inputs, + allow_generator_output=allow_generator_output, + ) # Return line result with index if index is not None and isinstance(line_result.output, dict): line_result.output[LINE_NUMBER_KEY] = index @@ -724,6 +735,7 @@ async def exec_line_async( validate_inputs: bool = True, node_concurrency=DEFAULT_CONCURRENCY_FLOW, allow_generator_output: bool = False, + line_timeout_sec: Optional[int] = None, ) -> LineResult: """Execute a single line of the flow. @@ -743,13 +755,12 @@ async def exec_line_async( :rtype: ~promptflow.executor._result.LineResult """ self._node_concurrency = node_concurrency + # TODO: Pass line_timeout_sec to flow node scheduler instead of updating self._line_timeout_sec + self._line_timeout_sec = line_timeout_sec or self._line_timeout_sec inputs = apply_default_value_for_input(self._flow.inputs, inputs) # For flow run, validate inputs as default - with self._run_tracker.node_log_manager: - # exec_line interface may be called when executing a batch run, so we only set run_mode as flow run when - # it is not set. - operation_context = OperationContext.get_instance() - operation_context.run_mode = operation_context.get("run_mode", None) or RunMode.Test.name + run_id = run_id or str(uuid.uuid4()) + with self._run_tracker.node_log_manager, self._update_operation_context(run_id, index): line_result = await self._exec_async( inputs, run_id=run_id, @@ -869,8 +880,7 @@ async def _exec_inner_with_trace_async( ): with self._start_flow_span(inputs) as span, self._record_cancellation_exceptions_to_span(span): output, nodes_outputs = await self._traverse_nodes_async(inputs, context) - # TODO: Also stringify async generator output - output = self._stringify_generator_output(output) if not stream else output + output = await self._stringify_generator_output_async(output) if not stream else output self._exec_post_process(inputs, output, nodes_outputs, run_info, run_tracker, span, stream) return output, extract_aggregation_inputs(self._flow, nodes_outputs) @@ -914,9 +924,9 @@ def _exec_post_process( for nodename, output in nodes_outputs.items() if isinstance(output, GeneratorType) or isinstance(output, AsyncGeneratorType) ] - run_tracker.persist_selected_node_runs(run_info, generator_output_nodes) # When stream is True, we allow generator output in the flow output run_tracker.allow_generator_types = stream + run_tracker.update_and_persist_generator_node_runs(run_info.run_id, generator_output_nodes) run_tracker.end_run(run_info.run_id, result=output) enrich_span_with_trace_type(span, inputs, output, trace_type=TraceType.FLOW) span.set_status(StatusCode.OK) @@ -1148,33 +1158,61 @@ def _extract_outputs(self, nodes_outputs, bypassed_nodes, flow_inputs): return outputs def _should_use_async(self): + def is_async(f): + # Here we check the original function since currently asyncgenfunction would be converted to sync func + # TODO: Improve @trace logic to make sure wrapped asyncgen is still an asyncgen + original_func = getattr(f, "__original_function", f) + return inspect.iscoroutinefunction(original_func) or inspect.isasyncgenfunction(original_func) + return ( - all(inspect.iscoroutinefunction(f) for f in self._tools_manager._tools.values()) + any(is_async(f) for f in self._tools_manager._tools.values()) or os.environ.get("PF_USE_ASYNC", "false").lower() == "true" ) def _traverse_nodes(self, inputs, context: FlowExecutionContext) -> Tuple[dict, dict]: batch_nodes = [node for node in self._flow.nodes if not node.aggregation] outputs = {} - # TODO: Use a mixed scheduler to support both async and thread pool mode. nodes_outputs, bypassed_nodes = self._submit_to_scheduler(context, inputs, batch_nodes) outputs = self._extract_outputs(nodes_outputs, bypassed_nodes, inputs) return outputs, nodes_outputs async def _traverse_nodes_async(self, inputs, context: FlowExecutionContext) -> Tuple[dict, dict]: batch_nodes = [node for node in self._flow.nodes if not node.aggregation] - outputs = {} - # Always use async scheduler when calling from async function. flow_logger.info("Start executing nodes in async mode.") scheduler = AsyncNodesScheduler(self._tools_manager, self._node_concurrency) nodes_outputs, bypassed_nodes = await scheduler.execute(batch_nodes, inputs, context) outputs = self._extract_outputs(nodes_outputs, bypassed_nodes, inputs) return outputs, nodes_outputs + @staticmethod + async def _merge_async_generator(async_gen: AsyncGeneratorType, outputs: dict, key: str): + items = [] + async for item in async_gen: + items.append(item) + outputs[key] = "".join(str(item) for item in items) + + async def _stringify_generator_output_async(self, outputs: dict): + pool = ThreadPoolExecutorWithContext() + tasks = [] + for k, v in outputs.items(): + if isinstance(v, AsyncGeneratorType): + tasks.append(asyncio.create_task(self._merge_async_generator(v, outputs, k))) + elif isinstance(v, GeneratorType): + loop = asyncio.get_event_loop() + task = loop.run_in_executor(pool, self._merge_generator, v, outputs, k) + tasks.append(task) + if tasks: + await asyncio.wait(tasks) + return outputs + + @staticmethod + def _merge_generator(gen: GeneratorType, outputs: dict, key: str): + outputs[key] = "".join(str(item) for item in gen) + def _stringify_generator_output(self, outputs: dict): for k, v in outputs.items(): if isinstance(v, GeneratorType): - outputs[k] = "".join(str(chuck) for chuck in v) + self._merge_generator(v, outputs, k) return outputs @@ -1187,14 +1225,9 @@ def _submit_to_scheduler(self, context: FlowExecutionContext, inputs, nodes: Lis ), current_value=self._node_concurrency, ) - if self._should_use_async(): - flow_logger.info("Start executing nodes in async mode.") - scheduler = AsyncNodesScheduler(self._tools_manager, self._node_concurrency) - return asyncio.run(scheduler.execute(nodes, inputs, context)) - else: - flow_logger.info("Start executing nodes in thread pool mode.") - scheduler = FlowNodesScheduler(self._tools_manager, inputs, nodes, self._node_concurrency, context) - return scheduler.execute(self._line_timeout_sec) + flow_logger.info("Start executing nodes in thread pool mode.") + scheduler = FlowNodesScheduler(self._tools_manager, inputs, nodes, self._node_concurrency, context) + return scheduler.execute(self._line_timeout_sec) @staticmethod def apply_inputs_mapping( From 43ef9297116e03cb4db90fbf131e6cdbdf94563b Mon Sep 17 00:00:00 2001 From: Robben Wang <350053002@qq.com> Date: Wed, 1 May 2024 16:43:48 +0800 Subject: [PATCH 03/21] Add execution target to distinguish dag or non-dag flow (#3086) # Description Add execution target to distinguish dag or non dag execution. Will set different value for each executor FlowExecutor: dag ScriptExecutor: flex PromptyExecutor: prompty For now, when customer run `pf flow test` command to a python file, we will create flow.flex.yaml file for customer, and will report it as flex Execution target filed will be used in 1. PowerBI AOAI token page 2. Trace telemetry # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. Co-authored-by: robbenwang --- .../promptflow/executor/_prompty_executor.py | 3 +++ .../promptflow/executor/_script_executor.py | 4 +++- .../promptflow/executor/flow_executor.py | 17 +++++++++++++---- .../promptflow/tracing/_operation_context.py | 7 +++++++ 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/promptflow-core/promptflow/executor/_prompty_executor.py b/src/promptflow-core/promptflow/executor/_prompty_executor.py index c5d5987106e..9db90b314aa 100644 --- a/src/promptflow-core/promptflow/executor/_prompty_executor.py +++ b/src/promptflow-core/promptflow/executor/_prompty_executor.py @@ -1,6 +1,7 @@ from pathlib import Path from typing import Any, Dict, Optional +from promptflow._constants import FlowType from promptflow._utils.logger_utils import logger from promptflow.contracts.flow import PromptyFlow from promptflow.contracts.tool import InputDefinition @@ -31,6 +32,8 @@ def __init__( self.prompty = Prompty.load(source=flow_file, **self._init_kwargs) super().__init__(flow_file=flow_file, connections=connections, working_dir=working_dir, storage=storage) + _execution_target = FlowType.PROMPTY + @property def has_aggregation_node(self): return False diff --git a/src/promptflow-core/promptflow/executor/_script_executor.py b/src/promptflow-core/promptflow/executor/_script_executor.py index 36011040829..4407f0854b0 100644 --- a/src/promptflow-core/promptflow/executor/_script_executor.py +++ b/src/promptflow-core/promptflow/executor/_script_executor.py @@ -9,7 +9,7 @@ from types import GeneratorType from typing import Any, Callable, Dict, List, Mapping, Optional, Union -from promptflow._constants import LINE_NUMBER_KEY, MessageFormatType +from promptflow._constants import LINE_NUMBER_KEY, FlowType, MessageFormatType from promptflow._core.log_manager import NodeLogManager from promptflow._core.run_tracker import RunTracker from promptflow._core.tool_meta_generator import PythonLoadError @@ -97,6 +97,8 @@ def _exec_line_context(self, run_id, line_number): with log_manager, self._update_operation_context(run_id, line_number): yield + _execution_target = FlowType.FLEX_FLOW + def exec_line( self, inputs: Mapping[str, Any], diff --git a/src/promptflow-core/promptflow/executor/flow_executor.py b/src/promptflow-core/promptflow/executor/flow_executor.py index 90cb95bb0e1..d5c7fd1d23c 100644 --- a/src/promptflow-core/promptflow/executor/flow_executor.py +++ b/src/promptflow-core/promptflow/executor/flow_executor.py @@ -21,7 +21,7 @@ from opentelemetry.trace.span import Span, format_trace_id from opentelemetry.trace.status import StatusCode -from promptflow._constants import LINE_NUMBER_KEY +from promptflow._constants import LINE_NUMBER_KEY, FlowType from promptflow._core._errors import NotSupported, UnexpectedError from promptflow._core.cache_manager import AbstractCacheManager from promptflow._core.flow_execution_context import FlowExecutionContext @@ -68,6 +68,8 @@ from promptflow.tracing._trace import enrich_span_with_context, enrich_span_with_input, enrich_span_with_trace_type from promptflow.tracing.contracts.trace import TraceType +DEFAULT_TRACING_KEYS = {"run_mode", "root_run_id", "flow_id", "batch_input_source", "execution_target"} + class FlowExecutor: """This class is used to execute a single flow for different inputs. @@ -165,6 +167,10 @@ def __init__( self._message_format = flow.message_format self._multimedia_processor = MultimediaProcessor.create(flow.message_format) + # This field is used to distinguish the execution target of the flow. + # Candidate value for executors are dag, flex adn prompty. + _execution_target = FlowType.DAG_FLOW + @classmethod def create( cls, @@ -348,7 +354,8 @@ def update_operation_context(): original_context = operation_context.copy() try: append_promptflow_package_ua(operation_context) - operation_context.set_default_tracing_keys({"run_mode", "root_run_id", "flow_id", "batch_input_source"}) + operation_context.set_execution_target(cls._execution_target) + operation_context.set_default_tracing_keys(DEFAULT_TRACING_KEYS) operation_context["run_mode"] = RunMode.SingleNode.name # Inject OpenAI API to make sure traces and headers injection works and # update OpenAI API configs from environment variables. @@ -788,7 +795,8 @@ def _update_operation_context(self, run_id: str, line_number: int): values_for_otel = {"line_run_id": run_id} try: append_promptflow_package_ua(operation_context) - operation_context.set_default_tracing_keys({"run_mode", "root_run_id", "flow_id", "batch_input_source"}) + operation_context.set_execution_target(execution_target=self._execution_target) + operation_context.set_default_tracing_keys(DEFAULT_TRACING_KEYS) operation_context.run_mode = original_mode operation_context.update(values_for_context) for k, v in values_for_otel.items(): @@ -817,7 +825,8 @@ def _update_operation_context_for_aggregation(self, run_id: str): ) try: append_promptflow_package_ua(operation_context) - operation_context.set_default_tracing_keys({"run_mode", "root_run_id", "flow_id", "batch_input_source"}) + operation_context.set_execution_target(self._execution_target) + operation_context.set_default_tracing_keys(DEFAULT_TRACING_KEYS) operation_context.run_mode = original_mode operation_context.update(values_for_context) for k, v in values_for_otel.items(): diff --git a/src/promptflow-tracing/promptflow/tracing/_operation_context.py b/src/promptflow-tracing/promptflow/tracing/_operation_context.py index ecde2f094cb..e49e1a96787 100644 --- a/src/promptflow-tracing/promptflow/tracing/_operation_context.py +++ b/src/promptflow-tracing/promptflow/tracing/_operation_context.py @@ -21,6 +21,7 @@ class OperationContext(Dict): _current_context = ContextVar(_CONTEXT_KEY, default=None) USER_AGENT_KEY = "user_agent" REQUEST_ID_KEY = "request_id" + EXECUTION_TARGET = "execution_target" _DEFAULT_TRACING_KEYS = "_default_tracing_keys" _OTEL_ATTRIBUTES = "_otel_attributes" _TRACKING_KEYS = "_tracking_keys" @@ -175,6 +176,12 @@ def set_default_tracing_keys(self, keys: set): if key not in self[self._TRACKING_KEYS]: self[self._TRACKING_KEYS].add(key) + def set_execution_target(self, execution_target: str): + # Set in the context for getting tracking info + # Set in otel attributes for telemetry + self[OperationContext.EXECUTION_TARGET] = execution_target + self._add_otel_attributes(OperationContext.EXECUTION_TARGET, execution_target) + def get_context_dict(self): """Get the context dictionary. From 12a0f98a7389e017cba0806ff39921c8c3c834e0 Mon Sep 17 00:00:00 2001 From: Ankit Singhal <30610298+singankit@users.noreply.github.com> Date: Thu, 2 May 2024 22:35:46 -0700 Subject: [PATCH 04/21] Local to remote tracking for evaluation (#3104) # Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --------- Co-authored-by: Billy Hu --- .../workflows/promptflow-evals-e2e-test.yml | 1 + .../workflows/promptflow-evals-unit-test.yml | 1 + .../promptflow/evals/_user_agent.py | 6 + .../promptflow/evals/evaluate/_evaluate.py | 37 ++++- .../promptflow/evals/evaluate/_utils.py | 142 ++++++++++++++++++ src/promptflow-evals/pyproject.toml | 3 + src/promptflow-evals/tests/evals/conftest.py | 36 +++++ .../tests/evals/e2etests/test_evaluate.py | 107 +++++++++++++ .../tests/evals/unittests/test_evaluate.py | 2 +- 9 files changed, 326 insertions(+), 9 deletions(-) create mode 100644 src/promptflow-evals/promptflow/evals/_user_agent.py diff --git a/.github/workflows/promptflow-evals-e2e-test.yml b/.github/workflows/promptflow-evals-e2e-test.yml index 29967d30810..422b7531f0a 100644 --- a/.github/workflows/promptflow-evals-e2e-test.yml +++ b/.github/workflows/promptflow-evals-e2e-test.yml @@ -60,6 +60,7 @@ jobs: poetry run pip install -e ../promptflow-devkit poetry run pip install -e ../promptflow-tracing poetry run pip install -e ../promptflow-tools + poetry run pip install -e ../promptflow-azure working-directory: ${{ env.WORKING_DIRECTORY }} - name: install promptflow-evals from wheel # wildcard expansion (*) does not work in Windows, so leverage python to find and install diff --git a/.github/workflows/promptflow-evals-unit-test.yml b/.github/workflows/promptflow-evals-unit-test.yml index 68c31094300..63ca21266e7 100644 --- a/.github/workflows/promptflow-evals-unit-test.yml +++ b/.github/workflows/promptflow-evals-unit-test.yml @@ -56,6 +56,7 @@ jobs: poetry run pip install -e ../promptflow-devkit poetry run pip install -e ../promptflow-tracing poetry run pip install -e ../promptflow-tools + poetry run pip install -e ../promptflow-azure working-directory: ${{ env.WORKING_DIRECTORY }} - name: install promptflow-evals from wheel # wildcard expansion (*) does not work in Windows, so leverage python to find and install diff --git a/src/promptflow-evals/promptflow/evals/_user_agent.py b/src/promptflow-evals/promptflow/evals/_user_agent.py new file mode 100644 index 00000000000..92e5222bfac --- /dev/null +++ b/src/promptflow-evals/promptflow/evals/_user_agent.py @@ -0,0 +1,6 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +from promptflow.evals._version import VERSION + +USER_AGENT = "{}/{}".format("promptflow-evals", VERSION) diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py index cb7e4e29ebd..aade5ebcf57 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py @@ -5,13 +5,14 @@ import os import re import tempfile -import uuid from typing import Any, Callable, Dict, Optional, Set, Tuple import pandas as pd from promptflow._sdk._constants import LINE_NUMBER from promptflow.client import PFClient +from ._utils import _log_metrics_and_instance_results +from .._user_agent import USER_AGENT def _calculate_mean(df) -> Dict[str, float]: @@ -104,7 +105,8 @@ def _validate_columns( def _apply_target_to_data( - target: Callable, data: str, pf_client: PFClient, initial_data: pd.DataFrame + target: Callable, data: str, pf_client: PFClient, initial_data: pd.DataFrame, + evaluation_name: Optional[str] = None ) -> Tuple[pd.DataFrame, Set[str]]: """ Apply the target function to the data set and return updated data and generated columns. @@ -123,11 +125,17 @@ def _apply_target_to_data( # We are manually creating the temporary directory for the flow # because the way tempdir remove temporary directories will # hang the debugger, because promptflow will keep flow directory. - run = pf_client.run(flow=target, data=data, name=f"preprocess_{uuid.uuid1()}", stream=True) + run = pf_client.run( + flow=target, + display_name=evaluation_name, + data=data, + properties={"runType": "eval_run"}, + stream=True + ) target_output = pf_client.runs.get_details(run, all_results=True) # Remove input and output prefix prefix = "outputs." - rename_dict = {col: col[len(prefix) :] for col in target_output.columns if col.startswith(prefix)} + rename_dict = {col: col[len(prefix):] for col in target_output.columns if col.startswith(prefix)} # Sort output by line numbers target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True) target_output.sort_index(inplace=True) @@ -140,7 +148,7 @@ def _apply_target_to_data( target_output.rename(columns=rename_dict, inplace=True) # Concatenate output to input target_output = pd.concat([target_output, initial_data], axis=1) - return target_output, set(rename_dict.values()) + return target_output, set(rename_dict.values()), run def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace: bool = False): @@ -230,11 +238,19 @@ def evaluate( evaluator_config = _process_evaluator_config(evaluator_config) _validate_columns(input_data_df, evaluators, target, evaluator_config) - pf_client = PFClient() + pf_client = PFClient( + config={ + "trace.destination": tracking_uri + } if tracking_uri else None, + user_agent=USER_AGENT, + + ) + target_run = None target_generated_columns = set() if data is not None and target is not None: - input_data_df, target_generated_columns = _apply_target_to_data(target, data, pf_client, input_data_df) + input_data_df, target_generated_columns, target_run = _apply_target_to_data(target, data, pf_client, + input_data_df, evaluation_name) # After we have generated all columns we can check if we have # everything we need for evaluators. _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config) @@ -251,6 +267,7 @@ def evaluate( evaluator_info[evaluator_name] = {} evaluator_info[evaluator_name]["run"] = pf_client.run( flow=evaluator, + run=target_run, column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)), data=data_file, stream=True, @@ -290,5 +307,9 @@ def evaluate( ) result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True) + metrics = _calculate_mean(evaluators_result_df) + + studio_url = _log_metrics_and_instance_results( + metrics, result_df, tracking_uri, target_run, pf_client, data, evaluation_name) - return {"rows": result_df.to_dict("records"), "metrics": _calculate_mean(evaluators_result_df), "traces": {}} + return {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url} diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_utils.py b/src/promptflow-evals/promptflow/evals/evaluate/_utils.py index 38f71421bac..66baa8f03ec 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_utils.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_utils.py @@ -1,9 +1,151 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +import logging import json +import os +import tempfile +from pathlib import Path + +import mlflow + +from promptflow._sdk._constants import Local2Cloud +from promptflow._sdk._utilities.general_utils import extract_workspace_triad_from_trace_provider +from promptflow._utils.async_utils import async_run_allowing_running_loop +from promptflow.azure.operations._async_run_uploader import AsyncRunUploader + +LOGGER = logging.getLogger(__name__) def load_jsonl(path): with open(path, "r", encoding="utf-8") as f: return [json.loads(line) for line in f.readlines()] + + +def _write_properties_to_run_history(properties: dict) -> None: + from mlflow.tracking import MlflowClient + from mlflow.utils.rest_utils import http_request + + # get mlflow run + run = mlflow.active_run() + if run is None: + run = mlflow.start_run() + # get auth from client + client = MlflowClient() + try: + cred = client._tracking_client.store.get_host_creds() # pylint: disable=protected-access + # update host to run history and request PATCH API + cred.host = cred.host.replace("mlflow/v2.0", "mlflow/v1.0").replace("mlflow/v1.0", "history/v1.0") + response = http_request( + host_creds=cred, + endpoint=f"/experimentids/{run.info.experiment_id}/runs/{run.info.run_id}", + method="PATCH", + json={"runId": run.info.run_id, "properties": properties}, + ) + if response.status_code != 200: + LOGGER.error("Fail writing properties '%s' to run history: %s", properties, response.text) + response.raise_for_status() + except AttributeError as e: + LOGGER.error("Fail writing properties '%s' to run history: %s", properties, e) + + +def _azure_pf_client(trace_destination): + from promptflow._sdk._utilities.general_utils import extract_workspace_triad_from_trace_provider + from promptflow.azure._cli._utils import _get_azure_pf_client + + ws_triad = extract_workspace_triad_from_trace_provider(trace_destination) + azure_pf_client = _get_azure_pf_client( + subscription_id=ws_triad.subscription_id, + resource_group=ws_triad.resource_group_name, + workspace_name=ws_triad.workspace_name, + ) + + return azure_pf_client + + +def _get_mlflow_tracking_uri(trace_destination): + from promptflow._sdk._utilities.general_utils import extract_workspace_triad_from_trace_provider + + azure_pf_client = _azure_pf_client(trace_destination) + ws_triad = extract_workspace_triad_from_trace_provider(trace_destination) + + ws = azure_pf_client.ml_client.workspaces.get(ws_triad.workspace_name) + return ws.mlflow_tracking_uri + + +def _get_trace_destination_config(tracking_uri): + from promptflow._sdk._configuration import Configuration + pf_config = Configuration(overrides={ + "trace.destination": tracking_uri + } if tracking_uri is not None else {} + ) + + trace_destination = pf_config.get_trace_destination() + + return trace_destination + + +def _log_metrics_and_instance_results(metrics, instance_results, tracking_uri, run, pf_client, data, + evaluation_name=None) -> str: + run_id = None + trace_destination = _get_trace_destination_config(tracking_uri=tracking_uri) + + if trace_destination is None: + return None + + tracking_uri = _get_mlflow_tracking_uri(trace_destination=trace_destination) + + # Adding line_number as index column this is needed by UI to form link to individual instance run + instance_results["line_number"] = instance_results.index + + if run is None: + mlflow.set_tracking_uri(tracking_uri) + + with tempfile.TemporaryDirectory() as tmpdir: + with mlflow.start_run(run_name=evaluation_name) as run: + tmp_path = os.path.join(tmpdir, "eval_results.jsonl") + + with open(tmp_path, "w", encoding="utf-8") as f: + f.write(instance_results.to_json(orient="records", lines=True)) + + mlflow.log_artifact(tmp_path) + + # Using mlflow to create a dummy run since once created via PF show traces of dummy run in UI. + # Those traces can be confusing. + # adding these properties to avoid showing traces if a dummy run is created + _write_properties_to_run_history( + properties={ + "_azureml.evaluation_run": "azure-ai-generative-parent", + "_azureml.evaluate_artifacts": json.dumps([{"path": "eval_results.jsonl", "type": "table"}]) + }) + run_id = run.info.run_id + else: + azure_pf_client = _azure_pf_client(trace_destination=trace_destination) + with tempfile.TemporaryDirectory() as temp_dir: + file_name = Local2Cloud.FLOW_INSTANCE_RESULTS_FILE_NAME + local_file = Path(temp_dir) / file_name + instance_results.to_json(local_file, orient="records", lines=True) + + # overriding instance_results.jsonl file + async_uploader = AsyncRunUploader._from_run_operations(run, azure_pf_client.runs) + remote_file = (f"{Local2Cloud.BLOB_ROOT_PROMPTFLOW}" + f"/{Local2Cloud.BLOB_ARTIFACTS}/{run.name}/{Local2Cloud.FLOW_INSTANCE_RESULTS_FILE_NAME}") + async_run_allowing_running_loop(async_uploader._upload_local_file_to_blob, local_file, remote_file) + run_id = run.name + + client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri) + for metric_name, metric_value in metrics.items(): + client.log_metric(run_id, metric_name, metric_value) + + return _get_ai_studio_url(trace_destination=trace_destination, evaluation_id=run_id) + + +def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str: + ws_triad = extract_workspace_triad_from_trace_provider(trace_destination) + studio_base_url = os.getenv("AI_STUDIO_BASE_URL", "https://ai.azure.com") + + studio_url = f"{studio_base_url}/build/evaluation/{evaluation_id}?wsid=/subscriptions/{ws_triad.subscription_id}" \ + f"/resourceGroups/{ws_triad.resource_group_name}/providers/Microsoft.MachineLearningServices/" \ + f"workspaces/{ws_triad.workspace_name}" + + return studio_url diff --git a/src/promptflow-evals/pyproject.toml b/src/promptflow-evals/pyproject.toml index 1d79338ab34..1331e653ee9 100644 --- a/src/promptflow-evals/pyproject.toml +++ b/src/promptflow-evals/pyproject.toml @@ -43,6 +43,9 @@ azure-ai-ml = ">=1.14.0" promptflow-devkit = "<2.0.0" promptflow-core = "<2.0.0" promptflow-tools = "<2.0.0" +promptflow-azure = "<2.0.0" # Needed for remote tracking +mlflow = "<3.0.0" # Needed for remote tracking to log metrics +azureml-mlflow = "<2.0.0" # Needed for remote tracking to log metrics [tool.poetry.group.dev.dependencies] pre-commit = "*" diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py index 80671fca63c..cf8adbd3465 100644 --- a/src/promptflow-evals/tests/evals/conftest.py +++ b/src/promptflow-evals/tests/evals/conftest.py @@ -1,7 +1,9 @@ import json import multiprocessing import os +import subprocess from pathlib import Path +from typing import Dict from unittest.mock import patch import pytest @@ -12,6 +14,8 @@ from promptflow.executor._line_execution_process_pool import _process_wrapper from promptflow.executor._process_manager import create_spawned_fork_process_manager from promptflow.tracing._integrations._openai_injector import inject_openai_api +from promptflow.azure import PFClient as AzurePFClient +from azure.identity import DefaultAzureCredential try: from promptflow.recording.local import recording_array_reset @@ -54,6 +58,11 @@ def configure_default_azure_credential(): creds = dev_connections["pf-evals-sp"]["value"] for key, value in creds.items(): os.environ[key] = value + login_output = subprocess.check_output( + ["az", "login", "--service-principal", "-u", creds["AZURE_CLIENT_ID"], + "-p", creds["AZURE_CLIENT_SECRET"], "--tenant", creds["AZURE_TENANT_ID"]], shell=True) + print("loging_output") + print(login_output) def pytest_configure(): @@ -115,6 +124,33 @@ def project_scope() -> dict: return dev_connections[conn_name]["value"] +@pytest.fixture +def mock_trace_destination_to_cloud(project_scope: dict): + """Mock trace destination to cloud.""" + + subscription_id = project_scope["subscription_id"] + resource_group_name = project_scope["resource_group_name"] + workspace_name = project_scope["project_name"] + + trace_destination = ( + f"azureml://subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/" + f"providers/Microsoft.MachineLearningServices/workspaces/{workspace_name}" + ) + with patch("promptflow._sdk._configuration.Configuration.get_trace_destination", return_value=trace_destination): + yield + + +@pytest.fixture +def azure_pf_client(project_scope: Dict): + """The fixture, returning AzurePFClient""" + return AzurePFClient( + subscription_id=project_scope["subscription_id"], + resource_group_name=project_scope["resource_group_name"], + workspace_name=project_scope["project_name"], + credential=DefaultAzureCredential() + ) + + @pytest.fixture def pf_client() -> PFClient: """The fixture, returning PRClient""" diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py index f1f30245271..dcb496d3522 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py @@ -4,9 +4,11 @@ import numpy as np import pandas as pd import pytest +import requests from promptflow.evals.evaluate import evaluate from promptflow.evals.evaluators import F1ScoreEvaluator, GroundednessEvaluator +from azure.identity import AzureCliCredential @pytest.fixture @@ -25,6 +27,35 @@ def answer_evaluator(answer): return {"length": len(answer)} +def _get_run_from_run_history(flow_run_id, runs_operation): + """Get run info from run history""" + token = "Bearer " + AzureCliCredential().get_token("https://management.azure.com/.default").token + headers = { + "Authorization": token, + "Content-Type": "application/json", + } + url = runs_operation._run_history_endpoint_url + "/rundata" + + payload = { + "runId": flow_run_id, + "selectRunMetadata": True, + "selectRunDefinition": True, + "selectJobSpecification": True, + } + + response = requests.post(url, headers=headers, json=payload) + if response.status_code == 200: + run = response.json() + # if original_form is True, return the original run data from run history, mainly for test use + return run + elif response.status_code == 404: + raise Exception(f"Run {flow_run_id!r} not found.") + else: + raise Exception( + f"Failed to get run from service. Code: {response.status_code}, text: {response.text}" + ) + + @pytest.mark.usefixtures("model_config", "recording_injection", "data_file") @pytest.mark.e2etest class TestEvaluate: @@ -62,6 +93,7 @@ def test_groundedness_evaluator(self, model_config, data_file): assert row_result_df["outputs.grounded.gpt_groundedness"][2] in [4, 5] assert row_result_df["outputs.f1_score.f1_score"][2] == 1 + assert result["studio_url"] is None def test_evaluate_python_function(self, data_file): # data @@ -158,3 +190,78 @@ def test_evaluate_with_evaluator_config(self, questions_file, evaluate_config): assert "answer.length" in metrics.keys() assert "f1_score.f1_score" in metrics.keys() + + @pytest.mark.skip(reason="az login in fixture is not working on ubuntu and mac.Works on windows") + def test_evaluate_track_in_cloud(self, questions_file, azure_pf_client, mock_trace_destination_to_cloud, + configure_default_azure_credential): + """Test evaluation with target function.""" + # We cannot define target in this file as pytest will load + # all modules in test folder and target_fn will be imported from the first + # module named test_evaluate and it will be a different module in unit test + # folder. By keeping function in separate file we guarantee, it will be loaded + # from there. + from .target_fn import target_fn + + f1_score_eval = F1ScoreEvaluator() + evaluation_name = "test_evaluate_track_in_cloud" + # run the evaluation with targets + result = evaluate( + evaluation_name=evaluation_name, + data=questions_file, + target=target_fn, + evaluators={"answer": answer_evaluator, "f1": f1_score_eval}, + ) + row_result_df = pd.DataFrame(result["rows"]) + + assert "outputs.answer" in row_result_df.columns + assert "outputs.answer.length" in row_result_df.columns + assert list(row_result_df["outputs.answer.length"]) == [28, 76, 22] + assert "outputs.f1.f1_score" in row_result_df.columns + assert not any(np.isnan(f1) for f1 in row_result_df["outputs.f1.f1_score"]) + assert result["studio_url"] is not None + + # get remote run and validate if it exists + run_id = result["studio_url"].split("?")[0].split("/")[5] + remote_run = azure_pf_client.runs.get(run_id) + + assert remote_run is not None + assert remote_run.properties["azureml.promptflow.local_to_cloud"] == "true" + assert remote_run.properties["runType"] == "eval_run" + assert remote_run.display_name == evaluation_name + + @pytest.mark.skip(reason="az login in fixture is not working on ubuntu and mac.Works on windows") + def test_evaluate_track_in_cloud_no_target(self, data_file, azure_pf_client, mock_trace_destination_to_cloud, + configure_default_azure_credential): + # data + input_data = pd.read_json(data_file, lines=True) + + f1_score_eval = F1ScoreEvaluator() + evaluation_name = "test_evaluate_track_in_cloud_no_target" + + # run the evaluation + result = evaluate( + evaluation_name=evaluation_name, + data=data_file, + evaluators={"f1_score": f1_score_eval}, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + + # validate the results + assert result is not None + assert result["rows"] is not None + assert row_result_df.shape[0] == len(input_data) + assert "outputs.f1_score.f1_score" in row_result_df.columns.to_list() + assert "f1_score.f1_score" in metrics.keys() + assert metrics.get("f1_score.f1_score") == np.nanmean(row_result_df["outputs.f1_score.f1_score"]) + assert row_result_df["outputs.f1_score.f1_score"][2] == 1 + assert result["studio_url"] is not None + + # get remote run and validate if it exists + run_id = result["studio_url"].split("?")[0].split("/")[5] + remote_run = _get_run_from_run_history(run_id, azure_pf_client.runs) + + assert remote_run is not None + assert remote_run["runMetadata"]["properties"]["_azureml.evaluation_run"] == "azure-ai-generative-parent" + assert remote_run["runMetadata"]["displayName"] == evaluation_name diff --git a/src/promptflow-evals/tests/evals/unittests/test_evaluate.py b/src/promptflow-evals/tests/evals/unittests/test_evaluate.py index 7f668aa8b70..913f6a7f73c 100644 --- a/src/promptflow-evals/tests/evals/unittests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/unittests/test_evaluate.py @@ -122,7 +122,7 @@ def test_wrong_target(self, questions_file): def test_apply_target_to_data(self, pf_client, questions_file, questions_answers_file): """Test that target was applied correctly.""" initial_data = pd.read_json(questions_file, lines=True) - qa_df, columns = _apply_target_to_data(_target_fn, questions_file, pf_client, initial_data) + qa_df, columns, target_run = _apply_target_to_data(_target_fn, questions_file, pf_client, initial_data) assert columns == {"answer"} ground_truth = pd.read_json(questions_answers_file, lines=True) assert_frame_equal(qa_df, ground_truth, check_like=True) From f9a7a851f758db52fcc8660ef00d3ce53ab2328d Mon Sep 17 00:00:00 2001 From: chenyang Date: Mon, 6 May 2024 11:26:14 +0800 Subject: [PATCH 05/21] fix config override tracing.destination (#3057) # Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. 1. azureml full ![image](https://github.com/microsoft/promptflow/assets/23182548/33ab9606-478c-4782-909f-bb36311bdc09) ![image](https://github.com/microsoft/promptflow/assets/23182548/2dbebc6c-1b18-40ed-9e19-968e8fdbd8eb) 2. azureml ![image](https://github.com/microsoft/promptflow/assets/23182548/44ffee1b-0cb7-4adf-ade5-d5b1d36189f4) # All Promptflow Contribution checklist: - [x] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [x] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [x] Pull request includes test coverage for the included changes. --- .../_sdk/_orchestrator/run_submitter.py | 9 ++++----- .../promptflow/_sdk/_pf_client.py | 6 +++--- .../promptflow/_sdk/_tracing.py | 19 +++++++++++++------ .../promptflow/_sdk/entities/_run.py | 8 +++----- 4 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/promptflow-devkit/promptflow/_sdk/_orchestrator/run_submitter.py b/src/promptflow-devkit/promptflow/_sdk/_orchestrator/run_submitter.py index 982489a7122..5df99665b18 100644 --- a/src/promptflow-devkit/promptflow/_sdk/_orchestrator/run_submitter.py +++ b/src/promptflow-devkit/promptflow/_sdk/_orchestrator/run_submitter.py @@ -23,7 +23,6 @@ from promptflow.tracing._operation_context import OperationContext from promptflow.tracing._start_trace import is_collection_writeable, start_trace -from .._configuration import Configuration from .._load_functions import load_flow from ..entities._flows import FlexFlow from .utils import SubmitterHelper, variant_overwrite_context @@ -36,7 +35,7 @@ class RunSubmitter: def __init__(self, client): self._client = client - self._config = Configuration(overrides=self._client._config) + self._config = self._client._config self.run_operations = self._client.runs def submit(self, run: Run, stream=False, **kwargs): @@ -96,13 +95,13 @@ def _run_bulk(self, run: Run, stream=False, **kwargs): # pass with internal parameter `_collection` start_trace( attributes=attributes, - run=run.name, + run=run, _collection=collection_for_run, path=flow_path, ) else: logger.debug("trace collection is protected, will honor existing collection.") - start_trace(attributes=attributes, run=run.name, path=flow_path) + start_trace(attributes=attributes, run=run, path=flow_path) self._validate_inputs(run=run) @@ -276,7 +275,7 @@ def _upload_run_to_cloud(cls, run: Run): from promptflow._sdk._tracing import _get_ws_triad_from_pf_config from promptflow.azure._cli._utils import _get_azure_pf_client - ws_triad = _get_ws_triad_from_pf_config(path=run._get_flow_dir().resolve()) + ws_triad = _get_ws_triad_from_pf_config(path=run._get_flow_dir().resolve(), config=run._config) pf = _get_azure_pf_client( subscription_id=ws_triad.subscription_id, resource_group=ws_triad.resource_group_name, diff --git a/src/promptflow-devkit/promptflow/_sdk/_pf_client.py b/src/promptflow-devkit/promptflow/_sdk/_pf_client.py index bb5c63f5af4..8da79af330f 100644 --- a/src/promptflow-devkit/promptflow/_sdk/_pf_client.py +++ b/src/promptflow-devkit/promptflow/_sdk/_pf_client.py @@ -44,7 +44,7 @@ def __init__(self, **kwargs): # when this is set, telemetry from this client will use this user agent and ignore the one from OperationContext self._user_agent_override = kwargs.pop(USER_AGENT_OVERRIDE_KEY, None) self._connection_provider = kwargs.pop("connection_provider", None) - self._config = kwargs.get("config", None) or {} + self._config = Configuration(overrides=kwargs.get("config", None) or {}) # The credential is used as an option to override # DefaultAzureCredential when using workspace connection provider self._credential = kwargs.get("credential", None) @@ -212,7 +212,7 @@ def _run( connections=connections, environment_variables=environment_variables, properties=properties, - config=Configuration(overrides=self._config), + config=self._config, init=init, dynamic_callable=dynamic_callable, ) @@ -372,7 +372,7 @@ def tools(self) -> ToolOperations: def _ensure_connection_provider(self) -> str: if not self._connection_provider: # Get a copy with config override instead of the config instance - self._connection_provider = Configuration(overrides=self._config).get_connection_provider() + self._connection_provider = self._config.get_connection_provider() logger.debug("PFClient connection provider: %s, setting to env.", self._connection_provider) from promptflow.core._connection_provider._connection_provider import ConnectionProvider diff --git a/src/promptflow-devkit/promptflow/_sdk/_tracing.py b/src/promptflow-devkit/promptflow/_sdk/_tracing.py index a6fa980fdb1..0e21ceec725 100644 --- a/src/promptflow-devkit/promptflow/_sdk/_tracing.py +++ b/src/promptflow-devkit/promptflow/_sdk/_tracing.py @@ -56,6 +56,7 @@ extract_workspace_triad_from_trace_provider, ) from promptflow._sdk._utilities.tracing_utils import get_workspace_kind, parse_kv_from_pb_attribute, parse_protobuf_span +from promptflow._sdk.entities import Run from promptflow._utils.logger_utils import get_cli_sdk_logger from promptflow._utils.thread_utils import ThreadWithContextVars from promptflow.tracing._integrations._openai_injector import inject_openai_api @@ -195,14 +196,15 @@ def _invoke_pf_svc() -> str: return port -def _get_ws_triad_from_pf_config(path: typing.Optional[Path]) -> typing.Optional[AzureMLWorkspaceTriad]: +def _get_ws_triad_from_pf_config(path: typing.Optional[Path], config=None) -> typing.Optional[AzureMLWorkspaceTriad]: from promptflow._sdk._configuration import Configuration - config = Configuration.get_instance().get_trace_destination(path=path) - _logger.info("resolved tracing.trace.destination: %s", config) - if not TraceDestinationConfig.need_to_export_to_azure(config): + config = config or Configuration.get_instance() + trace_destination = config.get_trace_destination(path=path) + _logger.info("resolved tracing.trace.destination: %s", trace_destination) + if not TraceDestinationConfig.need_to_export_to_azure(trace_destination): return None - return extract_workspace_triad_from_trace_provider(config) + return extract_workspace_triad_from_trace_provider(trace_destination) # priority: run > experiment > collection @@ -368,6 +370,11 @@ def start_trace_with_devkit(collection: str, **kwargs: typing.Any) -> None: _logger.debug("kwargs: %s", kwargs) attrs = kwargs.get("attributes", None) run = kwargs.get("run", None) + if isinstance(run, Run): + run_config = run._config + run = run.name + else: + run_config = None path = kwargs.get("path", None) # honor and set attributes if user has specified @@ -395,7 +402,7 @@ def start_trace_with_devkit(collection: str, **kwargs: typing.Any) -> None: # local to cloud feature _logger.debug("start_trace_with_devkit.path(from kwargs): %s", path) - ws_triad = _get_ws_triad_from_pf_config(path=path) + ws_triad = _get_ws_triad_from_pf_config(path=path, config=run_config) is_azure_ext_installed = _is_azure_ext_installed() if ws_triad is not None and not is_azure_ext_installed: warning_msg = ( diff --git a/src/promptflow-devkit/promptflow/_sdk/entities/_run.py b/src/promptflow-devkit/promptflow/_sdk/entities/_run.py index 2bf42c29934..8196634f1e7 100644 --- a/src/promptflow-devkit/promptflow/_sdk/entities/_run.py +++ b/src/promptflow-devkit/promptflow/_sdk/entities/_run.py @@ -172,15 +172,14 @@ def __init__( # default run name: flow directory name + timestamp self.name = name or self._generate_run_name() experiment_name = kwargs.get("experiment_name", None) + self._config: Configuration = kwargs.get("config", Configuration.get_instance()) if self._run_source == RunInfoSources.LOCAL and not self._use_remote_flow: self.flow = Path(str(flow)).resolve().absolute() flow_dir = self._get_flow_dir() # sanitize flow_dir to avoid invalid experiment name self._experiment_name = _sanitize_python_variable_name(flow_dir.name) self._lineage_id = get_flow_lineage_id(flow_dir=flow_dir) - self._output_path = Path( - kwargs.get("output_path", self._generate_output_path(config=kwargs.get("config", None))) - ) + self._output_path = Path(kwargs.get("output_path", self._generate_output_path(config=self._config))) if is_prompty_flow(self.flow): self._flow_name = Path(self.flow).stem else: @@ -786,8 +785,7 @@ def _validate_for_run_create_operation(self): if not self.run and not self.data: raise UserErrorException("at least one of data or run must be provided") - def _generate_output_path(self, config: Optional[Configuration]) -> Path: - config = config or Configuration.get_instance() + def _generate_output_path(self, config: Configuration) -> Path: path = config.get_run_output_path() if path is None: path = HOME_PROMPT_FLOW_DIR / ".runs" From f8c7738199e358c34ad2021836ee2cab21b0a778 Mon Sep 17 00:00:00 2001 From: Ge Gao <49388944+dorisjoy@users.noreply.github.com> Date: Mon, 6 May 2024 11:44:28 +0800 Subject: [PATCH 06/21] Add environment into flow.dag.yaml for eval-chat-math flow (#3082) Add environment into flow.dag.yaml for eval-chat-math flow: run link in portal: https://ml.azure.com/prompts/bulkrun/eval_chat_math_variant_0_20240430_143934_923359/details?wsid=/subscriptions/96aede12-2f73-41cb-b983-6d11a904839b/resourcegroups/promptflow/providers/Microsoft.MachineLearningServices/workspaces/promptflow-eastus-dev&tid=72f988bf-86f1-41af-91ab-2d7cd011db47 Co-authored-by: Ge Gao --- examples/flows/evaluation/eval-chat-math/flow.dag.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/flows/evaluation/eval-chat-math/flow.dag.yaml b/examples/flows/evaluation/eval-chat-math/flow.dag.yaml index d7e9a591ae3..ff302d88b49 100644 --- a/examples/flows/evaluation/eval-chat-math/flow.dag.yaml +++ b/examples/flows/evaluation/eval-chat-math/flow.dag.yaml @@ -1,3 +1,4 @@ +$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json inputs: groundtruth: type: string @@ -31,4 +32,5 @@ nodes: aggregation: true use_variants: false node_variants: {} -$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json +environment: + python_requirements_txt: requirements.txt From c8f0e8c6b412b04660d7bd73004357ee1840bb53 Mon Sep 17 00:00:00 2001 From: Honglin Date: Mon, 6 May 2024 12:51:49 +0800 Subject: [PATCH 07/21] [CI] Update link check white list (#3111) # Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --- scripts/docs/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/docs/conf.py b/scripts/docs/conf.py index cb1be38a695..80b8e2a4865 100644 --- a/scripts/docs/conf.py +++ b/scripts/docs/conf.py @@ -59,6 +59,8 @@ # Options for the linkcheck builder linkcheck_ignore = [ + # openai related sites blocks the IP of the CI server. + r"https://openai\.com/", r"https://platform\.openai\.com/", r"https://help\.openai\.com/", # These are used in card links, for example 'xx.html', .md can't be resolved. From 1607cc71fe8ce603d20723758ffcc3e2045281b9 Mon Sep 17 00:00:00 2001 From: riddle xu Date: Mon, 6 May 2024 14:02:02 +0800 Subject: [PATCH 08/21] [Internal] Make runtime cloud trace also async (#3081) # Description Currently, the cloud trace in runtime is in sync mode. This seems unnecessary. The local and runtime should use the same async pattern. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. Co-authored-by: Yangtong Xu --- .../promptflow/_sdk/_tracing.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/promptflow-devkit/promptflow/_sdk/_tracing.py b/src/promptflow-devkit/promptflow/_sdk/_tracing.py index 0e21ceec725..2a1a0b69f37 100644 --- a/src/promptflow-devkit/promptflow/_sdk/_tracing.py +++ b/src/promptflow-devkit/promptflow/_sdk/_tracing.py @@ -609,17 +609,11 @@ def process_otlp_trace_request( else: all_spans.append(span) - if cloud_trace_only: - # If we only trace to cloud, we should make sure the data writing is success before return. - _try_write_trace_to_cosmosdb( - all_spans, get_created_by_info_with_cache, logger, get_credential, is_cloud_trace=True - ) - else: - # Create a new thread to write trace to cosmosdb to avoid blocking the main thread - ThreadWithContextVars( - target=_try_write_trace_to_cosmosdb, - args=(all_spans, get_created_by_info_with_cache, logger, get_credential, False), - ).start() + # Create a new thread to write trace to cosmosdb to avoid blocking the main thread + ThreadWithContextVars( + target=_try_write_trace_to_cosmosdb, + args=(all_spans, get_created_by_info_with_cache, logger, get_credential, cloud_trace_only), + ).start() return From 637ad8a57a9ad50513adb03b186aa969b163c3a5 Mon Sep 17 00:00:00 2001 From: Ying Chen Date: Mon, 6 May 2024 14:58:27 +0800 Subject: [PATCH 09/21] Fix executable test since streamlit upgrade (#3108) # Description Root cause: streamlit latest package doesn't allow import streamlit_quill when runtime environment is not created. Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --------- Co-authored-by: Ying Chen <2601502859@qq.com> --- .../promptflow/_sdk/data/executable/main.py | 3 +- .../_sdk/operations/_flow_operations.py | 1 - .../sdk_cli_test/e2etests/test_executable.py | 48 +++++++++---------- 3 files changed, 24 insertions(+), 28 deletions(-) diff --git a/src/promptflow-devkit/promptflow/_sdk/data/executable/main.py b/src/promptflow-devkit/promptflow/_sdk/data/executable/main.py index 6fd10f2c7e6..2a8a3e1c385 100644 --- a/src/promptflow-devkit/promptflow/_sdk/data/executable/main.py +++ b/src/promptflow-devkit/promptflow/_sdk/data/executable/main.py @@ -7,7 +7,6 @@ import streamlit as st from PIL import Image -from streamlit_quill import st_quill from utils import dict_iter_render_message, parse_image_content, parse_list_from_html, render_single_dict_message from promptflow._constants import STREAMING_ANIMATION_TIME @@ -156,6 +155,8 @@ def submit(**kwargs) -> None: for flow_input, (default_value, value_type) in flow_inputs.items(): if value_type == "list": st.text(flow_input) + from streamlit_quill import st_quill + input = st_quill( html=True, toolbar=["image"], diff --git a/src/promptflow-devkit/promptflow/_sdk/operations/_flow_operations.py b/src/promptflow-devkit/promptflow/_sdk/operations/_flow_operations.py index d498aa2c72c..62cb207f50d 100644 --- a/src/promptflow-devkit/promptflow/_sdk/operations/_flow_operations.py +++ b/src/promptflow-devkit/promptflow/_sdk/operations/_flow_operations.py @@ -589,7 +589,6 @@ def _build_as_executable( import bs4 # noqa: F401 import PyInstaller # noqa: F401 import streamlit - import streamlit_quill # noqa: F401 except ImportError as ex: raise UserErrorException( f"Please try 'pip install promptflow[executable]' to install dependency, {ex.msg}." diff --git a/src/promptflow-devkit/tests/sdk_cli_test/e2etests/test_executable.py b/src/promptflow-devkit/tests/sdk_cli_test/e2etests/test_executable.py index 97a3107b878..7486aedb883 100644 --- a/src/promptflow-devkit/tests/sdk_cli_test/e2etests/test_executable.py +++ b/src/promptflow-devkit/tests/sdk_cli_test/e2etests/test_executable.py @@ -1,4 +1,3 @@ -import shutil import tempfile from pathlib import Path @@ -19,28 +18,25 @@ class TestExecutable: def test_flow_build_executable(self): source = f"{FLOWS_DIR}/web_classification/flow.dag.yaml" - try: - with tempfile.TemporaryDirectory() as temp_dir: - run_pf_command( - "flow", - "build", - "--source", - source, - "--output", - temp_dir, - "--format", - "executable", - ) - check_path_list = [ - "flow/flow.dag.yaml", - "connections/azure_open_ai_connection.yaml", - "pf.bat", - "pf", - "start_pfs.vbs", - ] - output_path = Path(temp_dir).resolve() - for check_path in check_path_list: - check_path = output_path / check_path - assert check_path.exists() - finally: - shutil.rmtree(output_path, ignore_errors=True) + with tempfile.TemporaryDirectory() as temp_dir: + run_pf_command( + "flow", + "build", + "--source", + source, + "--output", + temp_dir, + "--format", + "executable", + ) + check_path_list = [ + "flow/flow.dag.yaml", + "connections/azure_open_ai_connection.yaml", + "pf.bat", + "pf", + "start_pfs.vbs", + ] + output_path = Path(temp_dir).resolve() + for check_path in check_path_list: + check_path = output_path / check_path + assert check_path.exists() From e392e8776fa466c76b565f2f99e42d6f67f06144 Mon Sep 17 00:00:00 2001 From: Ge Gao <49388944+dorisjoy@users.noreply.github.com> Date: Mon, 6 May 2024 15:47:35 +0800 Subject: [PATCH 10/21] Change tools-tests workflow to use federated credential instead of client secret (#3077) Change tools-tests workflow to use federated credential instead of client secret: 1. In tools_tests.yml file to add permission section for id_token and content, this setting allows the JWT to be requested from GitHub's OIDC provider. 2. Add azure/login@v1 to az login using federated credential; 3. Using AzureCliCredential instead of ClientSecretCredential to init SecretClient. workflow test link: https://github.com/microsoft/promptflow/actions/runs/8888730637 Delete the tools_secret_upload workflow since it is useless now. --------- Co-authored-by: Ge Gao --- .github/workflows/tools_secret_upload.yml | 70 ---------------------- .github/workflows/tools_tests.yml | 20 ++++++- scripts/tool/generate_connection_config.py | 5 +- scripts/tool/upload_tool_secret.py | 41 ------------- scripts/tool/utils/secret_manager.py | 59 +----------------- scripts/tool/validate_tool_secret.py | 39 ------------ 6 files changed, 21 insertions(+), 213 deletions(-) delete mode 100644 .github/workflows/tools_secret_upload.yml delete mode 100644 scripts/tool/upload_tool_secret.py delete mode 100644 scripts/tool/validate_tool_secret.py diff --git a/.github/workflows/tools_secret_upload.yml b/.github/workflows/tools_secret_upload.yml deleted file mode 100644 index 205b958b816..00000000000 --- a/.github/workflows/tools_secret_upload.yml +++ /dev/null @@ -1,70 +0,0 @@ -name: tools_secret_upload -# Triggers the workflow manually -on: - workflow_dispatch: - inputs: - secret_name: - description: 'Name of the secret' - required: true - type: string - secret_value: - description: 'Value of the secret' - required: true - type: string -jobs: - approve_upload_tool_secrets_job: - runs-on: ubuntu-latest - name: wait for approval - timeout-minutes: 60 - - permissions: - issues: write - - steps: - - name: Echo inputs - run: echo "Secret name:${{ github.event.inputs.secret_name }}" - - - name: Wait for approval - uses: trstringer/manual-approval@v1 - timeout-minutes: 60 - with: - secret: ${{ github.TOKEN }} - approvers: 16oeahr,chjinche,DaweiCai - minimum-approvals: 1 - issue-title: "Request to upload secret to key vault for e2e test." - upload_secret_job: - name: upload secret - runs-on: ubuntu-latest - needs: approve_upload_tool_secrets_job - timeout-minutes: 60 - - steps: - - name: Add Mask - run: | - SECRET_VALUE=$(jq -r '.inputs.secret_value' $GITHUB_EVENT_PATH) - echo "::add-mask::$SECRET_VALUE" - - - name: Check for dockerenv file - run: (ls /.dockerenv && echo Found dockerenv) || (echo No dockerenv) - # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Set up Python 3.9 environment - uses: actions/setup-python@v4 - with: - python-version: "3.9" - - run: | - python -m pip install --upgrade pip - pip install azure-identity==1.12.0 - pip install azure-keyvault-secrets==4.6.0 - pip install azure-core==1.26.4 - - - name: Validate - run: | - python scripts/tool/validate_tool_secret.py --tenant_id ${{ secrets.TENANT_ID }} --client_id ${{ secrets.CLIENT_ID }} --client_secret ${{ secrets.CLIENT_SECRET }} --secret_name ${{ github.event.inputs.secret_name }} - - - name: Start upload - run: | - python scripts/tool/upload_tool_secret.py --tenant_id ${{ secrets.TENANT_ID }} --client_id ${{ secrets.CLIENT_ID }} --client_secret ${{ secrets.CLIENT_SECRET }} --secret_name ${{ github.event.inputs.secret_name }} --secret_value ${{ github.event.inputs.secret_value }} - diff --git a/.github/workflows/tools_tests.yml b/.github/workflows/tools_tests.yml index 1c34f0684d1..5eb00032f6e 100644 --- a/.github/workflows/tools_tests.yml +++ b/.github/workflows/tools_tests.yml @@ -1,10 +1,16 @@ name: tools_tests +permissions: + # This is required for requesting the JWT + id-token: write + # This is required for actions/checkout + contents: read on: + workflow_dispatch: pull_request_target: paths: - src/promptflow-tools/** - - '**tools_tests.yml' - workflow_dispatch: + - scripts/tool/** + - .github/workflows/tools_tests.yml jobs: authorize: environment: @@ -53,9 +59,17 @@ jobs: pip install azure-mgmt-cognitiveservices==13.5.0 fi pip list + + - name: Azure login + uses: azure/login@v1 + with: + client-id: ${{ secrets.CLIENT_ID }} + tenant-id: ${{ secrets.TENANT_ID }} + subscription-id: ${{ secrets.TEST_WORKSPACE_SUB_ID }} + - name: Generate configs run: | - python ./scripts/tool/generate_connection_config.py --tenant_id ${{ secrets.TENANT_ID }} --client_id ${{ secrets.CLIENT_ID }} --client_secret ${{ secrets.CLIENT_SECRET }} + python ./scripts/tool/generate_connection_config.py - name: Run tests run: | diff --git a/scripts/tool/generate_connection_config.py b/scripts/tool/generate_connection_config.py index 0a2a9b23837..622fe787574 100644 --- a/scripts/tool/generate_connection_config.py +++ b/scripts/tool/generate_connection_config.py @@ -21,9 +21,6 @@ def fill_key_to_dict(template_dict, keys_dict): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--tenant_id", type=str, help="The tenant id of the service principal") - parser.add_argument("--client_id", type=str, help="The client id of the service principal") - parser.add_argument("--client_secret", type=str, help="The client secret of the service principal") parser.add_argument("--local", action='store_true', help="local debug mode") args = parser.parse_args() @@ -32,7 +29,7 @@ def fill_key_to_dict(template_dict, keys_dict): print(f"file_path: {file_path}") if not args.local: - client = get_secret_client(tenant_id=args.tenant_id, client_id=args.client_id, client_secret=args.client_secret) + client = get_secret_client() all_secret_names = list_secret_names(client) data = {secret_name: get_secret(secret_name, client) for secret_name in all_secret_names} diff --git a/scripts/tool/upload_tool_secret.py b/scripts/tool/upload_tool_secret.py deleted file mode 100644 index bf893b983a2..00000000000 --- a/scripts/tool/upload_tool_secret.py +++ /dev/null @@ -1,41 +0,0 @@ -import argparse -from utils.secret_manager import get_secret_client, upload_secret - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--tenant_id", - type=str, - required=True, - help="The tenant id of the service principal", - ) - parser.add_argument( - "--client_id", - type=str, - required=True, - help="The client id of the service principal", - ) - parser.add_argument( - "--client_secret", - type=str, - required=True, - help="The client secret of the service principal", - ) - parser.add_argument( - "--secret_name", - type=str, - required=True, - ) - parser.add_argument( - "--secret_value", - type=str, - required=True, - ) - args = parser.parse_args() - - secret_client = get_secret_client( - args.tenant_id, args.client_id, args.client_secret - ) - - upload_secret(secret_client, args.secret_name, args.secret_value) diff --git a/scripts/tool/utils/secret_manager.py b/scripts/tool/utils/secret_manager.py index 597ba1a230e..97bdccac2d7 100644 --- a/scripts/tool/utils/secret_manager.py +++ b/scripts/tool/utils/secret_manager.py @@ -1,36 +1,17 @@ -import re - -from azure.core.exceptions import HttpResponseError, ResourceExistsError -from azure.identity import ClientSecretCredential +from azure.identity import AzureCliCredential from azure.keyvault.secrets import SecretClient -from exceptions import ( - SecretNameAlreadyExistsException, - SecretNameInvalidException, - SecretNoSetPermissionException, -) key_vault_name = "github-promptflow" -container_name = "tools" KVUri = f"https://{key_vault_name}.vault.azure.net" -def init_used_secret_names(client: SecretClient): - global reserved_secret_names - reserved_secret_names = list_secret_names(client) - - -def get_secret_client( - tenant_id: str, client_id: str, client_secret: str -) -> SecretClient: - credential = ClientSecretCredential(tenant_id, client_id, client_secret) +def get_secret_client() -> SecretClient: + credential = AzureCliCredential() client = SecretClient(vault_url=KVUri, credential=credential) return client -reserved_secret_names = [] - - def get_secret(secret_name: str, client: SecretClient): secret = client.get_secret(secret_name) @@ -41,37 +22,3 @@ def list_secret_names(client: SecretClient) -> list: secret_properties = client.list_properties_of_secrets() return [secret.name for secret in secret_properties] - - -def validate_secret_name(secret_name: str): - # Check if secret name is valid. Secret name can only contain alphanumeric characters and dashes. - pattern = "^[a-zA-Z0-9-]+$" - if not re.match(pattern, secret_name): - raise SecretNameInvalidException( - "Secret name can only contain alphanumeric characters and dashes" - ) - # Check if secret name is one of the reserved names - if secret_name in reserved_secret_names: - raise SecretNameAlreadyExistsException( - f"Secret name {secret_name} already exists" - ) - - -def upload_secret(client: SecretClient, secret_name: str, secret_value: str): - try: - client.set_secret(secret_name, secret_value) - except ResourceExistsError as ex: - if "in a deleted but recoverable state" in str(ex): - raise SecretNameAlreadyExistsException( - f"Secret name {secret_name} is deleted but recoverable, and its name cannot be reused" - ) - except HttpResponseError as ex: - if ( - ex.status_code == 403 - and "does not have secrets set permission on key vault" in str(ex) - ): - raise SecretNoSetPermissionException( - f"No set permission on key vault {key_vault_name}" - ) - - print("Done.") diff --git a/scripts/tool/validate_tool_secret.py b/scripts/tool/validate_tool_secret.py deleted file mode 100644 index 9c50140a4c2..00000000000 --- a/scripts/tool/validate_tool_secret.py +++ /dev/null @@ -1,39 +0,0 @@ -import argparse -from utils.secret_manager import ( - get_secret_client, - init_used_secret_names, - validate_secret_name, -) - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--tenant_id", - type=str, - required=True, - help="The tenant id of the service principal", - ) - parser.add_argument( - "--client_id", - type=str, - required=True, - help="The client id of the service principal", - ) - parser.add_argument( - "--client_secret", - type=str, - required=True, - help="The client secret of the service principal", - ) - parser.add_argument( - "--secret_name", - type=str, - required=True, - ) - args = parser.parse_args() - - secret_client = get_secret_client( - args.tenant_id, args.client_id, args.client_secret - ) - init_used_secret_names(secret_client) - validate_secret_name(args.secret_name) From 83f2c995198921b9d9699386cc5b8c5cac662c2c Mon Sep 17 00:00:00 2001 From: Zhengfei Wang <38847871+zhengfeiwang@users.noreply.github.com> Date: Mon, 6 May 2024 17:36:18 +0800 Subject: [PATCH 11/21] [promptflow] Bump pf version in main branch (#3115) # Description Otherwise, when install from local, the version will become 1.10.0. # All Promptflow Contribution checklist: - [x] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [x] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [x] Title of the pull request is clear and informative. - [x] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --- src/promptflow/promptflow/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/promptflow/promptflow/_version.py b/src/promptflow/promptflow/_version.py index 344d35d0a6e..c21091e23dc 100644 --- a/src/promptflow/promptflow/_version.py +++ b/src/promptflow/promptflow/_version.py @@ -2,4 +2,4 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -VERSION = "1.10.0" +VERSION = "1.11.0.dev0" From 57c632a395a4ff58a85e343e5e52fdab09c33748 Mon Sep 17 00:00:00 2001 From: Xiaopeng Wang Date: Mon, 6 May 2024 18:07:03 +0800 Subject: [PATCH 12/21] [pfserving] Improve log and add OTLP exporter aad auth support (#3112) # Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --------- Co-authored-by: xiaopwan --- .../_serving/extension/azureml_extension.py | 2 +- .../otel_exporter_provider_factory.py | 71 ++++++++++++++++++- .../promptflow/core/_serving/flow_invoker.py | 15 ++-- 3 files changed, 81 insertions(+), 7 deletions(-) diff --git a/src/promptflow-core/promptflow/core/_serving/extension/azureml_extension.py b/src/promptflow-core/promptflow/core/_serving/extension/azureml_extension.py index 3cfb2be4989..ade6ec5222e 100644 --- a/src/promptflow-core/promptflow/core/_serving/extension/azureml_extension.py +++ b/src/promptflow-core/promptflow/core/_serving/extension/azureml_extension.py @@ -97,7 +97,7 @@ def get_override_connections(self, flow: Flow) -> Tuple[dict, dict]: conn = WorkspaceConnectionProvider._convert_to_connection_dict(connection_name, conn_data) connections[connection_name] = conn except Exception as e: - self.logger.warn(f"Failed to convert connection data to connection: {e}") + self.logger.warning(f"Failed to convert connection data to connection: {e}") raise InvalidConnectionData(connection_name) if len(connections_name_overrides) > 0: self.logger.info(f"Connection name overrides: {connections_name_overrides}") diff --git a/src/promptflow-core/promptflow/core/_serving/extension/otel_exporter_provider_factory.py b/src/promptflow-core/promptflow/core/_serving/extension/otel_exporter_provider_factory.py index f463642cbfc..3e6ade8f04a 100644 --- a/src/promptflow-core/promptflow/core/_serving/extension/otel_exporter_provider_factory.py +++ b/src/promptflow-core/promptflow/core/_serving/extension/otel_exporter_provider_factory.py @@ -58,6 +58,10 @@ def get_exporter(self, **kwargs): return AzureMonitorTraceExporter.from_connection_string(self.app_insight_connection_string) except ImportError: + self.logger.warning( + "azure-monitor-opentelemetry-exporter is not installed, \ + AzureMonitorTraceExporter will not be enabled!" + ) return None @@ -82,9 +86,17 @@ def get_exporter(self, **kwargs): return AzureMonitorMetricExporter.from_connection_string(self.app_insight_connection_string) except ImportError: + self.logger.warning( + "azure-monitor-opentelemetry-exporter is not installed, \ + AzureMonitorMetricExporter will not be enabled!" + ) return None +OTEL_EXPORTER_OTLP_AAD_AUTH_ENABLE = "OTEL_EXPORTER_OTLP_AAD_AUTH_ENABLE" +OTEL_EXPORTER_OTLP_AAD_AUTH_SCOPE = "OTEL_EXPORTER_OTLP_AAD_AUTH_SCOPE" + + class OTLPExporterProvider(OTelExporterProvider): def __init__(self, logger, exporter_type: ExporterType) -> None: super().__init__(logger, exporter_type) @@ -103,11 +115,30 @@ def __init__(self, logger) -> None: super().__init__(logger, ExporterType.TRACE) def get_exporter(self, **kwargs): + logger = self.logger try: from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter - return OTLPSpanExporter(endpoint=self.otel_exporter_endpoint) + class AADAuthOTLPSpanExporter(OTLPSpanExporter): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.aad_auth, self.aad_auth_scope, self.credential = try_parse_otlp_aad_auth_info( + logger, "OTLPSpanExporter" + ) + + def _export(self, serialized_data: str): + if self.aad_auth and self.credential: + token = self.credential.get_token(self.aad_auth_scope).token + auth_header = {"Authorization": f"Bearer {token}"} + self._session.headers.update(auth_header) + return super()._export(serialized_data) + + return AADAuthOTLPSpanExporter(endpoint=self.otel_exporter_endpoint) except ImportError: + self.logger.warning( + "opentelemetry-exporter-otlp-proto-http is not installed, \ + OTLPSpanExporter will not be enabled!" + ) return None @@ -116,11 +147,30 @@ def __init__(self, logger) -> None: super().__init__(logger, ExporterType.METRIC) def get_exporter(self, **kwargs): + logger = self.logger try: from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter - return OTLPMetricExporter(endpoint=self.otel_exporter_endpoint) + class AADAuthOTLPMetricExporter(OTLPMetricExporter): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.aad_auth, self.aad_auth_scope, self.credential = try_parse_otlp_aad_auth_info( + logger, "OTLPMetricExporter" + ) + + def _export(self, serialized_data: str): + if self.aad_auth and self.credential: + token = self.credential.get_token(self.aad_auth_scope).token + auth_header = {"Authorization": f"Bearer {token}"} + self._session.headers.update(auth_header) + return super()._export(serialized_data) + + return AADAuthOTLPMetricExporter(endpoint=self.otel_exporter_endpoint) except ImportError: + self.logger.warning( + "opentelemetry-exporter-otlp-proto-http is not installed, \ + OTLPMetricExporter will not be enabled!" + ) return None @@ -166,3 +216,20 @@ def try_get_app_insight_connection_string(): return f"InstrumentationKey={instrumentation_key}" connection_str = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING") return connection_str + + +def try_parse_otlp_aad_auth_info(logger, exporter_name): + aad_auth = os.environ.get(OTEL_EXPORTER_OTLP_AAD_AUTH_ENABLE, "false").lower() == "true" + aad_auth_scope = os.environ.get(OTEL_EXPORTER_OTLP_AAD_AUTH_SCOPE, "https://management.azure.com/.default") + credential = None + if aad_auth: + try: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential() + except ImportError: + logger.warning( + f"azure-identity is not installed, \ + AAD auth for {exporter_name} will not be enabled!" + ) + return aad_auth, aad_auth_scope, credential diff --git a/src/promptflow-core/promptflow/core/_serving/flow_invoker.py b/src/promptflow-core/promptflow/core/_serving/flow_invoker.py index ce4b7dd6d0b..727c3ef4cd0 100644 --- a/src/promptflow-core/promptflow/core/_serving/flow_invoker.py +++ b/src/promptflow-core/promptflow/core/_serving/flow_invoker.py @@ -10,6 +10,7 @@ from promptflow._utils.flow_utils import dump_flow_result, is_executable_chat_flow from promptflow._utils.logger_utils import LoggerFactory from promptflow._utils.multimedia_utils import MultimediaProcessor +from promptflow.contracts.run_info import Status from promptflow.core._connection import _Connection from promptflow.core._connection_provider._connection_provider import ConnectionProvider from promptflow.core._flow import AbstractFlowBase @@ -222,8 +223,11 @@ def invoke(self, data: dict, run_id=None, disable_input_output_logging=False): returned_non_dict_output = False resolved_outputs = self._convert_multimedia_data_to_base64(output_dict) self._dump_invoke_result(result) - log_outputs = "" if disable_input_output_logging else result.output - self.logger.info(f"Flow run result: {log_outputs}") + if result.run_info.status != Status.Completed: + self.logger.error(f"Flow run failed with error: {result.run_info.error}") + else: + log_outputs = "" if disable_input_output_logging else result.output + self.logger.info(f"Flow run result: {log_outputs}") if not self.raise_ex: # If raise_ex is False, we will return the trace flow & node run info. return FlowResult( @@ -266,8 +270,11 @@ async def invoke_async(self, data: dict, run_id=None, disable_input_output_loggi returned_non_dict_output = False resolved_outputs = self._convert_multimedia_data_to_base64(output_dict) self._dump_invoke_result(result) - log_outputs = "" if disable_input_output_logging else result.output - self.logger.info(f"Flow run result: {log_outputs}") + if result.run_info.status != Status.Completed: + self.logger.error(f"Flow run failed with error: {result.run_info.error}") + else: + log_outputs = "" if disable_input_output_logging else result.output + self.logger.info(f"Flow run result: {log_outputs}") if not self.raise_ex: # If raise_ex is False, we will return the trace flow & node run info. return FlowResult( From e53bf7f998e78c155bc95bc3579f94f602e623d2 Mon Sep 17 00:00:00 2001 From: Robben Wang <350053002@qq.com> Date: Mon, 6 May 2024 18:45:11 +0800 Subject: [PATCH 13/21] Add util method to summarize trace telemetry. (#3074) # Description Add util method to summarize trace count telemetry. For long term telemetry, we will need span count. The simplest solution is create custom event for each trace id and include span count in custom dimension. But that may generate too many customer events to affect all telemetry. So, we only record trace count first, and decide how to add span count later according to trace count's telemetry. Maybe just decide a reasonable sampling rate. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --------- Co-authored-by: robbenwang --- src/promptflow-core/promptflow/_constants.py | 3 + .../promptflow/_internal/__init__.py | 1 + .../promptflow/_sdk/_tracing.py | 2 +- .../_sdk/_utilities/tracing_utils.py | 43 ++++++++ .../_sdk/_utilities/test_tracing_utils.py | 101 ++++++++++++++++++ 5 files changed, 149 insertions(+), 1 deletion(-) create mode 100644 src/promptflow-devkit/tests/unittests/_sdk/_utilities/test_tracing_utils.py diff --git a/src/promptflow-core/promptflow/_constants.py b/src/promptflow-core/promptflow/_constants.py index f15d77271e1..32565352926 100644 --- a/src/promptflow-core/promptflow/_constants.py +++ b/src/promptflow-core/promptflow/_constants.py @@ -171,6 +171,9 @@ class SpanAttributeFieldName: COMPLETION_TOKEN_COUNT = "__computed__.cumulative_token_count.completion" PROMPT_TOKEN_COUNT = "__computed__.cumulative_token_count.prompt" TOTAL_TOKEN_COUNT = "__computed__.cumulative_token_count.total" + # Execution target, e.g. prompty, flex, dag, code. + # We may need another field to indicate the language, e.g. python, csharp. + EXECUTION_TARGET = "execution_target" SESSION_ID = "session_id" diff --git a/src/promptflow-devkit/promptflow/_internal/__init__.py b/src/promptflow-devkit/promptflow/_internal/__init__.py index 4f1dc5edceb..2e606c018e7 100644 --- a/src/promptflow-devkit/promptflow/_internal/__init__.py +++ b/src/promptflow-devkit/promptflow/_internal/__init__.py @@ -52,6 +52,7 @@ from promptflow._sdk._service.apis.collector import trace_collector from promptflow._sdk._tracing import process_otlp_trace_request from promptflow._sdk._utilities.general_utils import resolve_flow_language +from promptflow._sdk._utilities.tracing_utils import aggregate_trace_count from promptflow._sdk._version import VERSION from promptflow._utils.context_utils import _change_working_dir, inject_sys_path from promptflow._utils.credential_scrubber import CredentialScrubber diff --git a/src/promptflow-devkit/promptflow/_sdk/_tracing.py b/src/promptflow-devkit/promptflow/_sdk/_tracing.py index 2a1a0b69f37..deddb6acb8f 100644 --- a/src/promptflow-devkit/promptflow/_sdk/_tracing.py +++ b/src/promptflow-devkit/promptflow/_sdk/_tracing.py @@ -615,7 +615,7 @@ def process_otlp_trace_request( args=(all_spans, get_created_by_info_with_cache, logger, get_credential, cloud_trace_only), ).start() - return + return all_spans def _try_write_trace_to_cosmosdb( diff --git a/src/promptflow-devkit/promptflow/_sdk/_utilities/tracing_utils.py b/src/promptflow-devkit/promptflow/_sdk/_utilities/tracing_utils.py index 9e96da8ec4e..8102f57eaf9 100644 --- a/src/promptflow-devkit/promptflow/_sdk/_utilities/tracing_utils.py +++ b/src/promptflow-devkit/promptflow/_sdk/_utilities/tracing_utils.py @@ -6,6 +6,7 @@ import json import logging import typing +from collections import namedtuple from dataclasses import dataclass from pathlib import Path @@ -15,10 +16,13 @@ from opentelemetry.trace.span import format_trace_id as otel_format_trace_id from promptflow._constants import ( + SpanAttributeFieldName, SpanContextFieldName, SpanEventFieldName, SpanFieldName, SpanLinkFieldName, + SpanResourceAttributesFieldName, + SpanResourceFieldName, SpanStatusFieldName, ) from promptflow._sdk._constants import HOME_PROMPT_FLOW_DIR, AzureMLWorkspaceTriad @@ -284,3 +288,42 @@ def append_conditions( expression += f" and session_id == '{session_id}'" logger.debug("final search expression: %s", expression) return expression + + +# SCENARIO: trace count telemetry +TraceCountKey = namedtuple( + "TraceKey", ["subscription_id", "resource_group", "workspace_name", "scenario", "execution_target"] +) + + +def aggregate_trace_count(all_spans: typing.List[Span]) -> typing.Dict[TraceCountKey, int]: + """ + Aggregate the trace count based on workspace info, scenario, and execution target. + """ + trace_count_summary = {} + + if not all_spans: + return trace_count_summary + + # Iterate over all spans + for span in all_spans: + # Only count for root span, ignore span count telemetry for now. + if span.parent_id is None: + resource_attributes = span.resource.get(SpanResourceFieldName.ATTRIBUTES, {}) + subscription_id = resource_attributes.get(SpanResourceAttributesFieldName.SUBSCRIPTION_ID, None) + resource_group = resource_attributes.get(SpanResourceAttributesFieldName.RESOURCE_GROUP_NAME, None) + workspace_name = resource_attributes.get(SpanResourceAttributesFieldName.WORKSPACE_NAME, None) + # We may need another field to indicate the language in the future, e.g. python, csharp. + execution_target = span.attributes.get(SpanAttributeFieldName.EXECUTION_TARGET, "code") + + scenario = "script" + if SpanAttributeFieldName.BATCH_RUN_ID in span.attributes: + scenario = "batch" + elif SpanAttributeFieldName.LINE_RUN_ID in span.attributes: + scenario = "test" + + key = TraceCountKey(subscription_id, resource_group, workspace_name, scenario, execution_target) + + trace_count_summary[key] = trace_count_summary.get(key, 0) + 1 + + return trace_count_summary diff --git a/src/promptflow-devkit/tests/unittests/_sdk/_utilities/test_tracing_utils.py b/src/promptflow-devkit/tests/unittests/_sdk/_utilities/test_tracing_utils.py new file mode 100644 index 00000000000..d3f5ec507da --- /dev/null +++ b/src/promptflow-devkit/tests/unittests/_sdk/_utilities/test_tracing_utils.py @@ -0,0 +1,101 @@ +import pytest +from pydash import partial + +from promptflow._constants import SpanAttributeFieldName, SpanResourceAttributesFieldName, SpanResourceFieldName +from promptflow._sdk._utilities.tracing_utils import aggregate_trace_count +from promptflow._sdk.entities._trace import Span + +# Mock definitions for Span, SpanResourceFieldName, SpanResourceAttributesFieldName, and SpanAttributeFieldName +# These should match the actual implementations you're using in your application. + + +@pytest.mark.unittest +class TestTraceTelemetry: + def test_empty_span_list(self): + """Test with an empty list of spans.""" + result = aggregate_trace_count([]) + assert result == {} + + def test_single_root_span(self): + + resource = { + SpanResourceFieldName.ATTRIBUTES: { + SpanResourceAttributesFieldName.SUBSCRIPTION_ID: "sub", + SpanResourceAttributesFieldName.RESOURCE_GROUP_NAME: "rg", + SpanResourceAttributesFieldName.WORKSPACE_NAME: "ws", + } + } + create_span = partial( + Span, + trace_id=None, + span_id=None, + name=None, + context=None, + kind=None, + start_time=None, + end_time=None, + status=None, + parent_id=None, + resource=resource, + ) + + batch_root_span = create_span( + attributes={ + SpanAttributeFieldName.EXECUTION_TARGET: "code", + SpanAttributeFieldName.BATCH_RUN_ID: "batch_run_id", + }, + ) + line_root_span = create_span( + attributes={ + SpanAttributeFieldName.EXECUTION_TARGET: "code", + SpanAttributeFieldName.LINE_RUN_ID: "line_run_id", + }, + ) + + flex_root_span = create_span( + attributes={ + SpanAttributeFieldName.EXECUTION_TARGET: "flex", + }, + ) + prompty_root_span = create_span( + attributes={ + SpanAttributeFieldName.EXECUTION_TARGET: "prompty", + }, + ) + script_root_span = create_span( + attributes={ + SpanAttributeFieldName.EXECUTION_TARGET: "code", + }, + ) + none_ws_root_span = create_span( + resource={}, + attributes={ + SpanAttributeFieldName.EXECUTION_TARGET: "prompty", + }, + ) + non_root_span = create_span( + parent_id=1, + attributes={ + SpanAttributeFieldName.EXECUTION_TARGET: "code", + }, + ) + result = aggregate_trace_count( + [ + batch_root_span, + line_root_span, + script_root_span, + flex_root_span, + prompty_root_span, + non_root_span, + none_ws_root_span, + ] + ) + expected_result = { + ("sub", "rg", "ws", "batch", "code"): 1, + ("sub", "rg", "ws", "script", "code"): 1, + ("sub", "rg", "ws", "script", "flex"): 1, + ("sub", "rg", "ws", "script", "prompty"): 1, + ("sub", "rg", "ws", "test", "code"): 1, + (None, None, None, "script", "prompty"): 1, + } + assert result == expected_result From 39abb837a9a94bf315f703ccb677eedb388bf2a8 Mon Sep 17 00:00:00 2001 From: Ankit Singhal <30610298+singankit@users.noreply.github.com> Date: Mon, 6 May 2024 11:25:35 -0700 Subject: [PATCH 14/21] Removing private util depenency (#3121) # Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --- .../promptflow/evals/evaluate/_utils.py | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_utils.py b/src/promptflow-evals/promptflow/evals/evaluate/_utils.py index 66baa8f03ec..41613b14ff5 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_utils.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_utils.py @@ -4,18 +4,40 @@ import logging import json import os +import re import tempfile +from collections import namedtuple from pathlib import Path import mlflow from promptflow._sdk._constants import Local2Cloud -from promptflow._sdk._utilities.general_utils import extract_workspace_triad_from_trace_provider from promptflow._utils.async_utils import async_run_allowing_running_loop from promptflow.azure.operations._async_run_uploader import AsyncRunUploader LOGGER = logging.getLogger(__name__) +AZURE_WORKSPACE_REGEX_FORMAT = ( + "^azureml:[/]{1,2}subscriptions/([^/]+)/resource(groups|Groups)/([^/]+)" + "(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$" +) + +AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"]) + + +def extract_workspace_triad_from_trace_provider(trace_provider: str): + match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider) + if not match or len(match.groups()) != 5: + raise ValueError( + "Malformed trace provider string, expected azureml://subscriptions//" + "resourceGroups//providers/Microsoft.MachineLearningServices/" + f"workspaces/, got {trace_provider}" + ) + subscription_id = match.group(1) + resource_group_name = match.group(3) + workspace_name = match.group(5) + return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name) + def load_jsonl(path): with open(path, "r", encoding="utf-8") as f: @@ -50,7 +72,6 @@ def _write_properties_to_run_history(properties: dict) -> None: def _azure_pf_client(trace_destination): - from promptflow._sdk._utilities.general_utils import extract_workspace_triad_from_trace_provider from promptflow.azure._cli._utils import _get_azure_pf_client ws_triad = extract_workspace_triad_from_trace_provider(trace_destination) @@ -64,8 +85,6 @@ def _azure_pf_client(trace_destination): def _get_mlflow_tracking_uri(trace_destination): - from promptflow._sdk._utilities.general_utils import extract_workspace_triad_from_trace_provider - azure_pf_client = _azure_pf_client(trace_destination) ws_triad = extract_workspace_triad_from_trace_provider(trace_destination) From 61f23df60afb7967109e9d3cd9c63af20e06ac2e Mon Sep 17 00:00:00 2001 From: Ying Chen Date: Tue, 7 May 2024 11:04:08 +0800 Subject: [PATCH 15/21] Update promptflow evals msi package (#3109) # Description ![image](https://github.com/microsoft/promptflow/assets/26239730/eb5a6bd1-44cf-4c9a-8ce7-b9c5f0fa8ec4) ![image](https://github.com/microsoft/promptflow/assets/26239730/b0bb41a0-5067-4bd7-88de-9314ac382d66) # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --------- Co-authored-by: Ying Chen <2601502859@qq.com> Co-authored-by: Ge Gao <49388944+dorisjoy@users.noreply.github.com> Co-authored-by: Ge Gao Co-authored-by: Honglin Co-authored-by: riddle xu Co-authored-by: Yangtong Xu --- .../installer/windows/scripts/promptflow.spec.jinja2 | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/installer/windows/scripts/promptflow.spec.jinja2 b/scripts/installer/windows/scripts/promptflow.spec.jinja2 index a7d1cd63664..f418c9186bd 100644 --- a/scripts/installer/windows/scripts/promptflow.spec.jinja2 +++ b/scripts/installer/windows/scripts/promptflow.spec.jinja2 @@ -17,17 +17,19 @@ for package in meta_packages: datas += copy_metadata(package) opentelemetry_datas, opentelemetry_binaries, opentelemetry_hiddenimports = collect_all('opentelemetry') +promptflow_datas, promptflow_binaries, promptflow_hiddenimports = collect_all('promptflow') datas += opentelemetry_datas +datas += promptflow_datas datas += collect_data_files('streamlit_quill') -datas += collect_data_files('promptflow') -datas += copy_metadata('promptflow') -datas += collect_data_files('promptflow-evals') -datas += copy_metadata('promptflow-evals') + hidden_imports = ['win32timezone', 'promptflow', 'opentelemetry.context.contextvars_context', 'streamlit.runtime.scriptrunner.magic_funcs'] + {{hidden_imports}} hidden_imports += opentelemetry_hiddenimports +hidden_imports += promptflow_hiddenimports + binaries = [] binaries += opentelemetry_binaries +binaries += promptflow_binaries block_cipher = None From 964e2bc06723e72854acdd1aae69450ef68fe082 Mon Sep 17 00:00:00 2001 From: Billy Hu Date: Mon, 6 May 2024 20:32:50 -0700 Subject: [PATCH 16/21] [promptflow-evals] Convert prompt based evaluators to prompty based (#3043) # Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --- .../evals/evaluators/coherence/_coherence.py | 41 ++++--- .../{flow/prompt.jinja2 => coherence.prompty} | 98 +++++++++------ .../evaluators/coherence/flow/flow.dag.yaml | 49 -------- .../evaluators/coherence/flow/parse_score.py | 14 --- .../coherence/flow/requirements.txt | 2 - .../coherence/flow/validate_inputs.py | 12 -- .../evals/evaluators/fluency/_fluency.py | 40 +++--- .../evaluators/fluency/flow/flow.dag.yaml | 49 -------- .../evaluators/fluency/flow/parse_score.py | 14 --- .../evaluators/fluency/flow/requirements.txt | 2 - .../fluency/flow/validate_inputs.py | 12 -- .../{flow/prompt.jinja2 => fluency.prompty} | 96 +++++++++------ .../evaluators/groundedness/_groundedness.py | 40 +++--- .../groundedness/flow/flow.dag.yaml | 49 -------- .../groundedness/flow/parse_score.py | 14 --- .../groundedness/flow/requirements.txt | 2 - .../groundedness/flow/validate_inputs.py | 10 -- .../prompt.jinja2 => groundedness.prompty} | 82 ++++++++----- .../evals/evaluators/relevance/_relevance.py | 40 +++--- .../evaluators/relevance/flow/flow.dag.yaml | 53 -------- .../evaluators/relevance/flow/parse_score.py | 14 --- .../relevance/flow/requirements.txt | 2 - .../relevance/flow/validate_inputs.py | 14 --- .../{flow/prompt.jinja2 => relevance.prompty} | 110 ++++++++++------- .../evaluators/similarity/_similarity.py | 44 ++++--- .../evaluators/similarity/flow/flow.dag.yaml | 53 -------- .../evaluators/similarity/flow/parse_score.py | 14 --- .../similarity/flow/requirements.txt | 2 - .../similarity/flow/validate_inputs.py | 14 --- .../prompt.jinja2 => similarity.prompty} | 114 +++++++++++------- src/promptflow-evals/pyproject.toml | 1 - .../local/evals.node_cache.shelve.bak | 16 +++ .../local/evals.node_cache.shelve.dat | Bin 72111 -> 142798 bytes .../local/evals.node_cache.shelve.dir | 16 +++ 34 files changed, 474 insertions(+), 659 deletions(-) rename src/promptflow-evals/promptflow/evals/evaluators/coherence/{flow/prompt.jinja2 => coherence.prompty} (80%) delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/flow.dag.yaml delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/parse_score.py delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/requirements.txt delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/validate_inputs.py delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/flow.dag.yaml delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/parse_score.py delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/requirements.txt delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/validate_inputs.py rename src/promptflow-evals/promptflow/evals/evaluators/fluency/{flow/prompt.jinja2 => fluency.prompty} (79%) delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/flow.dag.yaml delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/parse_score.py delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/requirements.txt delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/validate_inputs.py rename src/promptflow-evals/promptflow/evals/evaluators/groundedness/{flow/prompt.jinja2 => groundedness.prompty} (82%) delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/flow.dag.yaml delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/parse_score.py delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/requirements.txt delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/validate_inputs.py rename src/promptflow-evals/promptflow/evals/evaluators/relevance/{flow/prompt.jinja2 => relevance.prompty} (85%) delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/flow.dag.yaml delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/parse_score.py delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/requirements.txt delete mode 100644 src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/validate_inputs.py rename src/promptflow-evals/promptflow/evals/evaluators/similarity/{flow/prompt.jinja2 => similarity.prompty} (88%) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/coherence/_coherence.py b/src/promptflow-evals/promptflow/evals/evaluators/coherence/_coherence.py index 8e227885ca7..125a3361f3d 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/coherence/_coherence.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/coherence/_coherence.py @@ -2,10 +2,12 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from pathlib import Path +import os +import re + +import numpy as np from promptflow.client import load_flow -from promptflow.core._prompty_utils import convert_model_configuration_to_connection class CoherenceEvaluator: @@ -25,20 +27,15 @@ def __init__(self, model_config): question="What is the capital of Japan?", answer="The capital of Japan is Tokyo.") """ + # TODO: Remove this block once the bug is fixed + # https://msdata.visualstudio.com/Vienna/_workitems/edit/3151324 + if model_config.api_version is None: + model_config.api_version = "2024-02-15-preview" - # Load the flow as function - current_dir = Path(__file__).resolve().parent - flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) - - # Override the connection - connection = convert_model_configuration_to_connection(model_config) - self._flow.context.connections = { - "query_llm": { - "connection": connection, - "deployment_name": model_config.azure_deployment, - } - } + prompty_model_config = {"configuration": model_config} + current_dir = os.path.dirname(__file__) + prompty_path = os.path.join(current_dir, "coherence.prompty") + self._flow = load_flow(source=prompty_path, model=prompty_model_config) def __call__(self, *, question: str, answer: str, **kwargs): """Evaluate coherence. @@ -50,5 +47,17 @@ def __call__(self, *, question: str, answer: str, **kwargs): :rtype: dict """ + # Validate input parameters + if not (question and question.strip()) or not (answer and answer.strip()): + raise ValueError("Both 'question' and 'answer' must be non-empty strings.") + # Run the evaluation flow - return self._flow(question=question, answer=answer) + llm_output = self._flow(question=question, answer=answer) + + score = np.nan + if llm_output: + match = re.search(r"\d", llm_output) + if match: + score = float(match.group()) + + return {"gpt_coherence": float(score)} diff --git a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/prompt.jinja2 b/src/promptflow-evals/promptflow/evals/evaluators/coherence/coherence.prompty similarity index 80% rename from src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/prompt.jinja2 rename to src/promptflow-evals/promptflow/evals/evaluators/coherence/coherence.prompty index 9d36f82f0d1..9a1f47bb528 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/prompt.jinja2 +++ b/src/promptflow-evals/promptflow/evals/evaluators/coherence/coherence.prompty @@ -1,36 +1,62 @@ -system: -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. - -user: -Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale: -One star: the answer completely lacks coherence -Two stars: the answer mostly lacks coherence -Three stars: the answer is partially coherent -Four stars: the answer is mostly coherent -Five stars: the answer has perfect coherency - -This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. - -question: What is your favorite indoor activity and why do you enjoy it? -answer: I like pizza. The sun is shining. -stars: 1 - -question: Can you describe your favorite movie without giving away any spoilers? -answer: It is a science fiction movie. There are dinosaurs. The actors eat cake. People must stop the villain. -stars: 2 - -question: What are some benefits of regular exercise? -answer: Regular exercise improves your mood. A good workout also helps you sleep better. Trees are green. -stars: 3 - -question: How do you cope with stress in your daily life? -answer: I usually go for a walk to clear my head. Listening to music helps me relax as well. Stress is a part of life, but we can manage it through some activities. -stars: 4 - -question: What can you tell me about climate change and its effects on the environment? -answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike. -stars: 5 - -question: {{question}} -answer: {{answer}} -stars: \ No newline at end of file +--- +name: Coherence +description: Evaluates coherence score for QA scenario +model: + api: chat + configuration: + type: azure_openai + azure_deployment: ${env:AZURE_DEPLOYMENT} + api_key: ${env:AZURE_OPENAI_API_KEY} + azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT} + parameters: + temperature: 0.0 + max_tokens: 1 + top_p: 1.0 + presence_penalty: 0 + frequency_penalty: 0 + response_format: + type: text + +inputs: + question: + type: string + answer: + type: string + +--- +system: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. + +user: +Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale: +One star: the answer completely lacks coherence +Two stars: the answer mostly lacks coherence +Three stars: the answer is partially coherent +Four stars: the answer is mostly coherent +Five stars: the answer has perfect coherency + +This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. + +question: What is your favorite indoor activity and why do you enjoy it? +answer: I like pizza. The sun is shining. +stars: 1 + +question: Can you describe your favorite movie without giving away any spoilers? +answer: It is a science fiction movie. There are dinosaurs. The actors eat cake. People must stop the villain. +stars: 2 + +question: What are some benefits of regular exercise? +answer: Regular exercise improves your mood. A good workout also helps you sleep better. Trees are green. +stars: 3 + +question: How do you cope with stress in your daily life? +answer: I usually go for a walk to clear my head. Listening to music helps me relax as well. Stress is a part of life, but we can manage it through some activities. +stars: 4 + +question: What can you tell me about climate change and its effects on the environment? +answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike. +stars: 5 + +question: {{question}} +answer: {{answer}} +stars: diff --git a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/flow.dag.yaml b/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/flow.dag.yaml deleted file mode 100644 index 76776d133db..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/flow.dag.yaml +++ /dev/null @@ -1,49 +0,0 @@ -$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json -environment: - python_requirements_txt: requirements.txt -inputs: - question: - type: string - answer: - type: string -outputs: - gpt_coherence: - type: string - reference: ${parse_score.output} -nodes: -- name: validate_inputs - type: python - source: - type: code - path: validate_inputs.py - inputs: - answer: ${inputs.answer} - question: ${inputs.question} -- name: query_llm - type: llm - source: - type: code - path: prompt.jinja2 - inputs: - deployment_name: gpt-4 - temperature: 0 - top_p: 1 - max_tokens: 1 - presence_penalty: 0 - frequency_penalty: 0 - question: ${inputs.question} - answer: ${inputs.answer} - connection: open_ai_connection - api: chat - use_variants: false - activate: - when: ${validate_inputs.output} - is: true -- name: parse_score - type: python - source: - type: code - path: parse_score.py - inputs: - llm_output: ${query_llm.output} - use_variants: false diff --git a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/parse_score.py deleted file mode 100644 index e4157b4d22c..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/parse_score.py +++ /dev/null @@ -1,14 +0,0 @@ -from promptflow.core import tool -import numpy as np -import re - - -@tool -def parse_score(llm_output: str = None): - score = np.nan - if llm_output: - match = re.search(r'\d', llm_output) - if match: - score = float(match.group()) - - return score diff --git a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/requirements.txt b/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/requirements.txt deleted file mode 100644 index 687aa3599e9..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -promptflow -promptflow-tools \ No newline at end of file diff --git a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/validate_inputs.py b/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/validate_inputs.py deleted file mode 100644 index 517736474de..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/validate_inputs.py +++ /dev/null @@ -1,12 +0,0 @@ -from promptflow.core import tool - - -@tool -def validate_inputs(question: str, answer: str): - # Validate input parameters - if not (question and question.strip() and question != "None") or not ( - answer and answer.strip() and answer != "None" - ): - raise ValueError("Both 'question' and 'answer' must be non-empty strings.") - - return True diff --git a/src/promptflow-evals/promptflow/evals/evaluators/fluency/_fluency.py b/src/promptflow-evals/promptflow/evals/evaluators/fluency/_fluency.py index dce506427a0..45b8e13d2c0 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/fluency/_fluency.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/fluency/_fluency.py @@ -2,10 +2,12 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from pathlib import Path +import os +import re + +import numpy as np from promptflow.client import load_flow -from promptflow.core._prompty_utils import convert_model_configuration_to_connection class FluencyEvaluator: @@ -25,20 +27,15 @@ def __init__(self, model_config): question="What is the capital of Japan?", answer="The capital of Japan is Tokyo.") """ + # TODO: Remove this block once the bug is fixed + # https://msdata.visualstudio.com/Vienna/_workitems/edit/3151324 + if model_config.api_version is None: + model_config.api_version = "2024-02-15-preview" - # Load the flow as function - current_dir = Path(__file__).resolve().parent - flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) - - # Override the connection - connection = convert_model_configuration_to_connection(model_config) - self._flow.context.connections = { - "query_llm": { - "connection": connection, - "deployment_name": model_config.azure_deployment, - } - } + prompty_model_config = {"configuration": model_config} + current_dir = os.path.dirname(__file__) + prompty_path = os.path.join(current_dir, "fluency.prompty") + self._flow = load_flow(source=prompty_path, model=prompty_model_config) def __call__(self, *, question: str, answer: str, **kwargs): """Evaluate fluency. @@ -49,6 +46,17 @@ def __call__(self, *, question: str, answer: str, **kwargs): :return: The fluency score. :rtype: dict """ + # Validate input parameters + if not (question and question.strip()) or not (answer and answer.strip()): + raise ValueError("Both 'question' and 'answer' must be non-empty strings.") # Run the evaluation flow - return self._flow(question=question, answer=answer) + llm_output = self._flow(question=question, answer=answer) + + score = np.nan + if llm_output: + match = re.search(r"\d", llm_output) + if match: + score = float(match.group()) + + return {"gpt_fluency": float(score)} diff --git a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/flow.dag.yaml b/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/flow.dag.yaml deleted file mode 100644 index 5a707e18bf7..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/flow.dag.yaml +++ /dev/null @@ -1,49 +0,0 @@ -$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json -environment: - python_requirements_txt: requirements.txt -inputs: - question: - type: string - answer: - type: string -outputs: - gpt_fluency: - type: string - reference: ${parse_score.output} -nodes: -- name: validate_inputs - type: python - source: - type: code - path: validate_inputs.py - inputs: - answer: ${inputs.answer} - question: ${inputs.question} -- name: query_llm - type: llm - source: - type: code - path: prompt.jinja2 - inputs: - deployment_name: gpt-4 - temperature: 0 - top_p: 1 - max_tokens: 1 - presence_penalty: 0 - frequency_penalty: 0 - question: ${inputs.question} - answer: ${inputs.answer} - connection: open_ai_connection - api: chat - use_variants: false - activate: - when: ${validate_inputs.output} - is: true -- name: parse_score - type: python - source: - type: code - path: parse_score.py - inputs: - llm_output: ${query_llm.output} - use_variants: false diff --git a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/parse_score.py deleted file mode 100644 index e4157b4d22c..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/parse_score.py +++ /dev/null @@ -1,14 +0,0 @@ -from promptflow.core import tool -import numpy as np -import re - - -@tool -def parse_score(llm_output: str = None): - score = np.nan - if llm_output: - match = re.search(r'\d', llm_output) - if match: - score = float(match.group()) - - return score diff --git a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/requirements.txt b/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/requirements.txt deleted file mode 100644 index 687aa3599e9..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -promptflow -promptflow-tools \ No newline at end of file diff --git a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/validate_inputs.py b/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/validate_inputs.py deleted file mode 100644 index 517736474de..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/validate_inputs.py +++ /dev/null @@ -1,12 +0,0 @@ -from promptflow.core import tool - - -@tool -def validate_inputs(question: str, answer: str): - # Validate input parameters - if not (question and question.strip() and question != "None") or not ( - answer and answer.strip() and answer != "None" - ): - raise ValueError("Both 'question' and 'answer' must be non-empty strings.") - - return True diff --git a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/prompt.jinja2 b/src/promptflow-evals/promptflow/evals/evaluators/fluency/fluency.prompty similarity index 79% rename from src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/prompt.jinja2 rename to src/promptflow-evals/promptflow/evals/evaluators/fluency/fluency.prompty index 5c115ff0492..deaab2f19df 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/prompt.jinja2 +++ b/src/promptflow-evals/promptflow/evals/evaluators/fluency/fluency.prompty @@ -1,35 +1,61 @@ -system: -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. -user: -Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale: -One star: the answer completely lacks fluency -Two stars: the answer mostly lacks fluency -Three stars: the answer is partially fluent -Four stars: the answer is mostly fluent -Five stars: the answer has perfect fluency - -This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. - -question: What did you have for breakfast today? -answer: Breakfast today, me eating cereal and orange juice very good. -stars: 1 - -question: How do you feel when you travel alone? -answer: Alone travel, nervous, but excited also. I feel adventure and like its time. -stars: 2 - -question: When was the last time you went on a family vacation? -answer: Last family vacation, it took place in last summer. We traveled to a beach destination, very fun. -stars: 3 - -question: What is your favorite thing about your job? -answer: My favorite aspect of my job is the chance to interact with diverse people. I am constantly learning from their experiences and stories. -stars: 4 - -question: Can you describe your morning routine? -answer: Every morning, I wake up at 6 am, drink a glass of water, and do some light stretching. After that, I take a shower and get dressed for work. Then, I have a healthy breakfast, usually consisting of oatmeal and fruits, before leaving the house around 7:30 am. -stars: 5 - -question: {{question}} -answer: {{answer}} -stars: \ No newline at end of file +--- +name: Fluency +description: Evaluates fluency score for QA scenario +model: + api: chat + configuration: + type: azure_openai + azure_deployment: ${env:AZURE_DEPLOYMENT} + api_key: ${env:AZURE_OPENAI_API_KEY} + azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT} + parameters: + temperature: 0.0 + max_tokens: 1 + top_p: 1.0 + presence_penalty: 0 + frequency_penalty: 0 + response_format: + type: text + +inputs: + question: + type: string + answer: + type: string + +--- +system: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +user: +Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale: +One star: the answer completely lacks fluency +Two stars: the answer mostly lacks fluency +Three stars: the answer is partially fluent +Four stars: the answer is mostly fluent +Five stars: the answer has perfect fluency + +This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. + +question: What did you have for breakfast today? +answer: Breakfast today, me eating cereal and orange juice very good. +stars: 1 + +question: How do you feel when you travel alone? +answer: Alone travel, nervous, but excited also. I feel adventure and like its time. +stars: 2 + +question: When was the last time you went on a family vacation? +answer: Last family vacation, it took place in last summer. We traveled to a beach destination, very fun. +stars: 3 + +question: What is your favorite thing about your job? +answer: My favorite aspect of my job is the chance to interact with diverse people. I am constantly learning from their experiences and stories. +stars: 4 + +question: Can you describe your morning routine? +answer: Every morning, I wake up at 6 am, drink a glass of water, and do some light stretching. After that, I take a shower and get dressed for work. Then, I have a healthy breakfast, usually consisting of oatmeal and fruits, before leaving the house around 7:30 am. +stars: 5 + +question: {{question}} +answer: {{answer}} +stars: diff --git a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/_groundedness.py b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/_groundedness.py index 919262ff468..3ab047d80c1 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/_groundedness.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/_groundedness.py @@ -2,10 +2,12 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from pathlib import Path +import os +import re + +import numpy as np from promptflow.client import load_flow -from promptflow.core._prompty_utils import convert_model_configuration_to_connection class GroundednessEvaluator: @@ -26,20 +28,15 @@ def __init__(self, model_config): context="Tokyo is Japan's capital, known for its blend of traditional culture \ and technological advancements.") """ + # TODO: Remove this block once the bug is fixed + # https://msdata.visualstudio.com/Vienna/_workitems/edit/3151324 + if model_config.api_version is None: + model_config.api_version = "2024-02-15-preview" - # Load the flow as function - current_dir = Path(__file__).resolve().parent - flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) - - # Override the connection - connection = convert_model_configuration_to_connection(model_config) - self._flow.context.connections = { - "query_llm": { - "connection": connection, - "deployment_name": model_config.azure_deployment, - } - } + prompty_model_config = {"configuration": model_config} + current_dir = os.path.dirname(__file__) + prompty_path = os.path.join(current_dir, "groundedness.prompty") + self._flow = load_flow(source=prompty_path, model=prompty_model_config) def __call__(self, *, answer: str, context: str, **kwargs): """Evaluate groundedness of the answer in the context. @@ -51,6 +48,17 @@ def __call__(self, *, answer: str, context: str, **kwargs): :return: The groundedness score. :rtype: dict """ + # Validate input parameters + if not (answer and answer.strip()) or not (context and context.strip()): + raise ValueError("Both 'answer' and 'context' must be non-empty strings.") # Run the evaluation flow - return self._flow(answer=answer, context=context) + llm_output = self._flow(answer=answer, context=context) + + score = np.nan + if llm_output: + match = re.search(r"\d", llm_output) + if match: + score = float(match.group()) + + return {"gpt_groundedness": float(score)} diff --git a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/flow.dag.yaml b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/flow.dag.yaml deleted file mode 100644 index 3d901123cec..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/flow.dag.yaml +++ /dev/null @@ -1,49 +0,0 @@ -$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json -environment: - python_requirements_txt: requirements.txt -inputs: - answer: - type: string - context: - type: string -outputs: - gpt_groundedness: - type: string - reference: ${parse_score.output} -nodes: -- name: validate_inputs - type: python - source: - type: code - path: validate_inputs.py - inputs: - answer: ${inputs.answer} - context: ${inputs.context} -- name: query_llm - type: llm - source: - type: code - path: prompt.jinja2 - inputs: - deployment_name: gpt-4 - temperature: 0 - top_p: 1 - max_tokens: 1 - presence_penalty: 0 - frequency_penalty: 0 - answer: ${inputs.answer} - context: ${inputs.context} - connection: open_ai_connection - api: chat - use_variants: false - activate: - when: ${validate_inputs.output} - is: true -- name: parse_score - type: python - source: - type: code - path: parse_score.py - inputs: - llm_output: ${query_llm.output} - use_variants: false diff --git a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/parse_score.py deleted file mode 100644 index e4157b4d22c..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/parse_score.py +++ /dev/null @@ -1,14 +0,0 @@ -from promptflow.core import tool -import numpy as np -import re - - -@tool -def parse_score(llm_output: str = None): - score = np.nan - if llm_output: - match = re.search(r'\d', llm_output) - if match: - score = float(match.group()) - - return score diff --git a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/requirements.txt b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/requirements.txt deleted file mode 100644 index 687aa3599e9..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -promptflow -promptflow-tools \ No newline at end of file diff --git a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/validate_inputs.py b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/validate_inputs.py deleted file mode 100644 index 6cb0dc2cdaa..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/validate_inputs.py +++ /dev/null @@ -1,10 +0,0 @@ -from promptflow.core import tool - - -@tool -def validate_inputs(answer: str, context: str): - # Validate input parameters - if not (answer and answer.strip() and answer != "None") or not (context and context.strip() and context != "None"): - raise ValueError("Both 'answer' and 'context' must be non-empty strings.") - - return True diff --git a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/prompt.jinja2 b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/groundedness.prompty similarity index 82% rename from src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/prompt.jinja2 rename to src/promptflow-evals/promptflow/evals/evaluators/groundedness/groundedness.prompty index a60afdf57dc..97f02fd3b21 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/prompt.jinja2 +++ b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/groundedness.prompty @@ -1,28 +1,54 @@ -system: -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. -user: -You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating: -1. 5: The ANSWER follows logically from the information contained in the CONTEXT. -2. 1: The ANSWER is logically false from the information contained in the CONTEXT. -3. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. -Independent Examples: -## Example Task #1 Input: -{"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} -## Example Task #1 Output: -1 -## Example Task #2 Input: -{"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."} -## Example Task #2 Output: -5 -## Example Task #3 Input: -{"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."} -## Example Task #3 Output: -5 -## Example Task #4 Input: -{"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} -## Example Task #4 Output: -1 -## Actual Task Input: -{"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{answer}}} -Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question. -Actual Task Output: \ No newline at end of file +--- +name: Groundedness +description: Evaluates groundedness score for QA scenario +model: + api: chat + configuration: + type: azure_openai + azure_deployment: ${env:AZURE_DEPLOYMENT} + api_key: ${env:AZURE_OPENAI_API_KEY} + azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT} + parameters: + temperature: 0.0 + max_tokens: 1 + top_p: 1.0 + presence_penalty: 0 + frequency_penalty: 0 + response_format: + type: text + +inputs: + answer: + type: string + context: + type: string + +--- +system: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +user: +You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating: +1. 5: The ANSWER follows logically from the information contained in the CONTEXT. +2. 1: The ANSWER is logically false from the information contained in the CONTEXT. +3. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. +Independent Examples: +## Example Task #1 Input: +{"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} +## Example Task #1 Output: +1 +## Example Task #2 Input: +{"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."} +## Example Task #2 Output: +5 +## Example Task #3 Input: +{"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."} +## Example Task #3 Output: +5 +## Example Task #4 Input: +{"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} +## Example Task #4 Output: +1 +## Actual Task Input: +{"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{answer}}} +Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question. +Actual Task Output: diff --git a/src/promptflow-evals/promptflow/evals/evaluators/relevance/_relevance.py b/src/promptflow-evals/promptflow/evals/evaluators/relevance/_relevance.py index de11466be01..2816717caac 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/relevance/_relevance.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/relevance/_relevance.py @@ -2,10 +2,12 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from pathlib import Path +import os +import re + +import numpy as np from promptflow.client import load_flow -from promptflow.core._prompty_utils import convert_model_configuration_to_connection class RelevanceEvaluator: @@ -27,20 +29,15 @@ def __init__(self, model_config): context="Tokyo is Japan's capital, known for its blend of traditional culture \ and technological advancements.") """ + # TODO: Remove this block once the bug is fixed + # https://msdata.visualstudio.com/Vienna/_workitems/edit/3151324 + if model_config.api_version is None: + model_config.api_version = "2024-02-15-preview" - # Load the flow as function - current_dir = Path(__file__).resolve().parent - flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) - - # Override the connection - connection = convert_model_configuration_to_connection(model_config) - self._flow.context.connections = { - "query_llm": { - "connection": connection, - "deployment_name": model_config.azure_deployment, - } - } + prompty_model_config = {"configuration": model_config} + current_dir = os.path.dirname(__file__) + prompty_path = os.path.join(current_dir, "relevance.prompty") + self._flow = load_flow(source=prompty_path, model=prompty_model_config) def __call__(self, *, question: str, answer: str, context: str, **kwargs): """Evaluate relevance. @@ -54,6 +51,17 @@ def __call__(self, *, question: str, answer: str, context: str, **kwargs): :return: The relevance score. :rtype: dict """ + # Validate input parameters + if not (question and question.strip()) or not (answer and answer.strip()) or not (context and context.strip()): + raise ValueError("'question', 'answer' and 'context' must be non-empty strings.") # Run the evaluation flow - return self._flow(question=question, answer=answer, context=context) + llm_output = self._flow(question=question, answer=answer, context=context) + + score = np.nan + if llm_output: + match = re.search(r"\d", llm_output) + if match: + score = float(match.group()) + + return {"gpt_relevance": float(score)} diff --git a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/flow.dag.yaml b/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/flow.dag.yaml deleted file mode 100644 index 795db73c714..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/flow.dag.yaml +++ /dev/null @@ -1,53 +0,0 @@ -$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json -environment: - python_requirements_txt: requirements.txt -inputs: - question: - type: string - answer: - type: string - context: - type: string -outputs: - gpt_relevance: - type: string - reference: ${parse_score.output} -nodes: -- name: validate_inputs - type: python - source: - type: code - path: validate_inputs.py - inputs: - answer: ${inputs.answer} - context: ${inputs.context} - question: ${inputs.question} -- name: query_llm - type: llm - source: - type: code - path: prompt.jinja2 - inputs: - deployment_name: gpt-4 - temperature: 0 - top_p: 1 - max_tokens: 1 - presence_penalty: 0 - frequency_penalty: 0 - question: ${inputs.question} - answer: ${inputs.answer} - context: ${inputs.context} - connection: open_ai_connection - api: chat - use_variants: false - activate: - when: ${validate_inputs.output} - is: true -- name: parse_score - type: python - source: - type: code - path: parse_score.py - inputs: - llm_output: ${query_llm.output} - use_variants: false diff --git a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/parse_score.py deleted file mode 100644 index e4157b4d22c..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/parse_score.py +++ /dev/null @@ -1,14 +0,0 @@ -from promptflow.core import tool -import numpy as np -import re - - -@tool -def parse_score(llm_output: str = None): - score = np.nan - if llm_output: - match = re.search(r'\d', llm_output) - if match: - score = float(match.group()) - - return score diff --git a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/requirements.txt b/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/requirements.txt deleted file mode 100644 index 687aa3599e9..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -promptflow -promptflow-tools \ No newline at end of file diff --git a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/validate_inputs.py b/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/validate_inputs.py deleted file mode 100644 index 55c0680938b..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/validate_inputs.py +++ /dev/null @@ -1,14 +0,0 @@ -from promptflow.core import tool - - -@tool -def validate_inputs(question: str, answer: str, context: str): - # Validate input parameters - if ( - not (question and question.strip() and question != "None") - or not (answer and answer.strip() and answer != "None") - or not (context and context.strip() and context != "None") - ): - raise ValueError("'question', 'answer' and 'context' must be non-empty strings.") - - return True diff --git a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/prompt.jinja2 b/src/promptflow-evals/promptflow/evals/evaluators/relevance/relevance.prompty similarity index 85% rename from src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/prompt.jinja2 rename to src/promptflow-evals/promptflow/evals/evaluators/relevance/relevance.prompty index 41f269cf5bd..9f87118b925 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/prompt.jinja2 +++ b/src/promptflow-evals/promptflow/evals/evaluators/relevance/relevance.prompty @@ -1,41 +1,69 @@ -system: -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. -user: -Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale: -One star: the answer completely lacks relevance -Two stars: the answer mostly lacks relevance -Three stars: the answer is partially relevant -Four stars: the answer is mostly relevant -Five stars: the answer has perfect relevance - -This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. - -context: Marie Curie was a Polish-born physicist and chemist who pioneered research on radioactivity and was the first woman to win a Nobel Prize. -question: What field did Marie Curie excel in? -answer: Marie Curie was a renowned painter who focused mainly on impressionist styles and techniques. -stars: 1 - -context: The Beatles were an English rock band formed in Liverpool in 1960, and they are widely regarded as the most influential music band in history. -question: Where were The Beatles formed? -answer: The band The Beatles began their journey in London, England, and they changed the history of music. -stars: 2 - -context: The recent Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere. -question: What are the main goals of Perseverance Mars rover mission? -answer: The Perseverance Mars rover mission focuses on searching for signs of ancient life on Mars. -stars: 3 - -context: The Mediterranean diet is a commonly recommended dietary plan that emphasizes fruits, vegetables, whole grains, legumes, lean proteins, and healthy fats. Studies have shown that it offers numerous health benefits, including a reduced risk of heart disease and improved cognitive health. -question: What are the main components of the Mediterranean diet? -answer: The Mediterranean diet primarily consists of fruits, vegetables, whole grains, and legumes. -stars: 4 - -context: The Queen's Royal Castle is a well-known tourist attraction in the United Kingdom. It spans over 500 acres and contains extensive gardens and parks. The castle was built in the 15th century and has been home to generations of royalty. -question: What are the main attractions of the Queen's Royal Castle? -answer: The main attractions of the Queen's Royal Castle are its expansive 500-acre grounds, extensive gardens, parks, and the historical castle itself, which dates back to the 15th century and has housed generations of royalty. -stars: 5 - -context: {{context}} -question: {{question}} -answer: {{answer}} -stars: \ No newline at end of file +--- +name: Relevance +description: Evaluates relevance score for QA scenario +model: + api: chat + configuration: + type: azure_openai + azure_deployment: ${env:AZURE_DEPLOYMENT} + api_key: ${env:AZURE_OPENAI_API_KEY} + azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT} + parameters: + temperature: 0.0 + max_tokens: 1 + top_p: 1.0 + presence_penalty: 0 + frequency_penalty: 0 + response_format: + type: text + +inputs: + question: + type: string + answer: + type: string + context: + type: string + +--- +system: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +user: +Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale: +One star: the answer completely lacks relevance +Two stars: the answer mostly lacks relevance +Three stars: the answer is partially relevant +Four stars: the answer is mostly relevant +Five stars: the answer has perfect relevance + +This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. + +context: Marie Curie was a Polish-born physicist and chemist who pioneered research on radioactivity and was the first woman to win a Nobel Prize. +question: What field did Marie Curie excel in? +answer: Marie Curie was a renowned painter who focused mainly on impressionist styles and techniques. +stars: 1 + +context: The Beatles were an English rock band formed in Liverpool in 1960, and they are widely regarded as the most influential music band in history. +question: Where were The Beatles formed? +answer: The band The Beatles began their journey in London, England, and they changed the history of music. +stars: 2 + +context: The recent Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere. +question: What are the main goals of Perseverance Mars rover mission? +answer: The Perseverance Mars rover mission focuses on searching for signs of ancient life on Mars. +stars: 3 + +context: The Mediterranean diet is a commonly recommended dietary plan that emphasizes fruits, vegetables, whole grains, legumes, lean proteins, and healthy fats. Studies have shown that it offers numerous health benefits, including a reduced risk of heart disease and improved cognitive health. +question: What are the main components of the Mediterranean diet? +answer: The Mediterranean diet primarily consists of fruits, vegetables, whole grains, and legumes. +stars: 4 + +context: The Queen's Royal Castle is a well-known tourist attraction in the United Kingdom. It spans over 500 acres and contains extensive gardens and parks. The castle was built in the 15th century and has been home to generations of royalty. +question: What are the main attractions of the Queen's Royal Castle? +answer: The main attractions of the Queen's Royal Castle are its expansive 500-acre grounds, extensive gardens, parks, and the historical castle itself, which dates back to the 15th century and has housed generations of royalty. +stars: 5 + +context: {{context}} +question: {{question}} +answer: {{answer}} +stars: diff --git a/src/promptflow-evals/promptflow/evals/evaluators/similarity/_similarity.py b/src/promptflow-evals/promptflow/evals/evaluators/similarity/_similarity.py index 22a9acd9fbf..76e8124d1a9 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/similarity/_similarity.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/similarity/_similarity.py @@ -2,10 +2,12 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- -from pathlib import Path +import os +import re + +import numpy as np from promptflow.client import load_flow -from promptflow.core._prompty_utils import convert_model_configuration_to_connection class SimilarityEvaluator: @@ -26,20 +28,15 @@ def __init__(self, model_config): answer="The capital of Japan is Tokyo.", ground_truth="Tokyo is Japan's capital.") """ + # TODO: Remove this block once the bug is fixed + # https://msdata.visualstudio.com/Vienna/_workitems/edit/3151324 + if model_config.api_version is None: + model_config.api_version = "2024-02-15-preview" - # Load the flow as function - current_dir = Path(__file__).resolve().parent - flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) - - # Override the connection - connection = convert_model_configuration_to_connection(model_config) - self._flow.context.connections = { - "query_llm": { - "connection": connection, - "deployment_name": model_config.azure_deployment, - } - } + prompty_model_config = {"configuration": model_config} + current_dir = os.path.dirname(__file__) + prompty_path = os.path.join(current_dir, "similarity.prompty") + self._flow = load_flow(source=prompty_path, model=prompty_model_config) def __call__(self, *, question: str, answer: str, ground_truth: str, **kwargs): """Evaluate similarity. @@ -53,6 +50,21 @@ def __call__(self, *, question: str, answer: str, ground_truth: str, **kwargs): :return: The similarity score. :rtype: dict """ + # Validate input parameters + if ( + not (question and question.strip()) + or not (answer and answer.strip()) + or not (ground_truth and ground_truth.strip()) + ): + raise ValueError("'question', 'answer' and 'ground_truth' must be non-empty strings.") # Run the evaluation flow - return self._flow(question=question, answer=answer, ground_truth=ground_truth) + llm_output = self._flow(question=question, answer=answer, ground_truth=ground_truth) + + score = np.nan + if llm_output: + match = re.search(r"\d", llm_output) + if match: + score = float(match.group()) + + return {"gpt_similarity": float(score)} diff --git a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/flow.dag.yaml b/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/flow.dag.yaml deleted file mode 100644 index e2687defcc0..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/flow.dag.yaml +++ /dev/null @@ -1,53 +0,0 @@ -$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json -environment: - python_requirements_txt: requirements.txt -inputs: - question: - type: string - answer: - type: string - ground_truth: - type: string -outputs: - gpt_similarity: - type: string - reference: ${parse_score.output} -nodes: -- name: validate_inputs - type: python - source: - type: code - path: validate_inputs.py - inputs: - answer: ${inputs.answer} - question: ${inputs.question} - ground_truth: ${inputs.ground_truth} -- name: query_llm - type: llm - source: - type: code - path: prompt.jinja2 - inputs: - deployment_name: gpt-4 - temperature: 0 - top_p: 1 - max_tokens: 1 - presence_penalty: 0 - frequency_penalty: 0 - question: ${inputs.question} - answer: ${inputs.answer} - ground_truth: ${inputs.ground_truth} - connection: open_ai_connection - api: chat - use_variants: false - activate: - when: ${validate_inputs.output} - is: true -- name: parse_score - type: python - source: - type: code - path: parse_score.py - inputs: - llm_output: ${query_llm.output} - use_variants: false diff --git a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/parse_score.py deleted file mode 100644 index e4157b4d22c..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/parse_score.py +++ /dev/null @@ -1,14 +0,0 @@ -from promptflow.core import tool -import numpy as np -import re - - -@tool -def parse_score(llm_output: str = None): - score = np.nan - if llm_output: - match = re.search(r'\d', llm_output) - if match: - score = float(match.group()) - - return score diff --git a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/requirements.txt b/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/requirements.txt deleted file mode 100644 index 687aa3599e9..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -promptflow -promptflow-tools \ No newline at end of file diff --git a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/validate_inputs.py b/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/validate_inputs.py deleted file mode 100644 index d58fec0174e..00000000000 --- a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/validate_inputs.py +++ /dev/null @@ -1,14 +0,0 @@ -from promptflow.core import tool - - -@tool -def validate_inputs(question: str, answer: str, ground_truth: str): - # Validate input parameters - if ( - not (question and question.strip() and question != "None") - or not (answer and answer.strip() and answer != "None") - or not (ground_truth and ground_truth.strip() and ground_truth != "None") - ): - raise ValueError("'question', 'answer' and 'ground_truth' must be non-empty strings.") - - return True diff --git a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/prompt.jinja2 b/src/promptflow-evals/promptflow/evals/evaluators/similarity/similarity.prompty similarity index 88% rename from src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/prompt.jinja2 rename to src/promptflow-evals/promptflow/evals/evaluators/similarity/similarity.prompty index 28f090701cb..a07ab311b75 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/prompt.jinja2 +++ b/src/promptflow-evals/promptflow/evals/evaluators/similarity/similarity.prompty @@ -1,43 +1,71 @@ -system: -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. -user: -Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale: -One star: the predicted answer is not at all similar to the correct answer -Two stars: the predicted answer is mostly not similar to the correct answer -Three stars: the predicted answer is somewhat similar to the correct answer -Four stars: the predicted answer is mostly similar to the correct answer -Five stars: the predicted answer is completely similar to the correct answer - -This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. - -The examples below show the Equivalence score for a question, a correct answer, and a predicted answer. - -question: What is the role of ribosomes? -correct answer: Ribosomes are cellular structures responsible for protein synthesis. They interpret the genetic information carried by messenger RNA (mRNA) and use it to assemble amino acids into proteins. -predicted answer: Ribosomes participate in carbohydrate breakdown by removing nutrients from complex sugar molecules. -stars: 1 - -question: Why did the Titanic sink? -correct answer: The Titanic sank after it struck an iceberg during its maiden voyage in 1912. The impact caused the ship's hull to breach, allowing water to flood into the vessel. The ship's design, lifeboat shortage, and lack of timely rescue efforts contributed to the tragic loss of life. -predicted answer: The sinking of the Titanic was a result of a large iceberg collision. This caused the ship to take on water and eventually sink, leading to the death of many passengers due to a shortage of lifeboats and insufficient rescue attempts. -stars: 2 - -question: What causes seasons on Earth? -correct answer: Seasons on Earth are caused by the tilt of the Earth's axis and its revolution around the Sun. As the Earth orbits the Sun, the tilt causes different parts of the planet to receive varying amounts of sunlight, resulting in changes in temperature and weather patterns. -predicted answer: Seasons occur because of the Earth's rotation and its elliptical orbit around the Sun. The tilt of the Earth's axis causes regions to be subjected to different sunlight intensities, which leads to temperature fluctuations and alternating weather conditions. -stars: 3 - -question: How does photosynthesis work? -correct answer: Photosynthesis is a process by which green plants and some other organisms convert light energy into chemical energy. This occurs as light is absorbed by chlorophyll molecules, and then carbon dioxide and water are converted into glucose and oxygen through a series of reactions. -predicted answer: In photosynthesis, sunlight is transformed into nutrients by plants and certain microorganisms. Light is captured by chlorophyll molecules, followed by the conversion of carbon dioxide and water into sugar and oxygen through multiple reactions. -stars: 4 - -question: What are the health benefits of regular exercise? -correct answer: Regular exercise can help maintain a healthy weight, increase muscle and bone strength, and reduce the risk of chronic diseases. It also promotes mental well-being by reducing stress and improving overall mood. -predicted answer: Routine physical activity can contribute to maintaining ideal body weight, enhancing muscle and bone strength, and preventing chronic illnesses. In addition, it supports mental health by alleviating stress and augmenting general mood. -stars: 5 - -question: {{question}} -correct answer:{{ground_truth}} -predicted answer: {{answer}} -stars: \ No newline at end of file +--- +name: Similarity +description: Evaluates similarity score for QA scenario +model: + api: chat + configuration: + type: azure_openai + azure_deployment: ${env:AZURE_DEPLOYMENT} + api_key: ${env:AZURE_OPENAI_API_KEY} + azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT} + parameters: + temperature: 0.0 + max_tokens: 1 + top_p: 1.0 + presence_penalty: 0 + frequency_penalty: 0 + response_format: + type: text + +inputs: + question: + type: string + answer: + type: string + ground_truth: + type: string + +--- +system: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +user: +Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale: +One star: the predicted answer is not at all similar to the correct answer +Two stars: the predicted answer is mostly not similar to the correct answer +Three stars: the predicted answer is somewhat similar to the correct answer +Four stars: the predicted answer is mostly similar to the correct answer +Five stars: the predicted answer is completely similar to the correct answer + +This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. + +The examples below show the Equivalence score for a question, a correct answer, and a predicted answer. + +question: What is the role of ribosomes? +correct answer: Ribosomes are cellular structures responsible for protein synthesis. They interpret the genetic information carried by messenger RNA (mRNA) and use it to assemble amino acids into proteins. +predicted answer: Ribosomes participate in carbohydrate breakdown by removing nutrients from complex sugar molecules. +stars: 1 + +question: Why did the Titanic sink? +correct answer: The Titanic sank after it struck an iceberg during its maiden voyage in 1912. The impact caused the ship's hull to breach, allowing water to flood into the vessel. The ship's design, lifeboat shortage, and lack of timely rescue efforts contributed to the tragic loss of life. +predicted answer: The sinking of the Titanic was a result of a large iceberg collision. This caused the ship to take on water and eventually sink, leading to the death of many passengers due to a shortage of lifeboats and insufficient rescue attempts. +stars: 2 + +question: What causes seasons on Earth? +correct answer: Seasons on Earth are caused by the tilt of the Earth's axis and its revolution around the Sun. As the Earth orbits the Sun, the tilt causes different parts of the planet to receive varying amounts of sunlight, resulting in changes in temperature and weather patterns. +predicted answer: Seasons occur because of the Earth's rotation and its elliptical orbit around the Sun. The tilt of the Earth's axis causes regions to be subjected to different sunlight intensities, which leads to temperature fluctuations and alternating weather conditions. +stars: 3 + +question: How does photosynthesis work? +correct answer: Photosynthesis is a process by which green plants and some other organisms convert light energy into chemical energy. This occurs as light is absorbed by chlorophyll molecules, and then carbon dioxide and water are converted into glucose and oxygen through a series of reactions. +predicted answer: In photosynthesis, sunlight is transformed into nutrients by plants and certain microorganisms. Light is captured by chlorophyll molecules, followed by the conversion of carbon dioxide and water into sugar and oxygen through multiple reactions. +stars: 4 + +question: What are the health benefits of regular exercise? +correct answer: Regular exercise can help maintain a healthy weight, increase muscle and bone strength, and reduce the risk of chronic diseases. It also promotes mental well-being by reducing stress and improving overall mood. +predicted answer: Routine physical activity can contribute to maintaining ideal body weight, enhancing muscle and bone strength, and preventing chronic illnesses. In addition, it supports mental health by alleviating stress and augmenting general mood. +stars: 5 + +question: {{question}} +correct answer:{{ground_truth}} +predicted answer: {{answer}} +stars: diff --git a/src/promptflow-evals/pyproject.toml b/src/promptflow-evals/pyproject.toml index 1331e653ee9..bbeb2f7adbb 100644 --- a/src/promptflow-evals/pyproject.toml +++ b/src/promptflow-evals/pyproject.toml @@ -42,7 +42,6 @@ python = "<4.0,>=3.8" azure-ai-ml = ">=1.14.0" promptflow-devkit = "<2.0.0" promptflow-core = "<2.0.0" -promptflow-tools = "<2.0.0" promptflow-azure = "<2.0.0" # Needed for remote tracking mlflow = "<3.0.0" # Needed for remote tracking to log metrics azureml-mlflow = "<2.0.0" # Needed for remote tracking to log metrics diff --git a/src/promptflow-recording/recordings/local/evals.node_cache.shelve.bak b/src/promptflow-recording/recordings/local/evals.node_cache.shelve.bak index b905e33e5e6..87f0cf9d720 100644 --- a/src/promptflow-recording/recordings/local/evals.node_cache.shelve.bak +++ b/src/promptflow-recording/recordings/local/evals.node_cache.shelve.bak @@ -15,3 +15,19 @@ '0658d24d96833aa5acf52b87d34ab8220a5b2669', (59904, 4551) '5f17c1fae1329c6d6823c0c59e4a66fd6ee42691', (64512, 3274) '79ea9bacda0ffb42811f24c5b0293a9744824daf', (68096, 4015) +'6795248808bf0d7f77d611dbe9dd672dd52b4dce', (72192, 3449) +'60bcfb921bb2cc3bb56f007c8c3453a839194bc8', (75776, 4031) +'30fc92e8759d7d571143873f446534e6af0bf6c5', (79872, 4576) +'1acc36f1fb93f2b60555845a727b6eb56d33034e', (84480, 3574) +'877cc9fe86f6d1b2087caafc6279323887a216cb', (88064, 3433) +'3e7451f97e7a18866d21c0edad5a79e8cdab2196', (91648, 5574) +'e7a394b585fe9aa7b4684c4e246f8f64523c1479', (97280, 3567) +'096ceeedaae85c7e91eba98720a26e7ae53b5376', (100864, 3426) +'0e3791838c4058f5e4fd2aa482fbf266dc3b8610', (104448, 4087) +'cb84532697ba61a34f44b77c31578375e38c35ac', (108544, 4621) +'0fbf84f9640d60c5b9589227fb1cfcaa9c3cddc7', (113664, 3590) +'3d4b1a738cf9b1b78fd01fdfe1a0f7162c45936f', (117760, 4190) +'98fd571c22b4a4ec567c41d8aa48431b63390dc9', (122368, 4726) +'d49c368393001c4b3ba40be944e58444882b46ae', (127488, 4989) +'567c9a9f13867aa1fab26ff3eaf3a07d7d1d7f66', (132608, 5659) +'1038794dadb9460aca8471c5b763f25f56099d0f', (138752, 4046) diff --git a/src/promptflow-recording/recordings/local/evals.node_cache.shelve.dat b/src/promptflow-recording/recordings/local/evals.node_cache.shelve.dat index 0e0c2f7a8e9d93f0feebc4298fa53a77bb3125f9..54657d2a498bf1770c13f4898bd3b4dd798ae01d 100644 GIT binary patch delta 10828 zcmeI2TWs4@7{{9~Ysb>jj%gc1?4@*ECRCCpPBJ0NoJ-rpiPJRArIkkJ#7>TvICgwV zoNir;&>kvT8k&uU5QsMr6JwW73^5^r#M7i9@c`=hu#WTmyE9!cJz(!VoKaq`_0yLu zlQ<6)WSg`-$*@FPK@3uVl1g%9wop};r5);Vp1^Rkg7iwpBq1^isx}g<_o3>=MD1d= z*_0FtIKgshS{)QBq@5&Fmf@A0cMeOR4X;s7OY>!l=_=Ii`S7sn1%N3K*8L32fUi7?F=>4jO{8`H8qO zz6=ub^xWS1!O}PTEn16Y#J1jZA%;xAaA&47dQ zmAO*}t9<#r!{>kRH|*OVTvY0Jv48=^`dtDD%Abx{su688QSEQ-W5E1|mSY=Qj<>c5 z0=7Cl5&+-;I5-F(kf}dVGSLFBvX`t)G2v=62BK3A z$_`C>P{>Z^C?qdha53YU2&eJ2!z;_<+A63XxDm2Y{Yl59nd}$E@BpRMs@~#@=q2Z#T?W z9vf_Es9(e!48o#BAx_ffiG+E^?2QM@7FY*}R$CDrm-LRPT|@Mzj;QE^wV=P~ZiIPp z%AVBfTTKOG7l?MYQcq^z9EDfnK!Yso*>u zqo!#71e%`Criu$GoQdHH9UyvfJ)*kKMAdTB8ycR!bjwX`<9Q2dl$%5wIqN~=u*;uz zrK0H*aFDakpK#}9ayr2CgY|eGmg#SGt)TVAlFDfBBOOUZtMyWR$IAAmTHXFGt(9(I zje3&^kqdYzQnD`;kIaUkfZglyC!%P|oYeuNms%M;b5qxhu3W$1Gsr(U-_f-6uWqPs zTT8#6Z)T%*XDI=vnP3UchMDY?&F-3WxO`(yS_hn7Zsqj!&-VqV7teictPXWVPMd04 z?cgq`rl!4Z?}A>BnG9E(G2gM^gRMLjgHR}skp496ELG_8v^%c@POr3Zdj9K2^}P*R z7u2OgQB6>DoeBI8)PDW#o6W%QjKZ8Poh~q9EEdVlcwiyNAW>*0tpiZ6wty=C`saNh zsFw$88=~x<&ME4H;wyRT%YoHOsTigihb=*-{N;q`f`xM4Uvcrd@XS5kZ5r2{B^!Mu zpSN=Qhhq(=MW7l>?uD%+quNP8%~s+7%NE#5HXtgGtU;7f5xpnQCk;m>mq`UMpL diff --git a/src/promptflow-recording/recordings/local/evals.node_cache.shelve.dir b/src/promptflow-recording/recordings/local/evals.node_cache.shelve.dir index b905e33e5e6..87f0cf9d720 100644 --- a/src/promptflow-recording/recordings/local/evals.node_cache.shelve.dir +++ b/src/promptflow-recording/recordings/local/evals.node_cache.shelve.dir @@ -15,3 +15,19 @@ '0658d24d96833aa5acf52b87d34ab8220a5b2669', (59904, 4551) '5f17c1fae1329c6d6823c0c59e4a66fd6ee42691', (64512, 3274) '79ea9bacda0ffb42811f24c5b0293a9744824daf', (68096, 4015) +'6795248808bf0d7f77d611dbe9dd672dd52b4dce', (72192, 3449) +'60bcfb921bb2cc3bb56f007c8c3453a839194bc8', (75776, 4031) +'30fc92e8759d7d571143873f446534e6af0bf6c5', (79872, 4576) +'1acc36f1fb93f2b60555845a727b6eb56d33034e', (84480, 3574) +'877cc9fe86f6d1b2087caafc6279323887a216cb', (88064, 3433) +'3e7451f97e7a18866d21c0edad5a79e8cdab2196', (91648, 5574) +'e7a394b585fe9aa7b4684c4e246f8f64523c1479', (97280, 3567) +'096ceeedaae85c7e91eba98720a26e7ae53b5376', (100864, 3426) +'0e3791838c4058f5e4fd2aa482fbf266dc3b8610', (104448, 4087) +'cb84532697ba61a34f44b77c31578375e38c35ac', (108544, 4621) +'0fbf84f9640d60c5b9589227fb1cfcaa9c3cddc7', (113664, 3590) +'3d4b1a738cf9b1b78fd01fdfe1a0f7162c45936f', (117760, 4190) +'98fd571c22b4a4ec567c41d8aa48431b63390dc9', (122368, 4726) +'d49c368393001c4b3ba40be944e58444882b46ae', (127488, 4989) +'567c9a9f13867aa1fab26ff3eaf3a07d7d1d7f66', (132608, 5659) +'1038794dadb9460aca8471c5b763f25f56099d0f', (138752, 4046) From 28ba415f740169ad2ac5c26eea551a90fceffac2 Mon Sep 17 00:00:00 2001 From: Zhengfei Wang <38847871+zhengfeiwang@users.noreply.github.com> Date: Tue, 7 May 2024 11:40:36 +0800 Subject: [PATCH 17/21] [trace][bugfix] Refine persist logic to handle aggregation node in eval run (#3048) # Description For traces from aggregation node in an eval run, the attributes will have `referenced.batch_run_id` and no `line_number`, which will result in a `KeyError` during trace persistent. This pull request targets to fix this issue, and add a test to guard this. # All Promptflow Contribution checklist: - [x] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [x] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [x] Title of the pull request is clear and informative. - [x] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [x] Pull request includes test coverage for the included changes. --- .../promptflow/_sdk/entities/_trace.py | 6 +++++- .../tests/sdk_cli_test/e2etests/test_trace.py | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/promptflow-devkit/promptflow/_sdk/entities/_trace.py b/src/promptflow-devkit/promptflow/_sdk/entities/_trace.py index 15bacac5a6f..c14df2a57df 100644 --- a/src/promptflow-devkit/promptflow/_sdk/entities/_trace.py +++ b/src/promptflow-devkit/promptflow/_sdk/entities/_trace.py @@ -235,10 +235,14 @@ def _determine_line_run_id(span: Span) -> str: def _determine_parent_id(span: Span) -> typing.Optional[str]: # for test, `attributes.referenced.line_run_id` should be the parent id # for batch run, we need to query line run with run name and line number + # however, one exception is aggregation node, which does not have line number attribute # otherwise, there will be no parent id if SpanAttributeFieldName.REFERENCED_LINE_RUN_ID in span.attributes: return span.attributes[SpanAttributeFieldName.REFERENCED_LINE_RUN_ID] - elif SpanAttributeFieldName.REFERENCED_BATCH_RUN_ID in span.attributes: + elif ( + SpanAttributeFieldName.REFERENCED_BATCH_RUN_ID in span.attributes + and SpanAttributeFieldName.LINE_NUMBER in span.attributes + ): line_run = ORMLineRun._get_with_run_and_line_number( run=span.attributes[SpanAttributeFieldName.REFERENCED_BATCH_RUN_ID], line_number=span.attributes[SpanAttributeFieldName.LINE_NUMBER], diff --git a/src/promptflow-devkit/tests/sdk_cli_test/e2etests/test_trace.py b/src/promptflow-devkit/tests/sdk_cli_test/e2etests/test_trace.py index ffe8a446e83..866de791b9c 100644 --- a/src/promptflow-devkit/tests/sdk_cli_test/e2etests/test_trace.py +++ b/src/promptflow-devkit/tests/sdk_cli_test/e2etests/test_trace.py @@ -212,6 +212,24 @@ def test_span_persist_and_gets(self, pf: PFClient) -> None: expected_span_dict["events"][i]["attributes"] = dict() assert_span_equals(lazy_load_span, expected_span_dict) + def test_aggregation_node_in_eval_run(self, pf: PFClient) -> None: + # mock a span generated from an aggregation node in an eval run + # whose attributes has `referenced.batch_run_id`, no `line_number` + span = mock_span( + trace_id=str(uuid.uuid4()), + span_id=str(uuid.uuid4()), + parent_id=None, + line_run_id=str(uuid.uuid4()), + ) + batch_run_id = str(uuid.uuid4()) + span.attributes.pop(SpanAttributeFieldName.LINE_RUN_ID) + span.attributes[SpanAttributeFieldName.BATCH_RUN_ID] = batch_run_id + span.attributes[SpanAttributeFieldName.REFERENCED_BATCH_RUN_ID] = str(uuid.uuid4()) + span._persist() + # list and assert to ensure the persist is successful + line_runs = pf.traces.list_line_runs(runs=[batch_run_id]) + assert len(line_runs) == 1 + def test_spans_persist_and_line_run_gets(self, pf: PFClient) -> None: trace_id = str(uuid.uuid4()) non_root_span_id = str(uuid.uuid4()) From 91c358c8e6672b366b8febf75c54214324b804c6 Mon Sep 17 00:00:00 2001 From: Brynn Yin <24237253+brynn-code@users.noreply.github.com> Date: Tue, 7 May 2024 11:46:05 +0800 Subject: [PATCH 18/21] [Connection] Add AI services connection & AI search AAD (#3117) # Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. - Add AzureAIServicesConnection. - Add auth_mode for CognitiveSearchConnection (Azure AI Search). # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --------- Signed-off-by: Brynn Yin --- scripts/json_schema/gen_json_schema.py | 4 +- src/promptflow-core/promptflow/_constants.py | 1 + .../promptflow/connections/__init__.py | 2 + .../promptflow/core/_connection.py | 109 +++++++++++++----- .../_workspace_connection_provider.py | 8 ++ .../test_workspace_connection_provider.py | 75 ++++++++++++ .../promptflow/_sdk/entities/_connection.py | 18 ++- .../promptflow/_sdk/schemas/_connection.py | 32 +++-- .../sdk_cli_test/unittests/test_connection.py | 43 +++++++ .../azure_ai_services_aad_connection.yaml | 5 + .../azure_ai_services_connection.yaml | 5 + .../cognitive_search_aad_connection.yaml | 6 + 12 files changed, 266 insertions(+), 42 deletions(-) create mode 100644 src/promptflow/tests/test_configs/connections/azure_ai_services_aad_connection.yaml create mode 100644 src/promptflow/tests/test_configs/connections/azure_ai_services_connection.yaml create mode 100644 src/promptflow/tests/test_configs/connections/cognitive_search_aad_connection.yaml diff --git a/scripts/json_schema/gen_json_schema.py b/scripts/json_schema/gen_json_schema.py index d776fe93dc0..6f06025894d 100644 --- a/scripts/json_schema/gen_json_schema.py +++ b/scripts/json_schema/gen_json_schema.py @@ -142,7 +142,7 @@ def get_required(self, obj): from promptflow._sdk.schemas._connection import AzureOpenAIConnectionSchema, OpenAIConnectionSchema, \ QdrantConnectionSchema, CognitiveSearchConnectionSchema, SerpConnectionSchema, AzureContentSafetyConnectionSchema, \ FormRecognizerConnectionSchema, CustomConnectionSchema, WeaviateConnectionSchema, ServerlessConnectionSchema, \ -CustomStrongTypeConnectionSchema +CustomStrongTypeConnectionSchema, AzureAIServicesConnectionSchema from promptflow._sdk.schemas._run import RunSchema from promptflow._sdk.schemas._flow import FlowSchema, FlexFlowSchema @@ -163,7 +163,7 @@ def get_required(self, obj): args.output_file = ["Run", "Flow", "AzureOpenAIConnection", "OpenAIConnection", "QdrantConnection", "CognitiveSearchConnection", "SerpConnection", "AzureContentSafetyConnection", "FormRecognizerConnection", "CustomConnection", "WeaviateConnection", "ServerlessConnection", - "CustomStrongTypeConnection"] + "CustomStrongTypeConnection", "AzureAIServicesConnection"] # Special case for Flow and EagerFlow if "Flow" in args.output_file: diff --git a/src/promptflow-core/promptflow/_constants.py b/src/promptflow-core/promptflow/_constants.py index 32565352926..2caa3ad32a3 100644 --- a/src/promptflow-core/promptflow/_constants.py +++ b/src/promptflow-core/promptflow/_constants.py @@ -233,6 +233,7 @@ class ConnectionType(str, Enum): COGNITIVE_SEARCH = "CognitiveSearch" SERP = "Serp" AZURE_CONTENT_SAFETY = "AzureContentSafety" + AZURE_AI_SERVICES = "AzureAIServices" FORM_RECOGNIZER = "FormRecognizer" WEAVIATE = "Weaviate" SERVERLESS = "Serverless" diff --git a/src/promptflow-core/promptflow/connections/__init__.py b/src/promptflow-core/promptflow/connections/__init__.py index 2e9cfa33bb8..d26521f5b72 100644 --- a/src/promptflow-core/promptflow/connections/__init__.py +++ b/src/promptflow-core/promptflow/connections/__init__.py @@ -6,6 +6,7 @@ from promptflow._core.tools_manager import register_connections from promptflow.contracts.types import Secret from promptflow.core._connection import ( + AzureAIServicesConnection, AzureContentSafetyConnection, AzureOpenAIConnection, CognitiveSearchConnection, @@ -40,6 +41,7 @@ class BingConnection: "CustomConnection", "CustomStrongTypeConnection", "ServerlessConnection", + "AzureAIServicesConnection", "ConnectionProvider", ] diff --git a/src/promptflow-core/promptflow/core/_connection.py b/src/promptflow-core/promptflow/core/_connection.py index b5cd6d0a1eb..bbf76b4b098 100644 --- a/src/promptflow-core/promptflow/core/_connection.py +++ b/src/promptflow-core/promptflow/core/_connection.py @@ -142,7 +142,37 @@ def api_key(self, value): self.secrets["api_key"] = value -class AzureOpenAIConnection(_StrongTypeConnection): +class _StrongTypeAADSupportedConnection(_StrongTypeConnection): + """Base class for strong type connection that supports AAD token.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._token_provider = None + + @property + def _has_api_key(self): + """Return if the connection has api key.""" + return self.auth_mode == ConnectionAuthMode.KEY + + @property + def auth_mode(self): + """Return the connection auth mode.""" + return self.configs.get("auth_mode", ConnectionAuthMode.KEY) + + @auth_mode.setter + def auth_mode(self, value): + """Set the connection auth mode.""" + self.configs["auth_mode"] = value + + def get_token(self): + """Return the connection token.""" + if not self._token_provider: + self._token_provider = AzureTokenProvider() + + return self._token_provider.get_token() + + +class AzureOpenAIConnection(_StrongTypeAADSupportedConnection): """Azure Open AI connection. :param api_key: The api key. @@ -181,8 +211,9 @@ def __init__( "resource_id": resource_id, } secrets = {"api_key": api_key} if auth_mode == ConnectionAuthMode.KEY else {} - self._token_provider = kwargs.get("token_provider") super().__init__(configs=configs, secrets=secrets, **kwargs) + # Leave this line to ensure backward compatibility. + self._token_provider = kwargs.get("token_provider") @property def api_base(self): @@ -214,16 +245,6 @@ def api_version(self, value): """Set the connection api version.""" self.configs["api_version"] = value - @property - def auth_mode(self): - """Return the connection auth mode.""" - return self.configs.get("auth_mode", ConnectionAuthMode.KEY) - - @auth_mode.setter - def auth_mode(self, value): - """Set the connection auth mode.""" - self.configs["auth_mode"] = value - @property def resource_id(self): """Return the connection resource id.""" @@ -234,18 +255,6 @@ def resource_id(self, value): """Set the resource id.""" self.configs["resource_id"] = value - @property - def _has_api_key(self): - """Return if the connection has api key.""" - return self.auth_mode == ConnectionAuthMode.KEY - - def get_token(self): - """Return the connection token.""" - if not self._token_provider: - self._token_provider = AzureTokenProvider() - - return self._token_provider.get_token() - @classmethod def from_env(cls, name=None): """ @@ -425,7 +434,7 @@ class WeaviateConnection(_EmbeddingStoreConnection): TYPE = ConnectionType.WEAVIATE.value -class CognitiveSearchConnection(_StrongTypeConnection): +class CognitiveSearchConnection(_StrongTypeAADSupportedConnection): """Cognitive Search connection. :param api_key: The api key. @@ -434,6 +443,8 @@ class CognitiveSearchConnection(_StrongTypeConnection): :type api_base: str :param api_version: The api version, default "2023-11-01". :type api_version: str + :param auth_mode: The auth mode, supported values see: :class:`~.constants.ConnectionAuthMode`. + :type auth_mode: str :param name: Connection name. :type name: str """ @@ -441,10 +452,15 @@ class CognitiveSearchConnection(_StrongTypeConnection): TYPE = ConnectionType.COGNITIVE_SEARCH.value def __init__( - self, api_key: str, api_base: str, api_version: str = ConnectionDefaultApiVersion.COGNITIVE_SEARCH, **kwargs + self, + api_base: str, + api_key: str = None, + api_version: str = ConnectionDefaultApiVersion.COGNITIVE_SEARCH, + auth_mode: str = ConnectionAuthMode.KEY, + **kwargs, ): - configs = {"api_base": api_base, "api_version": api_version} - secrets = {"api_key": api_key} + configs = {"api_base": api_base, "api_version": api_version, "auth_mode": auth_mode} + secrets = {"api_key": api_key} if auth_mode == ConnectionAuthMode.KEY else {} super().__init__(configs=configs, secrets=secrets, **kwargs) @property @@ -468,6 +484,43 @@ def api_version(self, value): self.configs["api_version"] = value +class AzureAIServicesConnection(_StrongTypeAADSupportedConnection): + """Azure AI Services connection. + + :param api_key: The api key. + :type api_key: str + :param endpoint: The api endpoint. + :type endpoint: str + :param auth_mode: The auth mode, supported values see: :class:`~.constants.ConnectionAuthMode`. + :type auth_mode: str + :param name: Connection name. + :type name: str + """ + + TYPE = ConnectionType.AZURE_AI_SERVICES.value + + def __init__( + self, + endpoint: str, + api_key: str = None, + auth_mode: str = ConnectionAuthMode.KEY, + **kwargs, + ): + configs = {"endpoint": endpoint, "auth_mode": auth_mode} + secrets = {"api_key": api_key} if auth_mode == ConnectionAuthMode.KEY else {} + super().__init__(configs=configs, secrets=secrets, **kwargs) + + @property + def endpoint(self): + """Return the connection endpoint.""" + return self.configs.get("endpoint") + + @endpoint.setter + def endpoint(self, value): + """Set the connection endpoint.""" + self.configs["endpoint"] = value + + class AzureContentSafetyConnection(_StrongTypeConnection): """Azure Content Safety connection. diff --git a/src/promptflow-core/promptflow/core/_connection_provider/_workspace_connection_provider.py b/src/promptflow-core/promptflow/core/_connection_provider/_workspace_connection_provider.py index b97f048616a..f2df646be04 100644 --- a/src/promptflow-core/promptflow/core/_connection_provider/_workspace_connection_provider.py +++ b/src/promptflow-core/promptflow/core/_connection_provider/_workspace_connection_provider.py @@ -52,6 +52,7 @@ class ConnectionCategory: Serp = "Serp" Serverless = "Serverless" BingLLMSearch = "BingLLMSearch" + AIServices = "AIServices" class ConnectionAuthType: @@ -175,6 +176,8 @@ def validate_and_fallback_connection_type(cls, name, type_name, category, metada ConnectionCategory.Serverless, ]: return category + if category == ConnectionCategory.AIServices: + return "AzureAIServices" if category == ConnectionCategory.CustomKeys: return CustomConnection.__name__ if category == ConnectionCategory.CognitiveService: @@ -271,6 +274,11 @@ def get_auth_config(props, support_aad=False): "endpoint": properties.target, "api_version": get_case_insensitive_key(properties.metadata, "ApiVersion"), } + elif properties.category == ConnectionCategory.AIServices: + value = { + **get_auth_config(properties, support_aad=True), + "endpoint": properties.target, + } elif properties.category == ConnectionCategory.OpenAI: value = { **get_auth_config(properties), diff --git a/src/promptflow-core/tests/azureml-serving/unittests/test_workspace_connection_provider.py b/src/promptflow-core/tests/azureml-serving/unittests/test_workspace_connection_provider.py index 89147d5f611..8cb56a839bc 100644 --- a/src/promptflow-core/tests/azureml-serving/unittests/test_workspace_connection_provider.py +++ b/src/promptflow-core/tests/azureml-serving/unittests/test_workspace_connection_provider.py @@ -285,6 +285,36 @@ def test_build_cognitive_search_connection_from_rest_object(self): } build_from_data_and_assert(data, expected) + def test_build_cognitive_search_aad_connection_from_rest_object(self): + # Test on AAD type with CognitiveSearch category + data = { + "tags": None, + "location": None, + "id": "mock_id", + "name": "test", + "type": "Microsoft.MachineLearningServices/workspaces/connections", + "properties": { + "authType": "AAD", + "category": "CognitiveSearch", + "expiryTime": None, + "target": "mock_target", + "metadata": { + "ApiVersion": "2023-07-01-Preview", + }, + }, + } + expected = { + "type": "CognitiveSearchConnection", + "module": "promptflow.connections", + "name": "test", + "value": { + "api_base": "mock_target", + "api_version": "2023-07-01-Preview", + "auth_mode": "meid_token", + }, + } + build_from_data_and_assert(data, expected) + def test_build_cognitive_service_category_connection_from_rest_object(self): # Test on Api type with CognitiveService category data = { @@ -386,6 +416,51 @@ def test_build_serverless_category_connection_from_rest_object(self): } build_from_data_and_assert(data, expected) + def test_build_ai_services_connection_from_rest_object(self): + data = { + "id": "mock_id", + "name": "test", + "type": "Microsoft.MachineLearningServices/workspaces/connections", + "properties": { + "authType": "ApiKey", + "credentials": {"key": "***"}, + "group": "AzureAI", + "category": "AIServices", + "target": "mock_base", + "sharedUserList": [], + "metadata": {}, + }, + } + expected = { + "type": "AzureAIServicesConnection", + "module": "promptflow.connections", + "name": "test", + "value": {"api_key": "***", "endpoint": "mock_base", "auth_mode": "key"}, + } + build_from_data_and_assert(data, expected) + + def test_build_ai_services_aad_connection_from_rest_object(self): + data = { + "id": "mock_id", + "name": "test", + "type": "Microsoft.MachineLearningServices/workspaces/connections", + "properties": { + "authType": "AAD", + "group": "AzureAI", + "category": "AIServices", + "target": "mock_base", + "sharedUserList": [], + "metadata": {}, + }, + } + expected = { + "type": "AzureAIServicesConnection", + "module": "promptflow.connections", + "name": "test", + "value": {"endpoint": "mock_base", "auth_mode": "meid_token"}, + } + build_from_data_and_assert(data, expected) + def test_build_connection_list(self): data = { "value": [ diff --git a/src/promptflow-devkit/promptflow/_sdk/entities/_connection.py b/src/promptflow-devkit/promptflow/_sdk/entities/_connection.py index 091881d6bf9..857bf68be19 100644 --- a/src/promptflow-devkit/promptflow/_sdk/entities/_connection.py +++ b/src/promptflow-devkit/promptflow/_sdk/entities/_connection.py @@ -30,6 +30,7 @@ ) from promptflow._sdk.entities._yaml_translatable import YAMLTranslatableMixin from promptflow._sdk.schemas._connection import ( + AzureAIServicesConnectionSchema, AzureContentSafetyConnectionSchema, AzureOpenAIConnectionSchema, CognitiveSearchConnectionSchema, @@ -45,6 +46,7 @@ from promptflow._utils.logger_utils import LoggerFactory from promptflow._utils.utils import snake_to_camel from promptflow.contracts.types import Secret +from promptflow.core._connection import AzureAIServicesConnection as _CoreAzureAIServicesConnection from promptflow.core._connection import AzureContentSafetyConnection as _CoreAzureContentSafetyConnection from promptflow.core._connection import AzureOpenAIConnection as _CoreAzureOpenAIConnection from promptflow.core._connection import CognitiveSearchConnection as _CoreCognitiveSearchConnection @@ -72,6 +74,7 @@ def _casting_type(cls, typ): type_dict = { "azure_open_ai": ConnectionType.AZURE_OPEN_AI.value, "open_ai": ConnectionType.OPEN_AI.value, + "azure_ai_services": ConnectionType.AZURE_AI_SERVICES.value, } if typ in type_dict: @@ -218,8 +221,10 @@ def _load( data=data, context=context, unknown=INCLUDE, - additional_message=f"If you are trying to configure a job that is not of type {type_str}, please specify " - f"the correct connection type in the 'type' property.", + additional_message=( + f"If you are trying to configure a connection that is not of type {type_str}, please specify " + "the correct connection type in the 'type' property." + ), **kwargs, ) return connection @@ -335,6 +340,15 @@ def _get_schema_cls(cls): return CognitiveSearchConnectionSchema +class AzureAIServicesConnection(_CoreAzureAIServicesConnection, _StrongTypeConnection): + __doc__ = _CoreAzureAIServicesConnection.__doc__ + DATA_CLASS = _CoreAzureAIServicesConnection + + @classmethod + def _get_schema_cls(cls): + return AzureAIServicesConnectionSchema + + class AzureContentSafetyConnection(_CoreAzureContentSafetyConnection, _StrongTypeConnection): __doc__ = _CoreAzureContentSafetyConnection.__doc__ DATA_CLASS = _CoreAzureContentSafetyConnection diff --git a/src/promptflow-devkit/promptflow/_sdk/schemas/_connection.py b/src/promptflow-devkit/promptflow/_sdk/schemas/_connection.py index c1623595883..b943ccf761f 100644 --- a/src/promptflow-devkit/promptflow/_sdk/schemas/_connection.py +++ b/src/promptflow-devkit/promptflow/_sdk/schemas/_connection.py @@ -7,7 +7,7 @@ from promptflow._constants import ConnectionType, CustomStrongTypeConnectionConfigs from promptflow._sdk._constants import SCHEMA_KEYS_CONTEXT_CONFIG_KEY, SCHEMA_KEYS_CONTEXT_SECRET_KEY -from promptflow._sdk.schemas._base import YamlFileSchema +from promptflow._sdk.schemas._base import PatchedSchemaMeta, YamlFileSchema from promptflow._sdk.schemas._fields import StringTransformedEnum from promptflow._utils.utils import camel_to_snake from promptflow.constants import ConnectionAuthMode, ConnectionDefaultApiVersion @@ -32,13 +32,7 @@ def _pre_dump(self, data, **kwargs): return copied -class AzureOpenAIConnectionSchema(ConnectionSchema): - type = StringTransformedEnum(allowed_values="azure_open_ai", required=True) - api_key = fields.Str() - api_base = fields.Str(required=True) - api_type = fields.Str(dump_default="azure") - api_version = fields.Str(dump_default=ConnectionDefaultApiVersion.AZURE_OPEN_AI) - resource_id = fields.Str() +class AADSupportedSchemaMixin(metaclass=PatchedSchemaMeta): auth_mode = StringTransformedEnum( allowed_values=[ConnectionAuthMode.MEID_TOKEN, ConnectionAuthMode.KEY], allow_none=True, @@ -53,6 +47,15 @@ def _validate(self, data, **kwargs): return data +class AzureOpenAIConnectionSchema(ConnectionSchema, AADSupportedSchemaMixin): + type = StringTransformedEnum(allowed_values="azure_open_ai", required=True) + api_key = fields.Str() + api_base = fields.Str(required=True) + api_type = fields.Str(dump_default="azure") + api_version = fields.Str(dump_default=ConnectionDefaultApiVersion.AZURE_OPEN_AI) + resource_id = fields.Str() + + class OpenAIConnectionSchema(ConnectionSchema): type = StringTransformedEnum(allowed_values="open_ai", required=True) api_key = fields.Str(required=True) @@ -80,12 +83,12 @@ class WeaviateConnectionSchema(EmbeddingStoreConnectionSchema): type = StringTransformedEnum(allowed_values=camel_to_snake(ConnectionType.WEAVIATE), required=True) -class CognitiveSearchConnectionSchema(ConnectionSchema): +class CognitiveSearchConnectionSchema(ConnectionSchema, AADSupportedSchemaMixin): type = StringTransformedEnum( allowed_values=camel_to_snake(ConnectionType.COGNITIVE_SEARCH), required=True, ) - api_key = fields.Str(required=True) + api_key = fields.Str() api_base = fields.Str(required=True) api_version = fields.Str(dump_default=ConnectionDefaultApiVersion.COGNITIVE_SEARCH) @@ -95,6 +98,15 @@ class SerpConnectionSchema(ConnectionSchema): api_key = fields.Str(required=True) +class AzureAIServicesConnectionSchema(ConnectionSchema, AADSupportedSchemaMixin): + type = StringTransformedEnum( + allowed_values=camel_to_snake(ConnectionType.AZURE_AI_SERVICES), + required=True, + ) + api_key = fields.Str() + endpoint = fields.Str(required=True) + + class AzureContentSafetyConnectionSchema(ConnectionSchema): type = StringTransformedEnum( allowed_values=camel_to_snake(ConnectionType.AZURE_CONTENT_SAFETY), diff --git a/src/promptflow-devkit/tests/sdk_cli_test/unittests/test_connection.py b/src/promptflow-devkit/tests/sdk_cli_test/unittests/test_connection.py index 98cf6305759..d8ace1f6052 100644 --- a/src/promptflow-devkit/tests/sdk_cli_test/unittests/test_connection.py +++ b/src/promptflow-devkit/tests/sdk_cli_test/unittests/test_connection.py @@ -13,6 +13,7 @@ from promptflow._sdk._errors import ConnectionClassNotFoundError, SDKError from promptflow._sdk._load_functions import _load_env_to_connection from promptflow._sdk.entities._connection import ( + AzureAIServicesConnection, AzureContentSafetyConnection, AzureOpenAIConnection, CognitiveSearchConnection, @@ -135,6 +136,21 @@ class TestConnection: "api_base": "endpoint", "api_version": "2023-07-01-Preview", }, + { + "module": "promptflow.connections", + "type": "cognitive_search", + "auth_mode": "key", + }, + ), + ( + "cognitive_search_aad_connection.yaml", + CognitiveSearchConnection, + { + "name": "my_cognitive_search_connection", + "api_base": "endpoint", + "auth_mode": "meid_token", + "api_version": "2023-07-01-Preview", + }, { "module": "promptflow.connections", "type": "cognitive_search", @@ -206,6 +222,33 @@ class TestConnection: "type": "serverless", }, ), + ( + "azure_ai_services_connection.yaml", + AzureAIServicesConnection, + { + "name": "my_ai_services_connection", + "api_key": "", + "endpoint": "endpoint", + }, + { + "module": "promptflow.connections", + "type": "azure_ai_services", + "auth_mode": "key", + }, + ), + ( + "azure_ai_services_aad_connection.yaml", + AzureAIServicesConnection, + { + "name": "my_ai_services_connection", + "endpoint": "endpoint", + "auth_mode": "meid_token", + }, + { + "module": "promptflow.connections", + "type": "azure_ai_services", + }, + ), ], ) def test_connection_load_dump(self, file_name, class_name, init_param, expected): diff --git a/src/promptflow/tests/test_configs/connections/azure_ai_services_aad_connection.yaml b/src/promptflow/tests/test_configs/connections/azure_ai_services_aad_connection.yaml new file mode 100644 index 00000000000..0920f74485c --- /dev/null +++ b/src/promptflow/tests/test_configs/connections/azure_ai_services_aad_connection.yaml @@ -0,0 +1,5 @@ +$schema: https://azuremlschemas.azureedge.net/promptflow/latest/AzureAIServicesConnection.schema.json +name: my_ai_services_connection +type: azure_ai_services # snake case +endpoint: "endpoint" +auth_mode: meid_token diff --git a/src/promptflow/tests/test_configs/connections/azure_ai_services_connection.yaml b/src/promptflow/tests/test_configs/connections/azure_ai_services_connection.yaml new file mode 100644 index 00000000000..cbee34a364f --- /dev/null +++ b/src/promptflow/tests/test_configs/connections/azure_ai_services_connection.yaml @@ -0,0 +1,5 @@ +$schema: https://azuremlschemas.azureedge.net/promptflow/latest/AzureAIServicesConnection.schema.json +name: my_ai_services_connection +type: azure_ai_services # snake case +api_key: "" +endpoint: "endpoint" diff --git a/src/promptflow/tests/test_configs/connections/cognitive_search_aad_connection.yaml b/src/promptflow/tests/test_configs/connections/cognitive_search_aad_connection.yaml new file mode 100644 index 00000000000..7ff94a90578 --- /dev/null +++ b/src/promptflow/tests/test_configs/connections/cognitive_search_aad_connection.yaml @@ -0,0 +1,6 @@ +$schema: https://azuremlschemas.azureedge.net/promptflow/latest/CognitiveSearchConnection.schema.json +name: my_cognitive_search_connection +type: cognitive_search # snake case +api_base: "endpoint" +api_version: "2023-07-01-Preview" +auth_mode: meid_token From f4834912234213e481a2e7be59c296819f09150c Mon Sep 17 00:00:00 2001 From: zhen Date: Tue, 7 May 2024 12:06:53 +0800 Subject: [PATCH 19/21] [utils] resolve_flow_language returns python when flow path is prompty (#3124) # Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --- .../_sdk/_utilities/general_utils.py | 15 ++++++- .../sdk_cli_test/unittests/test_utils.py | 39 +++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/src/promptflow-devkit/promptflow/_sdk/_utilities/general_utils.py b/src/promptflow-devkit/promptflow/_sdk/_utilities/general_utils.py index bf9eeb4500b..a272ea2b09c 100644 --- a/src/promptflow-devkit/promptflow/_sdk/_utilities/general_utils.py +++ b/src/promptflow-devkit/promptflow/_sdk/_utilities/general_utils.py @@ -32,7 +32,14 @@ from keyring.errors import NoKeyringError from marshmallow import ValidationError -from promptflow._constants import ENABLE_MULTI_CONTAINER_KEY, EXTENSION_UA, FLOW_FLEX_YAML, LANGUAGE_KEY, FlowLanguage +from promptflow._constants import ( + ENABLE_MULTI_CONTAINER_KEY, + EXTENSION_UA, + FLOW_FLEX_YAML, + LANGUAGE_KEY, + PROMPTY_EXTENSION, + FlowLanguage, +) from promptflow._sdk._constants import ( AZURE_WORKSPACE_REGEX_FORMAT, DEFAULT_ENCODING, @@ -1113,6 +1120,10 @@ def resolve_flow_language( file_path = flow_path / flow_file if file_path.is_file() and file_path.suffix.lower() in (".yaml", ".yml"): yaml_dict = load_yaml(file_path) + elif file_path.is_file() and file_path.suffix.lower() == PROMPTY_EXTENSION: + return FlowLanguage.Python else: - raise UserErrorException(f"Invalid flow path {file_path.as_posix()}, must exist and of suffix yaml or yml.") + raise UserErrorException( + f"Invalid flow path {file_path.as_posix()}, must exist and of suffix yaml, yml or prompty." + ) return yaml_dict.get(LANGUAGE_KEY, FlowLanguage.Python) diff --git a/src/promptflow-devkit/tests/sdk_cli_test/unittests/test_utils.py b/src/promptflow-devkit/tests/sdk_cli_test/unittests/test_utils.py index 99d15843cb4..6a4254c3a1c 100644 --- a/src/promptflow-devkit/tests/sdk_cli_test/unittests/test_utils.py +++ b/src/promptflow-devkit/tests/sdk_cli_test/unittests/test_utils.py @@ -42,6 +42,7 @@ get_mac_address, get_system_info, refresh_connections_dir, + resolve_flow_language, ) from promptflow._sdk._version_hint_utils import check_latest_version from promptflow._utils.load_data import load_data @@ -51,6 +52,7 @@ override_connection_config_with_environment_variable, resolve_connections_environment_variable_reference, ) +from promptflow.exceptions import UserErrorException TEST_ROOT = PROMPTFLOW_ROOT / "tests" CONNECTION_ROOT = TEST_ROOT / "test_configs/connections" @@ -336,6 +338,43 @@ def test_configure_pf_home_dir_with_invalid_path(self) -> None: assert _constants.HOME_PROMPT_FLOW_DIR.as_posix() == (Path.home() / ".promptflow").resolve().as_posix() importlib.reload(_constants) + def test_resolve_flow_language(self): + # dag flow + lan = resolve_flow_language(flow_path=TEST_ROOT / "test_configs" / "flows" / "csharp_flow") + assert lan == "csharp" + + lan = resolve_flow_language(flow_path=TEST_ROOT / "test_configs" / "flows" / "chat_flow") + assert lan == "python" + + # flex flow + lan = resolve_flow_language(flow_path=TEST_ROOT / "test_configs" / "eager_flows" / "basic_callable_class") + assert lan == "python" + + lan = resolve_flow_language( + flow_path=TEST_ROOT / "test_configs" / "eager_flows" / "basic_dummy_csharp_flex_flow" + ) + assert lan == "csharp" + + # prompty + lan = resolve_flow_language(flow_path=TEST_ROOT / "test_configs" / "prompty" / "prompty_example.prompty") + assert lan == "python" + + with pytest.raises(UserErrorException) as ex: + resolve_flow_language() + assert "Either flow_path or yaml_dict should be provided." in ex.value.message + + with pytest.raises(UserErrorException) as ex: + resolve_flow_language() + assert "Either flow_path or yaml_dict should be provided." in ex.value.message + + with pytest.raises(UserErrorException) as ex: + resolve_flow_language(flow_path="mock_path", yaml_dict="mock_dict") + assert "Only one of flow_path and yaml_dict should be provided." in ex.value.message + + with pytest.raises(UserErrorException) as ex: + resolve_flow_language(flow_path="mock_path") + assert "must exist and of suffix yaml, yml or prompty." in ex.value.message + @pytest.mark.unittest class TestCLIUtils: From fbcc37994eba721d3ba248fdb6094d55de52f075 Mon Sep 17 00:00:00 2001 From: Ge Gao <49388944+dorisjoy@users.noreply.github.com> Date: Tue, 7 May 2024 16:09:20 +0800 Subject: [PATCH 20/21] Change openai chat api presence_penalty & frequency_penalty default value to None (#3125) Change openai chat api presence_penalty & frequency_penalty default value to None, so that when user doesn't set these values, we can ignore and not pass these parameters to do the api call. --------- Co-authored-by: Ge Gao --- src/promptflow-tools/promptflow/tools/aoai.py | 26 +++++++------- src/promptflow-tools/promptflow/tools/llm.py | 7 ++-- .../promptflow/tools/openai.py | 34 +++++++++---------- .../promptflow/tools/yamls/llm.yaml | 4 +-- 4 files changed, 36 insertions(+), 35 deletions(-) diff --git a/src/promptflow-tools/promptflow/tools/aoai.py b/src/promptflow-tools/promptflow/tools/aoai.py index 26be7ed117e..45817c2bc97 100644 --- a/src/promptflow-tools/promptflow/tools/aoai.py +++ b/src/promptflow-tools/promptflow/tools/aoai.py @@ -43,8 +43,8 @@ def completion( logprobs: int = None, echo: bool = False, stop: list = None, - presence_penalty: float = 0, - frequency_penalty: float = 0, + presence_penalty: float = None, + frequency_penalty: float = None, best_of: int = 1, logit_bias: dict = {}, user: str = "", @@ -71,8 +71,8 @@ def completion( echo=echo, # fix bug "[] is not valid under any of the given schemas-'stop'" stop=stop if stop else None, - presence_penalty=float(presence_penalty), - frequency_penalty=float(frequency_penalty), + presence_penalty=float(presence_penalty) if presence_penalty is not None else None, + frequency_penalty=float(frequency_penalty) if frequency_penalty is not None else None, best_of=int(best_of), # Logit bias must be a dict if we passed it to openai api. logit_bias=logit_bias if logit_bias else {}, @@ -107,8 +107,8 @@ def chat( stream: bool = False, stop: list = None, max_tokens: int = None, - presence_penalty: float = 0, - frequency_penalty: float = 0, + presence_penalty: float = None, + frequency_penalty: float = None, logit_bias: dict = {}, user: str = "", # function_call can be of type str or dict. @@ -130,8 +130,6 @@ def chat( "top_p": top_p, "n": n, "stream": stream, - "presence_penalty": presence_penalty, - "frequency_penalty": frequency_penalty, "user": user, "extra_headers": {"ms-azure-ai-promptflow-called-from": "aoai-tool"} } @@ -159,6 +157,10 @@ def chat( params["response_format"] = response_format if seed is not None: params["seed"] = seed + if presence_penalty is not None: + params["presence_penalty"] = presence_penalty + if frequency_penalty is not None: + params["frequency_penalty"] = frequency_penalty completion = self._client.chat.completions.create(**params) return post_process_chat_api_response(completion, stream, functions, tools) @@ -181,8 +183,8 @@ def completion( logprobs: int = None, echo: bool = False, stop: list = None, - presence_penalty: float = 0, - frequency_penalty: float = 0, + presence_penalty: float = None, + frequency_penalty: float = None, best_of: int = 1, logit_bias: dict = {}, user: str = "", @@ -220,8 +222,8 @@ def chat( stream: bool = False, stop: list = None, max_tokens: int = None, - presence_penalty: float = 0, - frequency_penalty: float = 0, + presence_penalty: float = None, + frequency_penalty: float = None, logit_bias: dict = {}, user: str = "", function_call: object = None, diff --git a/src/promptflow-tools/promptflow/tools/llm.py b/src/promptflow-tools/promptflow/tools/llm.py index 30ec8c0d4b5..1c3f8728ec9 100644 --- a/src/promptflow-tools/promptflow/tools/llm.py +++ b/src/promptflow-tools/promptflow/tools/llm.py @@ -27,15 +27,16 @@ def llm( connection, prompt: PromptTemplate, api: str = "chat", - deployment_name: str = "", model: str = "", + deployment_name: str = "", + model: str = "", temperature: float = 1.0, top_p: float = 1.0, # stream is a hidden to the end user, it is only supposed to be set by the executor. stream: bool = False, stop: list = None, max_tokens: int = None, - presence_penalty: float = 0, - frequency_penalty: float = 0, + presence_penalty: float = None, + frequency_penalty: float = None, logit_bias: dict = {}, # tool_choice can be of type str or dict. tool_choice: object = None, diff --git a/src/promptflow-tools/promptflow/tools/openai.py b/src/promptflow-tools/promptflow/tools/openai.py index 3ab90a9025a..a1352b7673a 100644 --- a/src/promptflow-tools/promptflow/tools/openai.py +++ b/src/promptflow-tools/promptflow/tools/openai.py @@ -42,8 +42,8 @@ def completion( logprobs: int = None, echo: bool = False, stop: list = None, - presence_penalty: float = 0, - frequency_penalty: float = 0, + presence_penalty: float = None, + frequency_penalty: float = None, best_of: int = 1, logit_bias: dict = {}, user: str = "", @@ -66,8 +66,8 @@ def completion( logprobs=int(logprobs) if logprobs else None, echo=echo, stop=stop if stop else None, - presence_penalty=float(presence_penalty), - frequency_penalty=float(frequency_penalty), + presence_penalty=float(presence_penalty) if presence_penalty is not None else None, + frequency_penalty=float(frequency_penalty) if frequency_penalty is not None else None, best_of=int(best_of), # Logit bias must be a dict if we passed it to openai api. logit_bias=logit_bias if logit_bias else {}, @@ -92,16 +92,15 @@ def generator(): def chat( self, prompt: PromptTemplate, - model: str = "gpt-3.5-turbo", + model: str = "", temperature: float = 1.0, top_p: float = 1.0, - n: int = 1, # stream is a hidden to the end user, it is only supposed to be set by the executor. stream: bool = False, stop: list = None, max_tokens: int = None, - presence_penalty: float = 0, - frequency_penalty: float = 0, + presence_penalty: float = None, + frequency_penalty: float = None, logit_bias: dict = {}, user: str = "", # function_call can be of type str or dict. @@ -121,10 +120,7 @@ def chat( "messages": messages, "temperature": temperature, "top_p": top_p, - "n": n, "stream": stream, - "presence_penalty": presence_penalty, - "frequency_penalty": frequency_penalty, "user": user, } @@ -152,6 +148,10 @@ def chat( params["response_format"] = response_format if seed is not None: params["seed"] = seed + if presence_penalty is not None: + params["presence_penalty"] = presence_penalty + if frequency_penalty is not None: + params["frequency_penalty"] = frequency_penalty completion = self._client.chat.completions.create(**params) return post_process_chat_api_response(completion, stream, functions, tools) @@ -174,8 +174,8 @@ def completion( logprobs: int = None, echo: bool = False, stop: list = None, - presence_penalty: float = 0, - frequency_penalty: float = 0, + presence_penalty: float = None, + frequency_penalty: float = None, best_of: int = 1, logit_bias: dict = {}, user: str = "", @@ -206,15 +206,14 @@ def completion( def chat( connection: OpenAIConnection, prompt: PromptTemplate, - model: str = "gpt-3.5-turbo", + model: str = "", temperature: float = 1, top_p: float = 1, - n: int = 1, stream: bool = False, stop: list = None, max_tokens: int = None, - presence_penalty: float = 0, - frequency_penalty: float = 0, + presence_penalty: float = None, + frequency_penalty: float = None, logit_bias: dict = {}, user: str = "", function_call: object = None, @@ -230,7 +229,6 @@ def chat( model=model, temperature=temperature, top_p=top_p, - n=n, stream=stream, stop=stop if stop else None, max_tokens=max_tokens, diff --git a/src/promptflow-tools/promptflow/tools/yamls/llm.yaml b/src/promptflow-tools/promptflow/tools/yamls/llm.yaml index 9e40893d17d..28d12140663 100644 --- a/src/promptflow-tools/promptflow/tools/yamls/llm.yaml +++ b/src/promptflow-tools/promptflow/tools/yamls/llm.yaml @@ -134,7 +134,7 @@ promptflow.tools.llm.llm: ui_hints: text_box_size: xs presence_penalty: - default: 0 + default: "" type: - double advanced: true @@ -155,7 +155,7 @@ promptflow.tools.llm.llm: frequency_penalty: type: - int - default: 0 + default: "" advanced: true ui_hints: text_box_size: xs From e376e864f0575819415d8de3b7c6b86a55b0c0ed Mon Sep 17 00:00:00 2001 From: chjinche <49483542+chjinche@users.noreply.github.com> Date: Tue, 7 May 2024 17:32:43 +0800 Subject: [PATCH 21/21] Smaller retry count for UnprocessableEntityError(422) (#3127) # Description Issue: OpenAI suggest retry on UnprocessableEntityError(422) https://platform.openai.com/docs/guides/error-codes/python-library-error-types while MaaS returns 422 with non-retriable error. Solution: Smaller retry count for 422. # All Promptflow Contribution checklist: - [x] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [x] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [x] Title of the pull request is clear and informative. - [x] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [x] Pull request includes test coverage for the included changes. --- .../promptflow/tools/common.py | 36 ++++++++++++++----- .../tests/test_handle_openai_error.py | 26 +++++++++----- 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/src/promptflow-tools/promptflow/tools/common.py b/src/promptflow-tools/promptflow/tools/common.py index 4fd95a08a6e..d5b99397c82 100644 --- a/src/promptflow-tools/promptflow/tools/common.py +++ b/src/promptflow-tools/promptflow/tools/common.py @@ -519,19 +519,25 @@ def is_retriable_api_connection_error(e: APIConnectionError): # TODO(2971352): revisit this tries=100 when there is any change to the 10min timeout logic -def handle_openai_error(tries: int = 100): +def handle_openai_error(tries: int = 100, unprocessable_entity_error_tries: int = 3): """ - A decorator function that used to handle OpenAI error. - OpenAI Error falls into retriable vs non-retriable ones. + A decorator function for handling OpenAI errors. - For retriable error, the decorator use below parameters to control its retry activity with exponential backoff: - `tries` : max times for the function invocation, type is int - 'delay': base delay seconds for exponential delay, type is float - """ + OpenAI errors are categorized into retriable and non-retriable. + + For retriable errors, the decorator uses the following parameters to control its retry behavior: + `tries`: max times for the function invocation, type is int + `unprocessable_entity_error_tries`: max times for the function invocation when consecutive + 422 error occurs, type is int + Note: + - The retry policy for UnprocessableEntityError is different because retrying may not be beneficial, + so small threshold and requiring consecutive errors. + """ def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): + consecutive_422_error_count = 0 for i in range(tries + 1): try: return func(*args, **kwargs) @@ -542,6 +548,7 @@ def wrapper(*args, **kwargs): # Handle retriable exception, please refer to # https://platform.openai.com/docs/guides/error-codes/api-errors print(f"Exception occurs: {type(e).__name__}: {str(e)}", file=sys.stderr) + # Firstly, exclude some non-retriable errors. # Vision model does not support all chat api parameters, e.g. response_format and function_call. # Recommend user to use vision model in vision tools, rather than LLM tool. # Related issue https://github.com/microsoft/promptflow/issues/1683 @@ -558,7 +565,11 @@ def wrapper(*args, **kwargs): if isinstance(e, APIConnectionError) and not isinstance(e, APITimeoutError) \ and not is_retriable_api_connection_error(e): raise WrappedOpenAIError(e) + # Retry InternalServerError(>=500), RateLimitError(429), UnprocessableEntityError(422) + # Solution references: + # https://platform.openai.com/docs/guides/error-codes/api-errors + # https://platform.openai.com/docs/guides/error-codes/python-library-error-types if isinstance(e, APIStatusError): status_code = e.response.status_code if status_code < 500 and status_code not in [429, 422]: @@ -567,7 +578,16 @@ def wrapper(*args, **kwargs): # Exit retry if this is quota insufficient error print(f"{type(e).__name__} with insufficient quota. Throw user error.", file=sys.stderr) raise WrappedOpenAIError(e) - if i == tries: + + # Retriable errors. + # To fix issue #2296, retry on api connection error, but with a separate retry policy. + if isinstance(e, APIStatusError) and e.response.status_code == 422: + consecutive_422_error_count += 1 + else: + # If other retriable errors, reset consecutive_422_error_count. + consecutive_422_error_count = 0 + + if i == tries or consecutive_422_error_count == unprocessable_entity_error_tries: # Exit retry if max retry reached print(f"{type(e).__name__} reached max retry. Exit retry with user error.", file=sys.stderr) raise ExceedMaxRetryTimes(e) diff --git a/src/promptflow-tools/tests/test_handle_openai_error.py b/src/promptflow-tools/tests/test_handle_openai_error.py index cfad9281161..1f23f31ec00 100644 --- a/src/promptflow-tools/tests/test_handle_openai_error.py +++ b/src/promptflow-tools/tests/test_handle_openai_error.py @@ -12,7 +12,7 @@ from promptflow.tools.aoai import chat, completion from promptflow.tools.common import handle_openai_error from promptflow.tools.exception import ChatAPIInvalidRole, WrappedOpenAIError, to_openai_error_message, \ - JinjaTemplateError, LLMError, ChatAPIFunctionRoleInvalidFormat + JinjaTemplateError, LLMError, ChatAPIFunctionRoleInvalidFormat, ExceedMaxRetryTimes from promptflow.tools.openai import chat as openai_chat from promptflow.tools.aoai_gpt4v import AzureOpenAI as AzureOpenAIVision from pytest_mock import MockerFixture @@ -115,8 +115,6 @@ def create_api_connection_error_with_cause(): create_api_connection_error_with_cause(), InternalServerError("Something went wrong", response=httpx.Response( 503, request=httpx.Request('GET', 'https://www.example.com')), body=None), - UnprocessableEntityError("Something went wrong", response=httpx.Response( - 422, request=httpx.Request('GET', 'https://www.example.com')), body=None) ] ), ], @@ -155,9 +153,6 @@ def test_retriable_openai_error_handle(self, mocker: MockerFixture, dummyExcepti InternalServerError("Something went wrong", response=httpx.Response( 503, request=httpx.Request('GET', 'https://www.example.com'), headers={"retry-after": "0.3"}), body=None), - UnprocessableEntityError("Something went wrong", response=httpx.Response( - 422, request=httpx.Request('GET', 'https://www.example.com'), headers={"retry-after": "0.3"}), - body=None) ] ), ], @@ -188,6 +183,23 @@ def test_retriable_openai_error_handle_with_header( ] mock_sleep.assert_has_calls(expected_calls) + def test_unprocessable_entity_error(self, mocker: MockerFixture): + unprocessable_entity_error = UnprocessableEntityError( + "Something went wrong", response=httpx.Response( + 422, request=httpx.Request('GET', 'https://www.example.com')), body=None) + rate_limit_error = RateLimitError("Something went wrong", response=httpx.Response( + 429, request=httpx.Request('GET', 'https://www.example.com'), headers={"retry-after": "0.3"}), + body=None) + # for below exception sequence, "consecutive_422_error_count" changes: 0 -> 1 -> 0 -> 1 -> 2. + exception_sequence = [ + unprocessable_entity_error, rate_limit_error, unprocessable_entity_error, unprocessable_entity_error] + patched_test_method = mocker.patch("promptflow.tools.aoai.AzureOpenAI.chat", side_effect=exception_sequence) + # limit api connection error retry threshold to 2. + decorated_test_method = handle_openai_error(unprocessable_entity_error_tries=2)(patched_test_method) + with pytest.raises(ExceedMaxRetryTimes): + decorated_test_method() + assert patched_test_method.call_count == 4 + @pytest.mark.parametrize( "dummyExceptionList", [ @@ -197,8 +209,6 @@ def test_retriable_openai_error_handle_with_header( body=None), BadRequestError("Something went wrong", response=httpx.get('https://www.example.com'), body=None), - APIConnectionError(message="Something went wrong", - request=httpx.Request('GET', 'https://www.example.com')), ] ), ],