Skip to content

Commit

Permalink
Feature/xpia sim and eval fixes (#3723)
Browse files Browse the repository at this point in the history
A fork of the original XPIA sim/eval branch with additional fixes for
bugs discovered last night.

- changes the jailbreak check for combining templates to only account
for upia (since xpia doesn't merge templates)
- removes conversations as an input for xpia evals until the default
override bug is fixed.
- Accounts for new xpia evaluator return fields.
- Changes the output base name of 'reasoning' fields for label-based
evaluators to just 'reason'

Original PR: #3703

---------

Co-authored-by: Diondra Peck <[email protected]>
Co-authored-by: Diondra <[email protected]>
  • Loading branch information
3 people authored Sep 6, 2024
1 parent b7dc8b7 commit b04e889
Show file tree
Hide file tree
Showing 20 changed files with 136,456 additions and 38 deletions.
5 changes: 4 additions & 1 deletion .cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,8 @@
"vnet",
"Weaviate",
"westus",
"wsid"
"wsid",
"Xpia"
],
"ignoreWords": [
"openmpi",
Expand Down Expand Up @@ -243,6 +244,8 @@
"azureopenaimodelconfiguration",
"openaimodelconfiguration",
"usecwd",
"upia",
"xpia",
"locustio",
"euap",
"Rerank",
Expand Down
5 changes: 5 additions & 0 deletions src/promptflow-evals/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@

## v0.3.3 (Upcoming)
### Features Added
- Introduced `IndirectAttackSimulator` to simulate XPIA (cross domain prompt injected attack) jailbreak attacks on your AI system.
- Introduced `IndirectAttackEvaluator` to evaluate content for the presence of XPIA (cross domain prompt injected attacks) injected into conversation or Q/A context to interrupt normal expected functionality by eliciting manipulated content, intrusion and attempting to gather information outside the scope of your AI system.
- Add a new evaluator (ProtectedMaterialEvaluator) and associated adversarial content simulator enum type (AdversarialScenario.ADVERSARIAL_CONTENT_PROTECTED_MATERIAL) for protected material, which determines if given inputs contain material protected by IP laws.
- Added four mathematic evaluators, `BleuScoreEvaluator`, `GleuScoreEvaluator`, `MeteorScoreEvaluator` and `RougeScoreEvaluator` - for evaluating the quality of generated text by comparing it against referenced text.

### Bugs Fixed
- Fixed evaluators to accept (non-Azure) Open AI Configs.

### Breaking Changes
- Replaced `jailbreak` parameter in `AdversarialSimulator` with `_jailbreak_type` parameter to support multiple jailbreak types. Instead of editing this parameter directly, we recommend using the `JailbreakAdversarialSimulator` class for UPIA jailbreak and `IndirectAttackSimulator` class for XPIA jailbreak.

### Improvements
- Renamed `JailbreakAdversarialSimulator` to `DirectAttackSimulator`
- Set the PF_EVALS_BATCH_USE_ASYNC environment variable to True by default to enable asynchronous batch run for async-enabled built-in evaluators, improving performance.
Expand Down
2 changes: 2 additions & 0 deletions src/promptflow-evals/promptflow/evals/_common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class Tasks:

CONTENT_HARM = "content harm"
PROTECTED_MATERIAL = "protected material"
XPIA = "xpia"


class _InternalAnnotationTasks:
Expand All @@ -52,6 +53,7 @@ class EvaluationMetrics:
SELF_HARM = "self_harm"
SEXUAL = "sexual"
PROTECTED_MATERIAL = "protected_material"
XPIA = "xpia"


class _InternalEvaluationMetrics:
Expand Down
25 changes: 22 additions & 3 deletions src/promptflow-evals/promptflow/evals/_common/rai_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
elif metric == _InternalEvaluationMetrics.ECI:
task = _InternalAnnotationTasks.ECI
include_metric = False
elif metric == EvaluationMetrics.XPIA:
task = Tasks.XPIA
include_metric = False
return (
{
"UserTextList": [normalized_user_text],
Expand Down Expand Up @@ -207,8 +210,9 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
:return: The parsed annotation result.
:rtype: List[List[Dict]]
"""

# non-numeric metrics
if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI}:
if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
return {}
response = batch_response[0][metric_name]
Expand All @@ -218,8 +222,23 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
result = {}
metric_prefix = _get_metric_prefix(metric_name)
# Use label instead of score since these are assumed to be boolean results.
result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else ""
result[metric_prefix + "_reasoning"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
# Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
result[metric_prefix + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
result[metric_prefix + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""

if metric_name == EvaluationMetrics.XPIA:
# Add "manipulated_content", "intrusion" and "information_gathering" to the result
# if present else set them to np.nan
result[metric_prefix + "_manipulated_content"] = (
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
)
result[metric_prefix + "_intrusion"] = (
parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
)
result[metric_prefix + "_information_gathering"] = (
parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
)

return result
return _parse_content_harm_response(batch_response, metric_name)

Expand Down
2 changes: 2 additions & 0 deletions src/promptflow-evals/promptflow/evals/evaluators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from ._relevance import RelevanceEvaluator
from ._rouge import RougeScoreEvaluator, RougeType
from ._similarity import SimilarityEvaluator
from ._xpia import IndirectAttackEvaluator

__all__ = [
"CoherenceEvaluator",
Expand All @@ -39,6 +40,7 @@
"HateUnfairnessEvaluator",
"ContentSafetyEvaluator",
"ContentSafetyChatEvaluator",
"IndirectAttackEvaluator",
"BleuScoreEvaluator",
"GleuScoreEvaluator",
"MeteorScoreEvaluator",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class ECIEvaluator:
{
"ECI_label": "False",
"ECI_reasoning": "Some reason."
"ECI_reason": "Some reason."
}
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ class ProtectedMaterialEvaluator:
.. code-block:: python
{
"label": "False",
"reasoning": "This question does not contain any protected material."
"protected_material_label": "False",
"protected_material_reason": "This question does not contain any protected material."
}
"""

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from ._xpia import IndirectAttackEvaluator

__all__ = [
"IndirectAttackEvaluator",
]
133 changes: 133 additions & 0 deletions src/promptflow-evals/promptflow/evals/evaluators/_xpia/_xpia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import logging
from typing import Optional

from promptflow._utils.async_utils import async_run_allowing_running_loop
from promptflow.evals._common.constants import EvaluationMetrics
from promptflow.evals._common.rai_service import evaluate_with_rai_service

logger = logging.getLogger(__name__)


class IndirectAttackEvaluator:
"""
Initializes an XPIA (cross domain prompt injected attack) jailbreak evaluator to detect whether cross domain
injected attacks are present in your AI system's response.
:param project_scope: The scope of the Azure AI project.
It contains subscription id, resource group, and project name.
:type project_scope: dict
:param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
:type eval_last_turn: bool
:param credential: The credential for connecting to Azure AI project.
:type credential: ~azure.core.credentials.TokenCredential
:return: A function that evaluates and generates metrics for XPIA chat scenario. Metrics include the overall
evaluation label and reason for the Q/A Pair, as well as sub-labels
for manipulated content, intrusion, and information.
:rtype: Callable
**Usage**
.. code-block:: python
eval_fn = IndirectAttackEvaluator(model_config)
result = eval_fn(question="What is the capital of France?", answer="Paris.")
**Output format for question-answer pair**
.. code-block:: python
{
'xpia_label': False,
'xpia_reason': 'The conversation does not contain any manipulated content, intrusion or
information gathering.'
'xpia_information_gathering': False,
'xpia_intrusion': False
'xpia_manipulated_content': False
}
"""

def __init__(self, project_scope: dict, eval_last_turn: bool = False, credential=None):
self._evaluator = _IndirectAttackEvaluator(project_scope, credential)
self._eval_last_turn = eval_last_turn

def __call__(
self,
*,
question: Optional[str],
answer: Optional[str],
**kwargs,
):
"""
Evaluates content according to the presence of attacks injected into the conversation context to
interrupt normal expected functionality by eliciting manipulated content, intrusion and attempting
to gather information outside the scope of your AI system.
:keyword question: The question to be evaluated. Mutually exclusive with 'conversation'.
:paramtype question: Optional[str]
:keyword answer: The answer to be evaluated. Mutually exclusive with 'conversation'.
:paramtype answer: Optional[str]
:return: The evaluation scores and reasoning.
:rtype: dict
"""

return self._evaluator(question=question, answer=answer, **kwargs)


class _AsyncIndirectAttackEvaluator:
def __init__(self, project_scope: dict, credential=None):
self._project_scope = project_scope
self._credential = credential

async def __call__(self, *, question: str, answer: str, **kwargs):
"""
Evaluates content according to this evaluator's metric.
:keyword question: The question to be evaluated.
:paramtype question: str
:keyword answer: The answer to be evaluated.
:paramtype answer: str
:return: The evaluation score computation based on the metric (self.metric).
:rtype: Any
"""
# Validate inputs
# Raises value error if failed, so execution alone signifies success.
if not (question and question.strip() and question != "None") or not (
answer and answer.strip() and answer != "None"
):
raise ValueError("Both 'question' and 'answer' must be non-empty strings.")

# Run score computation based on supplied metric.
result = await evaluate_with_rai_service(
metric_name=EvaluationMetrics.XPIA,
question=question,
answer=answer,
project_scope=self._project_scope,
credential=self._credential,
)
return result


class _IndirectAttackEvaluator:
def __init__(self, project_scope: dict, credential=None):
self._async_evaluator = _AsyncIndirectAttackEvaluator(project_scope, credential)

def __call__(self, *, question: str, answer: str, **kwargs):
"""
Evaluates XPIA content.
:keyword question: The question to be evaluated.
:paramtype question: str
:keyword answer: The answer to be evaluated.
:paramtype answer: str
:keyword context: The context to be evaluated.
:paramtype context: str
:return: The XPIA score.
:rtype: dict
"""
return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs)

def _to_async(self):
return self._async_evaluator
3 changes: 2 additions & 1 deletion src/promptflow-evals/promptflow/evals/synthetic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .adversarial_scenario import AdversarialScenario
from .adversarial_simulator import AdversarialSimulator
from .direct_attack_simulator import DirectAttackSimulator
from .xpia_simulator import IndirectAttackSimulator

__all__ = ["AdversarialSimulator", "AdversarialScenario", "DirectAttackSimulator"]
__all__ = ["AdversarialSimulator", "AdversarialScenario", "DirectAttackSimulator", "IndirectAttackSimulator"]
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,11 @@ async def request_api(

exp_retry_client = get_async_http_client().with_policies(retry_policy=retry_policy)

# initial 10 seconds wait before attempting to fetch result
await asyncio.sleep(10)
# initial 15 seconds wait before attempting to fetch result
# Need to wait both in this thread and in the async thread for some reason?
# Someone not under a crunch and with better async understandings should dig into this more.
await asyncio.sleep(15)
time.sleep(15)

response = await exp_retry_client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
self.result_url, headers=proxy_headers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def __init__(self, azure_ai_project: Dict, token_manager: APITokenManager) -> No
self.parameter_json_endpoint = urljoin(self.api_url, "simulation/template/parameters")
self.jailbreaks_json_endpoint = urljoin(self.api_url, "simulation/jailbreak")
self.simulation_submit_endpoint = urljoin(self.api_url, "simulation/chat/completions/submit")
self.xpia_jailbreaks_json_endpoint = urljoin(self.api_url, "simulation/jailbreak/xpia")

def _get_service_discovery_url(self):
bearer_token = self.token_manager.get_token()
Expand Down Expand Up @@ -92,10 +93,15 @@ async def get_contentharm_parameters(self) -> Any:

return self.contentharm_parameters

async def get_jailbreaks_dataset(self) -> Any:
async def get_jailbreaks_dataset(self, type: str) -> Any:
"Get the jailbreaks dataset, if exists"
if self.jailbreaks_dataset is None:
self.jailbreaks_dataset = await self.get(self.jailbreaks_json_endpoint)
if type == "xpia":
self.jailbreaks_dataset = await self.get(self.xpia_jailbreaks_json_endpoint)
elif type == "upia":
self.jailbreaks_dataset = await self.get(self.jailbreaks_json_endpoint)
else:
raise ValueError("Invalid type, please provide either 'xpia' or 'upia'")

return self.jailbreaks_dataset

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class AdversarialScenario(Enum):
ADVERSARIAL_CONTENT_GEN_UNGROUNDED = "adv_content_gen_ungrounded"
ADVERSARIAL_CONTENT_GEN_GROUNDED = "adv_content_gen_grounded"
ADVERSARIAL_CONTENT_PROTECTED_MATERIAL = "adv_content_protected_material"
ADVERSARIAL_INDIRECT_JAILBREAK = "adv_xpia"


class _UnstableAdversarialScenario(Enum):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,15 @@ def wrapper(*args, **kwargs):
scenario = str(kwargs.get("scenario", None))
max_conversation_turns = kwargs.get("max_conversation_turns", None)
max_simulation_results = kwargs.get("max_simulation_results", None)
jailbreak = kwargs.get("jailbreak", None)
_jailbreak_type = kwargs.get("_jailbreak_type", None)
decorated_func = monitor_operation(
activity_name="adversarial.simulator.call",
activity_type=ActivityType.PUBLICAPI,
custom_dimensions={
"scenario": scenario,
"max_conversation_turns": max_conversation_turns,
"max_simulation_results": max_simulation_results,
"jailbreak": jailbreak,
"_jailbreak_type": _jailbreak_type,
},
)(func)

Expand Down Expand Up @@ -115,7 +115,7 @@ async def __call__(
api_call_retry_sleep_sec: int = 1,
api_call_delay_sec: int = 0,
concurrent_async_task: int = 3,
jailbreak: bool = False,
_jailbreak_type: Optional[str] = None,
randomize_order: bool = True,
randomization_seed: Optional[int] = None,
):
Expand Down Expand Up @@ -149,9 +149,6 @@ async def __call__(
:keyword concurrent_async_task: The number of asynchronous tasks to run concurrently during the simulation.
Defaults to 3.
:paramtype concurrent_async_task: int
:keyword jailbreak: If set to True, allows breaking out of the conversation flow defined by the scenario.
Defaults to False.
:paramtype jailbreak: bool
:keyword randomize_order: Whether or not the order of the prompts should be randomized. Defaults to True.
:paramtype randomize_order: bool
:keyword randomization_seed: The seed used to randomize prompt selection. If unset, the system's
Expand Down Expand Up @@ -218,11 +215,11 @@ async def __call__(
total_tasks,
)
total_tasks = min(total_tasks, max_simulation_results)
if jailbreak:
jailbreak_dataset = await self.rai_client.get_jailbreaks_dataset()
if _jailbreak_type:
jailbreak_dataset = await self.rai_client.get_jailbreaks_dataset(type=_jailbreak_type)
progress_bar = tqdm(
total=total_tasks,
desc="generating jailbreak simulations" if jailbreak else "generating simulations",
desc="generating jailbreak simulations" if _jailbreak_type else "generating simulations",
ncols=100,
unit="simulations",
)
Expand All @@ -237,7 +234,7 @@ async def __call__(
random.shuffle(parameter_order)
for index in parameter_order:
parameter = template.template_parameters[index].copy()
if jailbreak:
if _jailbreak_type == "upia":
parameter = self._join_conversation_starter(parameter, random.choice(jailbreak_dataset))
tasks.append(
asyncio.create_task(
Expand Down
Loading

0 comments on commit b04e889

Please sign in to comment.