Skip to content

Commit 7455f91

Browse files
slister1001Copilot
andauthored
[Evaluation] Fix red team status tracking, cache key mismatch, and evaluation error handling (#45517)
* Fix red team status tracking, cache key mismatch, and evaluation error handling Bug 1 - Status tracking: _determine_run_status now treats 'pending' and 'running' entries as 'failed' instead of 'in_progress'. By the time this method runs the scan is finished, so leftover 'pending' entries (from skipped risk categories or Foundry execution failures) indicate failure, not ongoing work. Bug 2 - Cache key mismatch: _execute_attacks_with_foundry now uses get_attack_objective_from_risk_category() to build the cache lookup key, matching the caching logic in _get_attack_objectives. Previously, ungrounded_attributes objectives were cached under 'isa' but looked up under 'ungrounded_attributes', causing them to be silently skipped. Bug 3 - Evaluation error handling: RAIServiceScorer now detects when the RAI evaluation service returns an error response (properties.outcome == 'error', e.g. ServiceInvocationException) and raises RuntimeError. This causes PyRIT to treat the score as UNDETERMINED instead of using the erroneous passed=False to incorrectly mark the attack as successful, which was inflating ASR. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Add changelog entries for status tracking, cache key, and scoring fixes Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 2d0a9aa commit 7455f91

4 files changed

Lines changed: 25 additions & 5 deletions

File tree

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44

55
### Bugs Fixed
66

7+
- Fixed red team scan status stuck at `in_progress` in results.json despite the scan completing, by treating leftover `pending` entries as `failed`.
8+
- Fixed `ungrounded_attributes` risk category being silently skipped due to a cache key mismatch (`isa` vs `ungrounded_attributes`) in the Foundry execution path.
9+
- Fixed RAI evaluation service errors (`ServiceInvocationException`) incorrectly inflating attack success rate by treating error responses as undetermined instead of attack success.
710
- Fixed Foundry red team double-evaluation that caused ~3x scan latency by removing redundant evaluation_processor.evaluate() call.
811
- Fixed lost red team attack details where baseline overwrites wiped evaluation results.
912
- Fixed incorrect per_testing_criteria that showed attack strategies alongside risk categories.

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_rai_scorer.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ async def _score_piece_async(
183183
result_label = ""
184184
passed = None
185185

186+
matched_result_dict = None
187+
186188
if hasattr(eval_result, "results") or (isinstance(eval_result, dict) and "results" in eval_result):
187189
results = eval_result.results if hasattr(eval_result, "results") else eval_result.get("results", [])
188190
results = results or []
@@ -197,8 +199,22 @@ async def _score_piece_async(
197199
threshold = result_dict.get("threshold")
198200
passed = result_dict.get("passed")
199201
result_label = result_dict.get("label") or ""
202+
matched_result_dict = result_dict
200203
break
201204

205+
# Check for evaluation service errors (e.g. ServiceInvocationException).
206+
# These return score=0.0 and passed=False but with properties.outcome="error",
207+
# meaning the evaluation didn't actually run. Raising here lets the existing
208+
# except block re-raise so PyRIT marks the score as UNDETERMINED.
209+
if matched_result_dict is not None:
210+
result_properties = matched_result_dict.get("properties", {})
211+
if isinstance(result_properties, dict) and result_properties.get("outcome") == "error":
212+
error_detail = result_properties.get("error", "Unknown evaluation error")
213+
raise RuntimeError(
214+
f"RAI evaluation service returned an error for {metric_name}: {error_detail}. "
215+
f"Score will be treated as undetermined."
216+
)
217+
202218
if raw_score is None:
203219
self.logger.warning(f"No matching result found for metric '{metric_name}' in evaluation response.")
204220
raw_score = 0

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1750,7 +1750,7 @@ async def _execute_attacks_with_foundry(
17501750
objectives_by_risk[risk_value] = []
17511751

17521752
# Get baseline objectives for this risk category from cache
1753-
baseline_key = ((risk_value,), "baseline")
1753+
baseline_key = ((get_attack_objective_from_risk_category(risk_category).lower(),), "baseline")
17541754
self.logger.debug(f"Looking for baseline_key: {baseline_key}")
17551755
self.logger.debug(f"Available keys in attack_objectives: {list(self.attack_objectives.keys())}")
17561756
if baseline_key in self.attack_objectives:

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1493,7 +1493,10 @@ def _determine_run_status(
14931493
) -> str:
14941494
"""Determine the run-level status based on red team info status values."""
14951495

1496-
# Check if any tasks are still incomplete/failed
1496+
# Check if any tasks are incomplete/failed/were never executed.
1497+
# By the time this method is called the scan is finished, so "pending"
1498+
# (category was skipped or never ran) and "running" are also terminal
1499+
# failures rather than signs of ongoing work.
14971500
if isinstance(red_team_info, dict):
14981501
for risk_data in red_team_info.values():
14991502
if not isinstance(risk_data, dict):
@@ -1502,10 +1505,8 @@ def _determine_run_status(
15021505
if not isinstance(details, dict):
15031506
continue
15041507
status = details.get("status", "").lower()
1505-
if status in ("incomplete", "failed", "timeout"):
1508+
if status in ("incomplete", "failed", "timeout", "pending", "running"):
15061509
return "failed"
1507-
elif status in ("running", "pending"):
1508-
return "in_progress"
15091510

15101511
return "completed"
15111512

0 commit comments

Comments
 (0)