[Evaluation] Fix red team status tracking, cache key mismatch, and evaluation error handling (#45517)

slister1001 · Copilot · web-flow · commit 7455f91452ca · 2026-03-09T18:00:35.000-04:00
* Fix red team status tracking, cache key mismatch, and evaluation error handling

Bug 1 - Status tracking: _determine_run_status now treats 'pending' and
'running' entries as 'failed' instead of 'in_progress'. By the time this
method runs the scan is finished, so leftover 'pending' entries (from
skipped risk categories or Foundry execution failures) indicate failure,
not ongoing work.

Bug 2 - Cache key mismatch: _execute_attacks_with_foundry now uses
get_attack_objective_from_risk_category() to build the cache lookup key,
matching the caching logic in _get_attack_objectives. Previously,
ungrounded_attributes objectives were cached under 'isa' but looked up
under 'ungrounded_attributes', causing them to be silently skipped.

Bug 3 - Evaluation error handling: RAIServiceScorer now detects when the
RAI evaluation service returns an error response (properties.outcome ==
'error', e.g. ServiceInvocationException) and raises RuntimeError. This
causes PyRIT to treat the score as UNDETERMINED instead of using the
erroneous passed=False to incorrectly mark the attack as successful,
which was inflating ASR.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;

* Add changelog entries for status tracking, cache key, and scoring fixes

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;

---------

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -4,6 +4,9 @@
 
 ### Bugs Fixed
 
+- Fixed red team scan status stuck at `in_progress` in results.json despite the scan completing, by treating leftover `pending` entries as `failed`.
+- Fixed `ungrounded_attributes` risk category being silently skipped due to a cache key mismatch (`isa` vs `ungrounded_attributes`) in the Foundry execution path.
+- Fixed RAI evaluation service errors (`ServiceInvocationException`) incorrectly inflating attack success rate by treating error responses as undetermined instead of attack success.
 - Fixed Foundry red team double-evaluation that caused ~3x scan latency by removing redundant evaluation_processor.evaluate() call.
 - Fixed lost red team attack details where baseline overwrites wiped evaluation results.
 - Fixed incorrect per_testing_criteria that showed attack strategies alongside risk categories.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_rai_scorer.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_rai_scorer.py
@@ -183,6 +183,8 @@ async def _score_piece_async(
             result_label = ""
             passed = None
 
+            matched_result_dict = None
+
             if hasattr(eval_result, "results") or (isinstance(eval_result, dict) and "results" in eval_result):
                 results = eval_result.results if hasattr(eval_result, "results") else eval_result.get("results", [])
                 results = results or []
@@ -197,8 +199,22 @@ async def _score_piece_async(
                         threshold = result_dict.get("threshold")
                         passed = result_dict.get("passed")
                         result_label = result_dict.get("label") or ""
+                        matched_result_dict = result_dict
                         break
 
+            # Check for evaluation service errors (e.g. ServiceInvocationException).
+            # These return score=0.0 and passed=False but with properties.outcome="error",
+            # meaning the evaluation didn't actually run.  Raising here lets the existing
+            # except block re-raise so PyRIT marks the score as UNDETERMINED.
+            if matched_result_dict is not None:
+                result_properties = matched_result_dict.get("properties", {})
+                if isinstance(result_properties, dict) and result_properties.get("outcome") == "error":
+                    error_detail = result_properties.get("error", "Unknown evaluation error")
+                    raise RuntimeError(
+                        f"RAI evaluation service returned an error for {metric_name}: {error_detail}. "
+                        f"Score will be treated as undetermined."
+                    )
+
             if raw_score is None:
                 self.logger.warning(f"No matching result found for metric '{metric_name}' in evaluation response.")
                 raw_score = 0
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_red_team.py
@@ -1750,7 +1750,7 @@ async def _execute_attacks_with_foundry(
                 objectives_by_risk[risk_value] = []
 
                 # Get baseline objectives for this risk category from cache
-                baseline_key = ((risk_value,), "baseline")
+                baseline_key = ((get_attack_objective_from_risk_category(risk_category).lower(),), "baseline")
                 self.logger.debug(f"Looking for baseline_key: {baseline_key}")
                 self.logger.debug(f"Available keys in attack_objectives: {list(self.attack_objectives.keys())}")
                 if baseline_key in self.attack_objectives:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py
@@ -1493,7 +1493,10 @@ def _determine_run_status(
     ) -> str:
         """Determine the run-level status based on red team info status values."""
 
-        # Check if any tasks are still incomplete/failed
+        # Check if any tasks are incomplete/failed/were never executed.
+        # By the time this method is called the scan is finished, so "pending"
+        # (category was skipped or never ran) and "running" are also terminal
+        # failures rather than signs of ongoing work.
         if isinstance(red_team_info, dict):
             for risk_data in red_team_info.values():
                 if not isinstance(risk_data, dict):
@@ -1502,10 +1505,8 @@ def _determine_run_status(
                     if not isinstance(details, dict):
                         continue
                     status = details.get("status", "").lower()
-                    if status in ("incomplete", "failed", "timeout"):
+                    if status in ("incomplete", "failed", "timeout", "pending", "running"):
                         return "failed"
-                    elif status in ("running", "pending"):
-                        return "in_progress"
 
         return "completed"