api-debug-env / scenarios.py
yadnyeshkolte's picture
chore: remove __pycache__ files
8b10144
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""
Scenario definitions for the API Integration Debugging Environment.
Each scenario models a realistic multi-service API ecosystem with:
- Service dependency graphs (upstream/downstream relationships)
- Cascading failures (upstream bugs propagate downstream)
- Dynamic logs that update when issues are fixed
- Expanded issue pools for seed-based random subset selection
"""
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple
import random
@dataclass
class Issue:
"""A single issue in an API integration scenario."""
issue_id: str
service: str
description: str
expected_fix: Dict[str, Any]
fix_key: str # The key in the config that needs fixing
log_hint: str # Log line that hints at this issue
# --- New fields for cascading failures ---
depends_on: List[str] = field(default_factory=list)
# Issues that must be fixed before this one can be diagnosed
cascade_effects: Dict[str, str] = field(default_factory=dict)
# service -> error message caused by this issue being unfixed
category: str = "configuration"
# Issue category: configuration, authentication, networking, protocol
severity: str = "error"
# Severity: error, warning, critical
root_cause_explanation: str = ""
# Detailed explanation of why this issue occurs (for grading diagnosis quality)
@dataclass
class ServiceNode:
"""A node in the service dependency graph."""
name: str
depends_on: List[str] = field(default_factory=list)
# Services this one calls (upstream dependencies)
health_status: str = "degraded"
# healthy, degraded, error, unreachable
@dataclass
class Scenario:
"""A complete API debugging scenario with dependency graph."""
task_id: str
difficulty: str
description: str
max_steps: int
services: List[str]
configs: Dict[str, Dict[str, Any]]
logs: Dict[str, List[str]]
issues: List[Issue]
# --- New fields ---
service_graph: Dict[str, ServiceNode] = field(default_factory=dict)
# Service dependency graph
dynamic_logs: Dict[str, Dict[str, List[str]]] = field(default_factory=dict)
# service -> {issue_id: [new logs when fixed]}
optimal_fix_order: List[str] = field(default_factory=list)
# Optimal order to fix issues (for strategy scoring)
context: str = ""
# Additional scenario context for the agent
def get_scenario(task_id: str, seed: Optional[int] = None) -> Scenario:
"""
Load a scenario by task ID with optional randomization.
Args:
task_id: One of 'easy', 'medium', 'hard'
seed: Optional seed for deterministic but varied scenarios.
When provided, selects a random subset of issues from the pool
and randomizes log order. When None, returns the canonical scenario.
"""
scenario_builders = {
"easy": _easy_scenario,
"medium": _medium_scenario,
"hard": _hard_scenario,
}
if task_id not in scenario_builders:
raise ValueError(f"Unknown task_id: {task_id}. Must be one of: {list(scenario_builders.keys())}")
scenario = scenario_builders[task_id](seed=seed)
return scenario
def get_all_task_ids() -> List[str]:
"""Return all available task IDs."""
return ["easy", "medium", "hard"]
def _select_issues(pool: List[Issue], count: int, rng: random.Random) -> List[Issue]:
"""Select a random subset of issues from a pool, respecting dependencies."""
if count >= len(pool):
selected = list(pool)
else:
# Build dependency-aware selection
available = list(pool)
selected = []
while len(selected) < count and available:
# Pick a random issue
issue = rng.choice(available)
available.remove(issue)
# Add its dependencies too if not already selected
deps_satisfied = all(
any(s.issue_id == dep for s in selected)
for dep in issue.depends_on
)
if deps_satisfied or not issue.depends_on:
selected.append(issue)
else:
# Add dependencies first
for dep_id in issue.depends_on:
dep_issue = next((i for i in pool if i.issue_id == dep_id), None)
if dep_issue and dep_issue not in selected:
selected.append(dep_issue)
if dep_issue in available:
available.remove(dep_issue)
selected.append(issue)
# Shuffle log order for selected issues
rng.shuffle(selected)
return selected[:count]
def _randomize_scenario(scenario: Scenario, seed: int) -> Scenario:
"""Apply seed-based randomization to a scenario."""
rng = random.Random(seed)
# Shuffle log entries for each service
for service_logs in scenario.logs.values():
rng.shuffle(service_logs)
# Vary timestamps in log entries
base_hour = rng.randint(8, 16)
base_minute = rng.randint(0, 59)
for service, log_list in scenario.logs.items():
new_logs = []
for i, log_line in enumerate(log_list):
# Replace the timestamp portion
minute = (base_minute + i * rng.randint(1, 5)) % 60
hour = base_hour + (base_minute + i * rng.randint(1, 5)) // 60
new_log = log_line
if "2026-" in new_log:
# Replace date with varied date
day = rng.randint(20, 28)
new_log = new_log.replace(
"2026-03-25",
f"2026-03-{day:02d}"
).replace(
"2026-03-24",
f"2026-03-{day-1:02d}"
)
new_logs.append(new_log)
scenario.logs[service] = new_logs
return scenario
# ─── Easy Scenario ───────────────────────────────────────────────────────────
def _easy_scenario(seed: Optional[int] = None) -> Scenario:
"""
Easy: Payment API integration failures.
Agent must diagnose auth + content-type issues with clear log signals.
Issue pool has 4 possible issues; canonical scenario uses 2.
"""
# Full issue pool (4 issues, canonical uses 2)
issue_pool = [
Issue(
issue_id="easy_auth",
service="payment_client",
description="Missing Authorization header — payment gateway requires Bearer token authentication",
expected_fix={"headers.Authorization": "Bearer <token>"},
fix_key="headers.Authorization",
log_hint="Missing or invalid Authorization header",
category="authentication",
severity="critical",
root_cause_explanation=(
"The payment_client is missing the Authorization header entirely. "
"The payment_gateway requires Bearer token auth on all /process requests. "
"This results in HTTP 401 on every payment attempt."
),
cascade_effects={
"payment_gateway": "All requests from payment_client rejected with 401"
},
),
Issue(
issue_id="easy_content_type",
service="payment_client",
description="Wrong Content-Type header (text/plain instead of application/json)",
expected_fix={"headers.Content-Type": "application/json"},
fix_key="headers.Content-Type",
log_hint="Content-Type must be application/json",
category="protocol",
severity="error",
root_cause_explanation=(
"The payment_client sends Content-Type: text/plain, but the gateway "
"only accepts application/json. This causes HTTP 415 Unsupported Media Type. "
"The gateway cannot parse the request body."
),
cascade_effects={
"payment_gateway": "Request body parsing fails for payment_client requests"
},
),
Issue(
issue_id="easy_timeout",
service="payment_client",
description="Timeout set too low (5s) for payment processing that takes 8-12s",
expected_fix={"timeout": 30},
fix_key="timeout",
log_hint="Request timed out after 5s",
category="networking",
severity="error",
root_cause_explanation=(
"The payment_client has timeout=5s, but payment processing at the gateway "
"takes 8-12s for fraud checks. Legitimate payments are timing out."
),
),
Issue(
issue_id="easy_base_url",
service="payment_client",
description="Base URL pointing to deprecated v1 endpoint instead of v2",
expected_fix={"base_url": "https://api.paymentgateway.com/v2"},
fix_key="base_url",
log_hint="API v1 is deprecated",
category="configuration",
severity="warning",
root_cause_explanation=(
"The payment_client uses /v1 which is deprecated and returning 301 redirects. "
"The gateway v2 endpoint has different request schemas, causing deserialization errors."
),
),
]
# Select issues based on seed
if seed is not None:
rng = random.Random(seed)
issues = _select_issues(issue_pool, 2, rng)
else:
issues = issue_pool[:2] # Canonical: auth + content_type
# Build logs based on selected issues
client_logs = [
"[INFO] 2026-03-25T10:15:20Z Payment client initialized with base_url=https://api.paymentgateway.com/v2",
]
gateway_logs = [
"[INFO] 2026-03-25T10:15:20Z Gateway ready, accepting application/json with Bearer auth",
]
for issue in issues:
if issue.issue_id == "easy_auth":
client_logs.extend([
"[ERROR] 2026-03-25T10:15:23Z POST /process -> 401 Unauthorized",
"[ERROR] 2026-03-25T10:15:23Z Response: {'error': 'Missing or invalid Authorization header'}",
"[WARN] 2026-03-25T10:15:22Z Request headers: Content-Type=text/plain, Accept=application/json",
])
gateway_logs.append(
"[WARN] 2026-03-25T10:15:23Z Rejected request: no Authorization header present"
)
elif issue.issue_id == "easy_content_type":
client_logs.extend([
"[ERROR] 2026-03-25T10:15:24Z POST /process -> 415 Unsupported Media Type",
"[ERROR] 2026-03-25T10:15:24Z Response: {'error': 'Content-Type must be application/json'}",
])
gateway_logs.append(
"[WARN] 2026-03-25T10:15:24Z Rejected request: unsupported Content-Type 'text/plain'"
)
elif issue.issue_id == "easy_timeout":
client_logs.extend([
"[ERROR] 2026-03-25T10:15:30Z POST /process -> Request timed out after 5s",
"[WARN] 2026-03-25T10:15:30Z Payment processing takes 8-12s for fraud verification",
])
gateway_logs.append(
"[INFO] 2026-03-25T10:15:30Z Processing payment... estimated time: 10s"
)
elif issue.issue_id == "easy_base_url":
client_logs.extend([
"[ERROR] 2026-03-25T10:15:21Z GET /v1/status -> 301 Moved Permanently",
"[WARN] 2026-03-25T10:15:21Z API v1 is deprecated, migrate to /v2",
])
gateway_logs.append(
"[WARN] 2026-03-25T10:15:21Z Deprecated v1 endpoint accessed"
)
# Determine initial config based on selected issues
configs = {
"payment_client": {
"base_url": "https://api.paymentgateway.com/v2",
"headers": {
"Content-Type": "application/json",
"Accept": "application/json",
},
"timeout": 30,
"retry_count": 3,
},
"payment_gateway": {
"endpoint": "/process",
"method": "POST",
"required_headers": ["Authorization", "Content-Type"],
"accepted_content_types": ["application/json"],
"auth_scheme": "Bearer",
"processing_time_ms": "8000-12000",
},
}
# Apply broken config for each selected issue
for issue in issues:
if issue.issue_id == "easy_auth":
# Remove auth header (it shouldn't exist)
configs["payment_client"]["headers"].pop("Authorization", None)
elif issue.issue_id == "easy_content_type":
configs["payment_client"]["headers"]["Content-Type"] = "text/plain"
elif issue.issue_id == "easy_timeout":
configs["payment_client"]["timeout"] = 5
elif issue.issue_id == "easy_base_url":
configs["payment_client"]["base_url"] = "https://api.paymentgateway.com/v1"
# Dynamic logs: what changes after fixing each issue
dynamic_logs = {}
for issue in issues:
if issue.issue_id == "easy_auth":
dynamic_logs["easy_auth"] = {
"payment_client": ["[INFO] Authorization header set. Retrying request..."],
"payment_gateway": ["[INFO] Authentication successful for payment_client"],
}
elif issue.issue_id == "easy_content_type":
dynamic_logs["easy_content_type"] = {
"payment_client": ["[INFO] Content-Type set to application/json. Request body parsed."],
"payment_gateway": ["[INFO] Request body parsed successfully as JSON"],
}
elif issue.issue_id == "easy_timeout":
dynamic_logs["easy_timeout"] = {
"payment_client": ["[INFO] Timeout increased to 30s. Payment processing completing normally."],
}
elif issue.issue_id == "easy_base_url":
dynamic_logs["easy_base_url"] = {
"payment_client": ["[INFO] Migrated to v2 API endpoint. Requests routing correctly."],
}
# Service dependency graph
service_graph = {
"payment_client": ServiceNode(
name="payment_client",
depends_on=["payment_gateway"],
health_status="error",
),
"payment_gateway": ServiceNode(
name="payment_gateway",
depends_on=[],
health_status="healthy",
),
}
scenario = Scenario(
task_id="easy",
difficulty="easy",
description=(
"A payment processing API integration is failing. "
"The client is sending requests to the payment gateway but getting error responses. "
"Diagnose the root causes by inspecting error logs and service configurations, "
"then submit the correct configuration fixes."
),
max_steps=15,
services=["payment_client", "payment_gateway"],
configs=configs,
logs={"payment_client": client_logs, "payment_gateway": gateway_logs},
issues=issues,
service_graph=service_graph,
dynamic_logs=dynamic_logs,
optimal_fix_order=[i.issue_id for i in issues],
context=(
"The payment_client sends HTTP requests to payment_gateway. "
"payment_gateway requires Bearer authentication and JSON content type."
),
)
if seed is not None:
scenario = _randomize_scenario(scenario, seed)
return scenario
# ─── Medium Scenario ─────────────────────────────────────────────────────────
def _medium_scenario(seed: Optional[int] = None) -> Scenario:
"""
Medium: Webhook chain with cascading failures.
Service A -> Service B -> Service C, with rate limiting, retry, and auth issues.
Issue pool has 5 possible issues; canonical scenario uses 3.
Issues have dependencies — fixing rate_limit reveals the real retry issue.
"""
issue_pool = [
Issue(
issue_id="medium_rate_limit",
service="webhook_sender",
description="Rate limit too high (100/s vs receiver's 10/s limit) causing 429 responses",
expected_fix={"rate_limit.requests_per_second": 10},
fix_key="rate_limit.requests_per_second",
log_hint="Rate limit exceeded: 100 req/s > 10 req/s allowed",
category="networking",
severity="error",
root_cause_explanation=(
"webhook_sender fires at 100 req/s but webhook_receiver only accepts 10 req/s. "
"The excess requests get 429 Too Many Requests, and with only 1 retry, most events are dropped."
),
cascade_effects={
"webhook_receiver": "Overwhelmed with requests, dropping 90% of events",
"notification_service": "No events arriving downstream",
},
),
Issue(
issue_id="medium_retry",
service="webhook_sender",
description="Insufficient retry config: only 1 retry, no backoff, missing 429 in retry_on_status",
expected_fix={
"retry.max_retries": 3,
"retry.backoff_factor": 2,
"retry.retry_on_status": [429, 500],
},
fix_key="retry",
log_hint="Retry attempt 1/1 failed. No more retries.",
depends_on=["medium_rate_limit"],
# The retry issue is masked by the rate limit issue — even with retries,
# 100 req/s would still overwhelm the receiver
category="configuration",
severity="error",
root_cause_explanation=(
"Even after fixing the rate limit, the sender only retries once with no backoff. "
"Transient 429s during bursts aren't retried because 429 isn't in retry_on_status. "
"This causes event loss on any temporary load spike."
),
),
Issue(
issue_id="medium_signature",
service="webhook_sender",
description="Webhook signature header is empty — receiver rejects unsigned events",
expected_fix={"headers.X-Webhook-Signature": "sha256=<computed>"},
fix_key="headers.X-Webhook-Signature",
log_hint="Signature validation FAILED: received empty signature",
category="authentication",
severity="critical",
root_cause_explanation=(
"webhook_sender has signing_secret configured but the X-Webhook-Signature header "
"is empty string. webhook_receiver validates signatures and drops all unsigned "
"events as potential spoofing attempts."
),
cascade_effects={
"webhook_receiver": "Dropping all events as unsigned/spoofed",
"notification_service": "Zero events forwarded from receiver",
},
),
Issue(
issue_id="medium_target_url",
service="webhook_sender",
description="Target URL pointing to wrong receiver endpoint (/webhook vs /hooks/incoming)",
expected_fix={"target_url": "https://receiver.internal/hooks/incoming"},
fix_key="target_url",
log_hint="404 Not Found on /webhook endpoint",
category="configuration",
severity="error",
root_cause_explanation=(
"webhook_sender posts to /webhook but the receiver listens on /hooks/incoming. "
"All requests get 404 Not Found."
),
),
Issue(
issue_id="medium_content_encoding",
service="webhook_sender",
description="Payload compression enabled but receiver doesn't support gzip",
expected_fix={"compression": "none"},
fix_key="compression",
log_hint="Unsupported Content-Encoding: gzip",
category="protocol",
severity="warning",
root_cause_explanation=(
"webhook_sender compresses payloads with gzip but webhook_receiver "
"doesn't have a decompression middleware. Requests fail with 415."
),
),
]
if seed is not None:
rng = random.Random(seed)
issues = _select_issues(issue_pool, 3, rng)
else:
issues = issue_pool[:3] # Canonical: rate_limit, retry, signature
# Build configs
configs = {
"webhook_sender": {
"target_url": "https://receiver.internal/hooks/incoming",
"headers": {
"Content-Type": "application/json",
"X-Webhook-Signature": "sha256=computed_hmac",
},
"rate_limit": {
"requests_per_second": 10,
"burst_size": 20,
},
"retry": {
"max_retries": 3,
"backoff_factor": 2,
"retry_on_status": [429, 500],
},
"signing_secret": "whsec_abc123secret",
"compression": "none",
},
"webhook_receiver": {
"endpoint": "/hooks/incoming",
"rate_limit": {
"requests_per_second": 10,
"burst_size": 20,
},
"signature_validation": True,
"expected_signature_header": "X-Webhook-Signature",
"signing_secret": "whsec_abc123secret",
"forward_to": "https://notifications.internal/notify",
"supported_encodings": ["identity"],
},
"notification_service": {
"endpoint": "/notify",
"accepts_from": ["webhook_receiver"],
"status": "healthy",
},
}
# Apply broken config for each selected issue
for issue in issues:
if issue.issue_id == "medium_rate_limit":
configs["webhook_sender"]["rate_limit"]["requests_per_second"] = 100
configs["webhook_sender"]["rate_limit"]["burst_size"] = 200
elif issue.issue_id == "medium_retry":
configs["webhook_sender"]["retry"] = {
"max_retries": 1,
"backoff_factor": 0,
"retry_on_status": [500],
}
elif issue.issue_id == "medium_signature":
configs["webhook_sender"]["headers"]["X-Webhook-Signature"] = ""
elif issue.issue_id == "medium_target_url":
configs["webhook_sender"]["target_url"] = "https://receiver.internal/webhook"
elif issue.issue_id == "medium_content_encoding":
configs["webhook_sender"]["compression"] = "gzip"
# Build logs based on selected issues
sender_logs = [
"[INFO] 2026-03-25T10:59:59Z Webhook sender started. Signature header: X-Webhook-Signature",
]
receiver_logs = [
"[INFO] 2026-03-25T10:59:59Z Receiver ready. Rate limit: 10 req/s. Signature validation: ON",
]
notif_logs = [
"[INFO] 2026-03-25T10:59:59Z Notification service healthy. Waiting for events.",
]
for issue in issues:
if issue.issue_id == "medium_rate_limit":
sender_logs.extend([
"[ERROR] 2026-03-25T11:00:01Z POST /hooks/incoming -> 429 Too Many Requests",
"[ERROR] 2026-03-25T11:00:01Z Rate limited. Retry-After: 5s",
"[WARN] 2026-03-25T11:00:00Z Sending at 100 req/s (burst=200)",
])
receiver_logs.append(
"[WARN] 2026-03-25T11:00:01Z Rate limit exceeded: 100 req/s > 10 req/s allowed"
)
elif issue.issue_id == "medium_retry":
sender_logs.extend([
"[WARN] 2026-03-25T11:00:02Z Retry attempt 1/1 failed. No more retries.",
"[ERROR] 2026-03-25T11:00:03Z Event evt_12345 dropped after retry exhaustion",
])
elif issue.issue_id == "medium_signature":
receiver_logs.extend([
"[ERROR] 2026-03-25T11:00:02Z Signature validation FAILED: received empty signature",
"[WARN] 2026-03-25T11:00:02Z Dropping event: invalid signature from webhook_sender",
])
elif issue.issue_id == "medium_target_url":
sender_logs.extend([
"[ERROR] 2026-03-25T11:00:01Z POST /webhook -> 404 Not Found on /webhook endpoint",
"[WARN] 2026-03-25T11:00:01Z Receiver endpoint may have changed",
])
elif issue.issue_id == "medium_content_encoding":
receiver_logs.extend([
"[ERROR] 2026-03-25T11:00:02Z Unsupported Content-Encoding: gzip",
"[WARN] 2026-03-25T11:00:02Z Cannot decompress payload from webhook_sender",
])
notif_logs.append("[WARN] 2026-03-25T11:00:05Z No events received in last 60s")
# Dynamic logs
dynamic_logs = {
"medium_rate_limit": {
"webhook_sender": ["[INFO] Rate limit adjusted to 10 req/s. 429 errors resolved."],
"webhook_receiver": ["[INFO] Incoming request rate normalized. Processing events."],
},
"medium_retry": {
"webhook_sender": ["[INFO] Retry config updated: 3 retries with backoff. 429 now retried."],
},
"medium_signature": {
"webhook_sender": ["[INFO] Webhook signature computed and attached to requests."],
"webhook_receiver": ["[INFO] Signature validation passed for incoming events."],
},
"medium_target_url": {
"webhook_sender": ["[INFO] Target URL corrected to /hooks/incoming. Requests routing OK."],
},
"medium_content_encoding": {
"webhook_sender": ["[INFO] Compression disabled. Receiver parsing payloads correctly."],
},
}
service_graph = {
"webhook_sender": ServiceNode(
name="webhook_sender",
depends_on=["webhook_receiver"],
health_status="error",
),
"webhook_receiver": ServiceNode(
name="webhook_receiver",
depends_on=["notification_service"],
health_status="degraded",
),
"notification_service": ServiceNode(
name="notification_service",
depends_on=[],
health_status="healthy",
),
}
# Determine optimal fix order (respect dependencies)
issue_ids = [i.issue_id for i in issues]
optimal_order = []
# Rate limit should be fixed before retry (dependency)
if "medium_rate_limit" in issue_ids:
optimal_order.append("medium_rate_limit")
if "medium_retry" in issue_ids:
optimal_order.append("medium_retry")
for iid in issue_ids:
if iid not in optimal_order:
optimal_order.append(iid)
scenario = Scenario(
task_id="medium",
difficulty="medium",
description=(
"A webhook-based notification system is dropping events. "
"webhook_sender sends webhooks to webhook_receiver, which forwards to notification_service. "
"Events are being lost due to multiple cascading failures in the webhook chain. "
"Fix the webhook_sender configuration to restore event delivery."
),
max_steps=25,
services=["webhook_sender", "webhook_receiver", "notification_service"],
configs=configs,
logs={
"webhook_sender": sender_logs,
"webhook_receiver": receiver_logs,
"notification_service": notif_logs,
},
issues=issues,
service_graph=service_graph,
dynamic_logs=dynamic_logs,
optimal_fix_order=optimal_order,
context=(
"Event flow: webhook_sender -> webhook_receiver -> notification_service. "
"webhook_receiver validates signatures and enforces rate limits. "
"Fixing upstream issues may reveal additional downstream problems."
),
)
if seed is not None:
scenario = _randomize_scenario(scenario, seed)
return scenario
# ─── Hard Scenario ────────────────────────────────────────────────────────────
def _hard_scenario(seed: Optional[int] = None) -> Scenario:
"""
Hard: E-commerce order processing pipeline with cascading failures.
order_service -> inventory_service -> shipping_service
Plus api_gateway and auth_service.
Issue pool has 7 possible issues; canonical scenario uses 5.
Multiple dependency chains make this genuinely challenging.
"""
issue_pool = [
Issue(
issue_id="hard_wrong_url",
service="order_service",
description="Order service calling deprecated /v1/check instead of /v2/reserve",
expected_fix={"inventory_url": "https://inventory.internal/v2/reserve"},
fix_key="inventory_url",
log_hint="Endpoint deprecated. Use /v2/reserve",
category="configuration",
severity="error",
root_cause_explanation=(
"order_service calls /v1/check which was deprecated. The API gateway returns "
"301 Moved Permanently. The redirect goes to /v2/check (read-only) instead of "
"/v2/reserve (write). Inventory is never actually reserved."
),
cascade_effects={
"inventory_service": "Receiving read-only check requests instead of reservation requests",
"api_gateway": "Generating 301 redirect responses for deprecated endpoints",
},
),
Issue(
issue_id="hard_timeout",
service="order_service",
description="Timeout too short (2s) for inventory service that takes ~4s to process",
expected_fix={"timeout": 10},
fix_key="timeout",
log_hint="Timeout after 2s waiting for inventory response",
depends_on=["hard_wrong_url"],
# Timeout issue is masked by wrong URL — fix URL first to see real timeout
category="networking",
severity="error",
root_cause_explanation=(
"order_service has timeout=2s but inventory_service takes ~4s for reservation "
"(including DB lock + stock validation). After fixing the URL, requests now reach "
"inventory but time out before completion."
),
cascade_effects={
"inventory_service": "Connections killed mid-processing, leaving orphaned DB locks",
},
),
Issue(
issue_id="hard_async",
service="order_service",
description="Synchronous mode causes race conditions between concurrent orders",
expected_fix={"async_mode": True},
fix_key="async_mode",
log_hint="Race condition: order ord_998 processed before ord_997 completed",
category="configuration",
severity="critical",
root_cause_explanation=(
"order_service runs in sync mode, blocking the main thread on each inventory call. "
"Concurrent orders queue up and when timeouts occur, orders are processed out of "
"order, causing double-reservation and stock inconsistencies."
),
),
Issue(
issue_id="hard_expired_token",
service="inventory_service",
description="Expired auth token used for shipping service requests",
expected_fix={"headers.Authorization": "Bearer valid_token_789"},
fix_key="headers.Authorization",
log_hint="Auth token expired_token_456 is no longer valid",
category="authentication",
severity="critical",
root_cause_explanation=(
"inventory_service uses Bearer expired_token_456 to authenticate with "
"shipping_service. This token expired on 2026-03-24. All shipment creation "
"requests fail with 401, so reserved inventory is never shipped."
),
cascade_effects={
"shipping_service": "Rejecting all requests from inventory_service",
"auth_service": "Logging repeated failed token validations",
},
),
Issue(
issue_id="hard_token_refresh",
service="inventory_service",
description="No automatic token refresh mechanism configured",
expected_fix={"token_refresh_url": "https://auth.internal/refresh", "auto_refresh": True},
fix_key="token_refresh_url",
log_hint="Token validation failed: expired_token_456 expired",
depends_on=["hard_expired_token"],
# Token refresh is only relevant after fixing the expired token
category="configuration",
severity="error",
root_cause_explanation=(
"Even after replacing the expired token, there's no auto-refresh mechanism. "
"Tokens expire every 24h, so without auto_refresh=True and a refresh URL, "
"the same issue will recur tomorrow."
),
),
Issue(
issue_id="hard_circuit_breaker",
service="order_service",
description="No circuit breaker — failed requests keep hammering inventory_service",
expected_fix={"circuit_breaker.enabled": True, "circuit_breaker.failure_threshold": 5},
fix_key="circuit_breaker",
log_hint="Circuit breaker not configured",
category="configuration",
severity="warning",
root_cause_explanation=(
"Without a circuit breaker, order_service keeps sending requests to "
"inventory_service even when it's consistently failing. This wastes resources "
"and can cause a cascading overload."
),
),
Issue(
issue_id="hard_idempotency",
service="order_service",
description="Missing idempotency key — retried requests create duplicate orders",
expected_fix={"headers.Idempotency-Key": "order-{order_id}"},
fix_key="headers.Idempotency-Key",
log_hint="Duplicate order detected: ord_997 submitted twice",
depends_on=["hard_async"],
category="protocol",
severity="error",
root_cause_explanation=(
"When async retries fire, there's no Idempotency-Key header to deduplicate "
"requests. inventory_service creates duplicate reservations for the same order."
),
),
]
if seed is not None:
rng = random.Random(seed)
issues = _select_issues(issue_pool, 5, rng)
else:
issues = issue_pool[:5] # Canonical: first 5
configs = {
"order_service": {
"name": "order_service",
"inventory_url": "https://inventory.internal/v2/reserve",
"headers": {
"Content-Type": "application/json",
"Authorization": "Bearer valid_token_123",
},
"timeout": 10,
"async_mode": True,
"callback_url": "https://orders.internal/callback",
"circuit_breaker": {
"enabled": True,
"failure_threshold": 5,
},
},
"inventory_service": {
"name": "inventory_service",
"endpoint_version": "v2",
"reserve_path": "/v2/reserve",
"check_path": "/v2/check",
"shipping_url": "https://shipping.internal/v1/create",
"headers": {
"Content-Type": "application/json",
"Authorization": "Bearer valid_token_789",
},
"timeout": 10,
"processing_time_avg": 4,
"token_refresh_url": "https://auth.internal/refresh",
"auto_refresh": True,
},
"shipping_service": {
"name": "shipping_service",
"create_path": "/v1/create",
"requires_auth": True,
"accepted_auth": ["Bearer"],
"token_validation_url": "https://auth.internal/validate",
"status": "healthy",
},
"api_gateway": {
"routes": {
"/v1/check": "DEPRECATED — use /v2/check",
"/v2/reserve": "inventory_service",
"/v2/check": "inventory_service",
"/v1/create": "shipping_service",
},
"timeout": 30,
},
"auth_service": {
"valid_tokens": ["valid_token_123", "valid_token_789"],
"expired_tokens": ["expired_token_456"],
"token_refresh_endpoint": "/refresh",
"token_ttl_hours": 24,
},
}
# Apply broken config for each selected issue
for issue in issues:
if issue.issue_id == "hard_wrong_url":
configs["order_service"]["inventory_url"] = "https://inventory.internal/v1/check"
elif issue.issue_id == "hard_timeout":
configs["order_service"]["timeout"] = 2
elif issue.issue_id == "hard_async":
configs["order_service"]["async_mode"] = False
elif issue.issue_id == "hard_expired_token":
configs["inventory_service"]["headers"]["Authorization"] = "Bearer expired_token_456"
elif issue.issue_id == "hard_token_refresh":
configs["inventory_service"].pop("token_refresh_url", None)
configs["inventory_service"]["auto_refresh"] = False
elif issue.issue_id == "hard_circuit_breaker":
configs["order_service"]["circuit_breaker"] = {"enabled": False}
elif issue.issue_id == "hard_idempotency":
configs["order_service"]["headers"].pop("Idempotency-Key", None)
# Build logs
order_logs = []
inventory_logs = []
shipping_logs = []
gateway_logs = []
auth_logs = [
"[INFO] 2026-03-25T12:00:00Z Auth service ready. Valid tokens: 2, Expired: 1",
]
for issue in issues:
if issue.issue_id == "hard_wrong_url":
order_logs.extend([
"[ERROR] 2026-03-25T12:00:05Z POST inventory.internal/v1/check -> 301 Moved Permanently",
"[ERROR] 2026-03-25T12:00:05Z Response: {'error': 'Endpoint deprecated. Use /v2/reserve'}",
])
inventory_logs.append(
"[INFO] 2026-03-25T12:00:05Z Received request on /v1/check -> redirecting to /v2/check"
)
gateway_logs.extend([
"[WARN] 2026-03-25T12:00:05Z Deprecated endpoint /v1/check accessed by order_service",
"[INFO] 2026-03-25T12:00:05Z Redirecting /v1/check -> /v2/check (301)",
])
elif issue.issue_id == "hard_timeout":
order_logs.extend([
"[ERROR] 2026-03-25T12:00:07Z Timeout after 2s waiting for inventory response",
"[ERROR] 2026-03-25T12:00:07Z Order ord_999 failed: inventory check timed out",
])
inventory_logs.append(
"[WARN] 2026-03-25T12:00:06Z Processing reservation... avg time: 4s"
)
elif issue.issue_id == "hard_async":
order_logs.extend([
"[WARN] 2026-03-25T12:00:08Z Synchronous mode: blocking on inventory response",
"[ERROR] 2026-03-25T12:00:09Z Race condition: order ord_998 processed before ord_997 completed",
])
elif issue.issue_id == "hard_expired_token":
inventory_logs.extend([
"[ERROR] 2026-03-25T12:00:10Z POST shipping.internal/v1/create -> 401 Unauthorized",
"[ERROR] 2026-03-25T12:00:10Z Auth token expired_token_456 is no longer valid",
"[ERROR] 2026-03-25T12:00:10Z Cannot create shipment: authentication failed",
])
shipping_logs.append(
"[WARN] 2026-03-25T12:00:10Z Rejected request: token 'expired_token_456' is expired"
)
auth_logs.append(
"[WARN] 2026-03-25T12:00:10Z Token validation failed: expired_token_456 expired at 2026-03-24T00:00:00Z"
)
elif issue.issue_id == "hard_token_refresh":
auth_logs.append(
"[WARN] 2026-03-25T12:00:11Z Token validation failed: expired_token_456 expired. No refresh configured."
)
elif issue.issue_id == "hard_circuit_breaker":
order_logs.extend([
"[WARN] 2026-03-25T12:00:12Z Circuit breaker not configured, continuing to send requests after 10 failures",
"[ERROR] 2026-03-25T12:00:12Z System overload: 50 pending requests to inventory_service",
])
elif issue.issue_id == "hard_idempotency":
order_logs.append(
"[ERROR] 2026-03-25T12:00:13Z Duplicate order detected: ord_997 submitted twice"
)
inventory_logs.append(
"[WARN] 2026-03-25T12:00:13Z Duplicate reservation request for order ord_997"
)
if not shipping_logs:
shipping_logs.append(
"[INFO] 2026-03-25T12:00:00Z Shipping service healthy, awaiting authenticated requests"
)
dynamic_logs = {
"hard_wrong_url": {
"order_service": ["[INFO] URL corrected to /v2/reserve. Inventory requests routing correctly."],
"api_gateway": ["[INFO] order_service now using correct /v2/reserve endpoint."],
},
"hard_timeout": {
"order_service": ["[INFO] Timeout increased to 10s. Inventory responses completing."],
"inventory_service": ["[INFO] Reservations completing successfully within timeout."],
},
"hard_async": {
"order_service": ["[INFO] Async mode enabled. Orders processing concurrently without blocking."],
},
"hard_expired_token": {
"inventory_service": ["[INFO] Auth token refreshed. Shipping service requests authenticated."],
"shipping_service": ["[INFO] Authentication successful for inventory_service."],
},
"hard_token_refresh": {
"inventory_service": ["[INFO] Auto token refresh configured. Tokens will be refreshed before expiry."],
},
"hard_circuit_breaker": {
"order_service": ["[INFO] Circuit breaker enabled. Will stop sending after 5 consecutive failures."],
},
"hard_idempotency": {
"order_service": ["[INFO] Idempotency keys set. Duplicate requests will be safely deduplicated."],
},
}
service_graph = {
"order_service": ServiceNode(
name="order_service",
depends_on=["inventory_service", "api_gateway"],
health_status="error",
),
"inventory_service": ServiceNode(
name="inventory_service",
depends_on=["shipping_service", "auth_service"],
health_status="degraded",
),
"shipping_service": ServiceNode(
name="shipping_service",
depends_on=[],
health_status="healthy",
),
"api_gateway": ServiceNode(
name="api_gateway",
depends_on=[],
health_status="healthy",
),
"auth_service": ServiceNode(
name="auth_service",
depends_on=[],
health_status="healthy",
),
}
# Build optimal fix order respecting dependencies
issue_ids = [i.issue_id for i in issues]
optimal_order = []
ordered_preference = [
"hard_wrong_url", "hard_timeout", "hard_async",
"hard_expired_token", "hard_token_refresh",
"hard_circuit_breaker", "hard_idempotency",
]
for iid in ordered_preference:
if iid in issue_ids:
optimal_order.append(iid)
for iid in issue_ids:
if iid not in optimal_order:
optimal_order.append(iid)
scenario = Scenario(
task_id="hard",
difficulty="hard",
description=(
"An e-commerce order processing pipeline is failing with cascading errors. "
"Order Service calls Inventory Service, which calls Shipping Service. "
"Multiple issues span the pipeline: wrong endpoints, timeouts, race conditions, "
"expired authentication tokens, and missing resilience patterns. "
"Some issues are masked by upstream failures — you must fix issues in the right "
"order to diagnose the full chain."
),
max_steps=40,
services=["order_service", "inventory_service", "shipping_service", "api_gateway", "auth_service"],
configs=configs,
logs={
"order_service": order_logs,
"inventory_service": inventory_logs,
"shipping_service": shipping_logs,
"api_gateway": gateway_logs,
"auth_service": auth_logs,
},
issues=issues,
service_graph=service_graph,
dynamic_logs=dynamic_logs,
optimal_fix_order=optimal_order,
context=(
"Request flow: order_service -> api_gateway -> inventory_service -> shipping_service. "
"auth_service provides token validation for all inter-service calls. "
"Some issues are masked by upstream failures — fixing upstream issues may reveal "
"new errors downstream. Pay attention to service dependencies."
),
)
if seed is not None:
scenario = _randomize_scenario(scenario, seed)
return scenario