Spaces:
Sleeping
Sleeping
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the BSD-style license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| """ | |
| Scenario definitions for the API Integration Debugging Environment. | |
| Each scenario models a realistic multi-service API ecosystem with: | |
| - Service dependency graphs (upstream/downstream relationships) | |
| - Cascading failures (upstream bugs propagate downstream) | |
| - Dynamic logs that update when issues are fixed | |
| - Expanded issue pools for seed-based random subset selection | |
| """ | |
| from dataclasses import dataclass, field | |
| from typing import Any, Dict, List, Optional, Tuple | |
| import random | |
| class Issue: | |
| """A single issue in an API integration scenario.""" | |
| issue_id: str | |
| service: str | |
| description: str | |
| expected_fix: Dict[str, Any] | |
| fix_key: str # The key in the config that needs fixing | |
| log_hint: str # Log line that hints at this issue | |
| # --- New fields for cascading failures --- | |
| depends_on: List[str] = field(default_factory=list) | |
| # Issues that must be fixed before this one can be diagnosed | |
| cascade_effects: Dict[str, str] = field(default_factory=dict) | |
| # service -> error message caused by this issue being unfixed | |
| category: str = "configuration" | |
| # Issue category: configuration, authentication, networking, protocol | |
| severity: str = "error" | |
| # Severity: error, warning, critical | |
| root_cause_explanation: str = "" | |
| # Detailed explanation of why this issue occurs (for grading diagnosis quality) | |
| class ServiceNode: | |
| """A node in the service dependency graph.""" | |
| name: str | |
| depends_on: List[str] = field(default_factory=list) | |
| # Services this one calls (upstream dependencies) | |
| health_status: str = "degraded" | |
| # healthy, degraded, error, unreachable | |
| class Scenario: | |
| """A complete API debugging scenario with dependency graph.""" | |
| task_id: str | |
| difficulty: str | |
| description: str | |
| max_steps: int | |
| services: List[str] | |
| configs: Dict[str, Dict[str, Any]] | |
| logs: Dict[str, List[str]] | |
| issues: List[Issue] | |
| # --- New fields --- | |
| service_graph: Dict[str, ServiceNode] = field(default_factory=dict) | |
| # Service dependency graph | |
| dynamic_logs: Dict[str, Dict[str, List[str]]] = field(default_factory=dict) | |
| # service -> {issue_id: [new logs when fixed]} | |
| optimal_fix_order: List[str] = field(default_factory=list) | |
| # Optimal order to fix issues (for strategy scoring) | |
| context: str = "" | |
| # Additional scenario context for the agent | |
| def get_scenario(task_id: str, seed: Optional[int] = None) -> Scenario: | |
| """ | |
| Load a scenario by task ID with optional randomization. | |
| Args: | |
| task_id: One of 'easy', 'medium', 'hard' | |
| seed: Optional seed for deterministic but varied scenarios. | |
| When provided, selects a random subset of issues from the pool | |
| and randomizes log order. When None, returns the canonical scenario. | |
| """ | |
| scenario_builders = { | |
| "easy": _easy_scenario, | |
| "medium": _medium_scenario, | |
| "hard": _hard_scenario, | |
| } | |
| if task_id not in scenario_builders: | |
| raise ValueError(f"Unknown task_id: {task_id}. Must be one of: {list(scenario_builders.keys())}") | |
| scenario = scenario_builders[task_id](seed=seed) | |
| return scenario | |
| def get_all_task_ids() -> List[str]: | |
| """Return all available task IDs.""" | |
| return ["easy", "medium", "hard"] | |
| def _select_issues(pool: List[Issue], count: int, rng: random.Random) -> List[Issue]: | |
| """Select a random subset of issues from a pool, respecting dependencies.""" | |
| if count >= len(pool): | |
| selected = list(pool) | |
| else: | |
| # Build dependency-aware selection | |
| available = list(pool) | |
| selected = [] | |
| while len(selected) < count and available: | |
| # Pick a random issue | |
| issue = rng.choice(available) | |
| available.remove(issue) | |
| # Add its dependencies too if not already selected | |
| deps_satisfied = all( | |
| any(s.issue_id == dep for s in selected) | |
| for dep in issue.depends_on | |
| ) | |
| if deps_satisfied or not issue.depends_on: | |
| selected.append(issue) | |
| else: | |
| # Add dependencies first | |
| for dep_id in issue.depends_on: | |
| dep_issue = next((i for i in pool if i.issue_id == dep_id), None) | |
| if dep_issue and dep_issue not in selected: | |
| selected.append(dep_issue) | |
| if dep_issue in available: | |
| available.remove(dep_issue) | |
| selected.append(issue) | |
| # Shuffle log order for selected issues | |
| rng.shuffle(selected) | |
| return selected[:count] | |
| def _randomize_scenario(scenario: Scenario, seed: int) -> Scenario: | |
| """Apply seed-based randomization to a scenario.""" | |
| rng = random.Random(seed) | |
| # Shuffle log entries for each service | |
| for service_logs in scenario.logs.values(): | |
| rng.shuffle(service_logs) | |
| # Vary timestamps in log entries | |
| base_hour = rng.randint(8, 16) | |
| base_minute = rng.randint(0, 59) | |
| for service, log_list in scenario.logs.items(): | |
| new_logs = [] | |
| for i, log_line in enumerate(log_list): | |
| # Replace the timestamp portion | |
| minute = (base_minute + i * rng.randint(1, 5)) % 60 | |
| hour = base_hour + (base_minute + i * rng.randint(1, 5)) // 60 | |
| new_log = log_line | |
| if "2026-" in new_log: | |
| # Replace date with varied date | |
| day = rng.randint(20, 28) | |
| new_log = new_log.replace( | |
| "2026-03-25", | |
| f"2026-03-{day:02d}" | |
| ).replace( | |
| "2026-03-24", | |
| f"2026-03-{day-1:02d}" | |
| ) | |
| new_logs.append(new_log) | |
| scenario.logs[service] = new_logs | |
| return scenario | |
| # ─── Easy Scenario ─────────────────────────────────────────────────────────── | |
| def _easy_scenario(seed: Optional[int] = None) -> Scenario: | |
| """ | |
| Easy: Payment API integration failures. | |
| Agent must diagnose auth + content-type issues with clear log signals. | |
| Issue pool has 4 possible issues; canonical scenario uses 2. | |
| """ | |
| # Full issue pool (4 issues, canonical uses 2) | |
| issue_pool = [ | |
| Issue( | |
| issue_id="easy_auth", | |
| service="payment_client", | |
| description="Missing Authorization header — payment gateway requires Bearer token authentication", | |
| expected_fix={"headers.Authorization": "Bearer <token>"}, | |
| fix_key="headers.Authorization", | |
| log_hint="Missing or invalid Authorization header", | |
| category="authentication", | |
| severity="critical", | |
| root_cause_explanation=( | |
| "The payment_client is missing the Authorization header entirely. " | |
| "The payment_gateway requires Bearer token auth on all /process requests. " | |
| "This results in HTTP 401 on every payment attempt." | |
| ), | |
| cascade_effects={ | |
| "payment_gateway": "All requests from payment_client rejected with 401" | |
| }, | |
| ), | |
| Issue( | |
| issue_id="easy_content_type", | |
| service="payment_client", | |
| description="Wrong Content-Type header (text/plain instead of application/json)", | |
| expected_fix={"headers.Content-Type": "application/json"}, | |
| fix_key="headers.Content-Type", | |
| log_hint="Content-Type must be application/json", | |
| category="protocol", | |
| severity="error", | |
| root_cause_explanation=( | |
| "The payment_client sends Content-Type: text/plain, but the gateway " | |
| "only accepts application/json. This causes HTTP 415 Unsupported Media Type. " | |
| "The gateway cannot parse the request body." | |
| ), | |
| cascade_effects={ | |
| "payment_gateway": "Request body parsing fails for payment_client requests" | |
| }, | |
| ), | |
| Issue( | |
| issue_id="easy_timeout", | |
| service="payment_client", | |
| description="Timeout set too low (5s) for payment processing that takes 8-12s", | |
| expected_fix={"timeout": 30}, | |
| fix_key="timeout", | |
| log_hint="Request timed out after 5s", | |
| category="networking", | |
| severity="error", | |
| root_cause_explanation=( | |
| "The payment_client has timeout=5s, but payment processing at the gateway " | |
| "takes 8-12s for fraud checks. Legitimate payments are timing out." | |
| ), | |
| ), | |
| Issue( | |
| issue_id="easy_base_url", | |
| service="payment_client", | |
| description="Base URL pointing to deprecated v1 endpoint instead of v2", | |
| expected_fix={"base_url": "https://api.paymentgateway.com/v2"}, | |
| fix_key="base_url", | |
| log_hint="API v1 is deprecated", | |
| category="configuration", | |
| severity="warning", | |
| root_cause_explanation=( | |
| "The payment_client uses /v1 which is deprecated and returning 301 redirects. " | |
| "The gateway v2 endpoint has different request schemas, causing deserialization errors." | |
| ), | |
| ), | |
| ] | |
| # Select issues based on seed | |
| if seed is not None: | |
| rng = random.Random(seed) | |
| issues = _select_issues(issue_pool, 2, rng) | |
| else: | |
| issues = issue_pool[:2] # Canonical: auth + content_type | |
| # Build logs based on selected issues | |
| client_logs = [ | |
| "[INFO] 2026-03-25T10:15:20Z Payment client initialized with base_url=https://api.paymentgateway.com/v2", | |
| ] | |
| gateway_logs = [ | |
| "[INFO] 2026-03-25T10:15:20Z Gateway ready, accepting application/json with Bearer auth", | |
| ] | |
| for issue in issues: | |
| if issue.issue_id == "easy_auth": | |
| client_logs.extend([ | |
| "[ERROR] 2026-03-25T10:15:23Z POST /process -> 401 Unauthorized", | |
| "[ERROR] 2026-03-25T10:15:23Z Response: {'error': 'Missing or invalid Authorization header'}", | |
| "[WARN] 2026-03-25T10:15:22Z Request headers: Content-Type=text/plain, Accept=application/json", | |
| ]) | |
| gateway_logs.append( | |
| "[WARN] 2026-03-25T10:15:23Z Rejected request: no Authorization header present" | |
| ) | |
| elif issue.issue_id == "easy_content_type": | |
| client_logs.extend([ | |
| "[ERROR] 2026-03-25T10:15:24Z POST /process -> 415 Unsupported Media Type", | |
| "[ERROR] 2026-03-25T10:15:24Z Response: {'error': 'Content-Type must be application/json'}", | |
| ]) | |
| gateway_logs.append( | |
| "[WARN] 2026-03-25T10:15:24Z Rejected request: unsupported Content-Type 'text/plain'" | |
| ) | |
| elif issue.issue_id == "easy_timeout": | |
| client_logs.extend([ | |
| "[ERROR] 2026-03-25T10:15:30Z POST /process -> Request timed out after 5s", | |
| "[WARN] 2026-03-25T10:15:30Z Payment processing takes 8-12s for fraud verification", | |
| ]) | |
| gateway_logs.append( | |
| "[INFO] 2026-03-25T10:15:30Z Processing payment... estimated time: 10s" | |
| ) | |
| elif issue.issue_id == "easy_base_url": | |
| client_logs.extend([ | |
| "[ERROR] 2026-03-25T10:15:21Z GET /v1/status -> 301 Moved Permanently", | |
| "[WARN] 2026-03-25T10:15:21Z API v1 is deprecated, migrate to /v2", | |
| ]) | |
| gateway_logs.append( | |
| "[WARN] 2026-03-25T10:15:21Z Deprecated v1 endpoint accessed" | |
| ) | |
| # Determine initial config based on selected issues | |
| configs = { | |
| "payment_client": { | |
| "base_url": "https://api.paymentgateway.com/v2", | |
| "headers": { | |
| "Content-Type": "application/json", | |
| "Accept": "application/json", | |
| }, | |
| "timeout": 30, | |
| "retry_count": 3, | |
| }, | |
| "payment_gateway": { | |
| "endpoint": "/process", | |
| "method": "POST", | |
| "required_headers": ["Authorization", "Content-Type"], | |
| "accepted_content_types": ["application/json"], | |
| "auth_scheme": "Bearer", | |
| "processing_time_ms": "8000-12000", | |
| }, | |
| } | |
| # Apply broken config for each selected issue | |
| for issue in issues: | |
| if issue.issue_id == "easy_auth": | |
| # Remove auth header (it shouldn't exist) | |
| configs["payment_client"]["headers"].pop("Authorization", None) | |
| elif issue.issue_id == "easy_content_type": | |
| configs["payment_client"]["headers"]["Content-Type"] = "text/plain" | |
| elif issue.issue_id == "easy_timeout": | |
| configs["payment_client"]["timeout"] = 5 | |
| elif issue.issue_id == "easy_base_url": | |
| configs["payment_client"]["base_url"] = "https://api.paymentgateway.com/v1" | |
| # Dynamic logs: what changes after fixing each issue | |
| dynamic_logs = {} | |
| for issue in issues: | |
| if issue.issue_id == "easy_auth": | |
| dynamic_logs["easy_auth"] = { | |
| "payment_client": ["[INFO] Authorization header set. Retrying request..."], | |
| "payment_gateway": ["[INFO] Authentication successful for payment_client"], | |
| } | |
| elif issue.issue_id == "easy_content_type": | |
| dynamic_logs["easy_content_type"] = { | |
| "payment_client": ["[INFO] Content-Type set to application/json. Request body parsed."], | |
| "payment_gateway": ["[INFO] Request body parsed successfully as JSON"], | |
| } | |
| elif issue.issue_id == "easy_timeout": | |
| dynamic_logs["easy_timeout"] = { | |
| "payment_client": ["[INFO] Timeout increased to 30s. Payment processing completing normally."], | |
| } | |
| elif issue.issue_id == "easy_base_url": | |
| dynamic_logs["easy_base_url"] = { | |
| "payment_client": ["[INFO] Migrated to v2 API endpoint. Requests routing correctly."], | |
| } | |
| # Service dependency graph | |
| service_graph = { | |
| "payment_client": ServiceNode( | |
| name="payment_client", | |
| depends_on=["payment_gateway"], | |
| health_status="error", | |
| ), | |
| "payment_gateway": ServiceNode( | |
| name="payment_gateway", | |
| depends_on=[], | |
| health_status="healthy", | |
| ), | |
| } | |
| scenario = Scenario( | |
| task_id="easy", | |
| difficulty="easy", | |
| description=( | |
| "A payment processing API integration is failing. " | |
| "The client is sending requests to the payment gateway but getting error responses. " | |
| "Diagnose the root causes by inspecting error logs and service configurations, " | |
| "then submit the correct configuration fixes." | |
| ), | |
| max_steps=15, | |
| services=["payment_client", "payment_gateway"], | |
| configs=configs, | |
| logs={"payment_client": client_logs, "payment_gateway": gateway_logs}, | |
| issues=issues, | |
| service_graph=service_graph, | |
| dynamic_logs=dynamic_logs, | |
| optimal_fix_order=[i.issue_id for i in issues], | |
| context=( | |
| "The payment_client sends HTTP requests to payment_gateway. " | |
| "payment_gateway requires Bearer authentication and JSON content type." | |
| ), | |
| ) | |
| if seed is not None: | |
| scenario = _randomize_scenario(scenario, seed) | |
| return scenario | |
| # ─── Medium Scenario ───────────────────────────────────────────────────────── | |
| def _medium_scenario(seed: Optional[int] = None) -> Scenario: | |
| """ | |
| Medium: Webhook chain with cascading failures. | |
| Service A -> Service B -> Service C, with rate limiting, retry, and auth issues. | |
| Issue pool has 5 possible issues; canonical scenario uses 3. | |
| Issues have dependencies — fixing rate_limit reveals the real retry issue. | |
| """ | |
| issue_pool = [ | |
| Issue( | |
| issue_id="medium_rate_limit", | |
| service="webhook_sender", | |
| description="Rate limit too high (100/s vs receiver's 10/s limit) causing 429 responses", | |
| expected_fix={"rate_limit.requests_per_second": 10}, | |
| fix_key="rate_limit.requests_per_second", | |
| log_hint="Rate limit exceeded: 100 req/s > 10 req/s allowed", | |
| category="networking", | |
| severity="error", | |
| root_cause_explanation=( | |
| "webhook_sender fires at 100 req/s but webhook_receiver only accepts 10 req/s. " | |
| "The excess requests get 429 Too Many Requests, and with only 1 retry, most events are dropped." | |
| ), | |
| cascade_effects={ | |
| "webhook_receiver": "Overwhelmed with requests, dropping 90% of events", | |
| "notification_service": "No events arriving downstream", | |
| }, | |
| ), | |
| Issue( | |
| issue_id="medium_retry", | |
| service="webhook_sender", | |
| description="Insufficient retry config: only 1 retry, no backoff, missing 429 in retry_on_status", | |
| expected_fix={ | |
| "retry.max_retries": 3, | |
| "retry.backoff_factor": 2, | |
| "retry.retry_on_status": [429, 500], | |
| }, | |
| fix_key="retry", | |
| log_hint="Retry attempt 1/1 failed. No more retries.", | |
| depends_on=["medium_rate_limit"], | |
| # The retry issue is masked by the rate limit issue — even with retries, | |
| # 100 req/s would still overwhelm the receiver | |
| category="configuration", | |
| severity="error", | |
| root_cause_explanation=( | |
| "Even after fixing the rate limit, the sender only retries once with no backoff. " | |
| "Transient 429s during bursts aren't retried because 429 isn't in retry_on_status. " | |
| "This causes event loss on any temporary load spike." | |
| ), | |
| ), | |
| Issue( | |
| issue_id="medium_signature", | |
| service="webhook_sender", | |
| description="Webhook signature header is empty — receiver rejects unsigned events", | |
| expected_fix={"headers.X-Webhook-Signature": "sha256=<computed>"}, | |
| fix_key="headers.X-Webhook-Signature", | |
| log_hint="Signature validation FAILED: received empty signature", | |
| category="authentication", | |
| severity="critical", | |
| root_cause_explanation=( | |
| "webhook_sender has signing_secret configured but the X-Webhook-Signature header " | |
| "is empty string. webhook_receiver validates signatures and drops all unsigned " | |
| "events as potential spoofing attempts." | |
| ), | |
| cascade_effects={ | |
| "webhook_receiver": "Dropping all events as unsigned/spoofed", | |
| "notification_service": "Zero events forwarded from receiver", | |
| }, | |
| ), | |
| Issue( | |
| issue_id="medium_target_url", | |
| service="webhook_sender", | |
| description="Target URL pointing to wrong receiver endpoint (/webhook vs /hooks/incoming)", | |
| expected_fix={"target_url": "https://receiver.internal/hooks/incoming"}, | |
| fix_key="target_url", | |
| log_hint="404 Not Found on /webhook endpoint", | |
| category="configuration", | |
| severity="error", | |
| root_cause_explanation=( | |
| "webhook_sender posts to /webhook but the receiver listens on /hooks/incoming. " | |
| "All requests get 404 Not Found." | |
| ), | |
| ), | |
| Issue( | |
| issue_id="medium_content_encoding", | |
| service="webhook_sender", | |
| description="Payload compression enabled but receiver doesn't support gzip", | |
| expected_fix={"compression": "none"}, | |
| fix_key="compression", | |
| log_hint="Unsupported Content-Encoding: gzip", | |
| category="protocol", | |
| severity="warning", | |
| root_cause_explanation=( | |
| "webhook_sender compresses payloads with gzip but webhook_receiver " | |
| "doesn't have a decompression middleware. Requests fail with 415." | |
| ), | |
| ), | |
| ] | |
| if seed is not None: | |
| rng = random.Random(seed) | |
| issues = _select_issues(issue_pool, 3, rng) | |
| else: | |
| issues = issue_pool[:3] # Canonical: rate_limit, retry, signature | |
| # Build configs | |
| configs = { | |
| "webhook_sender": { | |
| "target_url": "https://receiver.internal/hooks/incoming", | |
| "headers": { | |
| "Content-Type": "application/json", | |
| "X-Webhook-Signature": "sha256=computed_hmac", | |
| }, | |
| "rate_limit": { | |
| "requests_per_second": 10, | |
| "burst_size": 20, | |
| }, | |
| "retry": { | |
| "max_retries": 3, | |
| "backoff_factor": 2, | |
| "retry_on_status": [429, 500], | |
| }, | |
| "signing_secret": "whsec_abc123secret", | |
| "compression": "none", | |
| }, | |
| "webhook_receiver": { | |
| "endpoint": "/hooks/incoming", | |
| "rate_limit": { | |
| "requests_per_second": 10, | |
| "burst_size": 20, | |
| }, | |
| "signature_validation": True, | |
| "expected_signature_header": "X-Webhook-Signature", | |
| "signing_secret": "whsec_abc123secret", | |
| "forward_to": "https://notifications.internal/notify", | |
| "supported_encodings": ["identity"], | |
| }, | |
| "notification_service": { | |
| "endpoint": "/notify", | |
| "accepts_from": ["webhook_receiver"], | |
| "status": "healthy", | |
| }, | |
| } | |
| # Apply broken config for each selected issue | |
| for issue in issues: | |
| if issue.issue_id == "medium_rate_limit": | |
| configs["webhook_sender"]["rate_limit"]["requests_per_second"] = 100 | |
| configs["webhook_sender"]["rate_limit"]["burst_size"] = 200 | |
| elif issue.issue_id == "medium_retry": | |
| configs["webhook_sender"]["retry"] = { | |
| "max_retries": 1, | |
| "backoff_factor": 0, | |
| "retry_on_status": [500], | |
| } | |
| elif issue.issue_id == "medium_signature": | |
| configs["webhook_sender"]["headers"]["X-Webhook-Signature"] = "" | |
| elif issue.issue_id == "medium_target_url": | |
| configs["webhook_sender"]["target_url"] = "https://receiver.internal/webhook" | |
| elif issue.issue_id == "medium_content_encoding": | |
| configs["webhook_sender"]["compression"] = "gzip" | |
| # Build logs based on selected issues | |
| sender_logs = [ | |
| "[INFO] 2026-03-25T10:59:59Z Webhook sender started. Signature header: X-Webhook-Signature", | |
| ] | |
| receiver_logs = [ | |
| "[INFO] 2026-03-25T10:59:59Z Receiver ready. Rate limit: 10 req/s. Signature validation: ON", | |
| ] | |
| notif_logs = [ | |
| "[INFO] 2026-03-25T10:59:59Z Notification service healthy. Waiting for events.", | |
| ] | |
| for issue in issues: | |
| if issue.issue_id == "medium_rate_limit": | |
| sender_logs.extend([ | |
| "[ERROR] 2026-03-25T11:00:01Z POST /hooks/incoming -> 429 Too Many Requests", | |
| "[ERROR] 2026-03-25T11:00:01Z Rate limited. Retry-After: 5s", | |
| "[WARN] 2026-03-25T11:00:00Z Sending at 100 req/s (burst=200)", | |
| ]) | |
| receiver_logs.append( | |
| "[WARN] 2026-03-25T11:00:01Z Rate limit exceeded: 100 req/s > 10 req/s allowed" | |
| ) | |
| elif issue.issue_id == "medium_retry": | |
| sender_logs.extend([ | |
| "[WARN] 2026-03-25T11:00:02Z Retry attempt 1/1 failed. No more retries.", | |
| "[ERROR] 2026-03-25T11:00:03Z Event evt_12345 dropped after retry exhaustion", | |
| ]) | |
| elif issue.issue_id == "medium_signature": | |
| receiver_logs.extend([ | |
| "[ERROR] 2026-03-25T11:00:02Z Signature validation FAILED: received empty signature", | |
| "[WARN] 2026-03-25T11:00:02Z Dropping event: invalid signature from webhook_sender", | |
| ]) | |
| elif issue.issue_id == "medium_target_url": | |
| sender_logs.extend([ | |
| "[ERROR] 2026-03-25T11:00:01Z POST /webhook -> 404 Not Found on /webhook endpoint", | |
| "[WARN] 2026-03-25T11:00:01Z Receiver endpoint may have changed", | |
| ]) | |
| elif issue.issue_id == "medium_content_encoding": | |
| receiver_logs.extend([ | |
| "[ERROR] 2026-03-25T11:00:02Z Unsupported Content-Encoding: gzip", | |
| "[WARN] 2026-03-25T11:00:02Z Cannot decompress payload from webhook_sender", | |
| ]) | |
| notif_logs.append("[WARN] 2026-03-25T11:00:05Z No events received in last 60s") | |
| # Dynamic logs | |
| dynamic_logs = { | |
| "medium_rate_limit": { | |
| "webhook_sender": ["[INFO] Rate limit adjusted to 10 req/s. 429 errors resolved."], | |
| "webhook_receiver": ["[INFO] Incoming request rate normalized. Processing events."], | |
| }, | |
| "medium_retry": { | |
| "webhook_sender": ["[INFO] Retry config updated: 3 retries with backoff. 429 now retried."], | |
| }, | |
| "medium_signature": { | |
| "webhook_sender": ["[INFO] Webhook signature computed and attached to requests."], | |
| "webhook_receiver": ["[INFO] Signature validation passed for incoming events."], | |
| }, | |
| "medium_target_url": { | |
| "webhook_sender": ["[INFO] Target URL corrected to /hooks/incoming. Requests routing OK."], | |
| }, | |
| "medium_content_encoding": { | |
| "webhook_sender": ["[INFO] Compression disabled. Receiver parsing payloads correctly."], | |
| }, | |
| } | |
| service_graph = { | |
| "webhook_sender": ServiceNode( | |
| name="webhook_sender", | |
| depends_on=["webhook_receiver"], | |
| health_status="error", | |
| ), | |
| "webhook_receiver": ServiceNode( | |
| name="webhook_receiver", | |
| depends_on=["notification_service"], | |
| health_status="degraded", | |
| ), | |
| "notification_service": ServiceNode( | |
| name="notification_service", | |
| depends_on=[], | |
| health_status="healthy", | |
| ), | |
| } | |
| # Determine optimal fix order (respect dependencies) | |
| issue_ids = [i.issue_id for i in issues] | |
| optimal_order = [] | |
| # Rate limit should be fixed before retry (dependency) | |
| if "medium_rate_limit" in issue_ids: | |
| optimal_order.append("medium_rate_limit") | |
| if "medium_retry" in issue_ids: | |
| optimal_order.append("medium_retry") | |
| for iid in issue_ids: | |
| if iid not in optimal_order: | |
| optimal_order.append(iid) | |
| scenario = Scenario( | |
| task_id="medium", | |
| difficulty="medium", | |
| description=( | |
| "A webhook-based notification system is dropping events. " | |
| "webhook_sender sends webhooks to webhook_receiver, which forwards to notification_service. " | |
| "Events are being lost due to multiple cascading failures in the webhook chain. " | |
| "Fix the webhook_sender configuration to restore event delivery." | |
| ), | |
| max_steps=25, | |
| services=["webhook_sender", "webhook_receiver", "notification_service"], | |
| configs=configs, | |
| logs={ | |
| "webhook_sender": sender_logs, | |
| "webhook_receiver": receiver_logs, | |
| "notification_service": notif_logs, | |
| }, | |
| issues=issues, | |
| service_graph=service_graph, | |
| dynamic_logs=dynamic_logs, | |
| optimal_fix_order=optimal_order, | |
| context=( | |
| "Event flow: webhook_sender -> webhook_receiver -> notification_service. " | |
| "webhook_receiver validates signatures and enforces rate limits. " | |
| "Fixing upstream issues may reveal additional downstream problems." | |
| ), | |
| ) | |
| if seed is not None: | |
| scenario = _randomize_scenario(scenario, seed) | |
| return scenario | |
| # ─── Hard Scenario ──────────────────────────────────────────────────────────── | |
| def _hard_scenario(seed: Optional[int] = None) -> Scenario: | |
| """ | |
| Hard: E-commerce order processing pipeline with cascading failures. | |
| order_service -> inventory_service -> shipping_service | |
| Plus api_gateway and auth_service. | |
| Issue pool has 7 possible issues; canonical scenario uses 5. | |
| Multiple dependency chains make this genuinely challenging. | |
| """ | |
| issue_pool = [ | |
| Issue( | |
| issue_id="hard_wrong_url", | |
| service="order_service", | |
| description="Order service calling deprecated /v1/check instead of /v2/reserve", | |
| expected_fix={"inventory_url": "https://inventory.internal/v2/reserve"}, | |
| fix_key="inventory_url", | |
| log_hint="Endpoint deprecated. Use /v2/reserve", | |
| category="configuration", | |
| severity="error", | |
| root_cause_explanation=( | |
| "order_service calls /v1/check which was deprecated. The API gateway returns " | |
| "301 Moved Permanently. The redirect goes to /v2/check (read-only) instead of " | |
| "/v2/reserve (write). Inventory is never actually reserved." | |
| ), | |
| cascade_effects={ | |
| "inventory_service": "Receiving read-only check requests instead of reservation requests", | |
| "api_gateway": "Generating 301 redirect responses for deprecated endpoints", | |
| }, | |
| ), | |
| Issue( | |
| issue_id="hard_timeout", | |
| service="order_service", | |
| description="Timeout too short (2s) for inventory service that takes ~4s to process", | |
| expected_fix={"timeout": 10}, | |
| fix_key="timeout", | |
| log_hint="Timeout after 2s waiting for inventory response", | |
| depends_on=["hard_wrong_url"], | |
| # Timeout issue is masked by wrong URL — fix URL first to see real timeout | |
| category="networking", | |
| severity="error", | |
| root_cause_explanation=( | |
| "order_service has timeout=2s but inventory_service takes ~4s for reservation " | |
| "(including DB lock + stock validation). After fixing the URL, requests now reach " | |
| "inventory but time out before completion." | |
| ), | |
| cascade_effects={ | |
| "inventory_service": "Connections killed mid-processing, leaving orphaned DB locks", | |
| }, | |
| ), | |
| Issue( | |
| issue_id="hard_async", | |
| service="order_service", | |
| description="Synchronous mode causes race conditions between concurrent orders", | |
| expected_fix={"async_mode": True}, | |
| fix_key="async_mode", | |
| log_hint="Race condition: order ord_998 processed before ord_997 completed", | |
| category="configuration", | |
| severity="critical", | |
| root_cause_explanation=( | |
| "order_service runs in sync mode, blocking the main thread on each inventory call. " | |
| "Concurrent orders queue up and when timeouts occur, orders are processed out of " | |
| "order, causing double-reservation and stock inconsistencies." | |
| ), | |
| ), | |
| Issue( | |
| issue_id="hard_expired_token", | |
| service="inventory_service", | |
| description="Expired auth token used for shipping service requests", | |
| expected_fix={"headers.Authorization": "Bearer valid_token_789"}, | |
| fix_key="headers.Authorization", | |
| log_hint="Auth token expired_token_456 is no longer valid", | |
| category="authentication", | |
| severity="critical", | |
| root_cause_explanation=( | |
| "inventory_service uses Bearer expired_token_456 to authenticate with " | |
| "shipping_service. This token expired on 2026-03-24. All shipment creation " | |
| "requests fail with 401, so reserved inventory is never shipped." | |
| ), | |
| cascade_effects={ | |
| "shipping_service": "Rejecting all requests from inventory_service", | |
| "auth_service": "Logging repeated failed token validations", | |
| }, | |
| ), | |
| Issue( | |
| issue_id="hard_token_refresh", | |
| service="inventory_service", | |
| description="No automatic token refresh mechanism configured", | |
| expected_fix={"token_refresh_url": "https://auth.internal/refresh", "auto_refresh": True}, | |
| fix_key="token_refresh_url", | |
| log_hint="Token validation failed: expired_token_456 expired", | |
| depends_on=["hard_expired_token"], | |
| # Token refresh is only relevant after fixing the expired token | |
| category="configuration", | |
| severity="error", | |
| root_cause_explanation=( | |
| "Even after replacing the expired token, there's no auto-refresh mechanism. " | |
| "Tokens expire every 24h, so without auto_refresh=True and a refresh URL, " | |
| "the same issue will recur tomorrow." | |
| ), | |
| ), | |
| Issue( | |
| issue_id="hard_circuit_breaker", | |
| service="order_service", | |
| description="No circuit breaker — failed requests keep hammering inventory_service", | |
| expected_fix={"circuit_breaker.enabled": True, "circuit_breaker.failure_threshold": 5}, | |
| fix_key="circuit_breaker", | |
| log_hint="Circuit breaker not configured", | |
| category="configuration", | |
| severity="warning", | |
| root_cause_explanation=( | |
| "Without a circuit breaker, order_service keeps sending requests to " | |
| "inventory_service even when it's consistently failing. This wastes resources " | |
| "and can cause a cascading overload." | |
| ), | |
| ), | |
| Issue( | |
| issue_id="hard_idempotency", | |
| service="order_service", | |
| description="Missing idempotency key — retried requests create duplicate orders", | |
| expected_fix={"headers.Idempotency-Key": "order-{order_id}"}, | |
| fix_key="headers.Idempotency-Key", | |
| log_hint="Duplicate order detected: ord_997 submitted twice", | |
| depends_on=["hard_async"], | |
| category="protocol", | |
| severity="error", | |
| root_cause_explanation=( | |
| "When async retries fire, there's no Idempotency-Key header to deduplicate " | |
| "requests. inventory_service creates duplicate reservations for the same order." | |
| ), | |
| ), | |
| ] | |
| if seed is not None: | |
| rng = random.Random(seed) | |
| issues = _select_issues(issue_pool, 5, rng) | |
| else: | |
| issues = issue_pool[:5] # Canonical: first 5 | |
| configs = { | |
| "order_service": { | |
| "name": "order_service", | |
| "inventory_url": "https://inventory.internal/v2/reserve", | |
| "headers": { | |
| "Content-Type": "application/json", | |
| "Authorization": "Bearer valid_token_123", | |
| }, | |
| "timeout": 10, | |
| "async_mode": True, | |
| "callback_url": "https://orders.internal/callback", | |
| "circuit_breaker": { | |
| "enabled": True, | |
| "failure_threshold": 5, | |
| }, | |
| }, | |
| "inventory_service": { | |
| "name": "inventory_service", | |
| "endpoint_version": "v2", | |
| "reserve_path": "/v2/reserve", | |
| "check_path": "/v2/check", | |
| "shipping_url": "https://shipping.internal/v1/create", | |
| "headers": { | |
| "Content-Type": "application/json", | |
| "Authorization": "Bearer valid_token_789", | |
| }, | |
| "timeout": 10, | |
| "processing_time_avg": 4, | |
| "token_refresh_url": "https://auth.internal/refresh", | |
| "auto_refresh": True, | |
| }, | |
| "shipping_service": { | |
| "name": "shipping_service", | |
| "create_path": "/v1/create", | |
| "requires_auth": True, | |
| "accepted_auth": ["Bearer"], | |
| "token_validation_url": "https://auth.internal/validate", | |
| "status": "healthy", | |
| }, | |
| "api_gateway": { | |
| "routes": { | |
| "/v1/check": "DEPRECATED — use /v2/check", | |
| "/v2/reserve": "inventory_service", | |
| "/v2/check": "inventory_service", | |
| "/v1/create": "shipping_service", | |
| }, | |
| "timeout": 30, | |
| }, | |
| "auth_service": { | |
| "valid_tokens": ["valid_token_123", "valid_token_789"], | |
| "expired_tokens": ["expired_token_456"], | |
| "token_refresh_endpoint": "/refresh", | |
| "token_ttl_hours": 24, | |
| }, | |
| } | |
| # Apply broken config for each selected issue | |
| for issue in issues: | |
| if issue.issue_id == "hard_wrong_url": | |
| configs["order_service"]["inventory_url"] = "https://inventory.internal/v1/check" | |
| elif issue.issue_id == "hard_timeout": | |
| configs["order_service"]["timeout"] = 2 | |
| elif issue.issue_id == "hard_async": | |
| configs["order_service"]["async_mode"] = False | |
| elif issue.issue_id == "hard_expired_token": | |
| configs["inventory_service"]["headers"]["Authorization"] = "Bearer expired_token_456" | |
| elif issue.issue_id == "hard_token_refresh": | |
| configs["inventory_service"].pop("token_refresh_url", None) | |
| configs["inventory_service"]["auto_refresh"] = False | |
| elif issue.issue_id == "hard_circuit_breaker": | |
| configs["order_service"]["circuit_breaker"] = {"enabled": False} | |
| elif issue.issue_id == "hard_idempotency": | |
| configs["order_service"]["headers"].pop("Idempotency-Key", None) | |
| # Build logs | |
| order_logs = [] | |
| inventory_logs = [] | |
| shipping_logs = [] | |
| gateway_logs = [] | |
| auth_logs = [ | |
| "[INFO] 2026-03-25T12:00:00Z Auth service ready. Valid tokens: 2, Expired: 1", | |
| ] | |
| for issue in issues: | |
| if issue.issue_id == "hard_wrong_url": | |
| order_logs.extend([ | |
| "[ERROR] 2026-03-25T12:00:05Z POST inventory.internal/v1/check -> 301 Moved Permanently", | |
| "[ERROR] 2026-03-25T12:00:05Z Response: {'error': 'Endpoint deprecated. Use /v2/reserve'}", | |
| ]) | |
| inventory_logs.append( | |
| "[INFO] 2026-03-25T12:00:05Z Received request on /v1/check -> redirecting to /v2/check" | |
| ) | |
| gateway_logs.extend([ | |
| "[WARN] 2026-03-25T12:00:05Z Deprecated endpoint /v1/check accessed by order_service", | |
| "[INFO] 2026-03-25T12:00:05Z Redirecting /v1/check -> /v2/check (301)", | |
| ]) | |
| elif issue.issue_id == "hard_timeout": | |
| order_logs.extend([ | |
| "[ERROR] 2026-03-25T12:00:07Z Timeout after 2s waiting for inventory response", | |
| "[ERROR] 2026-03-25T12:00:07Z Order ord_999 failed: inventory check timed out", | |
| ]) | |
| inventory_logs.append( | |
| "[WARN] 2026-03-25T12:00:06Z Processing reservation... avg time: 4s" | |
| ) | |
| elif issue.issue_id == "hard_async": | |
| order_logs.extend([ | |
| "[WARN] 2026-03-25T12:00:08Z Synchronous mode: blocking on inventory response", | |
| "[ERROR] 2026-03-25T12:00:09Z Race condition: order ord_998 processed before ord_997 completed", | |
| ]) | |
| elif issue.issue_id == "hard_expired_token": | |
| inventory_logs.extend([ | |
| "[ERROR] 2026-03-25T12:00:10Z POST shipping.internal/v1/create -> 401 Unauthorized", | |
| "[ERROR] 2026-03-25T12:00:10Z Auth token expired_token_456 is no longer valid", | |
| "[ERROR] 2026-03-25T12:00:10Z Cannot create shipment: authentication failed", | |
| ]) | |
| shipping_logs.append( | |
| "[WARN] 2026-03-25T12:00:10Z Rejected request: token 'expired_token_456' is expired" | |
| ) | |
| auth_logs.append( | |
| "[WARN] 2026-03-25T12:00:10Z Token validation failed: expired_token_456 expired at 2026-03-24T00:00:00Z" | |
| ) | |
| elif issue.issue_id == "hard_token_refresh": | |
| auth_logs.append( | |
| "[WARN] 2026-03-25T12:00:11Z Token validation failed: expired_token_456 expired. No refresh configured." | |
| ) | |
| elif issue.issue_id == "hard_circuit_breaker": | |
| order_logs.extend([ | |
| "[WARN] 2026-03-25T12:00:12Z Circuit breaker not configured, continuing to send requests after 10 failures", | |
| "[ERROR] 2026-03-25T12:00:12Z System overload: 50 pending requests to inventory_service", | |
| ]) | |
| elif issue.issue_id == "hard_idempotency": | |
| order_logs.append( | |
| "[ERROR] 2026-03-25T12:00:13Z Duplicate order detected: ord_997 submitted twice" | |
| ) | |
| inventory_logs.append( | |
| "[WARN] 2026-03-25T12:00:13Z Duplicate reservation request for order ord_997" | |
| ) | |
| if not shipping_logs: | |
| shipping_logs.append( | |
| "[INFO] 2026-03-25T12:00:00Z Shipping service healthy, awaiting authenticated requests" | |
| ) | |
| dynamic_logs = { | |
| "hard_wrong_url": { | |
| "order_service": ["[INFO] URL corrected to /v2/reserve. Inventory requests routing correctly."], | |
| "api_gateway": ["[INFO] order_service now using correct /v2/reserve endpoint."], | |
| }, | |
| "hard_timeout": { | |
| "order_service": ["[INFO] Timeout increased to 10s. Inventory responses completing."], | |
| "inventory_service": ["[INFO] Reservations completing successfully within timeout."], | |
| }, | |
| "hard_async": { | |
| "order_service": ["[INFO] Async mode enabled. Orders processing concurrently without blocking."], | |
| }, | |
| "hard_expired_token": { | |
| "inventory_service": ["[INFO] Auth token refreshed. Shipping service requests authenticated."], | |
| "shipping_service": ["[INFO] Authentication successful for inventory_service."], | |
| }, | |
| "hard_token_refresh": { | |
| "inventory_service": ["[INFO] Auto token refresh configured. Tokens will be refreshed before expiry."], | |
| }, | |
| "hard_circuit_breaker": { | |
| "order_service": ["[INFO] Circuit breaker enabled. Will stop sending after 5 consecutive failures."], | |
| }, | |
| "hard_idempotency": { | |
| "order_service": ["[INFO] Idempotency keys set. Duplicate requests will be safely deduplicated."], | |
| }, | |
| } | |
| service_graph = { | |
| "order_service": ServiceNode( | |
| name="order_service", | |
| depends_on=["inventory_service", "api_gateway"], | |
| health_status="error", | |
| ), | |
| "inventory_service": ServiceNode( | |
| name="inventory_service", | |
| depends_on=["shipping_service", "auth_service"], | |
| health_status="degraded", | |
| ), | |
| "shipping_service": ServiceNode( | |
| name="shipping_service", | |
| depends_on=[], | |
| health_status="healthy", | |
| ), | |
| "api_gateway": ServiceNode( | |
| name="api_gateway", | |
| depends_on=[], | |
| health_status="healthy", | |
| ), | |
| "auth_service": ServiceNode( | |
| name="auth_service", | |
| depends_on=[], | |
| health_status="healthy", | |
| ), | |
| } | |
| # Build optimal fix order respecting dependencies | |
| issue_ids = [i.issue_id for i in issues] | |
| optimal_order = [] | |
| ordered_preference = [ | |
| "hard_wrong_url", "hard_timeout", "hard_async", | |
| "hard_expired_token", "hard_token_refresh", | |
| "hard_circuit_breaker", "hard_idempotency", | |
| ] | |
| for iid in ordered_preference: | |
| if iid in issue_ids: | |
| optimal_order.append(iid) | |
| for iid in issue_ids: | |
| if iid not in optimal_order: | |
| optimal_order.append(iid) | |
| scenario = Scenario( | |
| task_id="hard", | |
| difficulty="hard", | |
| description=( | |
| "An e-commerce order processing pipeline is failing with cascading errors. " | |
| "Order Service calls Inventory Service, which calls Shipping Service. " | |
| "Multiple issues span the pipeline: wrong endpoints, timeouts, race conditions, " | |
| "expired authentication tokens, and missing resilience patterns. " | |
| "Some issues are masked by upstream failures — you must fix issues in the right " | |
| "order to diagnose the full chain." | |
| ), | |
| max_steps=40, | |
| services=["order_service", "inventory_service", "shipping_service", "api_gateway", "auth_service"], | |
| configs=configs, | |
| logs={ | |
| "order_service": order_logs, | |
| "inventory_service": inventory_logs, | |
| "shipping_service": shipping_logs, | |
| "api_gateway": gateway_logs, | |
| "auth_service": auth_logs, | |
| }, | |
| issues=issues, | |
| service_graph=service_graph, | |
| dynamic_logs=dynamic_logs, | |
| optimal_fix_order=optimal_order, | |
| context=( | |
| "Request flow: order_service -> api_gateway -> inventory_service -> shipping_service. " | |
| "auth_service provides token validation for all inter-service calls. " | |
| "Some issues are masked by upstream failures — fixing upstream issues may reveal " | |
| "new errors downstream. Pay attention to service dependencies." | |
| ), | |
| ) | |
| if seed is not None: | |
| scenario = _randomize_scenario(scenario, seed) | |
| return scenario | |