Spaces:

yadnyeshkolte
/

api-debug-env

Sleeping

App Files Files Community

api-debug-env / scenarios.py

yadnyeshkolte

chore: remove __pycache__ files

8b10144 about 1 month ago

raw

history blame contribute delete

46.3 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	Scenario definitions for the API Integration Debugging Environment.

	Each scenario models a realistic multi-service API ecosystem with:
	- Service dependency graphs (upstream/downstream relationships)
	- Cascading failures (upstream bugs propagate downstream)
	- Dynamic logs that update when issues are fixed
	- Expanded issue pools for seed-based random subset selection
	"""

	from dataclasses import dataclass, field
	from typing import Any, Dict, List, Optional, Tuple
	import random


	@dataclass
	class Issue:
	"""A single issue in an API integration scenario."""
	issue_id: str
	service: str
	description: str
	expected_fix: Dict[str, Any]
	fix_key: str # The key in the config that needs fixing
	log_hint: str # Log line that hints at this issue
	# --- New fields for cascading failures ---
	depends_on: List[str] = field(default_factory=list)
	# Issues that must be fixed before this one can be diagnosed
	cascade_effects: Dict[str, str] = field(default_factory=dict)
	# service -> error message caused by this issue being unfixed
	category: str = "configuration"
	# Issue category: configuration, authentication, networking, protocol
	severity: str = "error"
	# Severity: error, warning, critical
	root_cause_explanation: str = ""
	# Detailed explanation of why this issue occurs (for grading diagnosis quality)


	@dataclass
	class ServiceNode:
	"""A node in the service dependency graph."""
	name: str
	depends_on: List[str] = field(default_factory=list)
	# Services this one calls (upstream dependencies)
	health_status: str = "degraded"
	# healthy, degraded, error, unreachable


	@dataclass
	class Scenario:
	"""A complete API debugging scenario with dependency graph."""
	task_id: str
	difficulty: str
	description: str
	max_steps: int
	services: List[str]
	configs: Dict[str, Dict[str, Any]]
	logs: Dict[str, List[str]]
	issues: List[Issue]
	# --- New fields ---
	service_graph: Dict[str, ServiceNode] = field(default_factory=dict)
	# Service dependency graph
	dynamic_logs: Dict[str, Dict[str, List[str]]] = field(default_factory=dict)
	# service -> {issue_id: [new logs when fixed]}
	optimal_fix_order: List[str] = field(default_factory=list)
	# Optimal order to fix issues (for strategy scoring)
	context: str = ""
	# Additional scenario context for the agent


	def get_scenario(task_id: str, seed: Optional[int] = None) -> Scenario:
	"""
	Load a scenario by task ID with optional randomization.

	Args:
	task_id: One of 'easy', 'medium', 'hard'
	seed: Optional seed for deterministic but varied scenarios.
	When provided, selects a random subset of issues from the pool
	and randomizes log order. When None, returns the canonical scenario.
	"""
	scenario_builders = {
	"easy": _easy_scenario,
	"medium": _medium_scenario,
	"hard": _hard_scenario,
	}
	if task_id not in scenario_builders:
	raise ValueError(f"Unknown task_id: {task_id}. Must be one of: {list(scenario_builders.keys())}")

	scenario = scenario_builders[task_id](seed=seed)
	return scenario


	def get_all_task_ids() -> List[str]:
	"""Return all available task IDs."""
	return ["easy", "medium", "hard"]


	def _select_issues(pool: List[Issue], count: int, rng: random.Random) -> List[Issue]:
	"""Select a random subset of issues from a pool, respecting dependencies."""
	if count >= len(pool):
	selected = list(pool)
	else:
	# Build dependency-aware selection
	available = list(pool)
	selected = []
	while len(selected) < count and available:
	# Pick a random issue
	issue = rng.choice(available)
	available.remove(issue)
	# Add its dependencies too if not already selected
	deps_satisfied = all(
	any(s.issue_id == dep for s in selected)
	for dep in issue.depends_on
	)
	if deps_satisfied or not issue.depends_on:
	selected.append(issue)
	else:
	# Add dependencies first
	for dep_id in issue.depends_on:
	dep_issue = next((i for i in pool if i.issue_id == dep_id), None)
	if dep_issue and dep_issue not in selected:
	selected.append(dep_issue)
	if dep_issue in available:
	available.remove(dep_issue)
	selected.append(issue)

	# Shuffle log order for selected issues
	rng.shuffle(selected)
	return selected[:count]


	def _randomize_scenario(scenario: Scenario, seed: int) -> Scenario:
	"""Apply seed-based randomization to a scenario."""
	rng = random.Random(seed)

	# Shuffle log entries for each service
	for service_logs in scenario.logs.values():
	rng.shuffle(service_logs)

	# Vary timestamps in log entries
	base_hour = rng.randint(8, 16)
	base_minute = rng.randint(0, 59)
	for service, log_list in scenario.logs.items():
	new_logs = []
	for i, log_line in enumerate(log_list):
	# Replace the timestamp portion
	minute = (base_minute + i * rng.randint(1, 5)) % 60
	hour = base_hour + (base_minute + i * rng.randint(1, 5)) // 60
	new_log = log_line
	if "2026-" in new_log:
	# Replace date with varied date
	day = rng.randint(20, 28)
	new_log = new_log.replace(
	"2026-03-25",
	f"2026-03-{day:02d}"
	).replace(
	"2026-03-24",
	f"2026-03-{day-1:02d}"
	)
	new_logs.append(new_log)
	scenario.logs[service] = new_logs

	return scenario


	# ─── Easy Scenario ───────────────────────────────────────────────────────────

	def _easy_scenario(seed: Optional[int] = None) -> Scenario:
	"""
	Easy: Payment API integration failures.
	Agent must diagnose auth + content-type issues with clear log signals.

	Issue pool has 4 possible issues; canonical scenario uses 2.
	"""
	# Full issue pool (4 issues, canonical uses 2)
	issue_pool = [
	Issue(
	issue_id="easy_auth",
	service="payment_client",
	description="Missing Authorization header — payment gateway requires Bearer token authentication",
	expected_fix={"headers.Authorization": "Bearer <token>"},
	fix_key="headers.Authorization",
	log_hint="Missing or invalid Authorization header",
	category="authentication",
	severity="critical",
	root_cause_explanation=(
	"The payment_client is missing the Authorization header entirely. "
	"The payment_gateway requires Bearer token auth on all /process requests. "
	"This results in HTTP 401 on every payment attempt."
	),
	cascade_effects={
	"payment_gateway": "All requests from payment_client rejected with 401"
	},
	),
	Issue(
	issue_id="easy_content_type",
	service="payment_client",
	description="Wrong Content-Type header (text/plain instead of application/json)",
	expected_fix={"headers.Content-Type": "application/json"},
	fix_key="headers.Content-Type",
	log_hint="Content-Type must be application/json",
	category="protocol",
	severity="error",
	root_cause_explanation=(
	"The payment_client sends Content-Type: text/plain, but the gateway "
	"only accepts application/json. This causes HTTP 415 Unsupported Media Type. "
	"The gateway cannot parse the request body."
	),
	cascade_effects={
	"payment_gateway": "Request body parsing fails for payment_client requests"
	},
	),
	Issue(
	issue_id="easy_timeout",
	service="payment_client",
	description="Timeout set too low (5s) for payment processing that takes 8-12s",
	expected_fix={"timeout": 30},
	fix_key="timeout",
	log_hint="Request timed out after 5s",
	category="networking",
	severity="error",
	root_cause_explanation=(
	"The payment_client has timeout=5s, but payment processing at the gateway "
	"takes 8-12s for fraud checks. Legitimate payments are timing out."
	),
	),
	Issue(
	issue_id="easy_base_url",
	service="payment_client",
	description="Base URL pointing to deprecated v1 endpoint instead of v2",
	expected_fix={"base_url": "https://api.paymentgateway.com/v2"},
	fix_key="base_url",
	log_hint="API v1 is deprecated",
	category="configuration",
	severity="warning",
	root_cause_explanation=(
	"The payment_client uses /v1 which is deprecated and returning 301 redirects. "
	"The gateway v2 endpoint has different request schemas, causing deserialization errors."
	),
	),
	]

	# Select issues based on seed
	if seed is not None:
	rng = random.Random(seed)
	issues = _select_issues(issue_pool, 2, rng)
	else:
	issues = issue_pool[:2] # Canonical: auth + content_type

	# Build logs based on selected issues
	client_logs = [
	"[INFO] 2026-03-25T10:15:20Z Payment client initialized with base_url=https://api.paymentgateway.com/v2",
	]
	gateway_logs = [
	"[INFO] 2026-03-25T10:15:20Z Gateway ready, accepting application/json with Bearer auth",
	]

	for issue in issues:
	if issue.issue_id == "easy_auth":
	client_logs.extend([
	"[ERROR] 2026-03-25T10:15:23Z POST /process -> 401 Unauthorized",
	"[ERROR] 2026-03-25T10:15:23Z Response: {'error': 'Missing or invalid Authorization header'}",
	"[WARN] 2026-03-25T10:15:22Z Request headers: Content-Type=text/plain, Accept=application/json",
	])
	gateway_logs.append(
	"[WARN] 2026-03-25T10:15:23Z Rejected request: no Authorization header present"
	)
	elif issue.issue_id == "easy_content_type":
	client_logs.extend([
	"[ERROR] 2026-03-25T10:15:24Z POST /process -> 415 Unsupported Media Type",
	"[ERROR] 2026-03-25T10:15:24Z Response: {'error': 'Content-Type must be application/json'}",
	])
	gateway_logs.append(
	"[WARN] 2026-03-25T10:15:24Z Rejected request: unsupported Content-Type 'text/plain'"
	)
	elif issue.issue_id == "easy_timeout":
	client_logs.extend([
	"[ERROR] 2026-03-25T10:15:30Z POST /process -> Request timed out after 5s",
	"[WARN] 2026-03-25T10:15:30Z Payment processing takes 8-12s for fraud verification",
	])
	gateway_logs.append(
	"[INFO] 2026-03-25T10:15:30Z Processing payment... estimated time: 10s"
	)
	elif issue.issue_id == "easy_base_url":
	client_logs.extend([
	"[ERROR] 2026-03-25T10:15:21Z GET /v1/status -> 301 Moved Permanently",
	"[WARN] 2026-03-25T10:15:21Z API v1 is deprecated, migrate to /v2",
	])
	gateway_logs.append(
	"[WARN] 2026-03-25T10:15:21Z Deprecated v1 endpoint accessed"
	)

	# Determine initial config based on selected issues
	configs = {
	"payment_client": {
	"base_url": "https://api.paymentgateway.com/v2",
	"headers": {
	"Content-Type": "application/json",
	"Accept": "application/json",
	},
	"timeout": 30,
	"retry_count": 3,
	},
	"payment_gateway": {
	"endpoint": "/process",
	"method": "POST",
	"required_headers": ["Authorization", "Content-Type"],
	"accepted_content_types": ["application/json"],
	"auth_scheme": "Bearer",
	"processing_time_ms": "8000-12000",
	},
	}

	# Apply broken config for each selected issue
	for issue in issues:
	if issue.issue_id == "easy_auth":
	# Remove auth header (it shouldn't exist)
	configs["payment_client"]["headers"].pop("Authorization", None)
	elif issue.issue_id == "easy_content_type":
	configs["payment_client"]["headers"]["Content-Type"] = "text/plain"
	elif issue.issue_id == "easy_timeout":
	configs["payment_client"]["timeout"] = 5
	elif issue.issue_id == "easy_base_url":
	configs["payment_client"]["base_url"] = "https://api.paymentgateway.com/v1"

	# Dynamic logs: what changes after fixing each issue
	dynamic_logs = {}
	for issue in issues:
	if issue.issue_id == "easy_auth":
	dynamic_logs["easy_auth"] = {
	"payment_client": ["[INFO] Authorization header set. Retrying request..."],
	"payment_gateway": ["[INFO] Authentication successful for payment_client"],
	}
	elif issue.issue_id == "easy_content_type":
	dynamic_logs["easy_content_type"] = {
	"payment_client": ["[INFO] Content-Type set to application/json. Request body parsed."],
	"payment_gateway": ["[INFO] Request body parsed successfully as JSON"],
	}
	elif issue.issue_id == "easy_timeout":
	dynamic_logs["easy_timeout"] = {
	"payment_client": ["[INFO] Timeout increased to 30s. Payment processing completing normally."],
	}
	elif issue.issue_id == "easy_base_url":
	dynamic_logs["easy_base_url"] = {
	"payment_client": ["[INFO] Migrated to v2 API endpoint. Requests routing correctly."],
	}

	# Service dependency graph
	service_graph = {
	"payment_client": ServiceNode(
	name="payment_client",
	depends_on=["payment_gateway"],
	health_status="error",
	),
	"payment_gateway": ServiceNode(
	name="payment_gateway",
	depends_on=[],
	health_status="healthy",
	),
	}

	scenario = Scenario(
	task_id="easy",
	difficulty="easy",
	description=(
	"A payment processing API integration is failing. "
	"The client is sending requests to the payment gateway but getting error responses. "
	"Diagnose the root causes by inspecting error logs and service configurations, "
	"then submit the correct configuration fixes."
	),
	max_steps=15,
	services=["payment_client", "payment_gateway"],
	configs=configs,
	logs={"payment_client": client_logs, "payment_gateway": gateway_logs},
	issues=issues,
	service_graph=service_graph,
	dynamic_logs=dynamic_logs,
	optimal_fix_order=[i.issue_id for i in issues],
	context=(
	"The payment_client sends HTTP requests to payment_gateway. "
	"payment_gateway requires Bearer authentication and JSON content type."
	),
	)

	if seed is not None:
	scenario = _randomize_scenario(scenario, seed)

	return scenario


	# ─── Medium Scenario ─────────────────────────────────────────────────────────

	def _medium_scenario(seed: Optional[int] = None) -> Scenario:
	"""
	Medium: Webhook chain with cascading failures.
	Service A -> Service B -> Service C, with rate limiting, retry, and auth issues.

	Issue pool has 5 possible issues; canonical scenario uses 3.
	Issues have dependencies — fixing rate_limit reveals the real retry issue.
	"""
	issue_pool = [
	Issue(
	issue_id="medium_rate_limit",
	service="webhook_sender",
	description="Rate limit too high (100/s vs receiver's 10/s limit) causing 429 responses",
	expected_fix={"rate_limit.requests_per_second": 10},
	fix_key="rate_limit.requests_per_second",
	log_hint="Rate limit exceeded: 100 req/s > 10 req/s allowed",
	category="networking",
	severity="error",
	root_cause_explanation=(
	"webhook_sender fires at 100 req/s but webhook_receiver only accepts 10 req/s. "
	"The excess requests get 429 Too Many Requests, and with only 1 retry, most events are dropped."
	),
	cascade_effects={
	"webhook_receiver": "Overwhelmed with requests, dropping 90% of events",
	"notification_service": "No events arriving downstream",
	},
	),
	Issue(
	issue_id="medium_retry",
	service="webhook_sender",
	description="Insufficient retry config: only 1 retry, no backoff, missing 429 in retry_on_status",
	expected_fix={
	"retry.max_retries": 3,
	"retry.backoff_factor": 2,
	"retry.retry_on_status": [429, 500],
	},
	fix_key="retry",
	log_hint="Retry attempt 1/1 failed. No more retries.",
	depends_on=["medium_rate_limit"],
	# The retry issue is masked by the rate limit issue — even with retries,
	# 100 req/s would still overwhelm the receiver
	category="configuration",
	severity="error",
	root_cause_explanation=(
	"Even after fixing the rate limit, the sender only retries once with no backoff. "
	"Transient 429s during bursts aren't retried because 429 isn't in retry_on_status. "
	"This causes event loss on any temporary load spike."
	),
	),
	Issue(
	issue_id="medium_signature",
	service="webhook_sender",
	description="Webhook signature header is empty — receiver rejects unsigned events",
	expected_fix={"headers.X-Webhook-Signature": "sha256=<computed>"},
	fix_key="headers.X-Webhook-Signature",
	log_hint="Signature validation FAILED: received empty signature",
	category="authentication",
	severity="critical",
	root_cause_explanation=(
	"webhook_sender has signing_secret configured but the X-Webhook-Signature header "
	"is empty string. webhook_receiver validates signatures and drops all unsigned "
	"events as potential spoofing attempts."
	),
	cascade_effects={
	"webhook_receiver": "Dropping all events as unsigned/spoofed",
	"notification_service": "Zero events forwarded from receiver",
	},
	),
	Issue(
	issue_id="medium_target_url",
	service="webhook_sender",
	description="Target URL pointing to wrong receiver endpoint (/webhook vs /hooks/incoming)",
	expected_fix={"target_url": "https://receiver.internal/hooks/incoming"},
	fix_key="target_url",
	log_hint="404 Not Found on /webhook endpoint",
	category="configuration",
	severity="error",
	root_cause_explanation=(
	"webhook_sender posts to /webhook but the receiver listens on /hooks/incoming. "
	"All requests get 404 Not Found."
	),
	),
	Issue(
	issue_id="medium_content_encoding",
	service="webhook_sender",
	description="Payload compression enabled but receiver doesn't support gzip",
	expected_fix={"compression": "none"},
	fix_key="compression",
	log_hint="Unsupported Content-Encoding: gzip",
	category="protocol",
	severity="warning",
	root_cause_explanation=(
	"webhook_sender compresses payloads with gzip but webhook_receiver "
	"doesn't have a decompression middleware. Requests fail with 415."
	),
	),
	]

	if seed is not None:
	rng = random.Random(seed)
	issues = _select_issues(issue_pool, 3, rng)
	else:
	issues = issue_pool[:3] # Canonical: rate_limit, retry, signature

	# Build configs
	configs = {
	"webhook_sender": {
	"target_url": "https://receiver.internal/hooks/incoming",
	"headers": {
	"Content-Type": "application/json",
	"X-Webhook-Signature": "sha256=computed_hmac",
	},
	"rate_limit": {
	"requests_per_second": 10,
	"burst_size": 20,
	},
	"retry": {
	"max_retries": 3,
	"backoff_factor": 2,
	"retry_on_status": [429, 500],
	},
	"signing_secret": "whsec_abc123secret",
	"compression": "none",
	},
	"webhook_receiver": {
	"endpoint": "/hooks/incoming",
	"rate_limit": {
	"requests_per_second": 10,
	"burst_size": 20,
	},
	"signature_validation": True,
	"expected_signature_header": "X-Webhook-Signature",
	"signing_secret": "whsec_abc123secret",
	"forward_to": "https://notifications.internal/notify",
	"supported_encodings": ["identity"],
	},
	"notification_service": {
	"endpoint": "/notify",
	"accepts_from": ["webhook_receiver"],
	"status": "healthy",
	},
	}

	# Apply broken config for each selected issue
	for issue in issues:
	if issue.issue_id == "medium_rate_limit":
	configs["webhook_sender"]["rate_limit"]["requests_per_second"] = 100
	configs["webhook_sender"]["rate_limit"]["burst_size"] = 200
	elif issue.issue_id == "medium_retry":
	configs["webhook_sender"]["retry"] = {
	"max_retries": 1,
	"backoff_factor": 0,
	"retry_on_status": [500],
	}
	elif issue.issue_id == "medium_signature":
	configs["webhook_sender"]["headers"]["X-Webhook-Signature"] = ""
	elif issue.issue_id == "medium_target_url":
	configs["webhook_sender"]["target_url"] = "https://receiver.internal/webhook"
	elif issue.issue_id == "medium_content_encoding":
	configs["webhook_sender"]["compression"] = "gzip"

	# Build logs based on selected issues
	sender_logs = [
	"[INFO] 2026-03-25T10:59:59Z Webhook sender started. Signature header: X-Webhook-Signature",
	]
	receiver_logs = [
	"[INFO] 2026-03-25T10:59:59Z Receiver ready. Rate limit: 10 req/s. Signature validation: ON",
	]
	notif_logs = [
	"[INFO] 2026-03-25T10:59:59Z Notification service healthy. Waiting for events.",
	]

	for issue in issues:
	if issue.issue_id == "medium_rate_limit":
	sender_logs.extend([
	"[ERROR] 2026-03-25T11:00:01Z POST /hooks/incoming -> 429 Too Many Requests",
	"[ERROR] 2026-03-25T11:00:01Z Rate limited. Retry-After: 5s",
	"[WARN] 2026-03-25T11:00:00Z Sending at 100 req/s (burst=200)",
	])
	receiver_logs.append(
	"[WARN] 2026-03-25T11:00:01Z Rate limit exceeded: 100 req/s > 10 req/s allowed"
	)
	elif issue.issue_id == "medium_retry":
	sender_logs.extend([
	"[WARN] 2026-03-25T11:00:02Z Retry attempt 1/1 failed. No more retries.",
	"[ERROR] 2026-03-25T11:00:03Z Event evt_12345 dropped after retry exhaustion",
	])
	elif issue.issue_id == "medium_signature":
	receiver_logs.extend([
	"[ERROR] 2026-03-25T11:00:02Z Signature validation FAILED: received empty signature",
	"[WARN] 2026-03-25T11:00:02Z Dropping event: invalid signature from webhook_sender",
	])
	elif issue.issue_id == "medium_target_url":
	sender_logs.extend([
	"[ERROR] 2026-03-25T11:00:01Z POST /webhook -> 404 Not Found on /webhook endpoint",
	"[WARN] 2026-03-25T11:00:01Z Receiver endpoint may have changed",
	])
	elif issue.issue_id == "medium_content_encoding":
	receiver_logs.extend([
	"[ERROR] 2026-03-25T11:00:02Z Unsupported Content-Encoding: gzip",
	"[WARN] 2026-03-25T11:00:02Z Cannot decompress payload from webhook_sender",
	])

	notif_logs.append("[WARN] 2026-03-25T11:00:05Z No events received in last 60s")

	# Dynamic logs
	dynamic_logs = {
	"medium_rate_limit": {
	"webhook_sender": ["[INFO] Rate limit adjusted to 10 req/s. 429 errors resolved."],
	"webhook_receiver": ["[INFO] Incoming request rate normalized. Processing events."],
	},
	"medium_retry": {
	"webhook_sender": ["[INFO] Retry config updated: 3 retries with backoff. 429 now retried."],
	},
	"medium_signature": {
	"webhook_sender": ["[INFO] Webhook signature computed and attached to requests."],
	"webhook_receiver": ["[INFO] Signature validation passed for incoming events."],
	},
	"medium_target_url": {
	"webhook_sender": ["[INFO] Target URL corrected to /hooks/incoming. Requests routing OK."],
	},
	"medium_content_encoding": {
	"webhook_sender": ["[INFO] Compression disabled. Receiver parsing payloads correctly."],
	},
	}

	service_graph = {
	"webhook_sender": ServiceNode(
	name="webhook_sender",
	depends_on=["webhook_receiver"],
	health_status="error",
	),
	"webhook_receiver": ServiceNode(
	name="webhook_receiver",
	depends_on=["notification_service"],
	health_status="degraded",
	),
	"notification_service": ServiceNode(
	name="notification_service",
	depends_on=[],
	health_status="healthy",
	),
	}

	# Determine optimal fix order (respect dependencies)
	issue_ids = [i.issue_id for i in issues]
	optimal_order = []
	# Rate limit should be fixed before retry (dependency)
	if "medium_rate_limit" in issue_ids:
	optimal_order.append("medium_rate_limit")
	if "medium_retry" in issue_ids:
	optimal_order.append("medium_retry")
	for iid in issue_ids:
	if iid not in optimal_order:
	optimal_order.append(iid)

	scenario = Scenario(
	task_id="medium",
	difficulty="medium",
	description=(
	"A webhook-based notification system is dropping events. "
	"webhook_sender sends webhooks to webhook_receiver, which forwards to notification_service. "
	"Events are being lost due to multiple cascading failures in the webhook chain. "
	"Fix the webhook_sender configuration to restore event delivery."
	),
	max_steps=25,
	services=["webhook_sender", "webhook_receiver", "notification_service"],
	configs=configs,
	logs={
	"webhook_sender": sender_logs,
	"webhook_receiver": receiver_logs,
	"notification_service": notif_logs,
	},
	issues=issues,
	service_graph=service_graph,
	dynamic_logs=dynamic_logs,
	optimal_fix_order=optimal_order,
	context=(
	"Event flow: webhook_sender -> webhook_receiver -> notification_service. "
	"webhook_receiver validates signatures and enforces rate limits. "
	"Fixing upstream issues may reveal additional downstream problems."
	),
	)

	if seed is not None:
	scenario = _randomize_scenario(scenario, seed)

	return scenario


	# ─── Hard Scenario ────────────────────────────────────────────────────────────

	def _hard_scenario(seed: Optional[int] = None) -> Scenario:
	"""
	Hard: E-commerce order processing pipeline with cascading failures.
	order_service -> inventory_service -> shipping_service
	Plus api_gateway and auth_service.

	Issue pool has 7 possible issues; canonical scenario uses 5.
	Multiple dependency chains make this genuinely challenging.
	"""
	issue_pool = [
	Issue(
	issue_id="hard_wrong_url",
	service="order_service",
	description="Order service calling deprecated /v1/check instead of /v2/reserve",
	expected_fix={"inventory_url": "https://inventory.internal/v2/reserve"},
	fix_key="inventory_url",
	log_hint="Endpoint deprecated. Use /v2/reserve",
	category="configuration",
	severity="error",
	root_cause_explanation=(
	"order_service calls /v1/check which was deprecated. The API gateway returns "
	"301 Moved Permanently. The redirect goes to /v2/check (read-only) instead of "
	"/v2/reserve (write). Inventory is never actually reserved."
	),
	cascade_effects={
	"inventory_service": "Receiving read-only check requests instead of reservation requests",
	"api_gateway": "Generating 301 redirect responses for deprecated endpoints",
	},
	),
	Issue(
	issue_id="hard_timeout",
	service="order_service",
	description="Timeout too short (2s) for inventory service that takes ~4s to process",
	expected_fix={"timeout": 10},
	fix_key="timeout",
	log_hint="Timeout after 2s waiting for inventory response",
	depends_on=["hard_wrong_url"],
	# Timeout issue is masked by wrong URL — fix URL first to see real timeout
	category="networking",
	severity="error",
	root_cause_explanation=(
	"order_service has timeout=2s but inventory_service takes ~4s for reservation "
	"(including DB lock + stock validation). After fixing the URL, requests now reach "
	"inventory but time out before completion."
	),
	cascade_effects={
	"inventory_service": "Connections killed mid-processing, leaving orphaned DB locks",
	},
	),
	Issue(
	issue_id="hard_async",
	service="order_service",
	description="Synchronous mode causes race conditions between concurrent orders",
	expected_fix={"async_mode": True},
	fix_key="async_mode",
	log_hint="Race condition: order ord_998 processed before ord_997 completed",
	category="configuration",
	severity="critical",
	root_cause_explanation=(
	"order_service runs in sync mode, blocking the main thread on each inventory call. "
	"Concurrent orders queue up and when timeouts occur, orders are processed out of "
	"order, causing double-reservation and stock inconsistencies."
	),
	),
	Issue(
	issue_id="hard_expired_token",
	service="inventory_service",
	description="Expired auth token used for shipping service requests",
	expected_fix={"headers.Authorization": "Bearer valid_token_789"},
	fix_key="headers.Authorization",
	log_hint="Auth token expired_token_456 is no longer valid",
	category="authentication",
	severity="critical",
	root_cause_explanation=(
	"inventory_service uses Bearer expired_token_456 to authenticate with "
	"shipping_service. This token expired on 2026-03-24. All shipment creation "
	"requests fail with 401, so reserved inventory is never shipped."
	),
	cascade_effects={
	"shipping_service": "Rejecting all requests from inventory_service",
	"auth_service": "Logging repeated failed token validations",
	},
	),
	Issue(
	issue_id="hard_token_refresh",
	service="inventory_service",
	description="No automatic token refresh mechanism configured",
	expected_fix={"token_refresh_url": "https://auth.internal/refresh", "auto_refresh": True},
	fix_key="token_refresh_url",
	log_hint="Token validation failed: expired_token_456 expired",
	depends_on=["hard_expired_token"],
	# Token refresh is only relevant after fixing the expired token
	category="configuration",
	severity="error",
	root_cause_explanation=(
	"Even after replacing the expired token, there's no auto-refresh mechanism. "
	"Tokens expire every 24h, so without auto_refresh=True and a refresh URL, "
	"the same issue will recur tomorrow."
	),
	),
	Issue(
	issue_id="hard_circuit_breaker",
	service="order_service",
	description="No circuit breaker — failed requests keep hammering inventory_service",
	expected_fix={"circuit_breaker.enabled": True, "circuit_breaker.failure_threshold": 5},
	fix_key="circuit_breaker",
	log_hint="Circuit breaker not configured",
	category="configuration",
	severity="warning",
	root_cause_explanation=(
	"Without a circuit breaker, order_service keeps sending requests to "
	"inventory_service even when it's consistently failing. This wastes resources "
	"and can cause a cascading overload."
	),
	),
	Issue(
	issue_id="hard_idempotency",
	service="order_service",
	description="Missing idempotency key — retried requests create duplicate orders",
	expected_fix={"headers.Idempotency-Key": "order-{order_id}"},
	fix_key="headers.Idempotency-Key",
	log_hint="Duplicate order detected: ord_997 submitted twice",
	depends_on=["hard_async"],
	category="protocol",
	severity="error",
	root_cause_explanation=(
	"When async retries fire, there's no Idempotency-Key header to deduplicate "
	"requests. inventory_service creates duplicate reservations for the same order."
	),
	),
	]

	if seed is not None:
	rng = random.Random(seed)
	issues = _select_issues(issue_pool, 5, rng)
	else:
	issues = issue_pool[:5] # Canonical: first 5

	configs = {
	"order_service": {
	"name": "order_service",
	"inventory_url": "https://inventory.internal/v2/reserve",
	"headers": {
	"Content-Type": "application/json",
	"Authorization": "Bearer valid_token_123",
	},
	"timeout": 10,
	"async_mode": True,
	"callback_url": "https://orders.internal/callback",
	"circuit_breaker": {
	"enabled": True,
	"failure_threshold": 5,
	},
	},
	"inventory_service": {
	"name": "inventory_service",
	"endpoint_version": "v2",
	"reserve_path": "/v2/reserve",
	"check_path": "/v2/check",
	"shipping_url": "https://shipping.internal/v1/create",
	"headers": {
	"Content-Type": "application/json",
	"Authorization": "Bearer valid_token_789",
	},
	"timeout": 10,
	"processing_time_avg": 4,
	"token_refresh_url": "https://auth.internal/refresh",
	"auto_refresh": True,
	},
	"shipping_service": {
	"name": "shipping_service",
	"create_path": "/v1/create",
	"requires_auth": True,
	"accepted_auth": ["Bearer"],
	"token_validation_url": "https://auth.internal/validate",
	"status": "healthy",
	},
	"api_gateway": {
	"routes": {
	"/v1/check": "DEPRECATED — use /v2/check",
	"/v2/reserve": "inventory_service",
	"/v2/check": "inventory_service",
	"/v1/create": "shipping_service",
	},
	"timeout": 30,
	},
	"auth_service": {
	"valid_tokens": ["valid_token_123", "valid_token_789"],
	"expired_tokens": ["expired_token_456"],
	"token_refresh_endpoint": "/refresh",
	"token_ttl_hours": 24,
	},
	}

	# Apply broken config for each selected issue
	for issue in issues:
	if issue.issue_id == "hard_wrong_url":
	configs["order_service"]["inventory_url"] = "https://inventory.internal/v1/check"
	elif issue.issue_id == "hard_timeout":
	configs["order_service"]["timeout"] = 2
	elif issue.issue_id == "hard_async":
	configs["order_service"]["async_mode"] = False
	elif issue.issue_id == "hard_expired_token":
	configs["inventory_service"]["headers"]["Authorization"] = "Bearer expired_token_456"
	elif issue.issue_id == "hard_token_refresh":
	configs["inventory_service"].pop("token_refresh_url", None)
	configs["inventory_service"]["auto_refresh"] = False
	elif issue.issue_id == "hard_circuit_breaker":
	configs["order_service"]["circuit_breaker"] = {"enabled": False}
	elif issue.issue_id == "hard_idempotency":
	configs["order_service"]["headers"].pop("Idempotency-Key", None)

	# Build logs
	order_logs = []
	inventory_logs = []
	shipping_logs = []
	gateway_logs = []
	auth_logs = [
	"[INFO] 2026-03-25T12:00:00Z Auth service ready. Valid tokens: 2, Expired: 1",
	]

	for issue in issues:
	if issue.issue_id == "hard_wrong_url":
	order_logs.extend([
	"[ERROR] 2026-03-25T12:00:05Z POST inventory.internal/v1/check -> 301 Moved Permanently",
	"[ERROR] 2026-03-25T12:00:05Z Response: {'error': 'Endpoint deprecated. Use /v2/reserve'}",
	])
	inventory_logs.append(
	"[INFO] 2026-03-25T12:00:05Z Received request on /v1/check -> redirecting to /v2/check"
	)
	gateway_logs.extend([
	"[WARN] 2026-03-25T12:00:05Z Deprecated endpoint /v1/check accessed by order_service",
	"[INFO] 2026-03-25T12:00:05Z Redirecting /v1/check -> /v2/check (301)",
	])
	elif issue.issue_id == "hard_timeout":
	order_logs.extend([
	"[ERROR] 2026-03-25T12:00:07Z Timeout after 2s waiting for inventory response",
	"[ERROR] 2026-03-25T12:00:07Z Order ord_999 failed: inventory check timed out",
	])
	inventory_logs.append(
	"[WARN] 2026-03-25T12:00:06Z Processing reservation... avg time: 4s"
	)
	elif issue.issue_id == "hard_async":
	order_logs.extend([
	"[WARN] 2026-03-25T12:00:08Z Synchronous mode: blocking on inventory response",
	"[ERROR] 2026-03-25T12:00:09Z Race condition: order ord_998 processed before ord_997 completed",
	])
	elif issue.issue_id == "hard_expired_token":
	inventory_logs.extend([
	"[ERROR] 2026-03-25T12:00:10Z POST shipping.internal/v1/create -> 401 Unauthorized",
	"[ERROR] 2026-03-25T12:00:10Z Auth token expired_token_456 is no longer valid",
	"[ERROR] 2026-03-25T12:00:10Z Cannot create shipment: authentication failed",
	])
	shipping_logs.append(
	"[WARN] 2026-03-25T12:00:10Z Rejected request: token 'expired_token_456' is expired"
	)
	auth_logs.append(
	"[WARN] 2026-03-25T12:00:10Z Token validation failed: expired_token_456 expired at 2026-03-24T00:00:00Z"
	)
	elif issue.issue_id == "hard_token_refresh":
	auth_logs.append(
	"[WARN] 2026-03-25T12:00:11Z Token validation failed: expired_token_456 expired. No refresh configured."
	)
	elif issue.issue_id == "hard_circuit_breaker":
	order_logs.extend([
	"[WARN] 2026-03-25T12:00:12Z Circuit breaker not configured, continuing to send requests after 10 failures",
	"[ERROR] 2026-03-25T12:00:12Z System overload: 50 pending requests to inventory_service",
	])
	elif issue.issue_id == "hard_idempotency":
	order_logs.append(
	"[ERROR] 2026-03-25T12:00:13Z Duplicate order detected: ord_997 submitted twice"
	)
	inventory_logs.append(
	"[WARN] 2026-03-25T12:00:13Z Duplicate reservation request for order ord_997"
	)

	if not shipping_logs:
	shipping_logs.append(
	"[INFO] 2026-03-25T12:00:00Z Shipping service healthy, awaiting authenticated requests"
	)

	dynamic_logs = {
	"hard_wrong_url": {
	"order_service": ["[INFO] URL corrected to /v2/reserve. Inventory requests routing correctly."],
	"api_gateway": ["[INFO] order_service now using correct /v2/reserve endpoint."],
	},
	"hard_timeout": {
	"order_service": ["[INFO] Timeout increased to 10s. Inventory responses completing."],
	"inventory_service": ["[INFO] Reservations completing successfully within timeout."],
	},
	"hard_async": {
	"order_service": ["[INFO] Async mode enabled. Orders processing concurrently without blocking."],
	},
	"hard_expired_token": {
	"inventory_service": ["[INFO] Auth token refreshed. Shipping service requests authenticated."],
	"shipping_service": ["[INFO] Authentication successful for inventory_service."],
	},
	"hard_token_refresh": {
	"inventory_service": ["[INFO] Auto token refresh configured. Tokens will be refreshed before expiry."],
	},
	"hard_circuit_breaker": {
	"order_service": ["[INFO] Circuit breaker enabled. Will stop sending after 5 consecutive failures."],
	},
	"hard_idempotency": {
	"order_service": ["[INFO] Idempotency keys set. Duplicate requests will be safely deduplicated."],
	},
	}

	service_graph = {
	"order_service": ServiceNode(
	name="order_service",
	depends_on=["inventory_service", "api_gateway"],
	health_status="error",
	),
	"inventory_service": ServiceNode(
	name="inventory_service",
	depends_on=["shipping_service", "auth_service"],
	health_status="degraded",
	),
	"shipping_service": ServiceNode(
	name="shipping_service",
	depends_on=[],
	health_status="healthy",
	),
	"api_gateway": ServiceNode(
	name="api_gateway",
	depends_on=[],
	health_status="healthy",
	),
	"auth_service": ServiceNode(
	name="auth_service",
	depends_on=[],
	health_status="healthy",
	),
	}

	# Build optimal fix order respecting dependencies
	issue_ids = [i.issue_id for i in issues]
	optimal_order = []
	ordered_preference = [
	"hard_wrong_url", "hard_timeout", "hard_async",
	"hard_expired_token", "hard_token_refresh",
	"hard_circuit_breaker", "hard_idempotency",
	]
	for iid in ordered_preference:
	if iid in issue_ids:
	optimal_order.append(iid)
	for iid in issue_ids:
	if iid not in optimal_order:
	optimal_order.append(iid)

	scenario = Scenario(
	task_id="hard",
	difficulty="hard",
	description=(
	"An e-commerce order processing pipeline is failing with cascading errors. "
	"Order Service calls Inventory Service, which calls Shipping Service. "
	"Multiple issues span the pipeline: wrong endpoints, timeouts, race conditions, "
	"expired authentication tokens, and missing resilience patterns. "
	"Some issues are masked by upstream failures — you must fix issues in the right "
	"order to diagnose the full chain."
	),
	max_steps=40,
	services=["order_service", "inventory_service", "shipping_service", "api_gateway", "auth_service"],
	configs=configs,
	logs={
	"order_service": order_logs,
	"inventory_service": inventory_logs,
	"shipping_service": shipping_logs,
	"api_gateway": gateway_logs,
	"auth_service": auth_logs,
	},
	issues=issues,
	service_graph=service_graph,
	dynamic_logs=dynamic_logs,
	optimal_fix_order=optimal_order,
	context=(
	"Request flow: order_service -> api_gateway -> inventory_service -> shipping_service. "
	"auth_service provides token validation for all inter-service calls. "
	"Some issues are masked by upstream failures — fixing upstream issues may reveal "
	"new errors downstream. Pay attention to service dependencies."
	),
	)

	if seed is not None:
	scenario = _randomize_scenario(scenario, seed)

	return scenario