[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-production-incident-response-fr":3,"seo:pack:production-incident-response:fr":97},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":96},"production-incident-response","🚨","#DC2626","new","Nouveau · cette semaine","Pack de Réponse aux Incidents de Production","Dix choix pour l'ingénieur on-call au milieu d'un incendie en prod. Pagination, recherche logs+traces via MCP, routage d'alertes, automatisation des runbooks, page de statut, agent de postmortem. Oncall-Guide + Devops Incident Responder + PagerDuty Responder + SigNoz MCP + Monoscope + Graylog + Alertmanager + Rundeck + OpenStatus + Incident Responder. Installez dans cet ordre pour que la prochaine alerte affronte un système, pas une personne.",[16,28,36,43,53,61,69,76,83,89],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2277,"1a6b17c7-03dd-4d7d-a511-def683b9c5e8","oncall-guide-incident-response-subagent-1a6b17c7","oncall-guide — Incident Response Subagent","Open-source Claude Code subagent for incident response — walks the oncall checklist autonomously: deploys, errors, rollback. Inspired by Boris Cherny.","Skill Factory",161,0,"en","skill","Skill",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":34,"view_count":35,"vote_count":24,"lang_type":25,"type":26,"type_label":27},4429,"e30c19c4-8e4a-42ba-bd07-64a18702817b","claude-code-agent-devops-incident-responder-e30c19c4","Claude Code Agent: Devops Incident Responder","Use when actively responding to production incidents, diagnosing critical service failures, or conducting incident postmortems to implement permanent fixes and preventative...","TokRepo精选",27,{"id":37,"uuid":38,"slug":39,"title":40,"description":41,"author_name":34,"view_count":42,"vote_count":24,"lang_type":25,"type":26,"type_label":27},4402,"d3f997e8-c4d7-4b7b-a978-cc0a5d85408c","claude-code-agent-pagerduty-incident-responder-d3f997e8","Claude Code Agent: Pagerduty Incident Responder","Responds to PagerDuty incidents by analyzing incident context, identifying recent code changes, and suggesting fixes via GitHub PRs.",26,{"id":44,"uuid":45,"slug":46,"title":47,"description":48,"author_name":49,"view_count":50,"vote_count":24,"lang_type":25,"type":51,"type_label":52},3608,"818380f9-674d-5217-88ab-f393ff99a247","signoz-mcp-server-query-traces-logs-alerts","SigNoz MCP Server — Query Traces, Logs & Alerts","SigNoz MCP Server connects MCP clients to your SigNoz instance: query traces\u002Flogs, inspect alerts, and automate observability workflows using an API key.","MCP Hub",86,"mcp","MCP",{"id":54,"uuid":55,"slug":56,"title":57,"description":58,"author_name":59,"view_count":60,"vote_count":24,"lang_type":25,"type":26,"type_label":27},3335,"a86f3430-eb78-50ab-bebe-6eef4f53ea4a","monoscope-llm-query-for-logs-traces-metrics","Monoscope — LLM Query for Logs\u002FTraces\u002FMetrics","Monoscope stores logs\u002Ftraces\u002Fmetrics in S3-compatible buckets and lets you explore them with natural-language queries plus a CLI and self-hosted UI.","Script Depot",65,{"id":62,"uuid":63,"slug":64,"title":65,"description":66,"author_name":67,"view_count":68,"vote_count":24,"lang_type":25,"type":26,"type_label":27},1923,"68045e07-3de4-11f1-9bc6-00163e2b0d79","graylog-centralized-log-management-analysis-platform-68045e07","Graylog — Centralized Log Management and Analysis Platform","Collect, index, and analyze log data from any source with a powerful search engine, real-time alerting, and customizable dashboards built for operations teams.","AI Open Source",110,{"id":70,"uuid":71,"slug":72,"title":73,"description":74,"author_name":59,"view_count":75,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2026,"51f92d7e-3f31-11f1-9bc6-00163e2b0d79","prometheus-alertmanager-alert-routing-notification-hub-51f92d7e","Prometheus Alertmanager — Alert Routing and Notification Hub","Alertmanager handles alerts sent by Prometheus, deduplicating, grouping, and routing them to the right notification channel such as email, Slack, PagerDuty, or webhooks.",133,{"id":77,"uuid":78,"slug":79,"title":80,"description":81,"author_name":67,"view_count":82,"vote_count":24,"lang_type":25,"type":26,"type_label":27},1542,"d1bf0e61-3939-11f1-9bc6-00163e2b0d79","rundeck-open-source-runbook-automation-job-scheduler-d1bf0e61","Rundeck — Open Source Runbook Automation and Job Scheduler","Automate operations tasks with Rundeck. Define runbooks as jobs with steps, schedule them, delegate execution to teams via self-service, and audit every action with built-in logging.",116,{"id":84,"uuid":85,"slug":86,"title":87,"description":88,"author_name":59,"view_count":68,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2012,"ef13d2c6-3f0f-11f1-9bc6-00163e2b0d79","openstatus-open-source-monitoring-status-page-platform-ef13d2c6","OpenStatus — Open-Source Monitoring and Status Page Platform","OpenStatus is an open-source uptime monitoring and status page platform that checks endpoints from multiple regions, tracks latency and availability, and serves beautiful public status pages for your services.",{"id":90,"uuid":91,"slug":92,"title":93,"description":94,"author_name":34,"view_count":95,"vote_count":24,"lang_type":25,"type":26,"type_label":27},4277,"ee743381-c11a-4b8e-ac46-dac86d1fb7e7","claude-code-agent-incident-responder-ee743381","Claude Code Agent: Incident Responder","Handles production incidents with urgency and precision. Use IMMEDIATELY when production issues occur. Coordinates debugging, implements fixes, and documents post-mortems.",32,"tokrepo install pack\u002Fproduction-incident-response",{"pageType":98,"pageKey":8,"locale":25,"title":99,"metaDescription":100,"h1":101,"tldr":102,"bodyMarkdown":103,"faq":104,"schema":120,"internalLinks":126,"citations":139,"wordCount":152,"generatedAt":153},"pack","Production Incident Response Pack — 10 AI Tools for the Engineer Mid-Fire","Oncall-Guide, Devops Incident Responder, PagerDuty Responder, SigNoz MCP, Monoscope, Graylog, Alertmanager, Rundeck, OpenStatus, Incident Responder. Install in this order so the next page hits a system, not a person. Log+trace search, runbook execution, status page, postmortem writer.","Production Incident Response Pack — The Rig You Install Before the Next Page","Ten picks in deliberate order: paging skill first, triage agent second, then log+trace search via MCP, then alert routing + runbook automation, then customer comms via status page, then a postmortem agent. By the next outage your on-call rotation has scaffolding instead of adrenaline.","## What's in this pack\n\nIt's 2:47 AM. PagerDuty just woke you. The error budget is gone in fourteen minutes. This pack is the rig you wish you'd installed last quarter — not a 50-tool observability shopping list, but the **ten things the engineer mid-fire actually reaches for**, in the order an incident actually unfolds.\n\nEvery pick here is **open-source or has an OSS core**, runs in your own infra, and earns its keystroke during the worst ten minutes of your week. The order is not alphabetical — it tracks the lifecycle: page in → triage → search → execute → communicate → write it up.\n\n## Install in this order\n\n1. **Oncall-Guide — Incident Response Subagent** — start here. Drop-in Claude Code subagent that walks the on-call checklist autonomously (deploy correlation, error spike triage, rollback decision). Inspired by Boris Cherny's oncall playbook. This is the brain the rest of the tools plug into.\n2. **Claude Code Agent: Devops Incident Responder** — the triage agent that runs the first 90 seconds: pulls recent deploys, checks dashboards, flags suspect commits. Bind it to a slash command in your editor and you've cut MTTA in half.\n3. **Claude Code Agent: PagerDuty Incident Responder** — wires the agent into PagerDuty itself. Acknowledges, escalates, posts updates to the incident channel. Removes the \"is anyone looking at this?\" Slack noise that eats the first five minutes.\n4. **SigNoz MCP Server — Query Traces, Logs & Alerts** — gives your agent a single MCP tool to grep distributed traces and logs side-by-side. When the agent says \"the p99 latency spike correlates with deploy abc123 on cart-service\", this is the data source.\n5. **Monoscope — LLM Query for Logs\u002FTraces\u002FMetrics** — natural-language log search across stacks. \"Show me 5xx for \u002Fcheckout in the last 15 minutes from the new pod\" becomes one query instead of three Kibana dashboards. The agent uses it; humans use it when the agent is wrong.\n6. **Graylog — Centralized Log Management** — the log substrate if you don't already have one. SigNoz and Monoscope read from it; runbooks dump to it; the postmortem agent quotes from it. Self-hosted, no per-GB pricing trap.\n7. **Prometheus Alertmanager — Alert Routing and Notification Hub** — the routing brain that decides who gets paged, when alerts silence, and how to group flapping signals. Tune this before adding more dashboards. Most pager fatigue is an Alertmanager config problem, not a dashboard problem.\n8. **Rundeck — Open Source Runbook Automation** — the place runbooks become buttons. \"Restart the worker pool\", \"flush the cache\", \"rotate the read replica\" are jobs the on-call clicks instead of remembering. The agent can trigger them with permission gates.\n9. **OpenStatus — Open-Source Monitoring and Status Page** — public-facing status page, auto-updated from the same alerts. Saves the on-call from also being the comms lead. Customers see a yellow banner before they tweet at you.\n10. **Claude Code Agent: Incident Responder** — the postmortem-writing agent. Once mitigation is in, it scrapes the Slack channel, PagerDuty timeline, deploy history, and SigNoz queries into a five-whys draft you edit instead of write. Same agent type as #1, different prompt.\n\n## How they fit together\n\n```\nPagerDuty page\n   │\n   ▼\nPagerDuty Responder agent  ──── ack + first triage post\n   │\n   ▼\nDevops Incident Responder  ──── pulls deploys, dashboards, suspect commits\n   │\n   ├──► SigNoz MCP   ──► traces + log correlation\n   ├──► Monoscope    ──► natural-language log queries\n   └──► Graylog      ──► raw log substrate\n   │\n   ▼\nAlertmanager  ──── silence flapping signals, regroup\n   │\n   ▼\nRundeck  ──── execute runbook (restart \u002F flush \u002F failover)\n   │\n   ▼\nOpenStatus  ──── public status page auto-updates\n   │\n   ▼\nIncident Responder agent  ──── postmortem draft (five whys + timeline)\n```\n\nThe loop closes when the postmortem agent finds the action item that, had it shipped last week, would have prevented the page. File the ticket. Sleep.\n\n## Tradeoffs you'll hit\n\n- **SigNoz vs Datadog** — Datadog is the polished SaaS incumbent. SigNoz is the OSS bet you make when your bill goes from $4K to $40K\u002Fmonth and someone asks why. The MCP server is the bridge that makes either workable from an agent.\n- **Monoscope vs grep + jq** — for a 3-engineer team, grep + jq is fine. Past 50 services, you want natural-language search because no one remembers every service's log schema at 3 AM.\n- **Rundeck vs raw shell scripts in a repo** — raw scripts work until the on-call who wrote them is on PTO. Rundeck adds auth, audit log, and a \"click to run\" UI your future self will thank you for.\n- **One postmortem agent vs writing it yourself** — the agent's first draft is 70%. The 30% the human adds (context, intent, blameless framing) is what makes the doc useful. Don't ship the agent's draft unedited.\n\n## Common pitfalls\n\n- **Wiring the triage agent without rate limits** — first outage, the agent fires 200 SigNoz queries in 30 seconds and adds load to the system on fire. Set query budgets per incident.\n- **Skipping Alertmanager grouping rules** — without grouping, one upstream blip pages five teams. The Alertmanager `group_by` config is the difference between \"useful page\" and \"on-call burns out in six weeks\".\n- **Status page lying because OpenStatus uses the same monitoring that's down** — host the status page on independent infra. Different cloud, different DNS provider, different paging.\n- **Postmortem-by-LLM with no human edit** — the postmortem is the artifact that changes culture. An unedited LLM draft erodes trust in the practice. Always have a human in the loop on the final doc.\n- **Runbooks in a wiki nobody reads** — Rundeck only earns its keep if the runbooks are linked from the alerts. The Alertmanager → Rundeck link is the load-bearing wire.",[105,108,111,114,117],{"q":106,"a":107},"How long does it take to install this rig end-to-end?","Plan for a one-day spike to get the agents wired (Oncall-Guide + Devops Responder + PagerDuty Responder + Incident Responder), plus a week of background work to install the data substrate (Graylog + SigNoz + Alertmanager) if you don't already have it. Rundeck and OpenStatus each take an afternoon. The agents pay back in the first incident; the substrate pays back in the second.",{"q":109,"a":110},"Do I need all four Claude Code agents, or is one enough?","Three are load-bearing: Oncall-Guide (the playbook brain), Devops Incident Responder (first-90-seconds triage), and Incident Responder (postmortem writer). PagerDuty Responder is optional if you already have a tight PagerDuty workflow you don't want disrupted. The agents share context patterns but solve different lifecycle stages, so collapsing them into one mega-agent costs you specificity.",{"q":112,"a":113},"Why SigNoz MCP and Monoscope — aren't they overlapping?","SigNoz MCP gives the agent a structured query interface to traces and logs together (correlate a slow trace to its log lines). Monoscope is for humans typing natural language when the agent missed it. Different audience, different ergonomics. If your team is small and stack is simple, you can ship with just SigNoz MCP and add Monoscope later.",{"q":115,"a":116},"Can I self-host all of this, or do some pieces need SaaS?","Every tool in this pack has a fully self-hostable mode. PagerDuty itself is SaaS (the responder agent wraps the PagerDuty API); if you want OSS paging too, swap in OneUptime or Grafana OnCall — both are in the broader incident-response catalog. The other nine tools run on a laptop or a single VM for testing.",{"q":118,"a":119},"What's the minimum viable subset if I can only install three things this sprint?","Oncall-Guide + Devops Incident Responder + Alertmanager. The first two cut MTTA on the next incident; Alertmanager cuts the pager-fatigue tax that erodes everything else. Add SigNoz MCP next sprint, then Rundeck, then OpenStatus. Postmortem agent goes last because it only matters after you've had a postmortem-worthy incident.",{"@context":121,"@type":122,"name":123,"description":124,"numberOfItems":125,"inLanguage":25},"https:\u002F\u002Fschema.org","ItemList","Production Incident Response Pack","Ten AI tools for on-call engineers responding to production incidents — paging, triage, log+trace search, runbook automation, status page, postmortem writer.",10,[127,131,135],{"url":128,"anchor":129,"reason":130},"\u002Fen\u002Fai-tools-for\u002Fobservability","Observability tools for AI agents","Trace and log search underpins every incident response workflow",{"url":132,"anchor":133,"reason":134},"\u002Fen\u002Fai-tools-for\u002Fautomation","Automation tools for AI agents","Runbook automation and alert routing are the executable spine of the rig",{"url":136,"anchor":137,"reason":138},"\u002Fen\u002Ftopics","Browse other topic packs","Adjacent packs cover performance profiling, devops toolchains, and SRE workflows",[140,144,148],{"claim":141,"source_name":142,"source_url":143},"SigNoz is an open-source observability platform with native MCP server support for traces, logs, and alerts","SigNoz GitHub","https:\u002F\u002Fgithub.com\u002FSigNoz\u002Fsignoz",{"claim":145,"source_name":146,"source_url":147},"Prometheus Alertmanager handles alerts sent by client applications, including deduplication, grouping, and routing","Prometheus Alertmanager docs","https:\u002F\u002Fprometheus.io\u002Fdocs\u002Falerting\u002Flatest\u002Falertmanager\u002F",{"claim":149,"source_name":150,"source_url":151},"Rundeck is open-source runbook automation that turns operations procedures into executable jobs","Rundeck official site","https:\u002F\u002Fwww.rundeck.com\u002Fopen-source",920,"2026-05-22T10:00:00Z"]