[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-production-incident-response-zh":3,"seo:pack:production-incident-response:zh":96},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":95},"production-incident-response","🚨","#DC2626","new","本周新建","生产事故响应工具包","为正在救火的 on-call 工程师准备的十个工具，按事故响应流程排序：oncall-guide skill + Devops Incident Responder + PagerDuty Responder + SigNoz MCP（日志+链路）+ Monoscope（自然语言查日志）+ Graylog + Alertmanager + Rundeck（runbook 自动化）+ OpenStatus（状态页）+ Incident Responder agent（写复盘）。装完下一次告警进来面对的是系统，不是人。",[16,28,36,42,52,60,68,75,82,88],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2277,"1a6b17c7-03dd-4d7d-a511-def683b9c5e8","oncall-guide-incident-response-subagent-1a6b17c7","oncall-guide — Incident Response Subagent","Open-source Claude Code subagent for incident response — walks the oncall checklist autonomously: deploys, errors, rollback. Inspired by Boris Cherny.","Skill Factory",160,0,"en","skill","Skill",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":34,"view_count":35,"vote_count":24,"lang_type":25,"type":26,"type_label":27},4429,"e30c19c4-8e4a-42ba-bd07-64a18702817b","claude-code-agent-devops-incident-responder-e30c19c4","Claude Code Agent: Devops Incident Responder","Use when actively responding to production incidents, diagnosing critical service failures, or conducting incident postmortems to implement permanent fixes and preventative...","TokRepo精选",23,{"id":37,"uuid":38,"slug":39,"title":40,"description":41,"author_name":34,"view_count":35,"vote_count":24,"lang_type":25,"type":26,"type_label":27},4402,"d3f997e8-c4d7-4b7b-a978-cc0a5d85408c","claude-code-agent-pagerduty-incident-responder-d3f997e8","Claude Code Agent: Pagerduty Incident Responder","Responds to PagerDuty incidents by analyzing incident context, identifying recent code changes, and suggesting fixes via GitHub PRs.",{"id":43,"uuid":44,"slug":45,"title":46,"description":47,"author_name":48,"view_count":49,"vote_count":24,"lang_type":25,"type":50,"type_label":51},3608,"818380f9-674d-5217-88ab-f393ff99a247","signoz-mcp-server-query-traces-logs-alerts","SigNoz MCP Server — Query Traces, Logs & Alerts","SigNoz MCP Server connects MCP clients to your SigNoz instance: query traces\u002Flogs, inspect alerts, and automate observability workflows using an API key.","MCP Hub",84,"mcp","MCP",{"id":53,"uuid":54,"slug":55,"title":56,"description":57,"author_name":58,"view_count":59,"vote_count":24,"lang_type":25,"type":26,"type_label":27},3335,"a86f3430-eb78-50ab-bebe-6eef4f53ea4a","monoscope-llm-query-for-logs-traces-metrics","Monoscope — LLM Query for Logs\u002FTraces\u002FMetrics","Monoscope stores logs\u002Ftraces\u002Fmetrics in S3-compatible buckets and lets you explore them with natural-language queries plus a CLI and self-hosted UI.","Script Depot",64,{"id":61,"uuid":62,"slug":63,"title":64,"description":65,"author_name":66,"view_count":67,"vote_count":24,"lang_type":25,"type":26,"type_label":27},1923,"68045e07-3de4-11f1-9bc6-00163e2b0d79","graylog-centralized-log-management-analysis-platform-68045e07","Graylog — Centralized Log Management and Analysis Platform","Collect, index, and analyze log data from any source with a powerful search engine, real-time alerting, and customizable dashboards built for operations teams.","AI Open Source",110,{"id":69,"uuid":70,"slug":71,"title":72,"description":73,"author_name":58,"view_count":74,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2026,"51f92d7e-3f31-11f1-9bc6-00163e2b0d79","prometheus-alertmanager-alert-routing-notification-hub-51f92d7e","Prometheus Alertmanager — Alert Routing and Notification Hub","Alertmanager handles alerts sent by Prometheus, deduplicating, grouping, and routing them to the right notification channel such as email, Slack, PagerDuty, or webhooks.",133,{"id":76,"uuid":77,"slug":78,"title":79,"description":80,"author_name":66,"view_count":81,"vote_count":24,"lang_type":25,"type":26,"type_label":27},1542,"d1bf0e61-3939-11f1-9bc6-00163e2b0d79","rundeck-open-source-runbook-automation-job-scheduler-d1bf0e61","Rundeck — Open Source Runbook Automation and Job Scheduler","Automate operations tasks with Rundeck. Define runbooks as jobs with steps, schedule them, delegate execution to teams via self-service, and audit every action with built-in logging.",116,{"id":83,"uuid":84,"slug":85,"title":86,"description":87,"author_name":58,"view_count":67,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2012,"ef13d2c6-3f0f-11f1-9bc6-00163e2b0d79","openstatus-open-source-monitoring-status-page-platform-ef13d2c6","OpenStatus — Open-Source Monitoring and Status Page Platform","OpenStatus is an open-source uptime monitoring and status page platform that checks endpoints from multiple regions, tracks latency and availability, and serves beautiful public status pages for your services.",{"id":89,"uuid":90,"slug":91,"title":92,"description":93,"author_name":34,"view_count":94,"vote_count":24,"lang_type":25,"type":26,"type_label":27},4277,"ee743381-c11a-4b8e-ac46-dac86d1fb7e7","claude-code-agent-incident-responder-ee743381","Claude Code Agent: Incident Responder","Handles production incidents with urgency and precision. Use IMMEDIATELY when production issues occur. Coordinates debugging, implements fixes, and documents post-mortems.",28,"tokrepo install pack\u002Fproduction-incident-response",{"pageType":97,"pageKey":8,"locale":98,"title":99,"metaDescription":100,"h1":101,"tldr":102,"bodyMarkdown":103,"faq":104,"schema":120,"internalLinks":126,"citations":139,"wordCount":152,"generatedAt":153},"pack","zh","生产事故响应工具包 — 救火工程师的 10 个 AI 工具","Oncall-Guide \u002F Devops Incident Responder \u002F PagerDuty Responder \u002F SigNoz MCP \u002F Monoscope \u002F Graylog \u002F Alertmanager \u002F Rundeck \u002F OpenStatus \u002F Incident Responder — 按事故响应流程排序：page→triage→log+trace→runbook→对外通信→postmortem。下次告警进来面对的是系统，不是人。","生产事故响应工具包 — 下一次告警来临前装好的救火支架","十个工具按事故生命周期排序：先 paging skill，再 triage agent，然后 MCP 接进日志+链路搜索，接着 alert 路由 + runbook 自动化，再到对客户的 status page，最后一个 postmortem agent。下一次出事，on-call 手里有的是脚手架，不是肾上腺素。","## 这个 pack 包含什么\n\n凌晨 2:47。PagerDuty 把你叫醒。SLO 错误预算还有 14 分钟烧光。这个 pack 就是上个季度你后悔没装的那套支架 — 不是 50 个工具的 observability 购物清单，而是**正在救火的工程师真正会伸手去抓的那十个**，按事故真实发生的顺序排好。\n\n每一个都是**开源或有开源核心**、可以跑在自己基础设施上、在你这周最难熬的十分钟里值得占一个快捷键。顺序不是字母序 — 是按生命周期走的：告警进来 → 分诊 → 查日志\u002F链路 → 执行 runbook → 对外通信 → 写复盘。\n\n## 推荐安装顺序\n\n1. **Oncall-Guide — Incident Response Subagent** — 从这里开始。Claude Code subagent，自动走完 on-call 检查清单（部署关联、错误尖刺分诊、回滚决策），灵感来自 Boris Cherny 的 oncall playbook。这是后面所有工具的大脑。\n2. **Claude Code Agent: Devops Incident Responder** — 负责事故头 90 秒的 triage agent：拉最近部署、检查 dashboard、标出可疑 commit。绑到编辑器的 slash command，MTTA 直接砍半。\n3. **Claude Code Agent: PagerDuty Incident Responder** — 把 agent 接进 PagerDuty 本身。自动 ack、升级、往事故频道发更新。消灭头五分钟「有人在看吗？」的 Slack 噪音。\n4. **SigNoz MCP Server — Query Traces, Logs & Alerts** — 给 agent 一个统一的 MCP 工具，同时 grep 分布式链路和日志。当 agent 说「p99 延迟尖刺和 cart-service 的 abc123 部署相关」，数据就是从这儿来的。\n5. **Monoscope — LLM Query for Logs\u002FTraces\u002FMetrics** — 跨技术栈的自然语言日志查询。「过去 15 分钟从新 pod 出来的 \u002Fcheckout 5xx」一句话就能查，不用开三个 Kibana dashboard。agent 用它，agent 错的时候人也用它。\n6. **Graylog — Centralized Log Management** — 如果你还没有日志底座就装这个。SigNoz 和 Monoscope 从它读，runbook 往它写，postmortem agent 从它引用。自托管，没有按 GB 收费的陷阱。\n7. **Prometheus Alertmanager — Alert Routing and Notification Hub** — 决定谁被 page、什么时候静音、抖动信号如何聚合的路由大脑。先调它再加 dashboard。大部分 pager 疲劳是 Alertmanager 配置问题，不是 dashboard 问题。\n8. **Rundeck — Open Source Runbook Automation** — runbook 变成按钮的地方。「重启 worker 池」「清缓存」「轮换只读副本」变成 on-call 点一下的 job，不用现场回忆命令。agent 也能通过权限闸门触发。\n9. **OpenStatus — Open-Source Monitoring and Status Page** — 面向客户的状态页，从同一套告警自动更新。让 on-call 不用同时兼任公关。客户在发推骂你之前先看到黄色 banner。\n10. **Claude Code Agent: Incident Responder** — 写复盘的 agent。缓解措施一上线，它就抓 Slack 频道、PagerDuty 时间线、部署历史、SigNoz 查询，拼出一份 five-whys 草稿，你只需要改不需要写。和 #1 同类型 agent，不同 prompt。\n\n## 它们怎么协同\n\n```\nPagerDuty 告警\n   │\n   ▼\nPagerDuty Responder agent  ──── ack + 第一条 triage 帖\n   │\n   ▼\nDevops Incident Responder  ──── 拉部署\u002Fdashboard\u002F可疑 commit\n   │\n   ├──► SigNoz MCP   ──► 链路 + 日志关联\n   ├──► Monoscope    ──► 自然语言查日志\n   └──► Graylog      ──► 原始日志底座\n   │\n   ▼\nAlertmanager  ──── 静音抖动信号、重新聚合\n   │\n   ▼\nRundeck  ──── 执行 runbook（重启 \u002F 清缓存 \u002F failover）\n   │\n   ▼\nOpenStatus  ──── 公开状态页自动更新\n   │\n   ▼\nIncident Responder agent  ──── 复盘草稿（five-whys + 时间线）\n```\n\n闭环完成的标志：postmortem agent 找到那个**如果上周就上线就能阻止这次 page** 的 action item。开 ticket。睡觉。\n\n## 你会遇到的取舍\n\n- **SigNoz vs Datadog** — Datadog 是闭源 SaaS 霸主。SigNoz 是当你的账单从 4K 涨到 40K\u002F月有人问起时的开源退路。MCP server 是让两个里随便哪个都能从 agent 用起来的桥梁。\n- **Monoscope vs grep + jq** — 3 人小团队 grep + jq 够用。过了 50 个服务，你需要自然语言搜索，因为凌晨 3 点没人记得每个服务的日志 schema。\n- **Rundeck vs 仓库里的 shell 脚本** — 裸脚本一直能用，直到写它的 on-call 休年假。Rundeck 加了认证、审计、点击运行 UI，你未来的自己会感谢你。\n- **一个 postmortem agent vs 自己写** — agent 第一稿能到 70%。剩下 30%（上下文、意图、blameless 表述）才是文档真正有用的部分。别把 agent 草稿不改就发出去。\n\n## 常见踩坑\n\n- **triage agent 没设速率限制** — 第一次出事 agent 30 秒打 200 次 SigNoz 查询，给正在着火的系统又加了一层压。每次事故设查询预算。\n- **跳过 Alertmanager 分组规则** — 没分组的话一个上游小抖动 page 五个团队。`group_by` 配置就是「有用的 page」和「on-call 六周烧光」之间的分水岭。\n- **状态页骗人，因为 OpenStatus 用的是同一套已经挂了的监控** — 状态页放在独立基础设施上。不同云、不同 DNS、不同 paging 链路。\n- **LLM 写完不改就发的 postmortem** — 复盘文档是改变文化的产物。没改的 LLM 草稿会侵蚀大家对这套实践的信任。终稿必须有人在 loop 里。\n- **Runbook 写在没人看的 wiki 里** — Rundeck 只有 runbook 被告警链接到才值回票价。Alertmanager → Rundeck 那条链路是承重的。",[105,108,111,114,117],{"q":106,"a":107},"整套装下来要多久？","agent 接线（Oncall-Guide + Devops Responder + PagerDuty Responder + Incident Responder）规划一天 spike；如果还没有数据底座（Graylog + SigNoz + Alertmanager），背景再加一周。Rundeck 和 OpenStatus 各占一个下午。agent 第一次出事就回本；底座第二次出事回本。",{"q":109,"a":110},"四个 Claude Code agent 都需要吗，装一个够不够？","三个承重：Oncall-Guide（playbook 大脑）、Devops Incident Responder（前 90 秒分诊）、Incident Responder（写复盘）。PagerDuty Responder 可选 — 如果你已经有一套不想被打乱的 PagerDuty workflow 就跳过。这几个 agent 共享 context 模式但解决生命周期不同阶段，揉成一个大 agent 会损失针对性。",{"q":112,"a":113},"SigNoz MCP 和 Monoscope 是不是重复了？","SigNoz MCP 给 agent 一个结构化的查询接口，链路和日志一起查（把慢链路关联到对应日志行）。Monoscope 是给人在 agent 没查到的时候自己敲自然语言用的。受众不同、人机工程不同。如果团队小、技术栈简单，可以只上 SigNoz MCP，Monoscope 后面再加。",{"q":115,"a":116},"全部能自托管吗，有没有必须用 SaaS 的？","pack 里每个工具都有完整自托管模式。PagerDuty 本身是 SaaS（responder agent 包的是 PagerDuty API）；如果想 paging 也开源，换成 OneUptime 或 Grafana OnCall — 两个都在更大的 incident-response 目录里。其他九个都能在一台笔记本或单台 VM 上跑起来测试。",{"q":118,"a":119},"如果这个 sprint 只能装三件，最小可行子集是什么？","Oncall-Guide + Devops Incident Responder + Alertmanager。前两个砍下一次事故的 MTTA；Alertmanager 砍掉侵蚀一切的 pager 疲劳税。下个 sprint 加 SigNoz MCP，再下一个加 Rundeck，再下一个加 OpenStatus。Postmortem agent 放最后 — 它只在你有了值得复盘的事故之后才发挥价值。",{"@context":121,"@type":122,"name":13,"description":123,"numberOfItems":124,"inLanguage":125},"https:\u002F\u002Fschema.org","ItemList","为 on-call 工程师准备的十个 AI 工具，覆盖告警分诊、日志\u002F链路搜索、runbook 自动化、状态页、复盘写作。",10,"zh-CN",[127,131,135],{"url":128,"anchor":129,"reason":130},"\u002Fzh\u002Fai-tools-for\u002Fobservability","AI Agent 可观测性工具","链路和日志搜索是所有事故响应工作流的底座",{"url":132,"anchor":133,"reason":134},"\u002Fzh\u002Fai-tools-for\u002Fautomation","AI Agent 自动化工具集","Runbook 自动化和告警路由是这套支架的执行脊柱",{"url":136,"anchor":137,"reason":138},"\u002Fzh\u002Ftopics","浏览其他主题 pack","相邻 pack 覆盖性能剖析、DevOps 工具链、SRE 工作流",[140,144,148],{"claim":141,"source_name":142,"source_url":143},"SigNoz 是开源可观测性平台，原生支持 MCP 服务查询链路、日志和告警","SigNoz GitHub","https:\u002F\u002Fgithub.com\u002FSigNoz\u002Fsignoz",{"claim":145,"source_name":146,"source_url":147},"Prometheus Alertmanager 处理客户端应用发送的告警，包含去重、分组和路由","Prometheus Alertmanager 文档","https:\u002F\u002Fprometheus.io\u002Fdocs\u002Falerting\u002Flatest\u002Falertmanager\u002F",{"claim":149,"source_name":150,"source_url":151},"Rundeck 是开源 runbook 自动化工具，把运维流程变成可执行 job","Rundeck 官网","https:\u002F\u002Fwww.rundeck.com\u002Fopen-source",910,"2026-05-22T10:00:00Z"]