[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-agent-observability-tracing-zh":3,"seo:pack:agent-observability-tracing:zh":78},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":77},"agent-observability-tracing","🔭","#3B82F6","stable","稳定","Agent 可观测 + 全链路追踪","ML \u002F LLM 工程师专用，回答「agent 为什么会做这一步？」的 7 件套 — LangSmith \u002F Langfuse \u002F Phoenix \u002F Helicone \u002F AgentOps \u002F OpenTelemetry for LLM。逐 span 看穿 tool call、retry、子 agent、反思循环 — 不只是按 prompt 看 token 账单的 dashboard。",[16,28,36,46,54,62,69],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},288,"49a8eb0b-b44b-46c2-b3c8-b54e55fb224f","langfuse-open-source-llm-observability-49a8eb0b","Langfuse — Open Source LLM Observability","Langfuse is an open-source LLM engineering platform for tracing, prompt management, evaluation, and debugging AI apps. 24.1K+ GitHub stars. Self-hosted or cloud. MIT.","Langfuse",276,0,"en","skill","Skill",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":34,"view_count":35,"vote_count":24,"lang_type":25,"type":26,"type_label":27},236,"d570c84f-4e22-4723-806a-d23710686a5c","agentops-observability-ai-agents-d570c84f","AgentOps — Observability for AI Agents","Python SDK for AI agent monitoring. LLM cost tracking, session replay, benchmarking, and error analysis. Integrates with CrewAI, LangChain, AutoGen, and more. 5.4K+ stars.","Script Depot",227,{"id":37,"uuid":38,"slug":39,"title":40,"description":41,"author_name":42,"view_count":43,"vote_count":24,"lang_type":25,"type":44,"type_label":45},768,"4d9432ea-330f-44b6-a629-5b29627f746a","langsmith-prompt-debugging-llm-observability-4d9432ea","LangSmith — Prompt Debugging and LLM Observability","Debug, test, and monitor LLM applications in production. LangSmith provides trace visualization, prompt playground, dataset evaluation, and regression testing for AI.","Prompt Lab",291,"prompt","Prompt",{"id":47,"uuid":48,"slug":49,"title":50,"description":51,"author_name":52,"view_count":53,"vote_count":24,"lang_type":25,"type":26,"type_label":27},303,"42fa8573-760e-4a07-a19f-43422546e9f5","phoenix-open-source-ai-observability-42fa8573","Phoenix — Open Source AI Observability","Phoenix is an AI observability platform for tracing, evaluating, and debugging LLM apps. 9.1K+ stars. OpenTelemetry, evals, prompt management.","Arize AI",257,{"id":55,"uuid":56,"slug":57,"title":58,"description":59,"author_name":60,"view_count":61,"vote_count":24,"lang_type":25,"type":26,"type_label":27},442,"13e3c714-032f-4323-b9ee-69f38e613f45","openlit-opentelemetry-llm-observability-13e3c714","OpenLIT — OpenTelemetry LLM Observability","Monitor LLM costs, latency, and quality with OpenTelemetry-native tracing. GPU monitoring and guardrails built in. 2.3K+ stars.","AI Open Source",243,{"id":63,"uuid":64,"slug":65,"title":66,"description":67,"author_name":60,"view_count":68,"vote_count":24,"lang_type":25,"type":26,"type_label":27},730,"a53444d6-2d55-4f59-ba6f-3b672d7ec458","langtrace-open-source-ai-observability-platform-a53444d6","Langtrace — Open Source AI Observability Platform","Open-source observability for LLM apps. Trace OpenAI, Anthropic, and LangChain calls with OpenTelemetry-native instrumentation and a real-time dashboard.",226,{"id":70,"uuid":71,"slug":72,"title":73,"description":74,"author_name":75,"view_count":76,"vote_count":24,"lang_type":25,"type":26,"type_label":27},92,"aa41279c-0695-4fd6-a8ec-f70e0f255cff","gemini-cli-extension-observability-monitoring-logs-aa41279c","Gemini CLI Extension: Observability — Monitoring & Logs","Gemini CLI extension for Google Cloud observability. Set up monitoring, analyze logs, create dashboards, and configure alerts.","Google · Gemini Team",307,"tokrepo install pack\u002Fagent-observability-tracing",{"pageType":79,"pageKey":8,"locale":80,"title":81,"metaDescription":82,"h1":13,"tldr":83,"bodyMarkdown":84,"faq":85,"schema":101,"internalLinks":111,"citations":124,"wordCount":137,"generatedAt":138},"pack","zh","Agent 可观测 + 全链路追踪 — LLM Agent 深度调试栈","LangSmith \u002F Langfuse \u002F Phoenix \u002F Helicone \u002F AgentOps \u002F OpenTelemetry for LLM — 逐 span 追踪 tool 调用、retry、子 agent、反思循环。TokRepo 一键安装。","7 件套，ML \u002F LLM 工程师专用，回答「这 agent 到底为什么会做这一步？」 — 开源（Langfuse \u002F Phoenix \u002F AgentOps）+ 托管（LangSmith \u002F Helicone）+ 大家都在用的 OpenTelemetry 规范。Span 树、tool 调用回放、retry 链、反思循环 diff — 是调试基元，不只是花钱看板。","## 这个 pack 包含什么\n\n某天你的 agent 静默地在两个工具之间循环了 47 轮，最后给出一个自信但错的答案 — 那一刻你会希望自己当初装了 **span 级别的追踪**，而不是只有按 prompt 看 token 账单的图。这个 pack 是给 ML \u002F LLM 工程师准备的，他们要重建 agent 到底*做了什么*：哪个子 agent 触发了、每次 tool 调用进去的参数是啥、retry 吃了几次、planner 在转向之前在想什么。\n\n| # | 资产 | 分层 | 追踪什么 |\n|---|---|---|---|\n| 1 | LangSmith | 托管 | LangChain \u002F LangGraph 原生 span、dataset 回放、eval 串联 |\n| 2 | Langfuse | 开源 | 框架无关的 span 树、prompt 版本化、evaluator hook |\n| 3 | Arize Phoenix | 开源 | OpenInference span、内置 retrieval \u002F agent 评估器、notebook 友好 |\n| 4 | Helicone | 混合 | 代理模式追踪，无需装 SDK，cost + 缓存 + session |\n| 5 | AgentOps | 开源 | Agent 会话回放、tool 调用时间线、多 agent 步骤图 |\n| 6 | OpenTelemetry for LLM | 规范 | OpenInference + GenAI 语义约定 — 厂商中立的 span 格式 |\n| 7 | 评估串联的 trace store | 模式 | 每条 trace 都打质量分，会话内分数回退立刻告警 |\n\n## 跟 LLM Observability pack 有啥区别\n\n如果你不确定装哪个：**LLM Observability** 是运行时遥测层 — token 成本、p95 延迟、错误率、prompt 版本对比 dashboard。受众是任何把 LLM 调用上线的人。**Agent Observability + Tracing** 是 agent 系统的*调试*层 — 一个用户请求会扇出 10–100 次 LLM 调用、工具调用、子 agent 切换。受众是那个盯着一段 4 分钟 agent 运行结果是垃圾的工程师，想搞清楚是哪一步在撒谎。\n\n成本 dashboard 告诉你*这个月账单涨了 30%*。深度 trace 告诉你*planner 子 agent 重复了同一个搜索 8 次，因为 tool 返回空数组而 prompt 没处理这种情况*。两个都要，但它们回答的是不同问题。\n\n## 按这个顺序装\n\n```bash\n# 整套\ntokrepo install pack\u002Fagent-observability-tracing\n\n# 或一层一层装\ntokrepo install langfuse        # 1. trace 存储 + UI\ntokrepo install opentelemetry-llm  # 2. 一次插桩、后续随便换后端\ntokrepo install phoenix         # 3. eval 串联到每条 trace\n```\n\n五层，按这个顺序装：\n\n1. **插桩器** — 包装你的 LLM SDK 和 agent 框架。默认选 OpenTelemetry + OpenInference 语义约定：一次插桩，换后端不用改代码。如果你是 LangChain 重度用户，自带的 `langchain_core.tracers` 直接写到 LangSmith \u002F Langfuse。\n2. **Trace store + UI** — Langfuse 自部署（数据自主）、LangSmith Cloud（零运维 + LangChain 紧耦合）、Phoenix 本地（notebook 友好 + 零基建）、Helicone 代理（完全不装 SDK）。四个现在都吃 OTel span。\n3. **Eval 串联** — 把离线 eval（LLM-as-judge \u002F 检索召回率 \u002F tool 调用正确率）接到同一个 trace store，每个 span 都带质量分。Phoenix 和 Langfuse 都内置；LangSmith 叫 'feedback'。\n4. **告警** — 在 *trace 级别*异常上触发，不只是单次调用：agent 跑了 >20 步、retry 深度 >5、子 agent 没调用预期 tool、planner 输出缺必填 schema。这些都是 per-prompt dashboard 根本看不见的失败模式。\n5. **会话回放** — AgentOps 和 Helicone 都把 span 归为 'session'（一个用户请求 = 一个 session）。多 agent 系统这个是必装项，没有它你在时间线里根本分不开两个并发跑的用户请求。\n\n## 常见踩坑\n\n- **只追 LLM 调用，没追 tool 调用** — 模型给出 tool 调用、你的代码执行 tool、结果喂回下一轮。只插桩 LLM SDK，tool 执行就是黑洞。把 tool 分发器也用同一个 tracer 包起来。\n- **子 agent 切换没传 `parent_span_id`** — 如果子 agent B 是 A 启动的，B 的 span 必须带 A 的 trace ID。否则 UI 显示两条没关联的时间线，你根本看不出谁调用了谁。\n- **完整推理链写成一个大字段** — 一个 30 步反思循环不该是一个巨型字符串字段 — 应该是 `reflection` 父 span 下 30 个兄弟 span。过滤、搜索、diff 在 blob 形态下全废。\n- **Agent trace 均匀采样** — 普通 run 采样 10%，但**出错的 \u002F 触发 max retries 的 \u002F eval 分低于阈值的 100% 留下**。需要调试的 bug 恰好就是你会丢掉的那些 run。\n- **被厂商锁住 span 格式** — 用 OpenInference \u002F OpenTelemetry GenAI 语义约定。这个 pack 里每个后端都吃这个。手写私有 JSON 就意味着迁移后端时整个插桩要重写。\n- **没把 prompt 版本和 trace 关联起来** — 这条 trace 是 prompt v7 跑的，但你现在已经在用 v9，trace UI 必须能看到版本号，才能 diff 新旧行为。Langfuse 和 LangSmith 都支持，第一天就接上。\n\n## 跟这些 pack 搭配使用\n\nAgent Observability 是*调试器*。**LLM Observability** pack 是*生产 dashboard*。**Multi-Agent Frameworks** pack 是*被追踪的系统*（LangGraph \u002F CrewAI \u002F AutoGen）。**LLM Eval & Guardrails** pack 是把原始 trace 转换成质量信号、并落到同一个 dashboard 的*评分引擎*。真做生产的团队四个都装 — 没有 eval 的 observability 就是漂亮时间线、没有 trace 的 eval 就是平均数。",[86,89,92,95,98],{"q":87,"a":88},"跟 LLM Observability pack 到底有啥区别？","LLM Observability 是运行时遥测层 — 每条 prompt 的成本、p95 延迟、错误率、版本对版本 dashboard，受众是所有把 LLM 调用上线的人。Agent Observability + Tracing 是 agent 系统的*调试*层，给那种一个用户请求会扇出十几次 LLM 调用、工具调用、子 agent 切换的系统用，受众是那个想逐 span 重建 agent 到底做了什么的工程师。生产团队多数两个都装：observability 给 dashboard 看、tracing 给事后复盘用。",{"q":90,"a":91},"6 个平台都装，还是挑一个？","先挑一个 trace store：要自部署 + 框架无关选 Langfuse，LangChain 原生 + 零运维选 LangSmith，notebook 玩家 + eval 优先选 Phoenix，不想动 SDK 一行代理代码选 Helicone。AgentOps 是专门做 agent 会话回放的，跟上面四个之一搭配用。OpenTelemetry 不是平台，是你的插桩应该输出的线协议格式 — 这样以后换后端不用重写代码。",{"q":93,"a":94},"能追 tool 调用和子 agent 吗，不只是 LLM 调用？","能 — 这就是这个 pack 的核心。OpenInference 语义约定定义了 `LLM` \u002F `CHAIN` \u002F `RETRIEVER` \u002F `TOOL` \u002F `AGENT` \u002F `EMBEDDING` 等 span 类型，本 pack 里每个平台都能渲染完整带父子关系的树。坑在于你得真去插桩 tool 分发器，不能只包 LLM SDK — 如果你只包模型调用，tool 执行时间就是不可见的，结果是把慢错误归因到模型上。",{"q":96,"a":97},"深度 trace 开销多大？","单 span 开销在亚毫秒级（异步批量导出）。真正的成本是存储：一次 agent 运行 30 次 LLM 调用 + 50 次 tool 调用 + 完整 input\u002Foutput payload，大约 200–500 KB。每天 1 万次 run 就是 2–5 GB\u002F天。错误和高 eval 成本的 run 100% 保留，普通 run 采样 10%，再自部署 Langfuse 或 Phoenix，存储账单可控。",{"q":99,"a":100},"不用 LangChain 的 agent（CrewAI \u002F AutoGen \u002F 自研）能用吗？","能。Langfuse \u002F Phoenix \u002F Helicone \u002F AgentOps 都是框架无关 — 接受任何来源的 OpenInference span。CrewAI 自带 AgentOps 集成；AutoGen 有 Langfuse 适配；自研 Python agent 用 OpenInference SDK 的装饰器（`@trace`）和 context manager，不依赖框架。LangSmith 是相对最 LangChain-tight 的，但它的 API 也接受任意 span。",{"@context":102,"@type":103,"name":13,"description":104,"numberOfItems":105,"inLanguage":106,"publisher":107},"https:\u002F\u002Fschema.org","CollectionPage","7 件 ML\u002FLLM 工程师调试 agent 系统的追踪资产 — span 树、tool 调用回放、retry 链、串联了 eval 的 trace store。",7,"zh-CN",{"@type":108,"name":109,"url":110},"Organization","TokRepo","https:\u002F\u002Ftokrepo.com",[112,116,120],{"url":113,"anchor":114,"reason":115},"\u002Fzh\u002Fpacks\u002Fllm-observability","LLM 可观测性","互补的运行时 dashboard 层 — 成本 \u002F 延迟 \u002F 版本趋势",{"url":117,"anchor":118,"reason":119},"\u002Fzh\u002Fpacks\u002Fmulti-agent-frameworks","多智能体框架","本 pack 追踪的 agent 系统（LangGraph \u002F CrewAI \u002F AutoGen）",{"url":121,"anchor":122,"reason":123},"\u002Fzh\u002Fpacks\u002Fml-engineer-rag-eval","ML 工程师 RAG + Eval 套件","给每条 trace 打检索和答案质量分的 eval 层",[125,129,133],{"claim":126,"source_name":127,"source_url":128},"Langfuse 是开源 LLM 工程平台，含 tracing \u002F 评估 \u002F prompt 管理","langfuse\u002Flangfuse","https:\u002F\u002Fgithub.com\u002Flangfuse\u002Flangfuse",{"claim":130,"source_name":131,"source_url":132},"Arize Phoenix 是基于 OpenInference span 规范的开源 AI 可观测库","Arize-ai\u002Fphoenix","https:\u002F\u002Fgithub.com\u002FArize-ai\u002Fphoenix",{"claim":134,"source_name":135,"source_url":136},"OpenTelemetry GenAI 语义约定定义 LLM 和 agent span 的标准字段","OpenTelemetry GenAI conventions","https:\u002F\u002Fopentelemetry.io\u002Fdocs\u002Fspecs\u002Fsemconv\u002Fgen-ai\u002F",890,"2026-05-22T12:00:00Z"]