[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-llm-observability-zh":3,"seo:pack:llm-observability:zh":78},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":77},"llm-observability","📊","#EA580C","stable","稳定","LLM 可观测性","Langfuse \u002F AgentOps \u002F LangSmith \u002F Phoenix — 在 CFO 发现之前抓到 token 失控。",[16,28,36,46,54,62,69],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},288,"49a8eb0b-b44b-46c2-b3c8-b54e55fb224f","langfuse-open-source-llm-observability-49a8eb0b","Langfuse — Open Source LLM Observability","Langfuse is an open-source LLM engineering platform for tracing, prompt management, evaluation, and debugging AI apps. 24.1K+ GitHub stars. Self-hosted or cloud. MIT.","Langfuse",300,0,"en","skill","Skill",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":34,"view_count":35,"vote_count":24,"lang_type":25,"type":26,"type_label":27},236,"d570c84f-4e22-4723-806a-d23710686a5c","agentops-observability-ai-agents-d570c84f","AgentOps — Observability for AI Agents","Python SDK for AI agent monitoring. LLM cost tracking, session replay, benchmarking, and error analysis. Integrates with CrewAI, LangChain, AutoGen, and more. 5.4K+ stars.","Script Depot",240,{"id":37,"uuid":38,"slug":39,"title":40,"description":41,"author_name":42,"view_count":43,"vote_count":24,"lang_type":25,"type":44,"type_label":45},768,"4d9432ea-330f-44b6-a629-5b29627f746a","langsmith-prompt-debugging-llm-observability-4d9432ea","LangSmith — Prompt Debugging and LLM Observability","Debug, test, and monitor LLM applications in production. LangSmith provides trace visualization, prompt playground, dataset evaluation, and regression testing for AI.","Prompt Lab",305,"prompt","Prompt",{"id":47,"uuid":48,"slug":49,"title":50,"description":51,"author_name":52,"view_count":53,"vote_count":24,"lang_type":25,"type":26,"type_label":27},303,"42fa8573-760e-4a07-a19f-43422546e9f5","phoenix-open-source-ai-observability-42fa8573","Phoenix — Open Source AI Observability","Phoenix is an AI observability platform for tracing, evaluating, and debugging LLM apps. 9.1K+ stars. OpenTelemetry, evals, prompt management.","Arize AI",269,{"id":55,"uuid":56,"slug":57,"title":58,"description":59,"author_name":60,"view_count":61,"vote_count":24,"lang_type":25,"type":26,"type_label":27},442,"13e3c714-032f-4323-b9ee-69f38e613f45","openlit-opentelemetry-llm-observability-13e3c714","OpenLIT — OpenTelemetry LLM Observability","Monitor LLM costs, latency, and quality with OpenTelemetry-native tracing. GPU monitoring and guardrails built in. 2.3K+ stars.","AI Open Source",255,{"id":63,"uuid":64,"slug":65,"title":66,"description":67,"author_name":60,"view_count":68,"vote_count":24,"lang_type":25,"type":26,"type_label":27},730,"a53444d6-2d55-4f59-ba6f-3b672d7ec458","langtrace-open-source-ai-observability-platform-a53444d6","Langtrace — Open Source AI Observability Platform","Open-source observability for LLM apps. Trace OpenAI, Anthropic, and LangChain calls with OpenTelemetry-native instrumentation and a real-time dashboard.",248,{"id":70,"uuid":71,"slug":72,"title":73,"description":74,"author_name":75,"view_count":76,"vote_count":24,"lang_type":25,"type":26,"type_label":27},92,"aa41279c-0695-4fd6-a8ec-f70e0f255cff","gemini-cli-extension-observability-monitoring-logs-aa41279c","Gemini CLI Extension: Observability — Monitoring & Logs","Gemini CLI extension for Google Cloud observability. Set up monitoring, analyze logs, create dashboards, and configure alerts.","Google · Gemini Team",322,"tokrepo install pack\u002Fllm-observability",{"pageType":79,"pageKey":8,"locale":80,"title":81,"metaDescription":82,"h1":13,"tldr":83,"bodyMarkdown":84,"faq":85,"schema":101,"internalLinks":111,"citations":124,"wordCount":137,"generatedAt":138},"pack","zh","LLM 可观测性：Langfuse \u002F AgentOps \u002F LangSmith \u002F Phoenix","在 CFO 发现前抓到 token 失控。Langfuse \u002F AgentOps \u002F LangSmith \u002F Phoenix —— 每个上生产的 LLM 团队都在跑的仪表盘。TokRepo 一条命令装齐。","七个 LLM 可观测性资产 —— 开源（Langfuse \u002F Phoenix \u002F AgentOps）+ 托管（LangSmith）—— 追 prompt、给输出打分、在账单仪表盘炸之前就报警。","## 这个 pack 装了什么\n\n看不见就修不了。某天一个 prompt 回归悄悄把 token 账单翻 3 倍，你才会后悔上季度没装可观测层。这个 pack 收齐 **七个资产**，把不透明的 LLM 黑盒变成可调试、可报警、可优化的系统。\n\n| # | 资产 | 类型 | 干啥的 |\n|---|---|---|---|\n| 1 | Langfuse | 开源 | 全套 trace + 评测 + prompt 管理 —— 自建或云 |\n| 2 | AgentOps | 开源 | 针对 agent 的可观测性，带 session 回放 |\n| 3 | Arize Phoenix | 开源 | OpenInference trace + 内置评估器 |\n| 4 | LangSmith | 托管 | LangChain 第一方 trace 与数据集平台 |\n| 5 | Token 成本仪表盘 | 模式 | 按用户 \u002F 按功能 \u002F 按 prompt 版本拆分 |\n| 6 | 延迟预算报警 | 模式 | p95 \u002F p99 接 PagerDuty |\n| 7 | Prompt 版本对比 | 模式 | 两个 prompt 版本 trace 并排回放 |\n\n## 为什么要装\n\n三个直觉抓不到、可观测性能抓到的生产失败：\n\n1. **Token 静默膨胀**。一个「小」prompt 改动加了 200 token 的提醒。乘上每天 100 万次请求，预算外多了 2-6 千美金\u002F月。Langfuse 按 prompt 版本看成本，第一天就能发现。\n2. **第 95 百分位的尾巴**。平均延迟看着正常 —— 但 5% 撞冷缓存、撞重试循环、撞超大 RAG payload 的请求把用户体验拖崩。Phoenix \u002F LangSmith 的 p99 仪表盘让尾巴可见。\n3. **质量回归在单条上看不出来**。每条单独回答都看着合理。把过去 24h 的评估器分数（LLM-as-judge \u002F 检索召回 \u002F 幻觉率）跟前 7 天聚合对比，回归就跳出来了。\n\n## 一条命令装齐\n\n```bash\n# 装整个 pack\ntokrepo install pack\u002Fllm-observability\n\n# 或者只装你想先跑的平台\ntokrepo install langfuse\ntokrepo install agentops\ntokrepo install phoenix\n```\n\nTokRepo CLI 把 SDK 配置和仪表盘脚手架丢进项目，下次请求就开始流 trace —— 不用手动接接接。\n\n## 常见坑\n\n- **把完整 prompt 和 PII 日志推到第三方 SaaS**。如果 prompt 含用户数据，自建 Langfuse 或 Phoenix；别没脱敏就把原始 payload 推 LangSmith 云。三个开源方案在常规负载下单 VM 4GB 内存就够。\n- **高流量端点不采样**。每天 100 万请求全 trace 会把存储和钱包同时打爆。默认采 10%，错误时采 100%。Langfuse 和 Phoenix 原生支持。\n- **追 token 不追美元**。不同模型每 token 价不同。在平台里配一次模型定价；按美元追，不只看 token 数。CFO 关心的是美元。\n- **所有人共用一个仪表盘**。每个角色一个仪表盘 —— 工程（延迟 \u002F 错误率）、产品（每功能成本）、高管（每活跃用户成本 \u002F 周环比）。通用仪表盘没人看。\n- **不报 prompt 版本成本差**。设个报警：新 prompt 版本平均每次调用成本比上个版本偏离 >20% 就触发。这是 ROI 最高的单个报警。\n\n## 跟其他 pack 的关系\n\nLLM 可观测性是**运行时遥测层**。互补的 LLM 评测 & 护栏 pack 是**离线打分层** —— DeepEval \u002F Promptfoo \u002F Ragas。两个都要：可观测性告诉你生产里在发生什么，评测告诉你拟改的 prompt 是否更好*再*上线。\n\n多 Agent 框架（CAMEL \u002F LangGraph \u002F DeepAgents）是*被检测的系统*。如果你跑 LangGraph 工作流但看不出哪个节点挂了，那不叫可观测，叫 print 调试。框架 pack 跟这个 pack 第一天就一起装。",[86,89,92,95,98],{"q":87,"a":88},"这些工具免费吗？","Langfuse \u002F Phoenix \u002F AgentOps 是 MIT\u002FApache 2.0 开源，单 VM 跑得起来。自建免费，只付存储和计算。LangSmith 只能托管，按 trace 计费 —— 免费档够小团队用，价格往企业级走。多数团队的正确答案：先自建 Langfuse，只有当你已经深度在 LangChain 生态、想要第一方集成时才换 LangSmith。",{"q":90,"a":91},"Langfuse 跟 LangSmith 比怎么样？","Langfuse 开源、可自建、框架无关 —— LangChain \u002F LlamaIndex \u002F 原生 OpenAI SDK \u002F 自定义代码都能用。LangSmith 闭源、托管、跟 LangChain 紧耦合。功能上 trace 和 prompt 管理大致打平；LangSmith 在 LangChain 专属功能上略强，Langfuse 评估器框架和自建故事更强。看重数据主权选 Langfuse；想零运维 + LangChain 原生选 LangSmith。",{"q":93,"a":94},"用 Cursor 或 Codex CLI 也能用吗？","可观测性在 API 调用层，不是编辑器层 —— 任何打 LLM API 的工具都能接。TokRepo 装的时候把 SDK init 代码加进项目。如果你通过 Claude Code \u002F Cursor \u002F Codex CLI 代理，检测的是 agent 后端（实际调 LLM 的框架或服务），不是编辑器本身。每个平台的 SDK 都是 5 行 import。",{"q":96,"a":97},"跟 LLM 评测 pack 啥区别？","评测是离线打分 —— 给定 prompt 和参考答案，输出有多好。可观测性是运行时遥测 —— 生产里发生了什么：延迟 \u002F 成本 \u002F 错误 \u002F trace。评测喂 CI；可观测性喂仪表盘和报警。两个都要。常见模式：黄金集评测分数推进可观测平台，让质量、成本、延迟在同一个仪表盘。",{"q":99,"a":100},"这玩意儿加多少检测开销？","异步批量日志给 LLM 调用加 ~1-3ms p50 延迟 —— 跟模型本身延迟（常 500-3000ms）比可忽略。四个平台都出异步 SDK 后台批 trace。高流量端点采 10% 让存储成本可控。实际热路径开销低到没有理由不装可观测就上线。",{"@context":102,"@type":103,"name":104,"description":105,"numberOfItems":106,"publisher":107},"https:\u002F\u002Fschema.org","CollectionPage","LLM Observability","Langfuse, AgentOps, LangSmith, Phoenix and the dashboards that catch token blow-ups before your CFO does.",7,{"@type":108,"name":109,"url":110},"Organization","TokRepo","https:\u002F\u002Ftokrepo.com",[112,116,120],{"url":113,"anchor":114,"reason":115},"\u002Fzh\u002Fpacks\u002Fllm-eval-guardrails","LLM 评测 & 护栏","互补的离线评分层",{"url":117,"anchor":118,"reason":119},"\u002Fzh\u002Fpacks\u002Fmulti-agent-frameworks","多 Agent 框架","这些仪表盘要追的系统",{"url":121,"anchor":122,"reason":123},"\u002Fzh\u002Ftools\u002Fclaude-code","Claude Code","发出 trace 的 agent 表面",[125,129,133],{"claim":126,"source_name":127,"source_url":128},"Langfuse open-source LLM engineering platform with tracing, evaluations, and prompt management","langfuse\u002Flangfuse","https:\u002F\u002Fgithub.com\u002Flangfuse\u002Flangfuse",{"claim":130,"source_name":131,"source_url":132},"Arize Phoenix open-source AI observability and evaluation library","Arize-ai\u002Fphoenix","https:\u002F\u002Fgithub.com\u002FArize-ai\u002Fphoenix",{"claim":134,"source_name":135,"source_url":136},"AgentOps SDK for monitoring, debugging and benchmarking AI agents","AgentOps-AI\u002Fagentops","https:\u002F\u002Fgithub.com\u002FAgentOps-AI\u002Fagentops",450,"2026-05-02T15:10:00Z"]