[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-agent-observability-tracing-es":3,"seo:pack:agent-observability-tracing:es":78},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":77},"agent-observability-tracing","🔭","#3B82F6","new","Nuevo · esta semana","Observabilidad + Tracing de Agentes","Siete picks para el ingeniero ML\u002FLLM que necesita responder '¿por qué hizo eso el agente?' — LangSmith, Langfuse, Phoenix, Helicone, AgentOps, OpenTelemetry for LLM. Trazas span por span de tool calls, retries, sub-agentes y bucles de reflexión — no solo dashboards de costo por prompt.",[16,28,36,46,54,62,69],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},288,"49a8eb0b-b44b-46c2-b3c8-b54e55fb224f","langfuse-open-source-llm-observability-49a8eb0b","Langfuse — Open Source LLM Observability","Langfuse is an open-source LLM engineering platform for tracing, prompt management, evaluation, and debugging AI apps. 24.1K+ GitHub stars. Self-hosted or cloud. MIT.","Langfuse",190,0,"en","skill","Skill",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":34,"view_count":35,"vote_count":24,"lang_type":25,"type":26,"type_label":27},236,"d570c84f-4e22-4723-806a-d23710686a5c","agentops-observability-ai-agents-d570c84f","AgentOps — Observability for AI Agents","Python SDK for AI agent monitoring. LLM cost tracking, session replay, benchmarking, and error analysis. Integrates with CrewAI, LangChain, AutoGen, and more. 5.4K+ stars.","Script Depot",161,{"id":37,"uuid":38,"slug":39,"title":40,"description":41,"author_name":42,"view_count":43,"vote_count":24,"lang_type":25,"type":44,"type_label":45},768,"4d9432ea-330f-44b6-a629-5b29627f746a","langsmith-prompt-debugging-llm-observability-4d9432ea","LangSmith — Prompt Debugging and LLM Observability","Debug, test, and monitor LLM applications in production. LangSmith provides trace visualization, prompt playground, dataset evaluation, and regression testing for AI.","Prompt Lab",199,"prompt","Prompt",{"id":47,"uuid":48,"slug":49,"title":50,"description":51,"author_name":52,"view_count":53,"vote_count":24,"lang_type":25,"type":26,"type_label":27},303,"42fa8573-760e-4a07-a19f-43422546e9f5","phoenix-open-source-ai-observability-42fa8573","Phoenix — Open Source AI Observability","Phoenix is an AI observability platform for tracing, evaluating, and debugging LLM apps. 9.1K+ stars. OpenTelemetry, evals, prompt management.","Arize AI",175,{"id":55,"uuid":56,"slug":57,"title":58,"description":59,"author_name":60,"view_count":61,"vote_count":24,"lang_type":25,"type":26,"type_label":27},442,"13e3c714-032f-4323-b9ee-69f38e613f45","openlit-opentelemetry-llm-observability-13e3c714","OpenLIT — OpenTelemetry LLM Observability","Monitor LLM costs, latency, and quality with OpenTelemetry-native tracing. GPU monitoring and guardrails built in. 2.3K+ stars.","AI Open Source",150,{"id":63,"uuid":64,"slug":65,"title":66,"description":67,"author_name":60,"view_count":68,"vote_count":24,"lang_type":25,"type":26,"type_label":27},730,"a53444d6-2d55-4f59-ba6f-3b672d7ec458","langtrace-open-source-ai-observability-platform-a53444d6","Langtrace — Open Source AI Observability Platform","Open-source observability for LLM apps. Trace OpenAI, Anthropic, and LangChain calls with OpenTelemetry-native instrumentation and a real-time dashboard.",155,{"id":70,"uuid":71,"slug":72,"title":73,"description":74,"author_name":75,"view_count":76,"vote_count":24,"lang_type":25,"type":26,"type_label":27},92,"aa41279c-0695-4fd6-a8ec-f70e0f255cff","gemini-cli-extension-observability-monitoring-logs-aa41279c","Gemini CLI Extension: Observability — Monitoring & Logs","Gemini CLI extension for Google Cloud observability. Set up monitoring, analyze logs, create dashboards, and configure alerts.","Google · Gemini Team",212,"tokrepo install pack\u002Fagent-observability-tracing",{"pageType":79,"pageKey":8,"locale":25,"title":80,"metaDescription":81,"h1":82,"tldr":83,"bodyMarkdown":84,"faq":85,"schema":101,"internalLinks":110,"citations":123,"wordCount":136,"generatedAt":137},"pack","Agent Observability + Tracing — Deep Traces for LLM Agents","LangSmith, Langfuse, Phoenix, Helicone, AgentOps, OpenTelemetry for LLM — span-by-span traces across tool calls, retries, sub-agents, and reflection loops. Install via TokRepo.","Agent Observability + Tracing","Seven tracing assets for the ML\u002FLLM engineer who needs to answer 'why did the agent do that?' — open-source (Langfuse, Phoenix, AgentOps) plus hosted (LangSmith, Helicone) plus the OpenTelemetry spec everyone else builds on. Span trees, tool-call replay, retry chains, and reflection-loop diffs — debugging primitives, not just cost dashboards.","## What's in this pack\n\nThe day an agent silently loops between two tools for 47 turns and returns a confident wrong answer is the day you wish you had **span-level traces**, not a per-prompt cost chart. This pack is built for the ML\u002FLLM engineer trying to reconstruct what an agent actually *did*: which sub-agent fired, what arguments went into each tool call, how many retries it ate, what the planner thought before it pivoted.\n\n| # | Asset | Tier | What it traces |\n|---|---|---|---|\n| 1 | LangSmith | hosted | first-party LangChain \u002F LangGraph spans, dataset replay, eval bridge |\n| 2 | Langfuse | open-source | framework-agnostic span trees, prompt versioning, evaluator hooks |\n| 3 | Arize Phoenix | open-source | OpenInference spans, built-in retrieval \u002F agent evaluators, notebook-first |\n| 4 | Helicone | hybrid | proxy-based tracing, no SDK install, cost + caching + sessions |\n| 5 | AgentOps | open-source | agent session replay, tool-call timelines, multi-agent step graphs |\n| 6 | OpenTelemetry for LLM | spec | OpenInference + GenAI semantic conventions — vendor-neutral span format |\n| 7 | Eval-bridged trace store | pattern | every trace gets a quality score, alerted when score regresses inside a session |\n\n## How this is different from the LLM Observability pack\n\nIf you're not sure which pack to install: **LLM Observability** is the runtime telemetry layer — token cost, p95 latency, error rate, prompt-version dashboards. The audience is anyone shipping LLM calls to production. **Agent Observability + Tracing** is the *debugging* layer for systems where one user request fans out into 10–100 LLM calls, tool invocations, and sub-agent handoffs. The audience is the engineer staring at a 4-minute agent run that returned garbage and trying to figure out which step lied.\n\nA cost dashboard tells you *the bill went up 30%*. A deep trace tells you *the planner sub-agent retried the same search 8 times because the tool returned an empty array and the prompt didn't handle it*. You want both, but they answer different questions.\n\n## Install in a deliberate order\n\n```bash\n# Full pack\ntokrepo install pack\u002Fagent-observability-tracing\n\n# Or layer it up\ntokrepo install langfuse        # 1. trace store + UI\ntokrepo install opentelemetry-llm  # 2. instrument once, swap backends later\ntokrepo install phoenix         # 3. eval bridge for trace-level scoring\n```\n\nFive layers, install in this order:\n\n1. **Instrumenter** — wrap your LLM SDK and agent framework. OpenTelemetry + OpenInference semantic conventions is the right default: instrument once, swap backends without code changes. If you're LangChain-native, the built-in `langchain_core.tracers` writes directly to LangSmith \u002F Langfuse.\n2. **Trace store + UI** — pick Langfuse self-hosted (data sovereignty), LangSmith Cloud (zero ops, LangChain-tight), Phoenix local (notebook-first, no infra), or Helicone proxy (no SDK install at all). They all consume OTel spans now.\n3. **Eval bridge** — wire your offline evals (LLM-as-judge, retrieval recall, tool-call correctness) into the same trace store so quality scores land on every span. Phoenix and Langfuse both ship this; LangSmith calls it 'feedback'.\n4. **Alerts** — fire on *trace-level* anomalies, not just per-call ones: agent ran >20 steps, retry depth >5, sub-agent never called expected tool, planner output didn't include required schema. These are the failure modes a per-prompt dashboard misses entirely.\n5. **Session replay** — AgentOps and Helicone both group spans into 'sessions' (one user request = one session). For multi-agent systems this is non-negotiable. Without it you cannot tell two simultaneous user runs apart in the timeline.\n\n## Common pitfalls\n\n- **Tracing the LLM call but not the tool call.** The model emits a tool call; your code runs the tool; the result feeds the next LLM turn. If you only instrument the LLM SDK, the tool execution is a black hole. Wrap your tool dispatcher with the same tracer.\n- **No `parent_span_id` on sub-agent handoffs.** If sub-agent B is spawned by agent A, B's spans must carry A's trace ID. Otherwise the UI shows two disconnected timelines and you cannot answer 'who called whom'.\n- **Logging full reasoning chains as a single blob.** A reflection loop with 30 thoughts shouldn't be one giant string field — it should be 30 sibling spans under a `reflection` parent. Filtering, search, and diff all break on the blob shape.\n- **Sampling agent traces uniformly.** Sample 10% of normal runs, but always keep 100% of runs where the trace exited with an error, hit max retries, or had eval score below threshold. The bugs you need to debug are exactly the runs you'd otherwise drop.\n- **Vendor-locking your spans.** Use OpenInference \u002F OpenTelemetry GenAI semantic conventions. Every backend in this pack speaks them. Hand-rolling proprietary JSON means rewriting your instrumentation when you migrate.\n- **No prompt-version → trace-version link.** When a trace was generated by prompt version 7 but you've since shipped version 9, the trace UI must surface the version so you can diff old vs new behavior. Langfuse and LangSmith both support this; wire it on day one.\n\n## Pair with these packs\n\nAgent Observability is the *debugger*. The **LLM Observability** pack is the *production dashboard*. The **Multi-Agent Frameworks** pack is the *system being traced* (LangGraph, CrewAI, AutoGen). The **LLM Eval & Guardrails** pack is the *scoring engine* that turns raw traces into quality signals on the same dashboard. Real teams run all four together — observability without eval is just pretty timelines, eval without traces is just averages.",[86,89,92,95,98],{"q":87,"a":88},"How is this different from the LLM Observability pack?","LLM Observability is the runtime telemetry layer — per-prompt cost, p95 latency, error rate, version-over-version dashboards. The audience is everyone shipping LLM calls. Agent Observability + Tracing is the *debugging* layer for agentic systems where a single user request fans out into many LLM calls, tool invocations, and sub-agent handoffs. The audience is the engineer trying to reconstruct what an agent actually did, span by span. Most production teams need both: observability for the dashboard, tracing for the post-mortem.",{"q":90,"a":91},"Do I need all six platforms, or can I pick one?","Start with one trace store: Langfuse if you want self-host and framework-agnostic, LangSmith if you're LangChain-native and want zero-ops, Phoenix if you live in a notebook and want eval-first, Helicone if you want a one-line proxy with no SDK changes. AgentOps adds session replay specifically for agent workflows — pair it with one of the four above. OpenTelemetry isn't a platform; it's the wire format your instrumentation should emit so you can swap backends later without rewriting code.",{"q":93,"a":94},"Can I trace tool calls and sub-agents, not just LLM calls?","Yes — that's the whole point of this pack. OpenInference semantic conventions define span kinds for `LLM`, `CHAIN`, `RETRIEVER`, `TOOL`, `AGENT`, and `EMBEDDING`. Every platform here renders the full tree with parent-child links. The pitfall is that you have to actually instrument the tool dispatcher, not just the LLM SDK — if you only wrap the model call, tool execution time is invisible and you'll mis-attribute slowness to the model.",{"q":96,"a":97},"How much overhead does deep tracing add?","Per-span overhead is sub-millisecond with async batched export. The real cost is storage: a single agent run with 30 LLM calls, 50 tool calls, and full input\u002Foutput payloads is roughly 200–500 KB. At 10k runs\u002Fday that's 2–5 GB\u002Fday. Sample 100% on errors and high-eval-cost runs, 10% on routine runs, and self-host Langfuse or Phoenix to keep the storage bill predictable.",{"q":99,"a":100},"Will this work with non-LangChain agents (CrewAI, AutoGen, custom)?","Yes. Langfuse, Phoenix, Helicone, and AgentOps are all framework-agnostic — they accept OpenInference spans from any source. CrewAI ships built-in AgentOps integration; AutoGen has Langfuse adapters; for custom Python agents the OpenInference SDK gives you decorators (`@trace`) and context managers that work without a framework. LangSmith is the one that pushes hardest on LangChain-specific features, but its API also accepts arbitrary spans.",{"@context":102,"@type":103,"name":82,"description":104,"numberOfItems":105,"inLanguage":25,"publisher":106},"https:\u002F\u002Fschema.org","CollectionPage","Seven tracing assets for ML\u002FLLM engineers debugging agentic systems — span trees, tool-call replay, retry chains, and eval-bridged trace stores.",7,{"@type":107,"name":108,"url":109},"Organization","TokRepo","https:\u002F\u002Ftokrepo.com",[111,115,119],{"url":112,"anchor":113,"reason":114},"\u002Fen\u002Fpacks\u002Fllm-observability","LLM Observability","complementary runtime dashboard layer — cost, latency, version trends",{"url":116,"anchor":117,"reason":118},"\u002Fen\u002Fpacks\u002Fmulti-agent-frameworks","Multi-Agent Frameworks","the agent systems these traces instrument (LangGraph, CrewAI, AutoGen)",{"url":120,"anchor":121,"reason":122},"\u002Fen\u002Fpacks\u002Fml-engineer-rag-eval","ML Engineer RAG + Eval","the eval layer that scores every trace for retrieval and answer quality",[124,128,132],{"claim":125,"source_name":126,"source_url":127},"Langfuse open-source LLM engineering platform with tracing, evaluations, and prompt management","langfuse\u002Flangfuse","https:\u002F\u002Fgithub.com\u002Flangfuse\u002Flangfuse",{"claim":129,"source_name":130,"source_url":131},"Arize Phoenix open-source AI observability with OpenInference span schema","Arize-ai\u002Fphoenix","https:\u002F\u002Fgithub.com\u002FArize-ai\u002Fphoenix",{"claim":133,"source_name":134,"source_url":135},"OpenTelemetry GenAI semantic conventions for LLM and agent spans","OpenTelemetry GenAI conventions","https:\u002F\u002Fopentelemetry.io\u002Fdocs\u002Fspecs\u002Fsemconv\u002Fgen-ai\u002F",905,"2026-05-22T12:00:00Z"]