[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"workflow-judgeval-tracing-evaluation-for-agent-apps-ee57174a":3,"seo:featured-workflow:ee57174a-de3d-4b53-85c4-34bb754e90d1:zh":39,"workflow-related-judgeval-tracing-evaluation-for-agent-apps-ee57174a-ee57174a-de3d-4b53-85c4-34bb754e90d1":82},{"id":4,"uuid":5,"slug":6,"title":7,"description":8,"author_id":9,"author_name":10,"author_avatar":11,"token_estimate":12,"time_saved":12,"model_used":13,"fork_count":12,"vote_count":12,"view_count":14,"parent_id":12,"parent_uuid":13,"lang_type":15,"steps":16,"files":23,"tags":24,"has_voted":30,"visibility":19,"share_token":13,"is_featured":12,"content_hash":31,"asset_kind":28,"target_tools":32,"install_mode":36,"entrypoint":37,"risk_profile":38,"dependencies":40,"verification":45,"agent_metadata":48,"agent_fit":59,"trust":70,"provenance":78,"created_at":80,"updated_at":81},3236,"ee57174a-de3d-4b53-85c4-34bb754e90d1","judgeval-tracing-evaluation-for-agent-apps","Judgeval — Tracing + Evaluation for Agent Apps","Judgeval adds tracing and evaluation to agent apps, helping teams score behavior and monitor live traffic with a small SDK and dashboard workflow.","8a910fec-3180-11f1-9bc6-00163e2b0d79","Agent Toolkit","https:\u002F\u002Ftokrepo.com\u002Fapple-touch-icon.png",0,"",13,"en",[17],{"id":18,"step_order":19,"title":20,"description":13,"prompt_template":21,"variables":13,"depends_on":22,"expected_output":13},3799,1,"Asset","## Quick Use\n\n1. Install:\n   ```bash\n   pip install judgeval\n   ```\n2. Set credentials:\n   ```bash\n   export JUDGMENT_API_KEY=...\n   export JUDGMENT_ORG_ID=...\n   ```\n3. Verify:\n   - Run the README tracing example and confirm at least 1 trace arrives in the dashboard.\n\n## Intro\n\nJudgeval adds tracing and evaluation to agent apps, helping teams score behavior and monitor live traffic with a small SDK and dashboard workflow.\n\n- **Best for:** teams shipping agent backends who need tracing + scoring to catch regressions\n- **Works with:** Python agent services, common model SDKs, and production traffic you want to monitor\n- **Setup time:** 20–45 minutes\n\n## Practical Notes\n\n- Quant: start with 3–5 golden prompts and record a baseline score per release.\n- Quant: monitor eval latency and cost; cap evaluations per request in production.\n\n## Pattern: separate tracing from judging\n\nTreat tracing as the source of truth (what happened), and judging as an asynchronous step (how good it was).\n\nA practical rollout:\n- Trace everything in staging.\n- Pick 3 high-risk paths (tool call safety, RAG correctness, refusal behavior).\n- Add a small set of evals and expand only when signal is stable.\n\n## Operational note\n\nStore keys securely and avoid placing sensitive payloads into traces. Redaction\u002Fscrubbing should be part of the initial setup.\n\n### FAQ\n\n**Q: Do I need an account?**\nA: The README references API keys and a dashboard; plan on setting up an account for full functionality.\n\n**Q: What should I evaluate first?**\nA: Tool-call safety, correctness of retrieved facts, and refusal\u002Fguardrail compliance.\n\n**Q: How do I keep costs under control?**\nA: Sample traffic, cap evaluations per request, and run heavier suites in CI\u002Fstaging.\n\n## Source & Thanks\n\n> Source: https:\u002F\u002Fgithub.com\u002FJudgmentLabs\u002Fjudgeval\n> License: Apache-2.0\n> GitHub stars: 1,031 · forks: 93\n\n---\n\n\u003C!-- ZH -->\n\n## 快速使用\n\n1. 安装：\n   ```bash\n   pip install judgeval\n   ```\n2. 设置凭据：\n   ```bash\n   export JUDGMENT_API_KEY=...\n   export JUDGMENT_ORG_ID=...\n   ```\n3. 验证：\n   - 跑通 README 的 tracing 示例，确认至少 1 条 trace 到达 dashboard。\n\n## 简介\n\nJudgeval 提供 tracing 与评测能力：用轻量 SDK 采集运行轨迹，并把关键行为指标变成可打分、可回归的评测流程；适合把 agent 质量从“感觉”变成可监控、可对比的工程指标。\n\n- **适合谁：** 上线迭代 agent 后端、需要 tracing + 打分来抓回归的团队\n- **可搭配：** Python agent 服务、常见模型 SDK，以及你希望监控的线上流量\n- **准备时间：** 20–45 分钟\n\n## 实战建议\n\n- 量化建议：先做 3–5 条黄金样例，按版本记录基线分数。\n- 量化建议：监控评测延迟与成本；生产环境限制每次请求触发的评测数量。\n\n## 常用打法：把 tracing 与 judging 分离\n\n把 tracing 当作事实（发生了什么），把 judging 当作异步评测（好不好）。\n\n推荐落地路径：\n- staging 全量打点；\n- 先选 3 条高风险路径（工具安全、RAG 正确性、拒绝策略）；\n- 先做少量评测，信号稳定后再扩展。\n\n## 运维提示\n\n密钥要安全存放，避免把敏感 payload 写进 trace；脱敏应作为接入第一步。\n\n### FAQ\n\n**需要账号吗？**\n答：README 需要 API key 与 dashboard，完整功能通常需要注册并配置账号。\n\n**优先评测什么？**\n答：工具调用安全、检索事实正确性、拒绝\u002F护栏策略是否合规。\n\n**如何控制成本？**\n答：对线上流量采样、限制每次请求的评测数，把重评测放到 CI\u002Fstaging。\n\n## 来源与感谢\n\n> Source: https:\u002F\u002Fgithub.com\u002FJudgmentLabs\u002Fjudgeval\n> License: Apache-2.0\n> GitHub stars: 1,031 · forks: 93\n","0",[],[25],{"id":26,"name":27,"slug":28,"icon":29},11,"Scripts","script","📜",false,"5170613dc51fd5a900cd96a2b5305daf3a79c60ffb45d76443d5bfa3ae79440f",[33,34,35],"claude_code","codex","gemini_cli","single","judgeval",{"executes_code":30,"modifies_global_config":30,"requires_secrets":39,"uses_absolute_paths":30,"network_access":30},null,{"npm":41,"pip":42,"brew":43,"system":44},[],[37],[],[],{"commands":46,"expected_files":47},[],[20],{"asset_kind":28,"target_tools":49,"install_mode":36,"entrypoint":37,"risk_profile":50,"dependencies":51,"content_hash":31,"verification":56},[33,34,35],{"executes_code":30,"modifies_global_config":30,"requires_secrets":39,"uses_absolute_paths":30,"network_access":30},{"npm":52,"pip":53,"brew":54,"system":55},[],[37],[],[],{"commands":57,"expected_files":58},[],[20],{"target":34,"score":60,"status":61,"policy":61,"why":62,"asset_kind":28,"install_mode":36},29,"stage_only",[63,64,65,66,67,68,69],"target_tools includes codex","asset_kind script","install_mode single","markdown-only","policy stage_only","asset_kind script is not activated directly for Codex","trust established",{"author_trust_level":71,"verified_publisher":30,"asset_signed_hash":31,"signature_status":72,"install_count":12,"report_count":12,"dangerous_capability_badges":73,"review_status":74,"signals":75},"established","hash_only",[28],"unreviewed",[76,77],"author has published assets","content hash available",{"owner_uuid":9,"owner_name":10,"source_url":79,"content_hash":31,"visibility":19,"created_at":80,"updated_at":81},"https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Fjudgeval-tracing-evaluation-for-agent-apps","2026-05-12 16:06:05","2026-05-14 09:30:12",[83,133,177,219],{"id":84,"uuid":85,"slug":86,"title":87,"description":88,"author_id":9,"author_name":10,"author_avatar":11,"token_estimate":12,"time_saved":12,"model_used":13,"fork_count":12,"vote_count":12,"view_count":89,"parent_id":12,"parent_uuid":13,"lang_type":15,"steps":90,"files":39,"tags":91,"has_voted":30,"visibility":19,"share_token":13,"is_featured":12,"content_hash":93,"asset_kind":28,"target_tools":94,"install_mode":36,"entrypoint":95,"risk_profile":96,"dependencies":97,"verification":102,"agent_metadata":105,"agent_fit":116,"trust":118,"provenance":121,"created_at":123,"updated_at":124,"__relatedScore":125,"__relatedReasons":126,"__sharedTags":131},3153,"73cd67c3-9db6-48ed-8a31-c082f618168e","agent-evaluation-test-virtual-agents-in-ci","Agent Evaluation — Test Virtual Agents in CI","Agent Evaluation is a Python framework that runs repeatable, scored tests for virtual agents, so teams can catch regressions automatically in CI.",14,[],[92],{"id":26,"name":27,"slug":28,"icon":29},"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",[33,34,35],"README.md",{"executes_code":30,"modifies_global_config":30,"requires_secrets":39,"uses_absolute_paths":30,"network_access":30},{"npm":98,"pip":99,"brew":100,"system":101},[],[],[],[],{"commands":103,"expected_files":104},[],[],{"asset_kind":28,"target_tools":106,"install_mode":36,"entrypoint":95,"risk_profile":107,"dependencies":108,"content_hash":93,"verification":113},[33,34,35],{"executes_code":30,"modifies_global_config":30,"requires_secrets":39,"uses_absolute_paths":30,"network_access":30},{"npm":109,"pip":110,"brew":111,"system":112},[],[],[],[],{"commands":114,"expected_files":115},[],[],{"target":34,"score":60,"status":61,"policy":61,"why":117,"asset_kind":28,"install_mode":36},[63,64,65,66,67,68,69],{"author_trust_level":71,"verified_publisher":30,"asset_signed_hash":93,"signature_status":72,"install_count":12,"report_count":12,"dangerous_capability_badges":119,"review_status":74,"signals":120},[28],[76,77],{"owner_uuid":9,"owner_name":10,"source_url":122,"content_hash":93,"visibility":19,"created_at":123,"updated_at":124},"https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Fagent-evaluation-test-virtual-agents-in-ci","2026-05-12 07:08:04","2026-05-14 08:17:15",104.76413688858352,[127,128,129,130],"topic-match","same-kind","same-target","same-author",[28,132],"scripts",{"id":134,"uuid":135,"slug":136,"title":137,"description":138,"author_id":9,"author_name":10,"author_avatar":11,"token_estimate":12,"time_saved":12,"model_used":13,"fork_count":12,"vote_count":12,"view_count":139,"parent_id":12,"parent_uuid":13,"lang_type":15,"steps":140,"files":39,"tags":141,"has_voted":30,"visibility":19,"share_token":13,"is_featured":12,"content_hash":93,"asset_kind":28,"target_tools":143,"install_mode":36,"entrypoint":144,"risk_profile":145,"dependencies":146,"verification":151,"agent_metadata":154,"agent_fit":165,"trust":167,"provenance":170,"created_at":172,"updated_at":173,"__relatedScore":174,"__relatedReasons":175,"__sharedTags":176},3275,"68d7a657-2e23-506c-8963-368882308d34","coze-loop-agent-prompt-eval-and-observability-hub","Coze Loop — Agent Prompt, Eval, and Observability Hub","Coze Loop unifies prompt iteration, evaluation, and trace observability, helping agent teams debug workflows without jumping across separate tools.",12,[],[142],{"id":26,"name":27,"slug":28,"icon":29},[33,34,35],"make compose-up",{"executes_code":30,"modifies_global_config":30,"requires_secrets":39,"uses_absolute_paths":30,"network_access":30},{"npm":147,"pip":148,"brew":149,"system":150},[],[],[],[],{"commands":152,"expected_files":153},[],[],{"asset_kind":28,"target_tools":155,"install_mode":36,"entrypoint":144,"risk_profile":156,"dependencies":157,"content_hash":93,"verification":162},[33,34,35],{"executes_code":30,"modifies_global_config":30,"requires_secrets":39,"uses_absolute_paths":30,"network_access":30},{"npm":158,"pip":159,"brew":160,"system":161},[],[],[],[],{"commands":163,"expected_files":164},[],[],{"target":34,"score":60,"status":61,"policy":61,"why":166,"asset_kind":28,"install_mode":36},[63,64,65,66,67,68,69],{"author_trust_level":71,"verified_publisher":30,"asset_signed_hash":93,"signature_status":72,"install_count":12,"report_count":12,"dangerous_capability_badges":168,"review_status":74,"signals":169},[28],[76,77],{"owner_uuid":9,"owner_name":10,"source_url":171,"content_hash":93,"visibility":19,"created_at":172,"updated_at":173},"https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Fcoze-loop-agent-prompt-eval-and-observability-hub","2026-05-12 22:02:43","2026-05-14 00:40:01",96.67091502846026,[127,128,129,130],[28,132],{"id":178,"uuid":179,"slug":180,"title":181,"description":182,"author_id":9,"author_name":10,"author_avatar":11,"token_estimate":12,"time_saved":12,"model_used":13,"fork_count":12,"vote_count":12,"view_count":183,"parent_id":12,"parent_uuid":13,"lang_type":15,"steps":184,"files":39,"tags":185,"has_voted":30,"visibility":19,"share_token":13,"is_featured":12,"content_hash":93,"asset_kind":28,"target_tools":187,"install_mode":36,"entrypoint":95,"risk_profile":188,"dependencies":189,"verification":194,"agent_metadata":197,"agent_fit":208,"trust":210,"provenance":213,"created_at":123,"updated_at":215,"__relatedScore":216,"__relatedReasons":217,"__sharedTags":218},3154,"19beb569-331b-4aa8-a6f4-fe45cb89b6f3","agenteval-net-toolkit-for-agent-evaluation","AgentEval — .NET Toolkit for Agent Evaluation","AgentEval is a .NET evaluation toolkit for AI agents that validates tool usage, scores RAG quality, compares models, and exports regression-ready reports.",7,[],[186],{"id":26,"name":27,"slug":28,"icon":29},[33,34,35],{"executes_code":30,"modifies_global_config":30,"requires_secrets":39,"uses_absolute_paths":30,"network_access":30},{"npm":190,"pip":191,"brew":192,"system":193},[],[],[],[],{"commands":195,"expected_files":196},[],[],{"asset_kind":28,"target_tools":198,"install_mode":36,"entrypoint":95,"risk_profile":199,"dependencies":200,"content_hash":93,"verification":205},[33,34,35],{"executes_code":30,"modifies_global_config":30,"requires_secrets":39,"uses_absolute_paths":30,"network_access":30},{"npm":201,"pip":202,"brew":203,"system":204},[],[],[],[],{"commands":206,"expected_files":207},[],[],{"target":34,"score":60,"status":61,"policy":61,"why":209,"asset_kind":28,"install_mode":36},[63,64,65,66,67,68,69],{"author_trust_level":71,"verified_publisher":30,"asset_signed_hash":93,"signature_status":72,"install_count":12,"report_count":12,"dangerous_capability_badges":211,"review_status":74,"signals":212},[28],[76,77],{"owner_uuid":9,"owner_name":10,"source_url":214,"content_hash":93,"visibility":19,"created_at":123,"updated_at":215},"https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Fagenteval-net-toolkit-for-agent-evaluation","2026-05-14 00:22:22",95.35463498048792,[127,128,129,130],[28,132],{"id":220,"uuid":221,"slug":222,"title":223,"description":224,"author_id":9,"author_name":10,"author_avatar":11,"token_estimate":12,"time_saved":12,"model_used":13,"fork_count":12,"vote_count":12,"view_count":225,"parent_id":12,"parent_uuid":13,"lang_type":15,"steps":226,"files":39,"tags":227,"has_voted":30,"visibility":19,"share_token":13,"is_featured":12,"content_hash":93,"asset_kind":28,"target_tools":229,"install_mode":36,"entrypoint":95,"risk_profile":230,"dependencies":231,"verification":236,"agent_metadata":239,"agent_fit":250,"trust":252,"provenance":255,"created_at":257,"updated_at":258,"__relatedScore":259,"__relatedReasons":260,"__sharedTags":261},3104,"c866ac5d-23f3-4e59-9351-a402817c90ce","trulens-evaluate-and-track-llm-apps","TruLens — Evaluate and Track LLM Apps","Instrument LLM apps and run systematic evals for RAG quality and regressions to find failure modes fast. Combine tracing and scorecards in one workflow.",5,[],[228],{"id":26,"name":27,"slug":28,"icon":29},[33,34,35],{"executes_code":30,"modifies_global_config":30,"requires_secrets":39,"uses_absolute_paths":30,"network_access":30},{"npm":232,"pip":233,"brew":234,"system":235},[],[],[],[],{"commands":237,"expected_files":238},[],[],{"asset_kind":28,"target_tools":240,"install_mode":36,"entrypoint":95,"risk_profile":241,"dependencies":242,"content_hash":93,"verification":247},[33,34,35],{"executes_code":30,"modifies_global_config":30,"requires_secrets":39,"uses_absolute_paths":30,"network_access":30},{"npm":243,"pip":244,"brew":245,"system":246},[],[],[],[],{"commands":248,"expected_files":249},[],[],{"target":34,"score":60,"status":61,"policy":61,"why":251,"asset_kind":28,"install_mode":36},[63,64,65,66,67,68,69],{"author_trust_level":71,"verified_publisher":30,"asset_signed_hash":93,"signature_status":72,"install_count":12,"report_count":12,"dangerous_capability_badges":253,"review_status":74,"signals":254},[28],[76,77],{"owner_uuid":9,"owner_name":10,"source_url":256,"content_hash":93,"visibility":19,"created_at":257,"updated_at":258},"https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Ftrulens-evaluate-and-track-llm-apps","2026-05-12 03:00:17","2026-05-14 00:52:18",93.16722687557547,[127,128,129,130],[28,132]]