[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-voice-ai-stack-zh":3,"seo:pack:voice-ai-stack:zh":70},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":69},"voice-ai-stack","🎙️","#EC4899","stable","稳定","语音 AI 栈","Zonos \u002F Moshi \u002F OpenAI Realtime \u002F LiveKit Agents — 真正能上生产的实时语音 agent 与 TTS。",[16,28,38,46,54,62],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},328,"9b6992d2-2369-45f0-9f8e-6c0c834c649b","zonos-multilingual-tts-voice-cloning-9b6992d2","Zonos — Multilingual TTS with Voice Cloning","Zonos is an open-weight TTS model trained on 200K+ hours of speech. 7.2K+ stars. Voice cloning, 5 languages, emotion control. Apache 2.0.","Script Depot",344,0,"en","script","Script",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":34,"view_count":35,"vote_count":24,"lang_type":25,"type":36,"type_label":37},740,"6172db11-6b8c-431b-8f66-f4b7af585534","moshi-real-time-ai-voice-conversation-engine-6172db11","Moshi — Real-Time AI Voice Conversation Engine","Open-source real-time voice AI by Kyutai. Full-duplex speech conversation with 200ms latency, emotion recognition, and on-device processing. Apache 2.0 licensed.","AI Open Source",407,"skill","Skill",{"id":39,"uuid":40,"slug":41,"title":42,"description":43,"author_name":44,"view_count":45,"vote_count":24,"lang_type":25,"type":36,"type_label":37},865,"0d228731-33e3-11f1-9bc6-00163e2b0d79","openai-realtime-agents-voice-ai-agent-patterns-0d228731","OpenAI Realtime Agents — Voice AI Agent Patterns","Advanced agentic patterns for voice AI built on OpenAI Realtime API. Chat-supervisor and sequential handoff patterns with WebRTC streaming. MIT, 6,800+ stars.","OpenAI",356,{"id":47,"uuid":48,"slug":49,"title":50,"description":51,"author_name":52,"view_count":53,"vote_count":24,"lang_type":25,"type":36,"type_label":37},214,"804ee888-b285-4369-891e-15f424f587ed","livekit-agents-build-real-time-voice-ai-agents-804ee888","LiveKit Agents — Build Real-Time Voice AI Agents","Framework for building real-time voice AI agents. STT, LLM, TTS pipeline with sub-second latency. Supports OpenAI, Anthropic, Deepgram, ElevenLabs. 9.9K+ stars.","LiveKit",326,{"id":55,"uuid":56,"slug":57,"title":58,"description":59,"author_name":60,"view_count":61,"vote_count":24,"lang_type":25,"type":36,"type_label":37},102,"ff8cbccc-09d0-4b0e-a3e2-d58268ca2d39","remotion-ai-voiceover-skill-elevenlabs-tts-ff8cbccc","Remotion AI Voiceover Skill — ElevenLabs TTS","AI skill for adding ElevenLabs text-to-speech voiceover to Remotion videos. Auto-sizes composition duration to match generated audio.","ElevenLabs",323,{"id":63,"uuid":64,"slug":65,"title":66,"description":67,"author_name":60,"view_count":68,"vote_count":24,"lang_type":25,"type":26,"type_label":27},106,"16d32da9-c5fb-43ae-b881-8444b2dcd35b","elevenlabs-python-sdk-ai-text-speech-16d32da9","ElevenLabs Python SDK — AI Text-to-Speech","Official ElevenLabs Python SDK for AI voice generation. Create realistic voiceovers with 30+ languages, voice cloning, and streaming support.",300,"tokrepo install pack\u002Fvoice-ai-stack",{"pageType":71,"pageKey":8,"locale":72,"title":73,"metaDescription":74,"h1":13,"tldr":75,"bodyMarkdown":76,"faq":77,"schema":93,"internalLinks":103,"citations":116,"wordCount":129,"generatedAt":130},"pack","zh","语音 AI 栈：Zonos \u002F Moshi \u002F OpenAI Realtime \u002F LiveKit","真正能上生产的实时语音 agent 与 TTS。Zonos \u002F Moshi \u002F OpenAI Realtime API \u002F LiveKit Agents 全栈。TokRepo 一条命令装齐。","六个语音 AI 资产 —— 开源 TTS（Zonos）+ 全双工对话（Moshi）+ 托管 speech-to-speech（OpenAI Realtime）+ WebRTC 基建（LiveKit Agents），让所有这些一起达到生产延迟目标。","## 这个 pack 装了什么\n\n语音 AI 是「笔记本上 demo」与「真上线给用户用」差距最大的领域。延迟、轮次切换、打断、barge-in 必须全部工作 —— 默认情况下都不工作。这个 pack 收齐 **六个资产**，是 2026 年真在出语音产品的团队在跑的。\n\n| # | 资产 | 层 | 为什么收 |\n|---|---|---|---|\n| 1 | OpenAI Realtime API | speech-to-speech | 托管，亚 300ms 轮次延迟，免拼 STT\u002FTTS |\n| 2 | Moshi | speech-to-speech | 开源全双工；数据留本地 |\n| 3 | Zonos | TTS | 高质量开源 TTS，带克隆 |\n| 4 | LiveKit Agents | 基建 | WebRTC + agent 编排，生产基底 |\n| 5 | 语音 agent 模式 | 设计 | 轮次 \u002F barge-in \u002F 句尾检测 |\n| 6 | 延迟预算工作表 | 运维 | 端到端 \u003C800ms 的组件清单 |\n\n## 为什么要装\n\n1.5 秒响应在语音里感觉是坏了。600ms 响应感觉是人。差距是架构性的，不只是算力 —— 看你怎么组合 STT \u002F LLM \u002F TTS 与网络层。\n\n三个架构选择决定你的语音 agent 是否活：\n\n1. **speech-to-speech vs 级联**。传统级联（audio → STT → LLM → TTS → audio）有 4 个串行瓶颈，每轮通常 1.2-2.0 秒。speech-to-speech 模型（OpenAI Realtime \u002F Moshi）跳过文本中间层，砍到 200-400ms。对话场景选 speech-to-speech；只有需要精细控制 LLM 步骤（比如音频模型还搞不定的工具调用）才选级联。\n2. **流式 vs 非流式 TTS**。非流式 TTS 等完整文本再生成音频。流式在文本前 ~100ms 就开始出音频。5 秒回答这就是 4-5 秒感知延迟差。Zonos 和多数生产 TTS 都支持流式；用它。\n3. **WebRTC vs WebSocket**。WebRTC 处理丢包 \u002F jitter \u002F 自适应码率。WebSocket 不。真在蜂窝网下，能用的电话和卡顿的电话差在传输层选啥。LiveKit Agents 把 agent 循环包进合规 WebRTC；移动端这是必选项。\n\n## 一条命令装齐\n\n```bash\n# 装整个 pack\ntokrepo install pack\u002Fvoice-ai-stack\n\n# 或者先挑一层\ntokrepo install livekit-agents\ntokrepo install moshi\ntokrepo install zonos\n```\n\nTokRepo CLI 把 agent 脚手架、room 配置、SDK init 代码丢进项目。一个干净 checkout 在 10 分钟内就能让 LiveKit room 接到 OpenAI Realtime 跑起来。\n\n## 常见坑\n\n- **对话场景用级联架构**。用户在聊天（不是命令式听写），用 speech-to-speech。级联架构 2023 年合理；2026 年是延迟惩罚，对聊天没补偿性好处。\n- **跳过语音活动检测（VAD）**。没 VAD 要么 agent 抢话（没句尾检测）要么干等固定超时。LiveKit Agents 内置 VAD，开它。\n- **没处理 barge-in**。用户在 agent 说话时开口，agent 必须 ~150ms 内检测并停。硬编码「等讲完」感觉机器人。四个引擎都支持 barge-in，但有些配置默认关。\n- **TTS prompt 不匹配语音**。「$1,234.56」念出来很糟。送 TTS 前预处理数字 \u002F 日期 \u002F 缩写。TokRepo 的 voice-agent-patterns 资产带规范化器。\n- **没给首轮延迟留余量**。会话第一回答永远比稳态慢 200-400ms，因为模型在加载缓存。用「就绪」音或连接动画把空隙盖住。\n\n## 常见误解\n\n「speech-to-speech 不能做工具调用。」过时了 —— OpenAI Realtime 原生支持函数调用，Moshi 可以包在工具路由 agent 里。2024 年的限制不再成立。\n\n「每个并发通话要一张 GPU。」中等质量纯 TTS 现代开源在 CPU 上能实时。speech-to-speech 自建 Moshi 才需 GPU，或者把延迟外包给 OpenAI Realtime。LiveKit Agents 处理连接多路复用，所以一台机器能中转多并发会话即便模型在别处。\n\n「语音克隆太危险不能上线。」Zonos 这类引擎自带需要授权的水印标记。在明确用户授权下负责任使用（比如用户给*自己的*声音做无障碍克隆）是安全且高价值的功能。风险在未授权克隆，引擎本身就劝阻。",[78,81,84,87,90],{"q":79,"a":80},"OpenAI Realtime 免费吗？","不 —— 按音频分钟计费（输入和输出），定价是纯文本 API 的几倍，因为音频 token 更密。原型阶段成本可忽略；上线产品每天上千分钟就要先算账。自建 Moshi 每分钟零成本但要 GPU。多数团队先用 Realtime 上生产，量大到能 cover GPU 账单时再迁 Moshi。",{"q":82,"a":83},"Moshi 跟 OpenAI Realtime 比怎么样？","Moshi 是 Kyutai 的开源、可自建、全双工 speech-to-speech。OpenAI Realtime 是托管、闭源，英语质量稍高。决策树：数据主权或零分钟成本 → Moshi；延迟最低 + 零基建 → OpenAI Realtime。架构模式相同，所以你代码里包一层后两者长得很像。",{"q":85,"a":86},"用 Cursor 或 Codex CLI 也能用吗？","语音 agent 是服务端服务，不是编辑器插件。用 LiveKit Agents + Realtime\u002FMoshi 构建独立应用。Cursor \u002F Codex CLI 用来*写这些 agent 的代码*（TokRepo 装会丢可跑脚手架），但运行时本身是独立服务。Codex CLI 工具页有针对 Realtime API 的 agent 构建示例。",{"q":88,"a":89},"跟 LLM 可观测性 pack 啥区别？","可观测性给你 trace —— 每轮延迟 \u002F 模型错误 \u002F token 成本。语音 AI 栈 pack 是*构建运行时*。两个都要：装语音栈来上线语音 agent，装可观测性来调第 47 轮为什么有 2 秒延迟。LiveKit Agents 发标准 OpenTelemetry trace，Langfuse \u002F Phoenix 直接吃。",{"q":91,"a":92},"能用我现有的 TTS 吗？","能。pack 文档化了 LiveKit Agents 期待的契约（音频帧 \u002F 句尾信号 \u002F barge-in 事件），ElevenLabs \u002F Cartesia \u002F Azure TTS 或任何支持流式的引擎都能接。Zonos 作为强开源默认收录。voice-agent-patterns 资产有不重写 agent 循环就换 TTS 的指南。",{"@context":94,"@type":95,"name":96,"description":97,"numberOfItems":98,"publisher":99},"https:\u002F\u002Fschema.org","CollectionPage","Voice AI Stack","Zonos, Moshi, OpenAI Realtime, LiveKit Agents — real-time voice agents and TTS that ship to production.",6,{"@type":100,"name":101,"url":102},"Organization","TokRepo","https:\u002F\u002Ftokrepo.com",[104,108,112],{"url":105,"anchor":106,"reason":107},"\u002Fzh\u002Fpacks\u002Fmulti-agent-frameworks","多 Agent 框架","语音 agent 接进的编排层",{"url":109,"anchor":110,"reason":111},"\u002Fzh\u002Fpacks\u002Fllm-observability","LLM 可观测性","像文本一样追语音会话",{"url":113,"anchor":114,"reason":115},"\u002Fzh\u002Ftools\u002Fcodex-cli","Codex CLI","驱动 Realtime API 集成",[117,121,125],{"claim":118,"source_name":119,"source_url":120},"LiveKit Agents framework for building real-time AI voice and video applications","livekit\u002Fagents","https:\u002F\u002Fgithub.com\u002Flivekit\u002Fagents",{"claim":122,"source_name":123,"source_url":124},"Moshi full-duplex spoken dialogue framework from Kyutai","kyutai-labs\u002Fmoshi","https:\u002F\u002Fgithub.com\u002Fkyutai-labs\u002Fmoshi",{"claim":126,"source_name":127,"source_url":128},"OpenAI Realtime API for low-latency speech-to-speech interactions","OpenAI Realtime API","https:\u002F\u002Fplatform.openai.com\u002Fdocs\u002Fguides\u002Frealtime",539,"2026-05-02T15:20:00Z"]