[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-tts-stt-voice-stack-zh":3,"seo:pack:tts-stt-voice-stack:zh":98},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":97},"tts-stt-voice-stack","🎙️","#F97316","new","本周新建","TTS + STT 语音全家桶","做语音机器人、转写流水线、有声书旁白的 10 件套：STT 选 Whisper \u002F whisper.cpp \u002F Faster Whisper \u002F WhisperX；TTS 选 ElevenLabs \u002F Coqui \u002F Bark \u002F StyleTTS 2 \u002F Kokoro；OpenVoice 负责声音克隆。和 voice-ai-stack 互补 —— 那边是实时对话底座，这边是 STT\u002FTTS 零件库。",[16,28,36,43,50,60,68,75,82,90],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},105,"eb0f9dd6-2172-4c9f-aca9-97846b0f4d86","whisper-openai-speech-text-eb0f9dd6","Whisper — OpenAI Speech-to-Text","OpenAI's open-source speech recognition model. Transcribe audio\u002Fvideo to text with word-level timestamps in 99 languages. Essential for subtitle generation.","OpenAI",221,0,"en","skill","Skill",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":34,"view_count":35,"vote_count":24,"lang_type":25,"type":26,"type_label":27},390,"e1fd7c46-bbda-4956-8649-9c3ed579ff25","whisper-cpp-local-speech-text-pure-c-c-e1fd7c46","whisper.cpp — Local Speech-to-Text in Pure C\u002FC++","High-performance port of OpenAI Whisper in C\u002FC++. No Python, no GPU required. Runs on CPU, Apple Silicon, CUDA, and even Raspberry Pi. Real-time transcription.","Script Depot",1602,{"id":37,"uuid":38,"slug":39,"title":40,"description":41,"author_name":34,"view_count":42,"vote_count":24,"lang_type":25,"type":26,"type_label":27},270,"24576b2c-a9d1-4f7a-9696-b1e5c50a17f3","faster-whisper-4x-faster-speech-text-24576b2c","Faster Whisper — 4x Faster Speech-to-Text","Faster Whisper is a reimplementation of OpenAI Whisper using CTranslate2, up to 4x faster with less memory. 21.8K+ GitHub stars. GPU\u002FCPU, 8-bit quantization, word timestamps, VAD. MIT licensed.",202,{"id":44,"uuid":45,"slug":46,"title":47,"description":48,"author_name":34,"view_count":49,"vote_count":24,"lang_type":25,"type":26,"type_label":27},287,"c43ad870-8c99-471a-898e-b07140faf532","whisperx-70x-faster-speech-recognition-c43ad870","WhisperX — 70x Faster Speech Recognition","WhisperX provides 70x realtime speech recognition with word-level timestamps and speaker diarization. 21K+ GitHub stars. Batched inference, under 8GB VRAM. BSD-2-Clause.",237,{"id":51,"uuid":52,"slug":53,"title":54,"description":55,"author_name":56,"view_count":57,"vote_count":24,"lang_type":25,"type":58,"type_label":59},106,"16d32da9-c5fb-43ae-b881-8444b2dcd35b","elevenlabs-python-sdk-ai-text-speech-16d32da9","ElevenLabs Python SDK — AI Text-to-Speech","Official ElevenLabs Python SDK for AI voice generation. Create realistic voiceovers with 30+ languages, voice cloning, and streaming support.","ElevenLabs",194,"script","Script",{"id":61,"uuid":62,"slug":63,"title":64,"description":65,"author_name":66,"view_count":67,"vote_count":24,"lang_type":25,"type":58,"type_label":59},423,"a059dce2-6275-4ea0-a57b-e885248d8e95","coqui-tts-deep-learning-text-speech-engine-a059dce2","Coqui TTS — Deep Learning Text-to-Speech Engine","Generate speech in 1100+ languages with voice cloning. XTTS v2 streams with under 200ms latency. 44K+ GitHub stars.","TokRepo精选",285,{"id":69,"uuid":70,"slug":71,"title":72,"description":73,"author_name":34,"view_count":74,"vote_count":24,"lang_type":25,"type":26,"type_label":27},279,"814b8972-5d48-4379-9756-9a3d8ed686f7","bark-ai-text-audio-music-effects-814b8972","Bark — AI Text-to-Audio with Music & Effects","Bark is a transformer text-to-audio model by Suno that generates speech, music, and sound effects. 39.1K+ GitHub stars. 12+ languages, 100+ voice presets, non-speech audio. MIT licensed.",201,{"id":76,"uuid":77,"slug":78,"title":79,"description":80,"author_name":34,"view_count":81,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2462,"e7a8aaaf-453a-11f1-9bc6-00163e2b0d79","styletts-2-human-level-text-speech-via-style-diffusion-e7a8aaaf","StyleTTS 2 — Human-Level Text-to-Speech via Style Diffusion","A TTS system that achieves human-level speech synthesis through style diffusion and adversarial training with large speech language models. Fast inference with natural prosody.",108,{"id":83,"uuid":84,"slug":85,"title":86,"description":87,"author_name":88,"view_count":89,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2265,"ae7169ee-42b9-11f1-9bc6-00163e2b0d79","openvoice-instant-voice-cloning-tone-style-control-ae7169ee","OpenVoice — Instant Voice Cloning with Tone and Style Control","OpenVoice is an open-source voice cloning framework from MyShell AI that reproduces a speaker's voice from a short audio sample while giving independent control over emotion, accent, rhythm, and language.","AI Open Source",90,{"id":91,"uuid":92,"slug":93,"title":94,"description":95,"author_name":34,"view_count":96,"vote_count":24,"lang_type":25,"type":26,"type_label":27},275,"44809dfb-1735-4aae-af74-f21f4b805d0f","kokoro-lightweight-82m-tts-9-languages-44809dfb","Kokoro — Lightweight 82M TTS in 9 Languages","Kokoro is an 82M parameter text-to-speech model delivering quality comparable to larger models. 6.2K+ GitHub stars. Supports English, Spanish, French, Japanese, Chinese, and more. Apache 2.0.",208,"tokrepo install pack\u002Ftts-stt-voice-stack",{"pageType":99,"pageKey":8,"locale":100,"title":101,"metaDescription":102,"h1":13,"tldr":103,"bodyMarkdown":104,"faq":105,"schema":121,"internalLinks":131,"citations":144,"wordCount":157,"generatedAt":158},"pack","zh","TTS + STT 语音全家桶 — Whisper、ElevenLabs、Coqui、Bark、StyleTTS","做语音机器人、转写流水线、有声书旁白的 10 件套 — STT 选 Whisper 全家族，TTS 选 ElevenLabs \u002F Coqui \u002F Bark \u002F StyleTTS 2 \u002F Kokoro，OpenVoice 负责声音克隆。TokRepo 一键安装。","经典级联架构的 10 个零件 —— STT 挑一个 Whisper 变体，中间接 LLM，TTS 按延迟预算挑引擎。Bark 玩表现力、Kokoro 跑笔记本 CPU、ElevenLabs 拼质量、Coqui\u002FStyleTTS 自部署、OpenVoice 做声音克隆。","## 这个 pack 包含什么\n\n这是语音应用的**零件目录**。[Voice AI Stack pack](\u002Fzh\u002Fpacks\u002Fvoice-ai-stack) 提供的是实时对话底座（LiveKit、Moshi、OpenAI Realtime、Zonos）—— 那是 speech-to-speech 路线；本 pack 提供的是把经典**级联架构**拼起来用的离散 STT \u002F TTS 引擎：`麦克风 → STT → LLM → TTS → 扬声器`。\n\n级联没死。下面这些场景级联仍然是正确选择：\n\n- **要精细控制 LLM 中间环节** —— tool call、结构化输出、RAG、内容审核，任何需要拿到\u002F改写文本的地方。\n- **规模化下追成本** —— speech-to-speech 模型每分钟仍比一套调好的级联贵 3-5 倍。\n- **非实时场景** —— 转写流水线、有声书生成、播客后期、视频配音。延迟不是约束。\n- **自部署或断网部署** —— 这里每个组件都有开源选项，能跑在你自己的 GPU 甚至 CPU 上。\n\n10 个选项，按层分：\n\n| 层 | 选什么 | 什么时候用 |\n|---|---|---|\n| STT — 标杆 | Whisper | 参考实现。批量转写、多语种、口碑准确率。 |\n| STT — 本地 | whisper.cpp | 纯 C\u002FC++ 移植，CPU \u002F Apple Silicon，无 Python。移动端、边缘端唯一现实选择。 |\n| STT — 加速 | Faster Whisper | CTranslate2 提速 4×，精度不变、显存更省。 |\n| STT — 带说话人分离 | WhisperX | 70× 提速 + 词级时间戳 + 说话人分离。会议、播客。 |\n| TTS — 商业 | ElevenLabs Python SDK | 体感质量天花板，流式、声音克隆。按字符计费。 |\n| TTS — 开源框架 | Coqui TTS | 支持多种模型架构的深度学习 TTS 引擎，可自部署。 |\n| TTS — 富表现力 | Bark | Suno 的 transformer 模型，能生成音乐、音效、非语音音频。MIT。 |\n| TTS — 接近人声 | StyleTTS 2 | Style diffusion，自然度逼近商业引擎。 |\n| TTS — 轻量 | Kokoro | 82M 参数 \u002F 9 语种，笔记本 CPU 上能跑。 |\n| 声音克隆 | OpenVoice | 即时克隆，音色和风格独立可控。 |\n\n## 推荐安装顺序\n\n```bash\n# 1. 先选 STT —— 它决定了延迟下限\ntokrepo install whisper-cpp           # 本地 CPU\n# 或\ntokrepo install faster-whisper        # GPU，支持批量 + 流式\n# 或\ntokrepo install whisperx              # 转写 + 说话人分离\n\n# 2. 按质量门槛配 TTS\ntokrepo install elevenlabs-python-sdk # 出货级质量，按字符计费\n# 或\ntokrepo install coqui-tts             # 自部署，质量过得去\n# 或\ntokrepo install kokoro                # 轻量，到处能跑\n\n# 3. 可选 —— 给品牌旁白配声音克隆\ntokrepo install openvoice\n```\n\nTokRepo CLI 把每个资产以一个 skill 的形式落进你的仓库。Claude Code \u002F Cursor \u002F Codex CLI 都能直接读到 skill 文件里现成的 Python 片段和依赖清单，再拼进你自己的应用循环。\n\n## 级联怎么拼\n\n```\n[ 麦克风 \u002F 音频文件 ]\n        │\n        ▼\n[ STT — Whisper 变体 ]\n        │  文本 + 词级时间戳\n        ▼\n[ LLM — 你的选择 ]\n        │  回复文本 + tool call\n        ▼\n[ 文本归一化 ]\n        │  数字、日期、emoji 处理\n        ▼\n[ TTS — ElevenLabs \u002F Coqui \u002F Bark \u002F Kokoro ]\n        │  流式音频帧\n        ▼\n[ 扬声器 \u002F 输出文件 ]\n```\n\n所有在生产跑的级联都做对了这几件事：\n\n1. **两端都流式。** STT 每 ~200ms 给一次部分假设；TTS 在 LLM 输出前 ~100ms 文本到达后就开始出声。LLM 一定要 token 流式。端到端体感延迟从「发完等」变成「细水长流」。\n2. **TTS 之前先归一化。** `$1,234.56` 大多数引擎会念成「美元一逗号二三四点五六」。一个 20 行的归一化函数处理货币、日期、缩写、URL，胜过一周「为啥我 agent 听起来这么蠢」。\n3. **把模型驻留下来。** Whisper-large 冷启动加载权重要 ~3 秒。让模型常驻在一个长生命周期进程里，第一次转写不该付这个代价。\n\n## 你会遇到的取舍\n\n- **Whisper-large vs medium vs tiny。** Tiny 能跑树莓派；large 要 GPU。多数生产团队最后落在 medium + VAD 切片这个性价比拐点。Faster Whisper 让 large 变可负担；whisper.cpp 让 tiny\u002Fbase 在 CPU 上能用。\n- **ElevenLabs vs 开源 TTS。** ElevenLabs 明显更好听，但月费 $30-330 + 字符超量计费。Coqui + StyleTTS 2 能做到「生产够用」，但要 GPU。分水岭：日字符 \u003C 10w，ElevenLabs 更划算；以上自部署。\n- **Bark vs Kokoro vs StyleTTS。** Bark 表现力强（能笑、能唱、能音效）但慢，不总好控；Kokoro 快而小，但语调中性；StyleTTS 2 接近人声但要最多显存。匹配场景 —— 游戏 NPC 用 Bark、IVR 用 Kokoro、有声书用 StyleTTS。\n- **声音克隆伦理。** OpenVoice 和 ElevenLabs 都支持基于授权的克隆。必须显式取得用户同意并记录授权日志。未经授权的克隆是丢单\u002F吃官司最直接的路。\n\n## 常见踩坑\n\n- **STT 前没挂 VAD。** 给 Whisper 喂持续静音会得到幻觉转写（最经典的是 `Thank you for watching!`）。在 Whisper 前接 30 行 `webrtcvad` 或 `silero-vad`。仅此一改就能干掉级联里最常见的 bug。\n- **LLM 回复整段才送 TTS。** 你在串行付 LLM 全延迟 + TTS 全延迟。把 LLM token 流式喂进句子缓冲区，碰到 `。`、`?`、`!` 立刻冲一句到 TTS。\n- **采样率不匹配。** Whisper 要 16kHz 单声道。TTS 输出 22.05 \u002F 24 \u002F 48 kHz。边界处重采样；不匹配会出花栗鼠音或低频闷音，QA 会甩锅给模型。\n- **把 WhisperX 当 Whisper 平替。** WhisperX 的说话人分离要依赖 `pyannote`，需要 Hugging Face token + 同意 license。生产依赖前先把鉴权打通。\n- **没记录音频 + 转写配对。** 语音应用悄悄回归 —— TTS 更新、STT 版本变更都可能默默降质。采样 1% 会话把音频和转写存下来，每周看一遍。否则只能等用户怒来反馈。",[106,109,112,115,118],{"q":107,"a":108},"为啥选级联而不是 Moshi \u002F OpenAI Realtime 这种 speech-to-speech？","三个原因。第一，可控 —— 级联让你在 STT 和 TTS 之间拿到文本，能加 tool call、RAG、内容过滤、LLM 路由，这些 audio-native 模型现在还很弱。第二，成本 —— 规模化下 Whisper + GPT-4o-mini + Kokoro 每分钟可以比 Realtime API 便宜 5-10 倍。第三，匹配 —— 非对话场景（转写、有声书、播客后期）根本没有「实时对话感」要保。speech-to-speech 那条路看 [Voice AI Stack pack](\u002Fzh\u002Fpacks\u002Fvoice-ai-stack)，本 pack 覆盖其他所有场景。",{"q":110,"a":111},"Whisper 这几个变体到底选哪个？","有 GPU 就先上 Faster Whisper —— 和标杆 Whisper 同精度、4× 吞吐、显存更省。在 CPU \u002F Apple Silicon \u002F 边缘硬件上就 whisper.cpp，没第二个现实选择。要说话人分离或词级时间戳（会议、播客、字幕）选 WhisperX。仅当复现论文或者 CTranslate2 不支持你要的模型时，才用 OpenAI 原版 Whisper。",{"q":113,"a":114},"自部署 TTS 比 ElevenLabs 到底便宜多少？","粗算：Coqui 或 StyleTTS 2 跑在单卡 A10G 上（AWS 约 $0.75\u002F小时），合理质量下每 GPU-小时能产出约 200 小时音频，也就是约 $0.004\u002F分钟。ElevenLabs Creator 档大致等价 $0.03\u002F分钟。盈亏平衡在每天 25-50 小时音频量附近；以下用 ElevenLabs 运维上更划算（省掉推理基建），以上自部署赢。Kokoro 把这个账往下拉一档 —— 它在 CPU 上就有可用速度。",{"q":116,"a":117},"Claude Code \u002F Cursor \u002F Codex CLI 能用上吗？","能。本 pack 每一项都以 TokRepo skill 形式安装，会落一个 `.md` 文件 + 示例 Python 进你仓库 —— 不管你用哪个 agent CLI，模型都能直接读到 API key 处理、流式代码、采样率转换这些上下文，再帮你拼进应用。TokRepo 上 Codex CLI 和 Cursor 的入口都带了组合多个本 pack 资产的 voice agent 示例。",{"q":119,"a":120},"TTS 和 STT 的质量能自动评测吗？","能，但要选对指标。STT：在 held-out 转写集上算 WER（word error rate），用 `jiwer` 这个库做计算。TTS：没有单一数字 —— MOS（mean opinion score）需要真人；UTMOS 和 NISQA 能给自动估算。现实可跑的循环：常备一份 50 条 golden set，STT 改动跑 WER，TTS 改动跑一个小规模 MOS panel（5 个评审、30 分钟）。不要不评测就上线 —— TTS \u002F STT 升级回归常常打在指标盲区。",{"@context":122,"@type":123,"name":13,"description":124,"numberOfItems":125,"inLanguage":126,"publisher":127},"https:\u002F\u002Fschema.org","CollectionPage","做语音机器人、转写流水线、有声书旁白的 10 个 TTS \u002F STT 零件 —— Whisper 变体、ElevenLabs、Coqui、Bark、StyleTTS 2、Kokoro、OpenVoice。",10,"zh-CN",{"@type":128,"name":129,"url":130},"Organization","TokRepo","https:\u002F\u002Ftokrepo.com",[132,136,140],{"url":133,"anchor":134,"reason":135},"\u002Fzh\u002Fpacks\u002Fvoice-ai-stack","Voice AI Stack","实时对话底座（LiveKit \u002F Moshi \u002F OpenAI Realtime）—— 本 pack 的零件搭那边的 runtime",{"url":137,"anchor":138,"reason":139},"\u002Fzh\u002Fpacks\u002Fml-engineer-rag-eval","ML Engineer RAG + Eval pack","RAG 的评测方法论可以直接迁移到 STT \u002F TTS 质量追踪",{"url":141,"anchor":142,"reason":143},"\u002Fzh\u002Fpacks\u002Fcontent-creator-ai-studio","内容创作者 AI 工作室","那个 pack 里视频配音流水线吃的就是本 pack 的 TTS",[145,149,153],{"claim":146,"source_name":147,"source_url":148},"Whisper 是 OpenAI 开源的语音识别模型","openai\u002Fwhisper","https:\u002F\u002Fgithub.com\u002Fopenai\u002Fwhisper",{"claim":150,"source_name":151,"source_url":152},"whisper.cpp 把 Whisper 推理移植到纯 C\u002FC++，支持 CPU 和 Apple Silicon","ggerganov\u002Fwhisper.cpp","https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fwhisper.cpp",{"claim":154,"source_name":155,"source_url":156},"Bark 是 Suno 的 transformer 文字转音频模型，支持语音、音乐、音效","suno-ai\u002Fbark","https:\u002F\u002Fgithub.com\u002Fsuno-ai\u002Fbark",880,"2026-05-22T10:00:00Z"]