[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-voice-clone-podcast-studio-zh":3,"seo:pack:voice-clone-podcast-studio:zh":98},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":97},"voice-clone-podcast-studio","🎙️","#8B5CF6","new","本周新建","Voice Cloning + Podcast — 一个人撑起整个播客","做独立播客、配音、YouTube 的单兵 10 件套：Audacity 收音 + 降噪 + 剪辑、Whisper \u002F whisper.cpp 转写出稿、ElevenLabs \u002F OpenVoice \u002F GPT-SoVITS \u002F Fish Speech \u002F Coqui TTS 做声音克隆和多语配音、KrillinAI 一键把视频 dub 成 100 国语言、VideoCaptioner 自动烤字幕。录音 → 降噪 → 克隆 → 配音 → 剪辑发布，一个人全跑。",[16,28,36,44,54,61,68,75,83,90],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},1720,"44f450b6-3b20-11f1-9bc6-00163e2b0d79","audacity-free-cross-platform-audio-editor-44f450b6","Audacity — Free Cross-Platform Audio Editor","Audacity is a free, open-source digital audio editor and recorder for Windows, macOS, and Linux. It supports multi-track editing, a wide range of audio formats, real-time effects, and plugin extensibility for recording, editing, and mastering audio.","AI Open Source",119,0,"en","skill","Skill",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":34,"view_count":35,"vote_count":24,"lang_type":25,"type":26,"type_label":27},105,"eb0f9dd6-2172-4c9f-aca9-97846b0f4d86","whisper-openai-speech-text-eb0f9dd6","Whisper — OpenAI Speech-to-Text","OpenAI's open-source speech recognition model. Transcribe audio\u002Fvideo to text with word-level timestamps in 99 languages. Essential for subtitle generation.","OpenAI",221,{"id":37,"uuid":38,"slug":39,"title":40,"description":41,"author_name":42,"view_count":43,"vote_count":24,"lang_type":25,"type":26,"type_label":27},390,"e1fd7c46-bbda-4956-8649-9c3ed579ff25","whisper-cpp-local-speech-text-pure-c-c-e1fd7c46","whisper.cpp — Local Speech-to-Text in Pure C\u002FC++","High-performance port of OpenAI Whisper in C\u002FC++. No Python, no GPU required. Runs on CPU, Apple Silicon, CUDA, and even Raspberry Pi. Real-time transcription.","Script Depot",1602,{"id":45,"uuid":46,"slug":47,"title":48,"description":49,"author_name":50,"view_count":51,"vote_count":24,"lang_type":25,"type":52,"type_label":53},106,"16d32da9-c5fb-43ae-b881-8444b2dcd35b","elevenlabs-python-sdk-ai-text-speech-16d32da9","ElevenLabs Python SDK — AI Text-to-Speech","Official ElevenLabs Python SDK for AI voice generation. Create realistic voiceovers with 30+ languages, voice cloning, and streaming support.","ElevenLabs",194,"script","Script",{"id":55,"uuid":56,"slug":57,"title":58,"description":59,"author_name":22,"view_count":60,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2265,"ae7169ee-42b9-11f1-9bc6-00163e2b0d79","openvoice-instant-voice-cloning-tone-style-control-ae7169ee","OpenVoice — Instant Voice Cloning with Tone and Style Control","OpenVoice is an open-source voice cloning framework from MyShell AI that reproduces a speaker's voice from a short audio sample while giving independent control over emotion, accent, rhythm, and language.",90,{"id":62,"uuid":63,"slug":64,"title":65,"description":66,"author_name":22,"view_count":67,"vote_count":24,"lang_type":25,"type":26,"type_label":27},3660,"8b48f7ce-4f09-11f1-9bc6-00163e2b0d79","gpt-sovits-few-shot-voice-cloning-text-speech-8b48f7ce","GPT-SoVITS — Few-Shot Voice Cloning and Text-to-Speech","An open-source TTS system that can clone any voice from just one minute of audio data, combining GPT-style language modeling with VITS synthesis for natural speech generation.",108,{"id":69,"uuid":70,"slug":71,"title":72,"description":73,"author_name":22,"view_count":74,"vote_count":24,"lang_type":25,"type":26,"type_label":27},269,"88c15e9c-439c-4e70-8b8f-cd04efe928c0","fish-speech-multilingual-tts-80-languages-88c15e9c","Fish Speech — Multilingual TTS for 80+ Languages","Fish Speech is a state-of-the-art open-source TTS system supporting 80+ languages. 29K+ GitHub stars. 4B dual-AR model, voice cloning, emotional control with 15K+ tags, real-time inference.",228,{"id":76,"uuid":77,"slug":78,"title":79,"description":80,"author_name":81,"view_count":82,"vote_count":24,"lang_type":25,"type":52,"type_label":53},423,"a059dce2-6275-4ea0-a57b-e885248d8e95","coqui-tts-deep-learning-text-speech-engine-a059dce2","Coqui TTS — Deep Learning Text-to-Speech Engine","Generate speech in 1100+ languages with voice cloning. XTTS v2 streams with under 200ms latency. 44K+ GitHub stars.","TokRepo精选",286,{"id":84,"uuid":85,"slug":86,"title":87,"description":88,"author_name":22,"view_count":89,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2632,"e0ea662e-47b4-11f1-9bc6-00163e2b0d79","krillinai-ai-video-translation-dubbing-100-languages-e0ea662e","KrillinAI — AI Video Translation and Dubbing in 100 Languages","An open-source tool that uses LLMs to translate and dub video content into over 100 languages with one-click deployment, optimized for YouTube, TikTok, and other platforms.",94,{"id":91,"uuid":92,"slug":93,"title":94,"description":95,"author_name":42,"view_count":96,"vote_count":24,"lang_type":25,"type":26,"type_label":27},110,"d12d8441-f0da-4d3d-a0c2-0f258b27336f","videocaptioner-ai-subtitle-pipeline-d12d8441","VideoCaptioner — AI Subtitle Pipeline","LLM-powered video subtitle tool: Whisper transcription + AI correction + 99-language translation + styled subtitle export. 13,800+ stars.",238,"tokrepo install pack\u002Fvoice-clone-podcast-studio",{"pageType":99,"pageKey":8,"locale":100,"title":101,"metaDescription":102,"h1":13,"tldr":103,"bodyMarkdown":104,"faq":105,"schema":121,"internalLinks":132,"citations":145,"wordCount":158,"generatedAt":159},"pack","zh","Voice Cloning + Podcast — 一个人撑起整个播客的 10 件套","Audacity、Whisper、whisper.cpp、ElevenLabs、OpenVoice、GPT-SoVITS、Fish Speech、Coqui TTS、KrillinAI、VideoCaptioner — 独立播客 \u002F 配音师 \u002F Youtuber 的单兵装备，覆盖录音、降噪、转写、声音克隆、多语 dub、剪辑发布全流程。含安装顺序 + 授权红线。","五层、10 件、一条流水线：收录降噪（Audacity）→ 转写出稿（Whisper 家族）→ 克隆自己声音（ElevenLabs \u002F OpenVoice \u002F GPT-SoVITS）→ 100 国语言配音（KrillinAI \u002F Fish Speech \u002F Coqui TTS）→ 烤字幕剪短视频（VideoCaptioner）。克隆别人声音前先拿到授权 —— 这条不是建议是红线。","## 这个 pack 包含什么\n\n这是给一个**独立播客主 \u002F 配音师 \u002F YouTuber 一个人撑整套节目**而设计的 rig —— 没有制片人、没有混音师、没有翻译公司。10 件，刻意排序，要么开源要么有靠谱免费档。不是「所有相关工具」，是「能让一个人周一录、周五出多语字幕降噪克隆版本」的最小集。\n\n五层、每层在真有取舍的地方给两个选项：\n\n| 层 | 选项 | 用途 |\n|---|---|---|\n| 1. 录音 + 降噪 | Audacity | 免费 DAW。多轨录音、去嘶声\u002F喷麦\u002Fclick、什么格式都能导。 |\n| 2. 转写 | Whisper（云）· whisper.cpp（本地） | 云端精度最高；whisper.cpp 用于离线 \u002F 敏感内容 \u002F 批量 \u002F 移动端。 |\n| 3. 克隆自己的声音 | ElevenLabs · OpenVoice · GPT-SoVITS | ElevenLabs = 音质天花板，付费。OpenVoice = 即时音色+风格克隆，MIT。GPT-SoVITS = few-shot 克隆，可自部署。 |\n| 4. 多语配音 | Fish Speech · Coqui TTS · KrillinAI | Fish Speech 支持 80+ 语种。Coqui TTS = 可插拔引擎。KrillinAI 直接吃视频文件，一键 dub 成另一种语言。 |\n| 5. 字幕 + 发布 | VideoCaptioner | 给 TikTok \u002F Reels \u002F Shorts 的竖屏剪辑烤词级字幕。 |\n\n这套是按**单兵作战**裁的尺寸。如果你是 3 人播客网络带剪辑师，Audacity 换 Reaper \u002F Adobe Audition（付费）、KrillinAI 换人工翻译、再加发布排期工具。其他人就装这套。\n\n## 推荐安装顺序\n\n**别先装声音克隆。** 录音不干净，克隆出来的也不干净。\n\n```bash\n# 阶段 1 — 收录降噪（周一）\ntokrepo install audacity\n\n# 阶段 2 — 出转写稿，让你按文字编辑不按波形（周一晚）\ntokrepo install whisper-cpp        # 本地 \u002F 免费 \u002F M 系芯片 5x 实时\n# 或\ntokrepo install whisper            # OpenAI API，精度天花板\n\n# 阶段 3 — 克隆自己的声音（周二 —— 这步一辈子只做一次）\ntokrepo install elevenlabs-python-sdk  # 3 分钟干净音频 → 录音棚级克隆\n# 或 —— 不想自部署 \u002F 不想按字符付费\ntokrepo install openvoice              # 即时克隆，MIT\ntokrepo install gpt-sovits             # few-shot，建议 GPU\n\n# 阶段 4 — 把片段 dub 成其他语言（周三）\ntokrepo install fish-speech            # 多语种 TTS，80+ 语种\ntokrepo install coqui-tts              # 自部署替代品\ntokrepo install krillinai              # 全视频 dub，字幕+配音，一条命令\n\n# 阶段 5 — 发布（周四）\ntokrepo install videocaptioner         # 给短视频烤动效字幕\n```\n\nTokRepo CLI 把每个资产以 skill 形式落进你的仓库。Claude Code \u002F Cursor \u002F Codex CLI 读 skill 就能给你拼脚本 —— 比如「拿 `episode-12.wav`，Audacity headless 降噪、whisper.cpp 转写、KrillinAI 把前 60 秒 dub 成西语、VideoCaptioner 烤字幕，输出 `ep12-es.mp4`」整段就是一条 agent prompt。\n\n## 它们怎么协同\n\n```\n[ 麦克风 \u002F Riverside \u002F Zoom 录音 ]\n             │\n             ▼\n   ┌─────────────────────┐\n   │ Audacity            │  noise gate \u002F EQ \u002F 归一化 \u002F click 去除\n   └─────────────────────┘\n             │  干净 WAV\n             ▼\n   ┌─────────────────────┐\n   │ Whisper \u002F whisper.cpp │  转写 + 词级时间戳\n   └─────────────────────┘\n             │  按删文字编辑，不按拉波形\n             ▼\n   ┌─────────────────────────────────┐\n   │ 声音克隆（三选一）：           │\n   │   ElevenLabs · OpenVoice ·     │  → 你的人声模型\n   │   GPT-SoVITS                   │\n   └─────────────────────────────────┘\n             │\n             ├──► 补录某句口误：打字给「你的声音」念\n             │\n             ▼\n   ┌─────────────────────────────────┐\n   │ 多语配音（三选一）：           │\n   │   Fish Speech（TTS 引擎）·     │\n   │   Coqui TTS · KrillinAI        │  → ES \u002F JA \u002F DE \u002F FR 音轨\n   │   （全视频流水线）             │\n   └─────────────────────────────────┘\n             │\n             ▼\n   ┌─────────────────────┐\n   │ VideoCaptioner      │  逐词烤字幕，竖屏切片\n   └─────────────────────┘\n             │\n             ▼\n   [ YouTube \u002F 小宇宙 \u002F TikTok \u002F Reels \u002F 视频号 ]\n```\n\n这套真正的解锁点是 **按文字编辑，而不是按波形编辑**。Whisper 给你带时间戳的转写以后，删一个「呃」就是从文本文件里删一个词然后重渲染 —— 5 倍提速的来源不是克隆、不是 dub，是再也不用拉 90 分钟波形。\n\n## 你会遇到的取舍\n\n- **ElevenLabs vs OpenVoice vs GPT-SoVITS 选哪个克隆自己声音。** ElevenLabs 是音质天花板 —— 3 分钟干净音频克隆出来朋友都听不出来，但月费 $5–$330 + 字符超量计费，模型还存在他们服务器上。OpenVoice 是 MIT 许可、消费级 GPU 能跑，质量是「播客开场没问题、长篇旁白勉强」。GPT-SoVITS 是开源里最强的，但每个音色都要 fine-tune 一遍。要最快出结果选 ElevenLabs；在意授权或月费选 OpenVoice \u002F GPT-SoVITS。\n- **云端 Whisper vs whisper.cpp。** 云端精度最高，中文、日文、专有名词尤其明显。whisper.cpp 不用联网、不按分钟计费、数据不出本机 —— 有名人嘉宾的播客走云端；敏感内部 \u002F 记者采访保护信源走本地。\n- **KrillinAI vs DIY（Fish Speech + Coqui）。** KrillinAI 吃一个视频文件，吐回同一个视频在新语言下的版本，唇形大致对得上、字幕带好 —— 一条命令。DIY 路径（提取音频 → 转写 → 翻译 → re-TTS → 混回去）每步可控但整合工作量是 5 倍。一稿用 KrillinAI；某一步要精调时下沉到 DIY。\n- **多语保真度现实校准。** 英语训练的克隆模型做中文 \u002F 日语 \u002F 韩语都会有「外国口音」。本 pack 里 Fish Speech 是最强的多语种 TTS。商业级本地化（付费客户）依然该找母语配音员，克隆只是草稿质量、不是播出质量。\n- **实时 vs 离线。** 本 pack 没有任何实时方案 —— 这是**后期录制棚**不是直播 rig。要实时去看 [Voice AI Stack pack](\u002Fzh\u002Fpacks\u002Fvoice-ai-stack)。\n\n## 常见踩坑（含一个伦理的）\n\n- **你没有克隆别人声音的权利。** 克隆嘉宾、公众人物、已故人士、任何你没拿到书面授权的声音 —— 一条直通诉讼、平台封号、（很多司法管辖区）刑事责任的捷径。ElevenLabs 要求克隆别人声音前先录一段同意声明；OpenVoice 和 GPT-SoVITS 不强制这点 —— **你自己来执行**。克隆前拿到书面授权，并存档。\n- **模型偏见生成你不要的口音。** 美式英语主导训练数据的克隆模型会让你的印度英语 \u002F 澳洲 \u002F 苏格兰口音听起来微微「美国化」。整季节目押注之前先把你全口音范围跑一遍测试。\n- **专有名词转写错率。** Whisper 会幻觉名字。「Linus Torvalds」90% 时候对；「Anthropic」会变「and topic」。给你节目每个反复出现的名字 \u002F 术语建一份自定义词表 + 替换脚本。\n- **长音频 token 成本。** 用云 Whisper 转写 2 小时播客没问题（$0.006\u002F分钟 ≈ $0.36）。用 ElevenLabs 多语种档 dub 2 小时播客 ≈ 10 万字符\u002F小时 ≈ 每集每语言 $20–60。承诺「每集 10 国语言」之前先算账。\n- **STT 之前先挂 VAD。** 不做语音活动检测、直接把静音段喂给 Whisper —— 你会得到经典幻觉转写 `Thank you for watching!` 直接烤进字幕里。在任何 STT 调用前加 30 行 `silero-vad`。\n- **没存原始母带。** 克隆 + 重混 + 重 dub 是破坏性链路。Audacity 工程原始多轨永远存档 —— 客户、律师、未来的你都会需要。\n\n## 伦理 disclaimer\n\n声音克隆有正经用途：补录自己的口误、可访问性旁白、把自己的内容 dub 成不会说的语言、ALS 患者的声音保存。也有显而易见的滥用：冒充诈骗、未经同意的 deepfake、把话塞进公众人物嘴里。**本 pack 提供工具，规矩你来定。** 克隆非自己声音前先拿书面授权。在 show notes 里披露 AI 生成音频。各大平台（YouTube \u002F TikTok \u002F Spotify \u002F Meta \u002F 小宇宙国内合规要求）现在都要求合成媒体必须披露，藏着的会被取消变现 \u002F 下架。把披露写进你的发布步骤，从第一天就这样做。",[106,109,112,115,118],{"q":107,"a":108},"克隆自己的声音合法吗？","克隆自己的声音自己用，几乎所有司法管辖区都合法。麻烦从这里开始：(1) 克隆你没有授权的声音 —— 嘉宾、名人、已故人士；(2) 用克隆冒充某人做欺诈或诽谤，就算克隆的是你自己的声音，被别人滥用也算；(3) 在要求披露的平台（YouTube \u002F TikTok \u002F Spotify \u002F Meta 现在全都要求）上隐瞒音频是 AI 生成的。给自己播客的开场、补录、把自己内容翻译 dub —— 都没问题。涉及第二个人，先拿书面授权。",{"q":110,"a":111},"ElevenLabs vs Fish Speech vs OpenVoice，哪个干嘛？","ElevenLabs 是英语 \u002F 西语 \u002F 德语的质量领导者、付费 SaaS —— 在乎音质胜过月费、能接受云依赖时选它。Fish Speech 是本 pack 里最强的开源多语 TTS —— 覆盖 80+ 语言含强劲中日，GPU 上能跑，是 ElevenLabs 在你目标语言里「太洋化」时的解药。OpenVoice 是最快的开源克隆 —— 3 秒参考音频、MIT 许可、消费级 GPU 能跑，但质量上限是「播客开场」不是「播音级旁白」。典型配置：主声音克隆走 ElevenLabs，中日 dub 走 Fish Speech，一次性角色配音走 OpenVoice。",{"q":113,"a":114},"中文音质谁最好？","中文专项：GPT-SoVITS 和 Fish Speech 都比 ElevenLabs 开箱即用更好，因为它们在更大量中文数据上训练。GPT-SoVITS 尤其有强大中文社区，公开 few-shot 教程基本都是中文的。ElevenLabs 这一年中文进步明显，但四声上仍能听出英语影响的音调瑕疵。整集中文播客或 dub 轨：用 ~30 分钟干净普通话参考 fine-tune GPT-SoVITS 或 Fish Speech；英语节目里夹一句中文：ElevenLabs 够用。",{"q":116,"a":117},"真能用 KrillinAI 一键 dub 一个 1 小时播客吗？","技术上能：喂 `episode.mp4`、选目标语言、拿回 `episode-es.mp4` 带翻译字幕和 dub 音频。现实里发布前要做一遍人工审校：(1) 翻译会曲解几个文化梗和内梗，(2) 克隆会念错你领域里的专有名词和缩写，(3) 长视频唇形 80% 片段对得上、20% 明显错位。能跑通的工作流：KrillinAI 先在 5 分钟宣传片段跑一稿；质量 OK 再全集批；审校转写校术语；重渲染。1 小时一集端到端：人工 ~3 小时 vs 外包翻译公司 ~3 天。",{"q":119,"a":120},"播客转社交剪辑哪个视频剪辑工具最快？","如果是从 90 分钟节目里剪 60 秒竖屏切片给 TikTok \u002F Shorts \u002F Reels：VideoCaptioner 是关键解锁，因为最大的时间黑洞不是剪、是给每条切片做词级字幕动效。VideoCaptioner 直接吃 Whisper 已经给你的转写，把动态词级字幕烤进竖屏导出。配 FFmpeg crop 或 Shotcut \u002F Kdenlive 做剪本身。要单 GUI 一站式剪 + 字 + 导：OpenCut 和 Shotcut 都行但单条更费时。最快路径：在 Audacity \u002F 文本编辑器里按文字剪，FFmpeg 渲染切片，VideoCaptioner 上字幕，发布。",{"@context":122,"@type":123,"name":124,"description":125,"numberOfItems":126,"inLanguage":127,"publisher":128},"https:\u002F\u002Fschema.org","ItemList","Voice Cloning + Podcast Studio","给独立播客主 \u002F 配音师 \u002F Youtuber 的单兵 10 件套 —— 录音、降噪、转写、声音克隆、100 语种 dub、社交剪辑字幕，按刻意顺序安装。",10,"zh-CN",{"@type":129,"name":130,"url":131},"Organization","TokRepo","https:\u002F\u002Ftokrepo.com",[133,137,141],{"url":134,"anchor":135,"reason":136},"\u002Fzh\u002Fpacks\u002Ftts-stt-voice-stack","TTS + STT 语音全家桶","姐妹 pack —— 那个是 STT\u002FTTS 完整零件库，本 pack 是按播客形状切出来的子集",{"url":138,"anchor":139,"reason":140},"\u002Fzh\u002Fpacks\u002Fcontent-creator-ai-studio","内容创作者 AI 工作室","节目以 YouTube 长视频 + 切片形式发，创作者 pack 覆盖封面、脚本、发布",{"url":142,"anchor":143,"reason":144},"\u002Fzh\u002Fpacks\u002Fai-music-audio-generation","AI 音乐 + 音频生成","节目背景乐、片头 sting、SFX —— 和本 pack 的人声层互补",[146,150,154],{"claim":147,"source_name":148,"source_url":149},"OpenVoice 支持即时声音克隆，音色和风格独立可控","myshell-ai\u002FOpenVoice","https:\u002F\u002Fgithub.com\u002Fmyshell-ai\u002FOpenVoice",{"claim":151,"source_name":152,"source_url":153},"Whisper 是 OpenAI 开源的语音识别模型","openai\u002Fwhisper","https:\u002F\u002Fgithub.com\u002Fopenai\u002Fwhisper",{"claim":155,"source_name":156,"source_url":157},"ElevenLabs 要求克隆非本人声音前完成同意验证","ElevenLabs Voice Cloning Terms","https:\u002F\u002Felevenlabs.io\u002Fterms",910,"2026-05-23T10:00:00Z"]