[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-voice-clone-podcast-studio-en":3,"seo:pack:voice-clone-podcast-studio:en":97},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":96},"voice-clone-podcast-studio","🎙️","#8B5CF6","new","New · this week","Voice Cloning + Podcast — One Person Runs the Whole Show","Ten picks for the indie podcaster, voice actor, or YouTuber running a whole show solo — Audacity for capture and noise cleanup, Whisper \u002F whisper.cpp for transcription, ElevenLabs \u002F OpenVoice \u002F GPT-SoVITS \u002F Fish Speech \u002F Coqui TTS for voice clone and multilingual dubbing, KrillinAI for one-click 100-language video dub, VideoCaptioner for subtitle baking. Recording → cleanup → clone → dub → publish, in one rig.",[16,28,36,44,54,61,67,74,82,89],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},1720,"44f450b6-3b20-11f1-9bc6-00163e2b0d79","audacity-free-cross-platform-audio-editor-44f450b6","Audacity — Free Cross-Platform Audio Editor","Audacity is a free, open-source digital audio editor and recorder for Windows, macOS, and Linux. It supports multi-track editing, a wide range of audio formats, real-time effects, and plugin extensibility for recording, editing, and mastering audio.","AI Open Source",118,0,"en","skill","Skill",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":34,"view_count":35,"vote_count":24,"lang_type":25,"type":26,"type_label":27},105,"eb0f9dd6-2172-4c9f-aca9-97846b0f4d86","whisper-openai-speech-text-eb0f9dd6","Whisper — OpenAI Speech-to-Text","OpenAI's open-source speech recognition model. Transcribe audio\u002Fvideo to text with word-level timestamps in 99 languages. Essential for subtitle generation.","OpenAI",221,{"id":37,"uuid":38,"slug":39,"title":40,"description":41,"author_name":42,"view_count":43,"vote_count":24,"lang_type":25,"type":26,"type_label":27},390,"e1fd7c46-bbda-4956-8649-9c3ed579ff25","whisper-cpp-local-speech-text-pure-c-c-e1fd7c46","whisper.cpp — Local Speech-to-Text in Pure C\u002FC++","High-performance port of OpenAI Whisper in C\u002FC++. No Python, no GPU required. Runs on CPU, Apple Silicon, CUDA, and even Raspberry Pi. Real-time transcription.","Script Depot",1602,{"id":45,"uuid":46,"slug":47,"title":48,"description":49,"author_name":50,"view_count":51,"vote_count":24,"lang_type":25,"type":52,"type_label":53},106,"16d32da9-c5fb-43ae-b881-8444b2dcd35b","elevenlabs-python-sdk-ai-text-speech-16d32da9","ElevenLabs Python SDK — AI Text-to-Speech","Official ElevenLabs Python SDK for AI voice generation. Create realistic voiceovers with 30+ languages, voice cloning, and streaming support.","ElevenLabs",194,"script","Script",{"id":55,"uuid":56,"slug":57,"title":58,"description":59,"author_name":22,"view_count":60,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2265,"ae7169ee-42b9-11f1-9bc6-00163e2b0d79","openvoice-instant-voice-cloning-tone-style-control-ae7169ee","OpenVoice — Instant Voice Cloning with Tone and Style Control","OpenVoice is an open-source voice cloning framework from MyShell AI that reproduces a speaker's voice from a short audio sample while giving independent control over emotion, accent, rhythm, and language.",90,{"id":62,"uuid":63,"slug":64,"title":65,"description":66,"author_name":22,"view_count":29,"vote_count":24,"lang_type":25,"type":26,"type_label":27},3660,"8b48f7ce-4f09-11f1-9bc6-00163e2b0d79","gpt-sovits-few-shot-voice-cloning-text-speech-8b48f7ce","GPT-SoVITS — Few-Shot Voice Cloning and Text-to-Speech","An open-source TTS system that can clone any voice from just one minute of audio data, combining GPT-style language modeling with VITS synthesis for natural speech generation.",{"id":68,"uuid":69,"slug":70,"title":71,"description":72,"author_name":22,"view_count":73,"vote_count":24,"lang_type":25,"type":26,"type_label":27},269,"88c15e9c-439c-4e70-8b8f-cd04efe928c0","fish-speech-multilingual-tts-80-languages-88c15e9c","Fish Speech — Multilingual TTS for 80+ Languages","Fish Speech is a state-of-the-art open-source TTS system supporting 80+ languages. 29K+ GitHub stars. 4B dual-AR model, voice cloning, emotional control with 15K+ tags, real-time inference.",228,{"id":75,"uuid":76,"slug":77,"title":78,"description":79,"author_name":80,"view_count":81,"vote_count":24,"lang_type":25,"type":52,"type_label":53},423,"a059dce2-6275-4ea0-a57b-e885248d8e95","coqui-tts-deep-learning-text-speech-engine-a059dce2","Coqui TTS — Deep Learning Text-to-Speech Engine","Generate speech in 1100+ languages with voice cloning. XTTS v2 streams with under 200ms latency. 44K+ GitHub stars.","TokRepo精选",286,{"id":83,"uuid":84,"slug":85,"title":86,"description":87,"author_name":22,"view_count":88,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2632,"e0ea662e-47b4-11f1-9bc6-00163e2b0d79","krillinai-ai-video-translation-dubbing-100-languages-e0ea662e","KrillinAI — AI Video Translation and Dubbing in 100 Languages","An open-source tool that uses LLMs to translate and dub video content into over 100 languages with one-click deployment, optimized for YouTube, TikTok, and other platforms.",94,{"id":90,"uuid":91,"slug":92,"title":93,"description":94,"author_name":42,"view_count":95,"vote_count":24,"lang_type":25,"type":26,"type_label":27},110,"d12d8441-f0da-4d3d-a0c2-0f258b27336f","videocaptioner-ai-subtitle-pipeline-d12d8441","VideoCaptioner — AI Subtitle Pipeline","LLM-powered video subtitle tool: Whisper transcription + AI correction + 99-language translation + styled subtitle export. 13,800+ stars.",238,"tokrepo install pack\u002Fvoice-clone-podcast-studio",{"pageType":98,"pageKey":8,"locale":25,"title":99,"metaDescription":100,"h1":101,"tldr":102,"bodyMarkdown":103,"faq":104,"schema":120,"internalLinks":130,"citations":143,"wordCount":156,"generatedAt":157},"pack","Voice Clone + Podcast Studio — 10 Tools to Run a Show Solo","Audacity, Whisper, whisper.cpp, ElevenLabs, OpenVoice, GPT-SoVITS, Fish Speech, Coqui TTS, KrillinAI, VideoCaptioner — a single-operator rig for recording, denoising, voice cloning, multilingual dubbing, and publish-ready captions. Install in order. Use ethically.","Voice Cloning + Podcast — A Solo-Operator Studio in 10 Picks","Five layers, ten picks, one workflow: capture and clean (Audacity) → transcribe (Whisper family) → clone your voice (ElevenLabs, OpenVoice, GPT-SoVITS) → dub into 100 languages (KrillinAI, Fish Speech, Coqui TTS) → bake captions for social cuts (VideoCaptioner). Get consent before you clone anyone.","## What's in this pack\n\nThis is the rig an indie podcaster, voice actor, or YouTuber would build to run a whole show **without a producer, sound engineer, or translation agency**. Ten picks, opinionated order, every one of them either open-source or has a serious free tier. The point is not \"all the tools that exist\" — it's \"the smallest set that lets one person record on Monday and ship a localized, captioned, denoised, voice-cloned cut by Friday\".\n\nFive layers, two picks per layer where there's a real tradeoff:\n\n| Layer | Picks | Why |\n|---|---|---|\n| 1. Record + clean | Audacity | Free DAW. Records multi-track, removes hiss\u002Fbreath\u002Fclick, exports anything. |\n| 2. Transcribe | Whisper (cloud) · whisper.cpp (local) | Cloud Whisper for highest accuracy; whisper.cpp for offline \u002F sensitive \u002F batch \u002F mobile. |\n| 3. Clone your voice | ElevenLabs · OpenVoice · GPT-SoVITS | ElevenLabs = top fidelity, paid. OpenVoice = instant tone+style clone, MIT. GPT-SoVITS = few-shot clone you self-host. |\n| 4. Dub into other languages | Fish Speech · Coqui TTS · KrillinAI | Fish Speech does 80+ languages. Coqui TTS = pluggable engine. KrillinAI takes a video file and dubs the whole thing in one click. |\n| 5. Caption + ship | VideoCaptioner | Burns word-level subtitles into vertical cuts for TikTok \u002F Reels \u002F Shorts. |\n\nThe pack is sized for **one operator**. If you're running a 3-person podcast network with editors, swap Audacity for Reaper \u002F Adobe Audition (paid), swap KrillinAI for a human translation pass, and add a publish\u002Fscheduling tool. For everyone else, this is the rig.\n\n## Install in this order\n\nDo NOT install the voice clone tools first. You need a clean recording before cloning gives a usable result.\n\n```bash\n# Stage 1 — capture and clean (Monday)\ntokrepo install audacity\n\n# Stage 2 — get a transcript so you can edit by text, not by waveform (Monday night)\ntokrepo install whisper-cpp        # local, free, ~5x realtime on M-series\n# OR\ntokrepo install whisper            # OpenAI API, highest accuracy\n\n# Stage 3 — clone your own voice (Tuesday — you only do this once)\ntokrepo install elevenlabs-python-sdk  # 3 min of clean audio → studio-grade clone\n# OR — if you want to self-host \u002F not pay per character\ntokrepo install openvoice              # instant clone, MIT\ntokrepo install gpt-sovits             # few-shot, GPU recommended\n\n# Stage 4 — dub a clip into other languages (Wednesday)\ntokrepo install fish-speech            # multilingual TTS, 80+ languages\ntokrepo install coqui-tts              # self-hosted alternative\ntokrepo install krillinai              # full-video dub, subtitles+voice, one command\n\n# Stage 5 — publish (Thursday)\ntokrepo install videocaptioner         # burn animated captions for social cuts\n```\n\nThe TokRepo CLI drops each asset as a skill file in your repo. Claude Code \u002F Cursor \u002F Codex CLI read the skill and can wire up the script for you — \"take `episode-12.wav`, denoise it in Audacity headless, transcribe with whisper.cpp, dub the first 60 seconds into Spanish with KrillinAI, burn captions with VideoCaptioner, output `ep12-es.mp4`\" becomes a single agent prompt.\n\n## How they fit together\n\n```\n[ Mic \u002F Riverside \u002F Zoom recording ]\n             │\n             ▼\n   ┌─────────────────────┐\n   │ Audacity            │  noise gate, EQ, normalize, click removal\n   └─────────────────────┘\n             │  clean WAV\n             ▼\n   ┌─────────────────────┐\n   │ Whisper \u002F whisper.cpp │  transcript + word timestamps\n   └─────────────────────┘\n             │  edit by deleting text, not waveform\n             ▼\n   ┌─────────────────────────────────┐\n   │ Voice clone (one of):          │\n   │   ElevenLabs · OpenVoice ·     │  → your-voice model\n   │   GPT-SoVITS                   │\n   └─────────────────────────────────┘\n             │\n             ├──► re-record a flub: type the line, your-voice speaks it\n             │\n             ▼\n   ┌─────────────────────────────────┐\n   │ Multilingual dub (one of):     │\n   │   Fish Speech (TTS engine) ·   │\n   │   Coqui TTS · KrillinAI        │  → ES \u002F JA \u002F DE \u002F FR audio track\n   │   (full video pipeline)        │\n   └─────────────────────────────────┘\n             │\n             ▼\n   ┌─────────────────────┐\n   │ VideoCaptioner      │  word-by-word burned captions, vertical cut\n   └─────────────────────┘\n             │\n             ▼\n   [ YouTube \u002F Spotify \u002F TikTok \u002F Reels \u002F Shorts ]\n```\n\nThe big unlock here is **editing by transcript, not by waveform**. Once Whisper gives you a timestamped transcript, removing an um\u002Fuh becomes deleting a word from a text file and re-rendering. That's where the 5x speed-up actually comes from — not the cloning, not the dubbing, but never having to scrub through a 90-minute waveform.\n\n## Tradeoffs you'll hit\n\n- **ElevenLabs vs OpenVoice vs GPT-SoVITS for cloning your own voice.** ElevenLabs is the fidelity ceiling — 3 minutes of clean audio gets you a clone friends can't tell apart, but it's $5–$330\u002Fmonth + character overage and your voice model lives on their servers. OpenVoice is MIT-licensed and runs on a consumer GPU; quality is \"good enough for podcast intros, not narration\". GPT-SoVITS is the strongest open option but needs a fine-tune pass per voice. Pick ElevenLabs for fastest result, OpenVoice\u002FGPT-SoVITS if licensing or recurring cost matters.\n- **Cloud Whisper vs whisper.cpp.** Cloud is the most accurate, especially on Chinese\u002FJapanese\u002Fproper nouns. whisper.cpp runs on a MacBook with no internet, no per-minute cost, no data leaving your machine. Podcasts with named guests → cloud. Locked-down corporate \u002F journalism with sources → local.\n- **KrillinAI vs DIY (Fish Speech + Coqui).** KrillinAI takes a video file and gives you the same video in a new language, lips kind of synced, subtitles included — one command. The DIY path (extract audio → transcribe → translate → re-TTS → mux back in) gives you control over each step but is 5x the integration work. Use KrillinAI for first pass; drop down to DIY when one step needs tuning.\n- **Multilingual fidelity reality check.** Chinese\u002FJapanese\u002FKorean clones from English-trained voice models will sound \"foreign-accented\". Fish Speech is the strongest multilingual TTS in this pack. For mission-critical localization (paid clients) you still want a native voice actor for the target language; clones get you to draft quality, not broadcast.\n- **Realtime vs offline.** Nothing in this pack is realtime — this is a *production* studio, not a live-stream rig. If you need live, look at [Voice AI Stack pack](\u002Fen\u002Fpacks\u002Fvoice-ai-stack) instead.\n\n## Common pitfalls (and the ethical one)\n\n- **You don't own the rights to clone someone else's voice.** Cloning a guest, a public figure, a deceased person, or any voice you don't have explicit written consent from is a fast track to a lawsuit, a platform ban, and (in many jurisdictions) criminal liability. ElevenLabs requires a consent-recording before voice cloning. OpenVoice and GPT-SoVITS do not enforce this — *you* must. Get written consent before you clone anyone, and log it.\n- **Model bias generates accents you didn't want.** Voice cloning models trained predominantly on American English will make your Indian-English \u002F Australian \u002F Scottish accent sound subtly \"American\". Test the clone across your whole accent range before committing to a season of episodes.\n- **Proper-noun transcription error rate.** Whisper hallucinates names. \"Linus Torvalds\" comes out \"Linus Torvalds\" 90% of the time; \"Anthropic\" comes out \"and topic\". Build a custom vocabulary \u002F post-process replace list for every recurring name on your show.\n- **Long-audio token cost.** Transcribing a 2-hour podcast through cloud Whisper is fine ($0.36 at $0.006\u002Fmin). Dubbing a 2-hour podcast through ElevenLabs at the multilingual rate ≈ 100k chars\u002Fhour ≈ $20–60 per language per episode. Run the math before you promise \"every episode in 10 languages\".\n- **VAD before everything.** If you skip voice-activity detection and feed silent gaps to Whisper, you'll get the famous hallucinated transcript `Thank you for watching!` baked into your subtitles. Add a 30-line `silero-vad` pass before any STT call.\n- **Not keeping the original master.** Voice clone + re-mix + re-dub is a destructive chain. Always keep the original multi-track Audacity project — clients, lawyers, and future-you will all need it.\n\n## Ethical disclaimer\n\nVoice cloning has legitimate uses: re-recording your own flubs, accessibility narration, dubbing your own content into languages you don't speak, voice preservation for ALS patients. It also has obvious abuses: impersonation fraud, non-consensual deepfakes, putting words in a public figure's mouth. **This pack ships the tools. The rules are on you.** Get explicit written consent before cloning any voice that isn't your own. Disclose AI-generated audio in your show notes. Many platforms (YouTube, TikTok, Spotify) now require disclosure of synthetic media and will demonetize \u002F remove content that hides it. Build the disclosure into your publish step from day one.",[105,108,111,114,117],{"q":106,"a":107},"Is it legal to clone my own voice?","Cloning your own voice for your own use is legal in essentially every jurisdiction. The trouble starts when you (1) clone a voice you don't have rights to — a guest, a celebrity, a deceased person; (2) use a clone to impersonate someone for fraud or defamation, even your own clone in someone else's hands; or (3) hide that audio is AI-generated on a platform that requires disclosure (YouTube, TikTok, Spotify, Meta all do now). For your own podcast intros, narration patches, and translated dubs of your own content, you're fine. For anything involving a second person, get written consent.",{"q":109,"a":110},"ElevenLabs vs Fish Speech vs OpenVoice — which one for what?","ElevenLabs is the quality leader for English\u002FSpanish\u002FGerman and a paid SaaS — pick it when fidelity matters more than recurring cost and you're okay with a cloud dependency. Fish Speech is the best open multilingual TTS in this pack — it covers 80+ languages including strong Chinese and Japanese, runs on your GPU, and is what you reach for when ElevenLabs sounds \"too foreign\" in your target language. OpenVoice is the fastest open clone — 3-second reference audio, MIT-licensed, runs on a consumer GPU, but quality tops out around \"good podcast intro\" not \"broadcast narration\". Typical setup: ElevenLabs for your main voice clone, Fish Speech for Chinese\u002FJapanese dubs, OpenVoice for one-off character voices.",{"q":112,"a":113},"Which voice clone has the best Chinese quality?","For Chinese specifically: GPT-SoVITS and Fish Speech are both stronger than ElevenLabs out of the box, because they're trained on much more Chinese data. GPT-SoVITS in particular has a strong Chinese community and most public few-shot tutorials are Chinese-language. ElevenLabs has improved Chinese significantly in the last year but still has noticeable English-influenced tonal artifacts on the 4 tones. For a Chinese-language podcast or dub track, fine-tune GPT-SoVITS or Fish Speech on ~30 minutes of clean Mandarin reference; for a single Chinese sentence in an otherwise English show, ElevenLabs is fine.",{"q":115,"a":116},"Can I really dub a 1-hour podcast in one click with this?","Technically yes with KrillinAI — feed it `episode.mp4`, pick target language, get back `episode-es.mp4` with translated subtitles and dubbed audio. Realistically you'll want a human review pass before publishing, because (1) translation will mangle a few cultural references and inside jokes, (2) the clone will mispronounce proper nouns and acronyms specific to your domain, (3) lip-sync on long-form podcast video is convincing for 80% of clips and visibly off for 20%. Workflow that actually works: KrillinAI for the first pass on a 5-minute promo clip; if quality is good, batch the whole episode; review the transcript for terminology fixes; re-render. End-to-end for a 1-hour episode: ~3 hours human time vs ~3 days for an outsourced translation agency.",{"q":118,"a":119},"What's the fastest video editor for podcast-to-social repurposing?","If you mean cutting 60-second vertical clips out of a 90-minute episode for TikTok\u002FShorts\u002FReels: VideoCaptioner is the unlock here, because the big time sink is not the cut — it's animating word-by-word captions on every clip. VideoCaptioner takes the transcript Whisper already gave you and burns animated word-level subtitles into a vertical export. Combine with a simple FFmpeg crop or Shotcut\u002FKdenlive for the cut itself. If you want a single GUI that does cut + caption + export, OpenCut and Shotcut both work but you'll spend more time per clip. The fast path: edit-by-transcript in Audacity \u002F a text editor, render the cut with FFmpeg, caption with VideoCaptioner, ship.",{"@context":121,"@type":122,"name":123,"description":124,"numberOfItems":125,"inLanguage":25,"publisher":126},"https:\u002F\u002Fschema.org","ItemList","Voice Cloning + Podcast Studio","Ten open and commercial tools curated for a solo podcaster, voice actor, or YouTuber — record, denoise, transcribe, clone, dub into 100 languages, and caption for social, in deliberate install order.",10,{"@type":127,"name":128,"url":129},"Organization","TokRepo","https:\u002F\u002Ftokrepo.com",[131,135,139],{"url":132,"anchor":133,"reason":134},"\u002Fen\u002Fpacks\u002Ftts-stt-voice-stack","TTS + STT Voice Stack","Sister pack with the broader STT\u002FTTS component catalog — this pack is the podcast-shaped subset",{"url":136,"anchor":137,"reason":138},"\u002Fen\u002Fpacks\u002Fcontent-creator-ai-studio","Content Creator AI Studio","If your show ships as YouTube long-form + clips, the creator pack covers thumbnails, scripts, and publishing",{"url":140,"anchor":141,"reason":142},"\u002Fen\u002Fpacks\u002Fai-music-audio-generation","AI Music & Audio Generation","Background scoring, intro stings, and SFX for your show — pairs with this pack's voice layer",[144,148,152],{"claim":145,"source_name":146,"source_url":147},"OpenVoice supports instant voice cloning with separate tone and style control","myshell-ai\u002FOpenVoice","https:\u002F\u002Fgithub.com\u002Fmyshell-ai\u002FOpenVoice",{"claim":149,"source_name":150,"source_url":151},"Whisper is OpenAI's open-source speech recognition model","openai\u002Fwhisper","https:\u002F\u002Fgithub.com\u002Fopenai\u002Fwhisper",{"claim":153,"source_name":154,"source_url":155},"ElevenLabs requires consent verification before cloning a voice that isn't yours","ElevenLabs Voice Cloning Terms","https:\u002F\u002Felevenlabs.io\u002Fterms",920,"2026-05-23T10:00:00Z"]