[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-video-transcript-search-es":3,"seo:pack:video-transcript-search:es":97},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":96},"video-transcript-search","🎞️","#DC2626","new","Nuevo · esta semana","Pack de Transcripción y Búsqueda de Video","Diez picks para el ingeniero que quiere indexar horas de YouTube, podcasts o reuniones por lo que se dice realmente — yt-dlp para descargar la fuente, whisper.cpp \u002F Faster Whisper \u002F WhisperX para transcripciones con timestamps por palabra y diarización, Groq + AssemblyAI cuando necesitas velocidad cloud, Meetily para reuniones locales, yt-fts para búsqueda por palabra, LeMUR para resúmenes y Q&A, Remotion para subtítulos sincronizados.",[16,28,35,41,48,58,66,73,81,88],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2434,"05ad6f38-44f8-11f1-9bc6-00163e2b0d79","yt-dlp-feature-rich-audio-video-downloader-05ad6f38","yt-dlp — Feature-Rich Audio & Video Downloader","yt-dlp is a feature-rich command-line tool for downloading audio and video from thousands of websites. A community-maintained fork of youtube-dl with active development, format selection, post-processing, and SponsorBlock integration.","Script Depot",103,0,"en","skill","Skill",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":22,"view_count":34,"vote_count":24,"lang_type":25,"type":26,"type_label":27},390,"e1fd7c46-bbda-4956-8649-9c3ed579ff25","whisper-cpp-local-speech-text-pure-c-c-e1fd7c46","whisper.cpp — Local Speech-to-Text in Pure C\u002FC++","High-performance port of OpenAI Whisper in C\u002FC++. No Python, no GPU required. Runs on CPU, Apple Silicon, CUDA, and even Raspberry Pi. Real-time transcription.",1595,{"id":36,"uuid":37,"slug":38,"title":39,"description":40,"author_name":22,"view_count":4,"vote_count":24,"lang_type":25,"type":26,"type_label":27},270,"24576b2c-a9d1-4f7a-9696-b1e5c50a17f3","faster-whisper-4x-faster-speech-text-24576b2c","Faster Whisper — 4x Faster Speech-to-Text","Faster Whisper is a reimplementation of OpenAI Whisper using CTranslate2, up to 4x faster with less memory. 21.8K+ GitHub stars. GPU\u002FCPU, 8-bit quantization, word timestamps, VAD. MIT licensed.",{"id":42,"uuid":43,"slug":44,"title":45,"description":46,"author_name":22,"view_count":47,"vote_count":24,"lang_type":25,"type":26,"type_label":27},287,"c43ad870-8c99-471a-898e-b07140faf532","whisperx-70x-faster-speech-recognition-c43ad870","WhisperX — 70x Faster Speech Recognition","WhisperX provides 70x realtime speech recognition with word-level timestamps and speaker diarization. 21K+ GitHub stars. Batched inference, under 8GB VRAM. BSD-2-Clause.",235,{"id":49,"uuid":50,"slug":51,"title":52,"description":53,"author_name":54,"view_count":55,"vote_count":24,"lang_type":25,"type":56,"type_label":57},2988,"647a6e2e-a111-41c1-bfa4-229dc2be497d","assemblyai-diarization-auto-identify-2-10-speakers","AssemblyAI Diarization — Auto-Identify 2-10 Speakers","AssemblyAI speaker_labels separates 2-10 speakers without enrollment. Per-utterance speaker tags. For meetings, interviews, multi-party calls.","AssemblyAI",86,"script","Script",{"id":59,"uuid":60,"slug":61,"title":62,"description":63,"author_name":64,"view_count":65,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2847,"34b19e7a-a7a9-4869-9339-edbd8a20144f","groq-whisper-sub-second-speech-to-text-for-voice-agents","Groq Whisper — Sub-Second Speech-to-Text for Voice Agents","Whisper-large-v3 on Groq runs 166× realtime — 60-sec clip in \u003C400ms. OpenAI-compat audio.transcriptions endpoint for voice agents.","Groq",89,{"id":67,"uuid":68,"slug":69,"title":70,"description":71,"author_name":22,"view_count":72,"vote_count":24,"lang_type":25,"type":56,"type_label":57},3677,"d6a5082f-b4f9-5ddc-8d48-51d5d32dde73","yt-fts-youtube-full-text-search-cli","yt-fts — YouTube Full-Text Search CLI","yt-fts indexes YouTube transcripts for CLI, semantic search, and RAG flows; verified 1,804 stars with pip install and search\u002Fexport commands in README.",25,{"id":74,"uuid":75,"slug":76,"title":77,"description":78,"author_name":79,"view_count":80,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2101,"3270e558-4080-11f1-9bc6-00163e2b0d79","meetily-privacy-first-ai-meeting-assistant-local-3270e558","Meetily — Privacy-First AI Meeting Assistant with Local Transcription","An open-source, self-hosted AI meeting assistant that provides real-time transcription, speaker diarization, and local summarization using Whisper and Ollama, with no cloud dependency.","AI Open Source",140,{"id":82,"uuid":83,"slug":84,"title":85,"description":86,"author_name":54,"view_count":87,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2987,"bf97b4c4-021f-4912-afc9-fbba48bc48b2","lemur-run-llms-over-assemblyai-transcripts","LeMUR — Run LLMs Over AssemblyAI Transcripts","LeMUR runs Claude \u002F GPT prompts over AssemblyAI transcripts already in context. Summaries, Q&A, action items, custom JSON extraction.",84,{"id":89,"uuid":90,"slug":91,"title":92,"description":93,"author_name":94,"view_count":95,"vote_count":24,"lang_type":25,"type":26,"type_label":27},101,"7775f06a-8adf-477a-91e9-85f51682cd10","remotion-captions-subtitles-ai-powered-video-subtitles-7775f06a","Remotion Captions & Subtitles — AI-Powered Video Subtitles","AI skill for generating and rendering captions in Remotion videos. Supports transcription, word-level timing, and styled subtitle export.","Skill Factory",194,"tokrepo install pack\u002Fvideo-transcript-search",{"pageType":98,"pageKey":8,"locale":25,"title":99,"metaDescription":100,"h1":101,"tldr":102,"bodyMarkdown":103,"faq":104,"schema":120,"internalLinks":126,"citations":139,"wordCount":152,"generatedAt":153},"pack","Video Transcript + Search Pack — 10 Tools to Index YouTube, Podcasts, Meetings","yt-dlp, whisper.cpp, Faster Whisper, WhisperX, AssemblyAI Diarization, Groq Whisper, yt-fts, Meetily, LeMUR, Remotion Captions — a ten-asset pipeline for transcribing, diarizing, chunking, and searching video content by what's actually said. Install via TokRepo.","Video Transcript + Search — A Pipeline for Indexing Spoken Content","Ten picks chained into one pipeline: pull the source (yt-dlp), transcribe it (whisper.cpp \u002F Faster Whisper \u002F Groq Whisper), get word-level timestamps and speaker labels (WhisperX, AssemblyAI Diarization), chunk on natural turn boundaries, run keyword search (yt-fts) and LLM Q&A (LeMUR) over the transcript, and render captions back to video (Remotion). Built for researchers, journalists, and devs who want a searchable archive of spoken content.","## What's in this pack\n\nThis is the stack you build when you realise the most valuable text on the internet is locked inside YouTube videos, podcast feeds, and Zoom recordings — and the search box on each platform is useless. Ten picks chained into a deliberate pipeline: **source → transcribe → diarize → chunk → embed\u002Fsearch → summary**. Each one earns its place by doing one stage well; together they turn hours of audio into a queryable index.\n\nThe pack pairs **local and cloud options at every stage**. You start cloud while you're prototyping (Groq Whisper at sub-second latency, AssemblyAI for plug-and-play diarization), then move local once the bill or the privacy story matters (whisper.cpp + WhisperX on the box under your desk). Same pipeline, different backends.\n\nThe target user is the **researcher \u002F journalist \u002F dev** who has a folder of recordings and wants to grep them by what's actually said — \"find every time my guest mentioned 'sales tax' in the last 40 episodes,\" \"give me the 30-second clip where Speaker 2 disagreed,\" \"summarize last week's 9 meetings into one page.\"\n\n## Install in this order (source → transcribe → diarize → chunk → embed\u002Fsearch → summary)\n\n1. **yt-dlp** — pull the source. The cornerstone: YouTube, Vimeo, podcast RSS, X video, Bilibili, 1000+ sites with one command. `yt-dlp -x --audio-format wav \u003Curl>` gives you the WAV your transcription stack wants. Without yt-dlp, you're hand-downloading; with it, you script a whole channel's archive in a loop.\n2. **whisper.cpp** — local transcription, zero deps. C\u002FC++ port of Whisper that runs on a MacBook Air without Python, CUDA, or model wrangling. `large-v3` is the default for accuracy; `medium.en` is faster if you're English-only. This is the boring tool that does 80% of the work.\n3. **Faster Whisper** — 4x speedup of OpenAI Whisper via CTranslate2. When whisper.cpp is too slow for your batch size, Faster Whisper on a 4090 transcribes one hour of audio in 2-3 minutes. Same accuracy as OpenAI's reference, ~half the VRAM.\n4. **WhisperX** — word-level timestamps + diarization in one pass. The killer feature is **forced alignment**: each word gets a millisecond-precise start\u002Fend, so your search hit can jump to the exact second instead of \"somewhere in this 30-second chunk.\" Bundles pyannote-style diarization for free.\n5. **AssemblyAI Diarization** — cloud diarization when you don't want to manage models. 2-10 speakers auto-identified, no GPU, just a HTTP call. Use it for the recordings where Speaker 1 \u002F Speaker 2 labels are non-negotiable and you don't want to debug pyannote weights.\n6. **Groq Whisper** — sub-second cloud STT. Same Whisper model, but on Groq's LPU it returns in under a second per minute of audio. The right call for live transcription, voice agents, or any pipeline where latency beats batch cost.\n7. **yt-fts** — full-text search across YouTube channels. Pull every video's auto-captions from a channel, drop them in SQLite FTS5, search by keyword across years of content. The crude but effective first pass before you build a real semantic-search backend.\n8. **Meetily** — privacy-first AI meeting assistant. Joins your Zoom\u002FMeet\u002FTeams, transcribes locally, generates summaries — none of it leaves your machine. The right choice when the meeting content is sensitive (interviews, hiring, M&A, therapy notes).\n9. **LeMUR** — run LLMs over transcripts at scale. AssemblyAI's LLM endpoint that accepts a transcript and a prompt and returns summaries, action items, custom JSON extraction, Q&A. The summary-and-extract stage of the pipeline without you wiring up your own RAG.\n10. **Remotion Captions & Subtitles** — render the transcript back onto video. Once you've transcribed and corrected, you usually want captions burnt in. Remotion turns your SRT into styled subtitles with consistent typography, animated word-by-word, exportable to MP4.\n\n## How they fit together (ASCII pipeline)\n\n```\n         ┌── yt-dlp ──┐\n         │ (pull WAV) │  ← YouTube \u002F podcast feed \u002F Zoom .m4a\n         └─────┬──────┘\n               ▼\n   ┌─── transcription ────────────────────────────┐\n   │  whisper.cpp (local, no Python)             │\n   │  Faster Whisper (4x faster on GPU)          │\n   │  Groq Whisper (cloud, sub-second)           │\n   │  → raw text + chunk timestamps              │\n   └─────────────────┬────────────────────────────┘\n                     ▼\n   ┌── word-align + diarize ──────────────────────┐\n   │  WhisperX (local, word-level + speakers)    │\n   │  AssemblyAI Diarization (cloud)             │\n   │  → SRT with [SPEAKER_01] tags + word.start  │\n   └─────────────────┬────────────────────────────┘\n                     ▼\n   ┌── timestamp-aware chunking ──────────────────┐\n   │  chunk by speaker turn OR by silence > 0.5s │\n   │  OR by 30-60s windows with 10s overlap      │\n   │  emit (text, video_id, start, end, speaker) │\n   └─────────────────┬────────────────────────────┘\n                     ▼\n      ┌─── search ─────┐    ┌─── summary ────┐\n      │ yt-fts FTS5    │    │ LeMUR (LLM)    │\n      │ + embeddings   │    │ summaries,     │\n      │ (your choice)  │    │ Q&A, extract   │\n      └────────┬───────┘    └────────┬───────┘\n               ▼                     ▼\n           Meetily UI       Remotion captions\n           (live meetings)  (burned-in subs)\n```\n\nThe **chunking step has no dedicated tool in this pack on purpose** — it's a 30-line script over WhisperX's word output. Chunk on speaker turns for diarized content, on silence > 500ms for monologue, on fixed 30-60s windows with 10s overlap for everything else. Whatever you embed gets `(text, video_id, start_sec, end_sec, speaker)` so a search hit can deep-link to the timestamp.\n\n## Tradeoffs you'll hit\n\n- **whisper.cpp vs Faster Whisper** — whisper.cpp wins on portability (single binary, runs on a MacBook), Faster Whisper wins on raw speed (CUDA + CTranslate2). If you process more than one hour of audio per day on a GPU box, switch to Faster Whisper; if you're transcribing on the laptop you're already using, whisper.cpp stays.\n- **WhisperX vs AssemblyAI Diarization** — WhisperX is free and diarization quality on 2-3 clearly distinct speakers is excellent; it struggles with overlapping speech, crowd recordings, and >5 speakers. AssemblyAI's hosted diarization handles those edge cases at $0.37\u002Fhour. Rule of thumb: WhisperX for podcasts (host + 1-2 guests), AssemblyAI for panels and Q&A audio.\n- **Groq Whisper vs local whisper.cpp** — Groq at \u003C1s latency wins for live captions and voice agents. Local wins for batch jobs where you're transcribing overnight, where the data can't leave your machine, or where the recording is in a language Groq's hosted model handles poorly. Pick by latency budget, not by cost — for batch they're both pennies.\n- **yt-fts (FTS5 keyword) vs semantic embeddings** — yt-fts answers \"what episode mentioned the word X\" in 5 minutes of setup. Semantic search answers \"what episode is *about* X\" but takes an embedding step, a vector store, and a query pipeline. Build yt-fts first; upgrade to embeddings only when keyword search starts missing relevant hits.\n- **Meetily vs general-purpose Whisper** — Meetily is a packaged product (joins the call, generates summaries, ships a UI). The raw Whisper pipeline is more work but you can shape every step. Meetily for non-technical users on your team; the Whisper pipeline for the engineering-owned archive.\n\n## Common pitfalls\n\n- **Skipping diarization on multi-speaker audio.** A transcript of a 4-person podcast without speaker tags is search-poisoned: every quote is attributed to \"the transcript\" not to a person. Always diarize before you index.\n- **Throwing away word-level timestamps.** Whisper's default chunk timestamps are 30-second windows; a search hit that says \"in this 30-second chunk\" is useless for deep-linking. Use WhisperX (or AssemblyAI with `word_timestamps`) so each word has its own `start`.\n- **Chunking on fixed 30-second windows for diarized content.** A 30-second chunk often spans two speakers; the embedding then represents neither cleanly. Chunk on speaker turn boundaries first, then split long turns by sentence — not by clock.\n- **Re-transcribing every time the prompt changes.** Transcription is the expensive step. Persist the SRT + word JSON to disk, version it, and rerun only the summary\u002Fsearch pipeline against the cached output. Use a content hash of the source audio as the cache key.\n- **Trusting auto-captions blindly.** YouTube auto-captions miss names, jargon, and numbers — exactly the things you usually want to search for. Always re-transcribe with whisper.cpp `large-v3` or better before you commit to an index; the auto-captions are a free first draft, not the source of truth.\n- **Forgetting hallucination on silence.** Whisper hallucinates plausible-sounding text on long silent passages. Run a VAD (voice activity detection) pre-pass and only transcribe segments with actual speech; WhisperX has VAD built in via `--vad_filter`.",[105,108,111,114,117],{"q":106,"a":107},"Do I need all 10 tools or can I start smaller?","Start with three: yt-dlp to pull source, whisper.cpp for transcription, and yt-fts for search. That's a working keyword-search archive over any YouTube channel in an afternoon. Add WhisperX when you need word-level timestamps for deep-linking, Faster Whisper when whisper.cpp is too slow for your batch, AssemblyAI Diarization when WhisperX's speaker labels start mixing people up, Groq Whisper when you need sub-second latency for a live use case, Meetily when you need to record meetings without data leaving the machine, LeMUR when you want LLM-generated summaries over the whole archive, and Remotion Captions when you start re-rendering video with the corrected transcript burnt in. The full 10 only earns its keep once your archive is more than 50 hours of audio.",{"q":109,"a":110},"What's the realistic transcription throughput on a single GPU?","On a single RTX 4090 with Faster Whisper `large-v3` and beam_size=5: roughly 20-30x realtime, so one hour of audio in 2-3 minutes. WhisperX on the same box does ~15x realtime including alignment and diarization. whisper.cpp on an M3 MacBook Air with `medium.en` does ~5-8x realtime — fast enough for nightly batches but slow for live work. Plan capacity at one GPU-hour per 20-30 hours of source audio with Faster Whisper, or one M-series laptop-hour per 5-8 hours of source audio with whisper.cpp.",{"q":112,"a":113},"Why bother with WhisperX if Whisper already returns timestamps?","Whisper returns segment-level timestamps — each segment is roughly 30 seconds. That's fine if you just want captions on a video, but useless for two things this pack cares about: (1) deep-linking a search hit to the exact second a phrase was said, and (2) chunking on speaker turn boundaries instead of fixed clock windows. WhisperX adds forced alignment so every word gets its own `start` and `end` (typically accurate to 20-50ms), and bundles pyannote-style diarization so each word also has a speaker label. Those two features turn a transcript from a blob of text into a queryable, deep-linkable index.",{"q":115,"a":116},"Can this pack handle non-English content?","Yes, with caveats per language. whisper.cpp \u002F Faster Whisper \u002F WhisperX all share OpenAI Whisper's multilingual model — `large-v3` handles 90+ languages with usable quality, best on Spanish\u002FFrench\u002FGerman\u002FJapanese\u002FChinese\u002FPortuguese, weakest on low-resource African and Central Asian languages. Groq's hosted Whisper inherits the same language coverage. AssemblyAI's diarization is language-agnostic for the speaker labels but its transcript engine is best on English, Spanish, French, German, Italian, Portuguese, and Japanese. yt-fts FTS5 search is language-agnostic; LeMUR summarisation works in any language the underlying LLM (Claude \u002F GPT) handles. For Mandarin or Cantonese specifically, also consider SenseVoice as a drop-in transcription model — it outperforms Whisper on Chinese accents in our experience.",{"q":118,"a":119},"How do I keep the transcript database in sync as new episodes drop?","Treat the pipeline as a one-direction ingest. A nightly cron does: (1) `yt-dlp -x --download-archive archive.txt \u003Cchannel_url>` to fetch new episodes only — the archive file prevents re-downloads; (2) whisper.cpp or Faster Whisper transcribes each new audio file, writing JSON next to the audio with a content hash; (3) WhisperX aligns + diarizes if the file hasn't already been processed (check by hash); (4) the indexer reads the JSON, chunks on speaker turns, and upserts into yt-fts \u002F your vector store keyed on `(video_id, chunk_id)`; (5) LeMUR runs a summary prompt on each new episode and writes the result to your DB. Total cron is usually 50-80 lines of Python plus a Makefile. Don't try to make it incremental at the chunk level — full-file re-runs on cache miss are simpler and cheaper than tracking partial state.",{"@context":121,"@type":122,"name":123,"description":124,"numberOfItems":125,"inLanguage":25},"https:\u002F\u002Fschema.org","ItemList","Video Transcript + Search Pack","Ten AI assets curated to transcribe, diarize, chunk, and search video and audio content — yt-dlp, whisper.cpp, Faster Whisper, WhisperX, AssemblyAI Diarization, Groq Whisper, yt-fts, Meetily, LeMUR, Remotion Captions.",10,[127,131,135],{"url":128,"anchor":129,"reason":130},"\u002Fen\u002Fpacks\u002Fcontent-creator-ai-studio","Content Creator's AI Studio pack","The creator side of the same pipeline — once you can transcribe, you can repurpose into newsletter, captions, and B-roll",{"url":132,"anchor":133,"reason":134},"\u002Fen\u002Fpacks\u002Flog-analysis-search","Log Analysis + Search pack","Same search-over-time-series-text problem applied to logs instead of transcripts",{"url":136,"anchor":137,"reason":138},"\u002Fen\u002Ffeatured","Featured assets on TokRepo","These ten picks live alongside the broader curated catalog of agent-ready dev tools",[140,144,148],{"claim":141,"source_name":142,"source_url":143},"whisper.cpp is a high-performance, dependency-free C\u002FC++ port of OpenAI's Whisper that runs locally","ggerganov\u002Fwhisper.cpp on GitHub","https:\u002F\u002Fgithub.com\u002Fggerganov\u002Fwhisper.cpp",{"claim":145,"source_name":146,"source_url":147},"WhisperX provides word-level timestamps via forced alignment and integrated speaker diarization","m-bain\u002FwhisperX on GitHub","https:\u002F\u002Fgithub.com\u002Fm-bain\u002FwhisperX",{"claim":149,"source_name":150,"source_url":151},"yt-dlp is a feature-rich command-line audio\u002Fvideo downloader supporting thousands of sites","yt-dlp on GitHub","https:\u002F\u002Fgithub.com\u002Fyt-dlp\u002Fyt-dlp",1310,"2026-05-22T08:00:00Z"]