[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-ai-video-generation-pack-es":3,"seo:pack:ai-video-generation-pack:es":95},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":94},"ai-video-generation-pack","🎥","#7C3AED","new","Nuevo · esta semana","Pack de Generación de Video con IA","Diez picks para el creador y dev que genera video desde texto o imagen: modelos open-source (CogVideo, Open-Sora, AnimateDiff, Diffusers), el puente a APIs comerciales como Sora, Veo, Runway, Pika, control de cámara y movimiento con ControlNet y Motion Canvas, upscale con Real-ESRGAN, y el editor que une todos los clips.",[16,28,35,43,50,58,65,72,79,86],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2458,"7e2317bb-453a-11f1-9bc6-00163e2b0d79","cogvideo-text-image-video-generation-7e2317bb","CogVideo — Text and Image to Video Generation","An open-source video generation framework from Zhipu AI supporting text-to-video and image-to-video with CogVideoX models. Generates high-quality clips up to 6 seconds.","Script Depot",155,0,"en","skill","Skill",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":22,"view_count":34,"vote_count":24,"lang_type":25,"type":26,"type_label":27},109,"ff30d766-6654-4886-b631-e45e203f3a5e","open-sora-open-source-text-video-generation-ff30d766","Open-Sora — Open-Source Text-to-Video Generation","Open-source alternative to Sora by HPC-AI Tech. Generate videos from text prompts with an 11B parameter model. Apache 2.0 licensed. 28,800+ stars.",193,{"id":36,"uuid":37,"slug":38,"title":39,"description":40,"author_name":41,"view_count":42,"vote_count":24,"lang_type":25,"type":26,"type_label":27},777,"d848ded0-8bff-4424-8ef5-dda71b903327","together-ai-video-generation-skill-claude-code-d848ded0","Together AI Video Generation Skill for Claude Code","Skill that teaches Claude Code Together AI's video generation API. Covers text-to-video, image-to-video, and keyframe control for AI-powered video creation workflows.","Together AI",112,{"id":44,"uuid":45,"slug":46,"title":47,"description":48,"author_name":22,"view_count":49,"vote_count":24,"lang_type":25,"type":26,"type_label":27},111,"4ef1950f-2a47-4e24-9ce2-6f648dea8bed","diffusers-universal-video-image-generation-hub-4ef1950f","Diffusers — Universal Video & Image Generation Hub","Hugging Face's diffusion model library. Run CogVideoX, AnimateDiff, Stable Video Diffusion, and 50+ video\u002Fimage models with a unified API. 33,200+ stars.",173,{"id":51,"uuid":52,"slug":53,"title":54,"description":55,"author_name":56,"view_count":57,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2463,"04d7fee0-453b-11f1-9bc6-00163e2b0d79","animatediff-plug-play-animation-diffusion-models-04d7fee0","AnimateDiff — Plug-and-Play Animation for Diffusion Models","A plug-and-play motion module that turns community text-to-image Stable Diffusion models into animation generators without additional training. ICLR 2024 Spotlight paper.","AI Open Source",104,{"id":59,"uuid":60,"slug":61,"title":62,"description":63,"author_name":56,"view_count":64,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2495,"73d0fc65-45bc-11f1-9bc6-00163e2b0d79","real-esrgan-practical-image-video-super-resolution-73d0fc65","Real-ESRGAN — Practical Image and Video Super-Resolution","General-purpose image and video restoration tool that trains on pure synthetic data to handle real-world degradations including blur, noise, JPEG compression, and resize artifacts.",39,{"id":66,"uuid":67,"slug":68,"title":69,"description":70,"author_name":56,"view_count":71,"vote_count":24,"lang_type":25,"type":26,"type_label":27},4664,"74fc6ef5-54cb-11f1-9bc6-00163e2b0d79","controlnet-add-spatial-control-diffusion-models-74fc6ef5","ControlNet — Add Spatial Control to Diffusion Models","ControlNet lets you add precise spatial conditioning such as edge maps, depth, and pose to Stable Diffusion, giving fine-grained control over AI image generation.",16,{"id":73,"uuid":74,"slug":75,"title":76,"description":77,"author_name":56,"view_count":78,"vote_count":24,"lang_type":25,"type":26,"type_label":27},4618,"1a626bf6-548a-11f1-9bc6-00163e2b0d79","motion-canvas-create-animated-videos-code-1a626bf6","Motion Canvas — Create Animated Videos with Code","A TypeScript library and editor for creating publication-quality animated videos programmatically, combining the precision of code with a visual preview workflow.",38,{"id":80,"uuid":81,"slug":82,"title":83,"description":84,"author_name":22,"view_count":85,"vote_count":24,"lang_type":25,"type":26,"type_label":27},4027,"f40e235a-5078-11f1-9bc6-00163e2b0d79","opencut-open-source-ai-video-editor-f40e235a","OpenCut — Open-Source AI Video Editor","An open-source alternative to CapCut for video editing with AI-assisted features, timeline editing, and professional export options.",86,{"id":87,"uuid":88,"slug":89,"title":90,"description":91,"author_name":92,"view_count":93,"vote_count":24,"lang_type":25,"type":26,"type_label":27},3602,"eff58004-95ca-5e16-bfc4-f85ec9b207c2","generative-media-skills-muapi-npx-skills-add","Generative Media Skills — muapi + npx skills add","Generative Media Skills is a multi-modal skill library: run image\u002Fvideo recipes via muapi-cli, installable into Claude Code\u002FCursor with `npx skills add`.","Skill Factory",91,"tokrepo install pack\u002Fai-video-generation-pack",{"pageType":96,"pageKey":8,"locale":25,"title":97,"metaDescription":98,"h1":99,"tldr":100,"bodyMarkdown":101,"faq":102,"schema":118,"internalLinks":124,"citations":137,"wordCount":150,"generatedAt":151},"pack","AI Video Generation Pack — 10 Tools for Text-to-Video and Image-to-Video Workflows","CogVideo, Open-Sora, Together AI, Diffusers, AnimateDiff, Real-ESRGAN, ControlNet, Motion Canvas, OpenCut, Generative Media Skills — a deliberate stack covering open-source models, commercial APIs, camera control, motion modules, and upscale. Install via TokRepo.","AI Video Generation Pack — From Text Prompt to Final Cut","Ten picks for the creator or developer who is generating video from text or image. Open-source models for local control, a commercial-API skill for when you need Sora \u002F Veo \u002F Runway quality, camera and motion control via ControlNet and Motion Canvas, Real-ESRGAN upscale, and an editor to assemble the clips.","## What this pack is for\n\nGenerating AI video is no longer one model. It is a **pipeline**: pick a model, write a prompt, condition on a keyframe, control camera or motion, upscale, then cut. This pack assembles the ten picks that cover each stage — opinionated, not exhaustive — so you can move from a blank prompt to a watchable clip without stitching docs from ten different repos.\n\nThe stack is split deliberately: **open-source models for local control and zero per-second cost**, plus a **commercial-API bridge** for the moments when you genuinely need Sora-class quality and are willing to pay for it. Most working pipelines end up with both.\n\n## Install in this order\n\n### 1. Choose a model\n\n- **CogVideo** (#2458) — text-to-video and image-to-video. Mature open-source baseline from THUDM. Runs on a single high-VRAM GPU. Start here if you want a reproducible local pipeline.\n- **Open-Sora** (#109) — the open-source effort to replicate Sora-style results. More capable than CogVideo on long shots and motion coherence, heavier on hardware.\n- **Diffusers** (#111) — the Hugging Face hub that loads CogVideo, Stable Video Diffusion, HunyuanVideo, Wan, Mochi, and every new model the week it ships. If you want a single Python interface instead of one repo per model, install Diffusers first and treat the rest as weights.\n- **Together AI Video Generation Skill** (#777) — the commercial-API bridge. Use this when you need Sora \u002F Veo \u002F Runway \u002F Pika output quality and don't want to manage GPU infra. Pay per second, ship the same day.\n\n### 2. Write the prompt\n\n- Same rules as image generation, plus **motion verbs**: \"dolly forward\", \"orbit left\", \"static lock-off\". Models trained on cinematic data respond to film vocabulary.\n- Keep the subject in the first 12 tokens. Most text-to-video models still front-load attention.\n\n### 3. Condition on a keyframe\n\n- **ControlNet** (#4664) — feed a pose \u002F depth \u002F canny image to lock down composition. Use it when you have a specific framing in mind and don't want the model to reinvent the shot.\n- For image-to-video runs, the input image *is* the keyframe — no ControlNet needed.\n\n### 4. Add motion\n\n- **AnimateDiff** (#2463) — plug-and-play motion module for Stable Diffusion–family models. Animates an existing image-gen pipeline without retraining. Great for stylized or anime content.\n- **Motion Canvas** (#4618) — when the motion you want is deterministic (UI demos, data viz, programmatic camera moves), don't fight a diffusion model — write the motion in code.\n\n### 5. Upscale\n\n- **Real-ESRGAN** (#2495) — practical 4× super-resolution that handles video. Most generation models output 512×512 or 720p; Real-ESRGAN is how you ship 4K. Run it as the last step before encode.\n\n### 6. Assemble\n\n- **OpenCut** (#4027) — open-source AI video editor. Trim, splice, color-match generated clips. Avoids the export round-trip to a closed NLE.\n- **Generative Media Skills** (#3602) — the muapi + npx skill installer that unifies a dozen commercial generation APIs behind one CLI. Useful when an agent needs to call \"generate a 5-second clip\" without picking a vendor every time.\n\n## How they fit together\n\n```\nPrompt ─► CogVideo \u002F Open-Sora \u002F Diffusers \u002F Together API\n               │\n               ▼\n        Raw 720p clips\n               │\n   ControlNet ─┤ (optional: lock framing)\n   AnimateDiff ┤ (optional: add motion to image)\n   Motion Canvas ┤ (optional: deterministic moves)\n               ▼\n        Real-ESRGAN  ─►  4K upscaled clips\n               │\n               ▼\n            OpenCut\n               │\n               ▼\n         Final cut (mp4)\n```\n\nThe split that matters: **diffusion models hallucinate motion, code-based tools dictate motion**. Use diffusion (CogVideo, Open-Sora, AnimateDiff) when you want surprises and atmosphere. Use Motion Canvas when the camera path is non-negotiable and the audience will notice drift.\n\n## Tradeoffs you'll hit\n\n- **Local vs API** — local generation is free per second but costs you GPU time and tuning. API generation is fast and high-quality but priced per second and locked behind quotas. Run local for iteration, API for the hero shots.\n- **CogVideo vs Open-Sora** — CogVideo is more stable to set up and runs on lower VRAM. Open-Sora produces longer, more coherent shots when it works. Start with CogVideo; graduate when the gap matters.\n- **AnimateDiff vs native video models** — AnimateDiff bolts motion onto SD checkpoints (huge ecosystem of styles, mediocre coherence). Native video models train end-to-end on video (cleaner motion, fewer styles). Pick by content: stylized → AnimateDiff, realistic → CogVideo \u002F Open-Sora.\n- **Real-ESRGAN vs paid upscalers** — Real-ESRGAN is free and good enough for most web delivery. Topaz Video AI and similar paid tools are sharper on faces but cost real money. Ship Real-ESRGAN first; upgrade only if reviewers complain.\n\n## Common pitfalls\n\n- **VRAM math** — CogVideo-5B needs ~24 GB for 720p generation. Open-Sora can demand 40 GB+. Read the model card before renting a GPU.\n- **Prompt drift across frames** — long shots from any diffusion model drift in identity and lighting after ~3 seconds. Generate in 3-second chunks and stitch in OpenCut rather than fighting the model for a 10-second take.\n- **Audio is separate** — none of these tools generate matching audio. Plan a separate TTS \u002F SFX pass; the assembly happens in OpenCut.\n- **Commercial-API terms** — every commercial generation provider has different rules on commercial reuse, training opt-out, and watermarking. Read the TOS before publishing client work.",[103,106,109,112,115],{"q":104,"a":105},"Which model should I start with — CogVideo or Open-Sora?","Start with CogVideo unless you already have a 40+ GB GPU and a reason to push for longer shots. CogVideo runs on a single 24 GB card, the documentation is more complete, and the failure modes are well understood. Move to Open-Sora when CogVideo's clip-length ceiling is the bottleneck — not before.",{"q":107,"a":108},"Do I actually need both an open-source model and a commercial API skill?","Most working pipelines end up with both. Local models give you free iteration, deterministic seeds, and no per-second cost — great for testing 50 variants of a prompt. Commercial APIs (via the Together AI skill or Generative Media Skills) give you Sora \u002F Veo \u002F Runway-class output for the final hero shot. The split is iteration vs delivery.",{"q":110,"a":111},"How do I control the camera, not just the subject?","Two paths. For diffusion models, write camera verbs into the prompt (\"slow dolly forward\", \"static lock-off\", \"orbit 90 degrees\") — models trained on cinematic captions respond to film vocabulary. When you need exact framing or trajectory, switch to Motion Canvas and program the move in code, then composite the diffusion output into the framed shot.",{"q":113,"a":114},"Why ControlNet for video — isn't it for images?","ControlNet conditions a diffusion step on a structural signal — pose, depth, edges. When that step happens to be the first frame of a video generation, the entire clip inherits that composition. It is the cleanest way to keep generated video on-model when you have a specific framing in mind, especially for product or character shots where you cannot afford the model to reinvent the layout.",{"q":116,"a":117},"Can a single GPU machine actually run this whole pipeline?","Yes if you sequence the stages instead of running them in parallel. Generate with CogVideo (24 GB), unload, then run Real-ESRGAN (~6 GB), then OpenCut on CPU. The bottleneck is the generation step; everything downstream is comparatively cheap. If you only have a 16 GB card, drop to CogVideo's smaller variant or call the commercial API for generation and keep upscale + edit local.",{"@context":119,"@type":120,"name":121,"description":122,"numberOfItems":123,"inLanguage":25},"https:\u002F\u002Fschema.org","ItemList","AI Video Generation Pack","Ten picks covering open-source video generation models, commercial APIs, camera and motion control, upscale, and assembly for creators and developers building text-to-video and image-to-video pipelines.",10,[125,129,133],{"url":126,"anchor":127,"reason":128},"\u002Fen\u002Ftopics\u002Fvideo-production-ai","Video Production AI pack","Companion pack focused on editing and headless production pipelines rather than generation models",{"url":130,"anchor":131,"reason":132},"\u002Fen\u002Fai-tools-for\u002Fcontent-creation","AI tools for content creation","Broader catalog of content-creation assets that pair with generated video clips",{"url":134,"anchor":135,"reason":136},"\u002Fen\u002Ftopics","Browse other topic packs","Discover packs for AI agents, MCP, multi-agent frameworks, and more",[138,142,146],{"claim":139,"source_name":140,"source_url":141},"CogVideo is an open-source text-to-video and image-to-video model from THUDM","CogVideo GitHub","https:\u002F\u002Fgithub.com\u002FTHUDM\u002FCogVideo",{"claim":143,"source_name":144,"source_url":145},"Open-Sora is an open-source effort to reproduce Sora-style text-to-video generation","Open-Sora GitHub","https:\u002F\u002Fgithub.com\u002Fhpcaitech\u002FOpen-Sora",{"claim":147,"source_name":148,"source_url":149},"Real-ESRGAN is a practical algorithm for general image and video super-resolution","Real-ESRGAN GitHub","https:\u002F\u002Fgithub.com\u002Fxinntao\u002FReal-ESRGAN",920,"2026-05-22T00:00:00Z"]