[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-ai-web-scraping-zh":3,"seo:pack:ai-web-scraping:zh":63},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":62},"ai-web-scraping","🕷","#0369A1","stable","稳定","AI 网页抓取","Firecrawl \u002F Crawlee \u002F Crawl4AI \u002F GPT Crawler \u002F ScrapeGraphAI — 直接吐 LLM 可用 markdown 的抓取引擎，不再处理裸 HTML。",[16,28,38,46,54],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},744,"6a62a986-9f1a-4a59-88c8-b99151986854","firecrawl-web-scraping-api-ai-applications-6a62a986","Firecrawl — Web Scraping API for AI Applications","Turn any website into clean markdown or structured data for LLMs. Firecrawl handles JavaScript rendering, anti-bot bypassing, sitemaps, and batch crawling via simple API.","Firecrawl",281,0,"en","skill","Skill",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":34,"view_count":35,"vote_count":24,"lang_type":25,"type":36,"type_label":37},412,"3e8c6e91-e10e-45ba-9206-d6e3a9958c6a","crawlee-production-web-scraping-node-js-3e8c6e91","Crawlee — Production Web Scraping for Node.js","Build reliable crawlers with automatic proxy rotation, request queuing, and browser automation. By Apify. 22K+ stars.","Apify",269,"script","Script",{"id":39,"uuid":40,"slug":41,"title":42,"description":43,"author_name":44,"view_count":45,"vote_count":24,"lang_type":25,"type":26,"type_label":27},172,"cb19c9d4-6c2a-4443-80eb-043a440d79eb","crawl4ai-llm-friendly-web-crawling-cb19c9d4","Crawl4AI — LLM-Friendly Web Crawling","Open-source web crawler optimized for AI and LLM use cases. Extracts clean markdown, handles JavaScript-rendered pages, and supports structured data extraction.","Crawl4AI",298,{"id":47,"uuid":48,"slug":49,"title":50,"description":51,"author_name":52,"view_count":53,"vote_count":24,"lang_type":25,"type":26,"type_label":27},238,"bbd3962b-db9b-4ce9-9efe-31f44d08fdff","gpt-crawler-build-custom-gpts-any-website-bbd3962b","GPT Crawler — Build Custom GPTs from Any Website","Crawl any website to generate knowledge files for custom GPTs and RAG. Output as JSON for OpenAI GPTs or any LLM knowledge base. Zero config. 22K+ stars.","AI Open Source",224,{"id":55,"uuid":56,"slug":57,"title":58,"description":59,"author_name":60,"view_count":61,"vote_count":24,"lang_type":25,"type":26,"type_label":27},243,"d34e3181-e3f5-4853-871e-83acafe0c60e","scrapegraphai-ai-powered-web-scraping-d34e3181","ScrapeGraphAI — AI-Powered Web Scraping","Python scraping library powered by LLMs. Describe what you want to extract in natural language, get structured data back. Handles dynamic pages. 23K+ stars.","Script Depot",352,"tokrepo install pack\u002Fai-web-scraping",{"pageType":64,"pageKey":8,"locale":65,"title":66,"metaDescription":67,"h1":13,"tldr":68,"bodyMarkdown":69,"faq":70,"schema":86,"internalLinks":96,"citations":108,"wordCount":121,"generatedAt":122},"pack","zh","AI 网页抓取：5 个吐 LLM 可用 markdown 的引擎","Firecrawl \u002F Crawlee \u002F Crawl4AI \u002F GPT Crawler \u002F ScrapeGraphAI — 直接吐干净 markdown 不再处理裸 HTML。TokRepo 一条命令装齐 RAG 抓取栈。","五个开源抓取引擎，跳过 BeautifulSoup 苦海，直接吐 LLM 可用 markdown。TokRepo 一条命令装整个 pack，或者按需挑一个匹配你栈的。","## 这个 pack 装了什么\n\n| # | 引擎 | 强项 | 语言 |\n|---|---|---|---|\n| 1 | Firecrawl | 托管 API + 自托管，JS 渲染，sitemap 爬取 | TypeScript |\n| 2 | Crawlee | 完整爬虫框架，带代理轮转 | TypeScript \u002F Python |\n| 3 | Crawl4AI | RAG 优化的 markdown，最快异步爬取 | Python |\n| 4 | GPT Crawler | 一份配置文件搞定聊天机器人知识库爬取 | TypeScript |\n| 5 | ScrapeGraphAI | 通过 prompt + schema 让 LLM 驱动抽取 | Python |\n\n这五个工具有一个共同认识：把裸 HTML 喂给 LLM 是 token 税。等你剥完导航栏、广告、脚本、内联样式，已经烧了几千 token 啥都没干。AI 原生抓取器把这个转换放在爬虫边缘，让检索层只看到干净 markdown。\n\n## 为什么 2026 年抓取换了打法\n\n三个变化把老抓取套路逼退休了。\n\n第一，JavaScript 渲染已是基线。SPA 和边缘渲染网站把内容藏在水合后面，2018 年那一套（`requests` + BeautifulSoup）只能拿到外壳。现代引擎包了 headless Chromium，等到正确的 network-idle 事件才抽取。\n\n第二，目的地是检索，不是展示。输出不是进搜索索引，而是进向量数据库做 RAG。优化目标从「在浏览器里渲染好」变成「干净塞进 8k token」。\n\n第三，反爬升级。Cloudflare \u002F DataDome \u002F PerimeterX 几秒就把朴素爬虫挡掉。Firecrawl 和 Crawlee 通过住宅代理轮转、浏览器指纹随机化、智能重试解决了这些问题 —— 你自己拼这些得花几周。\n\n## 一条命令装齐\n\n```bash\n# 装整个 pack\ntokrepo install pack\u002Fai-web-scraping\n\n# 或者挑跟你栈匹配的引擎\ntokrepo install firecrawl\ntokrepo install crawl4ai\ntokrepo install scrapegraphai\n```\n\n每个资产的 TokRepo 页面都打包好了安装命令、推荐配置，以及主流输出适配器（markdown、JSONL、向量库直插）。\n\n## 常见坑\n\n- **Robots.txt 和速率限制**：守规矩。多数引擎默认开 `respect_robots_txt`，关掉等 IP 被封 + 法律麻烦。设礼貌爬取延迟\n- **没开 JS 渲染就抓 JS 页面**：Firecrawl\u002FCrawl4AI 返回空，多半是水合页面没启用渲染。打开 JS 选项\n- **Markdown 风格不一致**：不同引擎吐出的 markdown 略有差异（表格、代码块、脚注）。混用引擎做同一 RAG 语料时要后处理统一\n- **PDF\u002FOffice 文件伪装成网页**：网页爬虫提不出来，转交给文档 AI 流水线 pack\n- **登录后内容**：爬登录后内容脆弱且常违反 ToS。能用官方 API 就用官方\n\n## 这个 pack 单跑不够\n\n这个 pack 是*抽取*层。要凑出完整 RAG 流水线还需要：\n\n- 向量数据库 —— 看向量数据库横评 pack（Chroma \u002F Weaviate \u002F Qdrant 等）\n- 切块 + embedding 步骤 —— 一般 LangChain 或 LlamaIndex 黏合\n- 评测循环 —— LLM 评测 & 护栏 pack 评检索相关度\n\nPDF 和 Office 输入换文档 AI 流水线 pack。交互式抓取（填表、走向导）该用浏览器自动化 pack —— 那种站点要 Playwright 风格的交互，不是爬取。\n\n## 怎么挑引擎\n\n- **想要托管 API、不想管基础设施**：Firecrawl。五个里 dev-ex 最好，JS 渲染和代理轮转内建\n- **要在自有硬件爬几百万页**：Crawlee。最成熟的爬虫框架，带队列持久化和可恢复运行\n- **用 Python 建 RAG 入库**：Crawl4AI。异步优先设计，同样机器上吞吐是同步爬虫的 3-5 倍\n- **一次性给聊天机器人导出知识库**：GPT Crawler。一份 `config.ts` 指向一个域名，吐出 JSONL 直接喂 OpenAI 文件上传\n- **页面 schema 不规整、想按意图抽取**：ScrapeGraphAI。你给它 Pydantic 模型和 prompt，它每页自己摸出 selector",[71,74,77,80,83],{"q":72,"a":73},"这些工具都免费吗？","五个全开源。Firecrawl 有托管 SaaS 版带免费额度，但你可以免费自托管。Crawlee \u002F Crawl4AI \u002F GPT Crawler \u002F ScrapeGraphAI 是 100% 自托管 BSD\u002FMIT 许可。隐藏成本是代理服务 —— 爬反爬严的站点要住宅代理，月支出 $50-200。",{"q":75,"a":76},"Firecrawl 还是 Crawl4AI 该选哪个？","想要托管端点不想管基础设施，选 Firecrawl —— API 更简洁，JS 渲染稳。要 Python 原生 + 自托管最大吞吐，选 Crawl4AI —— 异步架构原始速度比 Firecrawl 快，但运维粘合多。给 Cursor\u002FCodex CLI agent 调工具用，两个都行，Firecrawl 步骤少。",{"q":78,"a":79},"能给 Cursor \u002F Codex CLI 当工具用吗？","可以 —— 多数都有 MCP server 或 HTTP API，任何带 tool-calling 的 AI 工具都能调。Firecrawl 自带官方 MCP server。Crawl4AI 暴露 Python 函数你可以包一层。把 MCP 配置丢进 Cursor 设置或 Codex CLI agent 定义，LLM 就能按需抓取。",{"q":81,"a":82},"跟浏览器自动化 pack 有什么区别？","抓取是抽取优先：你想从 URL 可预测的页面拿 LLM 可用 markdown。浏览器自动化是交互优先：点击、填表、跳转、截图。有重叠（都用 headless Chromium），但 API 形态和典型流程不同。建 RAG 语料就用本 pack。填表就用浏览器自动化。",{"q":84,"a":85},"运维上最大的坑？","贪心爬取导致 token 爆炸。一个 sitemap 1 万页 × 每页 5k token = 5000 万 embedding token，OpenAI 价格轻松 $500+。永远先设 `max_pages` 和 `max_depth`，先跑 50 页采样，数 token，估账单，再放出去。忘了便宜，修起来贵。",{"@context":87,"@type":88,"name":89,"description":90,"numberOfItems":91,"publisher":92},"https:\u002F\u002Fschema.org","CollectionPage","AI Web Scraping","Five scraping engines that output LLM-ready markdown — Firecrawl, Crawlee, Crawl4AI, GPT Crawler, ScrapeGraphAI.",5,{"@type":93,"name":94,"url":95},"Organization","TokRepo","https:\u002F\u002Ftokrepo.com",[97,101,105],{"url":98,"anchor":99,"reason":100},"\u002Fzh\u002Fpacks\u002Fdocument-ai-pipeline","文档 AI 流水线","PDF\u002FOffice 入库拍档",{"url":102,"anchor":103,"reason":104},"\u002Fzh\u002Fpacks\u002Fbrowser-automation","浏览器自动化","交互式抓取的替代方案",{"url":106,"anchor":22,"reason":107},"\u002Fzh\u002Ftools\u002Ffirecrawl","本 pack 里最受欢迎的引擎",[109,113,117],{"claim":110,"source_name":111,"source_url":112},"Firecrawl turns websites into LLM-ready markdown via a hosted or self-hosted API","mendableai\u002Ffirecrawl","https:\u002F\u002Fgithub.com\u002Fmendableai\u002Ffirecrawl",{"claim":114,"source_name":115,"source_url":116},"Crawlee is the open-source web crawling and browser automation library by Apify","apify\u002Fcrawlee","https:\u002F\u002Fgithub.com\u002Fapify\u002Fcrawlee",{"claim":118,"source_name":119,"source_url":120},"Crawl4AI is open-source and optimized for retrieval-augmented LLM input","unclecode\u002Fcrawl4ai","https:\u002F\u002Fgithub.com\u002Funclecode\u002Fcrawl4ai",502,"2026-05-02T15:00:00Z"]