[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-ocr-document-parsing-zh":3,"seo:pack:ocr-document-parsing:zh":97},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":96},"ocr-document-parsing","📄","#0EA5E9","new","本周新建","OCR + 文档解析全家桶","10 件套，给要从扫描件、PDF、截图里抠出结构化数据的工程师。现代 doc-AI（Marker \u002F Nougat \u002F Surya \u002F Zerox \u002F MinerU）、布局感知解析器（Docling \u002F Unstructured \u002F OpenDataLoader）+ 老牌 OCR（Tesseract \u002F PaddleOCR）— 按 检测→识别→表格→结构化→JSON 的顺序安排好了。",[16,28,35,44,51,59,66,75,82,89],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},210,"42976daf-a56a-4152-9afb-d5b00d130a08","marker-convert-pdf-markdown-high-accuracy-42976daf","Marker — Convert PDF to Markdown with High Accuracy","Fast, accurate PDF to Markdown + JSON converter. Handles tables, images, equations, code blocks, and multi-column layouts. GPU-accelerated. 33K+ GitHub stars.","Script Depot",135,0,"en","skill","Skill",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":22,"view_count":34,"vote_count":24,"lang_type":25,"type":26,"type_label":27},263,"66bc0630-1be7-4da3-b227-f1fdb1faa065","surya-document-ocr-90-languages-66bc0630","Surya — Document OCR for 90+ Languages","Surya is a document OCR toolkit with 19.5K+ GitHub stars. Text recognition in 90+ languages, layout analysis, table detection, reading order, and LaTeX OCR. Benchmarks favorably against cloud OCR serv",380,{"id":36,"uuid":37,"slug":38,"title":39,"description":40,"author_name":22,"view_count":41,"vote_count":24,"lang_type":25,"type":42,"type_label":43},413,"985fe0df-6ec5-4fd6-8d3d-3c1627b0e18d","mineru-extract-llm-ready-data-any-document-985fe0df","MinerU — Extract LLM-Ready Data from Any Document","Convert PDFs, scans, and complex documents into clean Markdown or JSON for RAG and LLM pipelines. 57K+ GitHub stars.",225,"script","Script",{"id":45,"uuid":46,"slug":47,"title":48,"description":49,"author_name":22,"view_count":50,"vote_count":24,"lang_type":25,"type":26,"type_label":27},758,"3ac555d9-d75c-4208-ba46-974e4a717234","zerox-zero-shot-pdf-ocr-ai-pipelines-3ac555d9","Zerox — Zero-Shot PDF OCR for AI Pipelines","Extract text from any PDF using vision models as OCR. Zerox converts PDF pages to images then uses GPT-4o or Claude to extract clean markdown without training.",199,{"id":52,"uuid":53,"slug":54,"title":55,"description":56,"author_name":57,"view_count":58,"vote_count":24,"lang_type":25,"type":26,"type_label":27},4670,"ed1264b8-54cb-11f1-9bc6-00163e2b0d79","nougat-neural-optical-understanding-academic-documents-ed1264b8","Nougat — Neural Optical Understanding for Academic Documents","Nougat is a visual transformer model from Meta that converts academic PDF pages into structured Markdown, accurately preserving mathematical equations, tables, and text formatting.","AI Open Source",20,{"id":60,"uuid":61,"slug":62,"title":63,"description":64,"author_name":22,"view_count":65,"vote_count":24,"lang_type":25,"type":42,"type_label":43},173,"443e86c2-3811-496e-8e4d-6eef742ab219","docling-document-parsing-ai-443e86c2","Docling — Document Parsing for AI","IBM document parsing library. Converts PDFs, DOCX, PPTX, images, and HTML into structured markdown or JSON. Built for RAG pipelines and LLM ingestion.",177,{"id":67,"uuid":68,"slug":69,"title":70,"description":71,"author_name":72,"view_count":17,"vote_count":24,"lang_type":25,"type":73,"type_label":74},439,"c2ba9909-f624-414f-8aeb-fbd95c50766e","unstructured-document-etl-llm-pipelines-c2ba9909","Unstructured — Document ETL for LLM Pipelines","Extract clean data from PDFs, DOCX, HTML, images, and emails for RAG and LLM ingestion. 14K+ GitHub stars.","MCP Hub","mcp","MCP",{"id":76,"uuid":77,"slug":78,"title":79,"description":80,"author_name":22,"view_count":81,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2454,"175147cb-453a-11f1-9bc6-00163e2b0d79","paddleocr-ai-powered-ocr-toolkit-100-languages-175147cb","PaddleOCR — AI-Powered OCR Toolkit for 100+ Languages","A lightweight, production-ready OCR system supporting 100+ languages. Bridges documents and images to structured data for LLM pipelines.",94,{"id":83,"uuid":84,"slug":85,"title":86,"description":87,"author_name":22,"view_count":88,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2344,"9bb6bba9-43a4-11f1-9bc6-00163e2b0d79","tesseract-ocr-open-source-text-recognition-engine-100-9bb6bba9","Tesseract OCR — Open Source Text Recognition Engine for 100+ Languages","Tesseract is an open-source OCR engine maintained by Google, supporting over 100 languages. It converts images and scanned documents into machine-readable text with high accuracy across multiple output formats.",160,{"id":90,"uuid":91,"slug":92,"title":93,"description":94,"author_name":57,"view_count":95,"vote_count":24,"lang_type":25,"type":26,"type_label":27},4036,"841f15d1-5079-11f1-9bc6-00163e2b0d79","opendataloader-pdf-ai-ready-document-parser-841f15d1","OpenDataLoader PDF — AI-Ready Document Parser","An open-source PDF parser that automates document accessibility and extracts structured, AI-ready data including tables, text, bounding boxes, and tagged content.",51,"tokrepo install pack\u002Focr-document-parsing",{"pageType":98,"pageKey":8,"locale":99,"title":100,"metaDescription":101,"h1":102,"tldr":103,"bodyMarkdown":104,"faq":105,"schema":121,"internalLinks":127,"citations":140,"wordCount":153,"generatedAt":154},"pack","zh","OCR + 文档解析全家桶 — 10 个把 PDF 和扫描件变成干净 JSON 的工具","Marker \u002F Surya \u002F Nougat \u002F Zerox \u002F MinerU \u002F Docling \u002F Unstructured \u002F OpenDataLoader \u002F Tesseract \u002F PaddleOCR — 给要从扫描件 \u002F PDF \u002F 截图里抠出结构化数据的工程师。检测布局 → OCR → 表格抽取 → 拼结构 → 输出 JSON，按部就班配齐。","OCR + 文档解析全家桶 — 检测 → OCR → 表格 → 结构化 → JSON","10 件套，按安装顺序排好：先做版面检测，再 OCR 文字，再抽表格，再拼结构，最后输出 LLM \u002F 流水线能直接吃的 JSON。上层是现代 doc-AI 模型，老牌 OCR 兜底接住新模型还会漏掉的内容。","## 这个 pack 包含什么\n\n这是一个真正工程师花一下午能搭起来的流水线：把杂乱的文档 — 扫描的发票、学术 PDF、截图、双语合同 — 转成干净的结构化数据。**顺序很重要**：上一阶段的输出就是下一阶段的输入；跳过版面检测，是 doc-AI 流水线产出垃圾的最常见原因。\n\n所有 10 个都是 2026 年仍在**活跃维护**的**开源**工具。整套装下来不小（模型权重几个 GB），但通常每个阶段挑一个就够，剩下的跳过。把这个 pack 当**菜单**用，不是清单。\n\n## 推荐安装顺序\n\n1. **Marker** — PDF 转 Markdown，端到端。**从这里开始**。Marker 一把搞定版面 + OCR + 表格 + 数学公式，是绝大多数学术 \u002F 技术 \u002F 结构化 PDF 的合理默认值。如果 Marker 的输出够好，后面就不用看了。\n2. **Surya** — 90+ 语言的文档 OCR，含版面分析、表格检测、阅读顺序、LaTeX OCR。Marker 内部就在用它。当你只需要 OCR 层而不要整套 Markdown 流水线时单独用它。\n3. **MinerU** — 从任意文档里抽 LLM-ready 数据。复杂版面（多栏论文、杂志、政府表格）比 Marker 更强。GitHub 57K+ star。**接住 Marker 投降的场景**。\n4. **Zerox** — 零样本 PDF OCR。把页面图片送给视觉 LLM（GPT-4o \u002F Claude \u002F Gemini），拿回 Markdown。按调用计费，省去本地 GPU 推理。**不想自己部署模型时的最快路径**。\n5. **Nougat** — Meta 出的学术文档神经网络模型，arXiv 训练。数学公式重的 PDF 它最强（公式回来是 LaTeX，不是糊掉的字形）。比 Marker 慢，但 STEM 论文上更准。\n6. **Docling** — IBM 的文档解析库。PDF \u002F DOCX \u002F PPTX \u002F 图片 \u002F HTML 都能转成结构化 Markdown 或 JSON。**整个 pack 里最通用的解析器**，输入格式不固定时用它。\n7. **Unstructured** — LLM 流水线的文档 ETL。统一 API 处理 25+ 种文件格式，把文本切成有类型的元素（Title \u002F NarrativeText \u002F Table \u002F ListItem）。**RAG 大规模入库的工业级骨架**。\n8. **OpenDataLoader PDF** — 专注给下游 agent 产出干净结构化数据的 PDF 解析器。比 Marker \u002F MinerU 轻量，**对延迟敏感比对峰值精度敏感时用它**。\n9. **PaddleOCR** — 100+ 语言的生产级 OCR，**目前开源里中文 OCR 最强**。当 Marker \u002F Surya 在非拉丁文字或极端噪声上挣扎时，用它当 OCR 层。\n10. **Tesseract OCR** — 40 岁的老黄牛。慢，对现代字体偶尔不准，但**可预测、可脚本化、能在树莓派上跑**。当 GPU 不可用、精度要求一般时的兜底方案。\n\n## 它们怎么协同\n\n```\n输入文档（PDF \u002F 扫描件 \u002F 图片 \u002F DOCX）\n   │\n   ├─ Marker  ─────────────────► 干净 Markdown（先试这个）\n   │\n   │  Marker 输出不行：\n   │\n   ├─ MinerU  ─────────────────► Markdown \u002F JSON（复杂版面）\n   │\n   │  输入是多格式（DOCX \u002F PPTX \u002F HTML）：\n   │\n   ├─ Docling  ────────────────► 结构化 Markdown\n   ├─ Unstructured  ───────────► 有类型的元素（Title \u002F Table \u002F NarrativeText）\n   │\n   │  数学公式重的学术 PDF：\n   │\n   ├─ Nougat  ─────────────────► LaTeX + Markdown\n   │\n   │  云端 LLM 比自架 GPU 便宜：\n   │\n   ├─ Zerox  ──────────────────► 视觉 LLM 走一遍出 Markdown\n   │\n   │  底层 OCR 层（上面的工具内部会调）：\n   │\n   ├─ Surya \u002F PaddleOCR \u002F Tesseract  ──► 纯文本 + bounding box\n   │\n   └─ OpenDataLoader PDF  ─────► 轻量级结构化 JSON\n```\n\n安装套路是：**Marker 当默认**，**MinerU 当升级**接 Marker 搞不定的版面，**Nougat 处理数学**，**Zerox 跳过 GPU 自架**。OCR-only 工具（Surya \u002F PaddleOCR \u002F Tesseract）是底层积木 — 上层解析器在你特定文档上失手时直接调它们。\n\n## 你会遇到的取舍\n\n- **Marker vs MinerU** — Marker 在规整 PDF 上更快、Markdown 更干净。MinerU 处理更怪的版面（中文报纸、政府表格、扫描书）但更慢、输出更脏。**在你领域里 10 份真实文档上各跑一遍再选**。\n- **本地模型 vs 视觉 LLM（Zerox）** — 一张 4090 跑 Marker 前期硬件投入高，但月处理量过几千页之后每页成本比 Zerox 走 GPT-4o \u002F Claude 便宜大概一个数量级。量小的话云端是对的。\n- **Surya vs PaddleOCR vs Tesseract** — Surya 是现代默认。**PaddleOCR 在中日韩阿拉伯语上赢**。Tesseract 赢在「没 GPU 也能跑」— 留它当最后兜底。\n- **Docling vs Unstructured** — Docling 输出更干净的 Markdown；Unstructured 输出更适合 RAG 切块的有类型元素。**给人看选 Docling，给检索器吃选 Unstructured**。\n\n## 常见踩坑\n\n- **跳过版面检测** — 双栏学术 PDF 上直接跑裸 Tesseract，两栏文字会交错混在一起。**永远先过一遍版面感知的工具**（Marker \u002F Surya \u002F MinerU），不要把整页盲喂给 OCR。\n- **不核对表格输出** — 这个 pack 里每个工具在无边框表格、合并表头、旋转文本上都仍会丢格。表格出来后过一道 sanity check（行数、列数、数值列 dtype）再用。\n- **GPU 显存爆掉** — Marker \u002F MinerU \u002F Nougat 满质量下都要 8-12 GB 显存。16 GB 卡上**串行跑，别并发**。\n- **双语文档** — 大多数工具按页自动检测语言，不按区域。一份左英右中的合同往往一种语言识别对了，另一种乱码。**PaddleOCR 处理得最好**；其他工具先按区域切再喂。\n- **忘了去重页眉页脚** — Marker 之类会把页码、跑动页眉、脚注当正文抽出来。后处理一道，用「跨页重复出现的子串」当 key 去掉。",[106,109,112,115,118],{"q":107,"a":108},"完全不知道从哪开始，先试哪个？","Marker。它在最广范围的 PDF 上质量最高，端到端给你干净 Markdown，不用自己拼版面 - OCR - 表格的流水线。在你领域里挑 5 份真实文档跑 Marker。输出能用就结束了。不行再升级到 MinerU 处理版面、Nougat 处理数学。",{"q":110,"a":111},"这个 pack 必须要 GPU 吗？","Marker \u002F Surya \u002F MinerU \u002F Nougat — 基本上要 GPU 或一颗够强的 Apple Silicon。技术上能跑 CPU，但慢 30-100 倍，玩玩可以，干活不行。逃生通道：Zerox（卸载给视觉 LLM API）、Tesseract（设计上就纯 CPU）、PaddleOCR（有 CPU 轻量模式）。生产线建议规划一张 GPU 卡每小时处理几千页。",{"q":113,"a":114},"中日韩 \u002F 阿拉伯语文档怎么处理？","PaddleOCR 是开源里 CJK 和阿拉伯语最强的选择 — 它本来就主要为中文做的，模型权重深度优化过。Surya 覆盖 90+ 语言，混合脚本处理也还行。Marker 和 MinerU 内部都会代理 OCR 层，MinerU 尤其在开发时就重点考虑了中文覆盖。CJK 上别用 Tesseract，除非你被钉死在 CPU 上。",{"q":116,"a":117},"OCR 和文档解析有什么区别？","OCR 是窄问题：把图像像素转成文本字符串。文档解析是更大的问题：理解文档结构 — 章节、段落、表格、图、阅读顺序、引用。Tesseract 和 PaddleOCR 只做 OCR。Marker \u002F MinerU \u002F Docling \u002F Unstructured 在 OCR 之上做解析。这个 pack 两层都要的原因是：高层解析器在某一页偶尔会失败，需要可用的 OCR 层去补救。",{"q":119,"a":120},"能不能用托管 API 不自己部署？","好几个工具都有托管版或商业封装 — Marker 有托管 API，MinerU 跑成托管服务，Unstructured 有 API 计划，Zerox 设计上就只是调视觉 LLM API。量小、原型阶段用托管对。量大、合规数据、文档内容不能出网络的场景，自架是路。**真正要看的基准是你真实工作负载下每千页成本，不是榜单上的精度数字**。",{"@context":122,"@type":123,"name":13,"description":124,"numberOfItems":125,"inLanguage":126},"https:\u002F\u002Fschema.org","ItemList","10 个为工程师整理的开源 OCR 和文档解析工具，把 PDF、扫描件、截图变成结构化数据；从版面检测到 JSON 输出的推荐安装顺序。",10,"zh-CN",[128,132,136],{"url":129,"anchor":130,"reason":131},"\u002Fzh\u002Fai-tools-for\u002Frag","RAG 入库工具集","文档解析是大多数 RAG 流水线的第一步",{"url":133,"anchor":134,"reason":135},"\u002Fzh\u002Ffeatured","TokRepo 精选资产","这 10 个工具属于更大的精选目录",{"url":137,"anchor":138,"reason":139},"\u002Fzh\u002Ftopics","浏览其他主题 pack","和数据工程师 \u002F ML 工程师 pack 搭配使用",[141,145,149],{"claim":142,"source_name":143,"source_url":144},"Marker 端到端把 PDF 转成 Markdown，支持版面 \u002F 表格 \u002F 公式","Marker GitHub 仓库","https:\u002F\u002Fgithub.com\u002FVikParuchuri\u002Fmarker",{"claim":146,"source_name":147,"source_url":148},"MinerU 把 PDF 和扫描件转成 RAG 可用的干净 Markdown 或 JSON","MinerU GitHub 仓库","https:\u002F\u002Fgithub.com\u002Fopendatalab\u002FMinerU",{"claim":150,"source_name":151,"source_url":152},"Nougat 是 Meta AI 出的用于学术文档理解的 transformer 模型","Nougat GitHub 仓库","https:\u002F\u002Fgithub.com\u002Ffacebookresearch\u002Fnougat",900,"2026-05-22T10:00:00Z"]