[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-pdf-research-paper-rag-zh":3,"seo:pack:pdf-research-paper-rag:zh":96},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":95},"pdf-research-paper-rag","📚","#7C2D12","new","本周新建","PDF + 论文 RAG 工具包","面向被一堆 PDF 和论文淹没的研究员、分析师、律师：围绕一条真正的 RAG 流水线挑的十件套——ingest → 解析（Zerox \u002F OpenDataLoader \u002F Surya）→ 嵌入+索引（Pinecone Assistant \u002F PageIndex \u002F Cherry Studio 知识库）→ 检索+对话（RAGFlow \u002F Kotaemon）→ 重排（Cohere Rerank）→ 翻译非英文论文（PDFMathTranslate）。按顺序装，今晚就能把 200 篇 PDF 丢进一个文件夹然后跟它对话。",[16,28,36,43,51,59,66,73,80,88],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},758,"3ac555d9-d75c-4208-ba46-974e4a717234","zerox-zero-shot-pdf-ocr-ai-pipelines-3ac555d9","Zerox — Zero-Shot PDF OCR for AI Pipelines","Extract text from any PDF using vision models as OCR. Zerox converts PDF pages to images then uses GPT-4o or Claude to extract clean markdown without training.","Script Depot",205,0,"en","skill","Skill",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":34,"view_count":35,"vote_count":24,"lang_type":25,"type":26,"type_label":27},4036,"841f15d1-5079-11f1-9bc6-00163e2b0d79","opendataloader-pdf-ai-ready-document-parser-841f15d1","OpenDataLoader PDF — AI-Ready Document Parser","An open-source PDF parser that automates document accessibility and extracts structured, AI-ready data including tables, text, bounding boxes, and tagged content.","AI Open Source",62,{"id":37,"uuid":38,"slug":39,"title":40,"description":41,"author_name":22,"view_count":42,"vote_count":24,"lang_type":25,"type":26,"type_label":27},263,"66bc0630-1be7-4da3-b227-f1fdb1faa065","surya-document-ocr-90-languages-66bc0630","Surya — Document OCR for 90+ Languages","Surya is a document OCR toolkit with 19.5K+ GitHub stars. Text recognition in 90+ languages, layout analysis, table detection, reading order, and LaTeX OCR. Benchmarks favorably against cloud OCR serv",385,{"id":44,"uuid":45,"slug":46,"title":47,"description":48,"author_name":49,"view_count":50,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2820,"e8255b25-1bb1-47a8-bff9-ca5a445ce3f1","cherry-studio-knowledge-base-local-rag-with-50-formats","Cherry Studio Knowledge Base — Local RAG with 50+ Formats","Cherry Studio Knowledge Base ingests PDFs, Office docs, Markdown into a local vector index. Query offline, BYOK any LLM. Data stays on your machine.","Cherry Studio",130,{"id":52,"uuid":53,"slug":54,"title":55,"description":56,"author_name":57,"view_count":58,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2812,"63b22f3a-181d-4032-bfa8-3be176e193df","pinecone-assistant-managed-rag-service-with-auto-indexing","Pinecone Assistant — Managed RAG Service with Auto-Indexing","Pinecone Assistant is the fully managed RAG product on Pinecone. Upload PDFs, query with natural language, get cited answers — no chunking pipeline.","Pinecone",95,{"id":60,"uuid":61,"slug":62,"title":63,"description":64,"author_name":34,"view_count":65,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2171,"7421307d-416b-11f1-9bc6-00163e2b0d79","pageindex-document-index-reasoning-based-rag-7421307d","PageIndex — Document Index for Reasoning-Based RAG","A document indexing system that enables vectorless retrieval-augmented generation by building structured page-level indexes for LLM reasoning.",91,{"id":67,"uuid":68,"slug":69,"title":70,"description":71,"author_name":22,"view_count":72,"vote_count":24,"lang_type":25,"type":26,"type_label":27},245,"7785d7a8-fc57-42ab-ba6b-4a970404fadc","ragflow-deep-document-understanding-rag-engine-7785d7a8","RAGFlow — Deep Document Understanding RAG Engine","Open-source RAG engine with deep document understanding. Parses complex PDFs, tables, images. Agent-powered Q&A with citations. Multi-model. 77K+ stars.",251,{"id":74,"uuid":75,"slug":76,"title":77,"description":78,"author_name":22,"view_count":79,"vote_count":24,"lang_type":25,"type":26,"type_label":27},242,"b0f93b10-3339-4ca0-ad20-d6335a3d7785","kotaemon-open-source-rag-document-chat-b0f93b10","Kotaemon — Open-Source RAG Document Chat","Clean, open-source RAG tool for chatting with your documents. Supports PDF, DOCX, web pages. Multi-model, citation, and multi-user. Self-hostable. 25K+ stars.",232,{"id":81,"uuid":82,"slug":83,"title":84,"description":85,"author_name":86,"view_count":87,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2824,"bf323939-d2b6-4426-aa9f-9325666e7eaa","cohere-rerank-boost-rag-accuracy-with-rerank-3","Cohere Rerank — Boost RAG Accuracy with Rerank-3","Cohere Rerank scores candidates against a query using a cross-encoder. Drop into any RAG to boost top-1 hit rate by 30-50% over vector search alone.","Cohere",97,{"id":89,"uuid":90,"slug":91,"title":92,"description":93,"author_name":22,"view_count":94,"vote_count":24,"lang_type":25,"type":26,"type_label":27},389,"4c628f43-c803-45c8-ae39-a4caded80419","pdfmathtranslate-translate-pdf-papers-preserving-format-4c628f43","PDFMathTranslate — Translate PDF Papers Preserving Format","Translate PDF scientific papers while preserving math formulas, charts, and layout. Supports Google, DeepL, OpenAI, Ollama. CLI, GUI, MCP, Docker, Zotero plugin.",240,"tokrepo install pack\u002Fpdf-research-paper-rag",{"pageType":97,"pageKey":8,"locale":98,"title":99,"metaDescription":100,"h1":101,"tldr":102,"bodyMarkdown":103,"faq":104,"schema":120,"internalLinks":126,"citations":139,"wordCount":152,"generatedAt":153},"pack","zh","PDF + 论文 RAG 工具包 — 跟一堆论文对话的十件套","Zerox \u002F OpenDataLoader PDF \u002F Surya \u002F Cherry Studio 知识库 \u002F Pinecone Assistant \u002F PageIndex \u002F RAGFlow \u002F Kotaemon \u002F Cohere Rerank \u002F PDFMathTranslate。Ingest → 解析 → 索引 → 检索 → 重排 → 翻译，给研究员、分析师、律师的一条完整 RAG 流水线。","PDF + 论文 RAG 工具包 — 给被论文淹没的人的一条流水线","十件套按真 RAG 流水线顺序排：先解析（Zerox \u002F OpenDataLoader \u002F Surya），再索引（Cherry Studio 知识库 \u002F Pinecone Assistant \u002F PageIndex），再对话（RAGFlow \u002F Kotaemon），再重排（Cohere Rerank），非英文论文用 PDFMathTranslate 翻译。今晚把 200 篇 PDF 丢进去，明天早上就能跟它对话。","## 这个 pack 包含什么\n\n如果你是研究员、分析师、律师，瓶颈不在搜索 — 在 **PDF**。论文、合同、备案、白皮书、监管备忘录。大多数都是 90 年代风格的 PDF：双栏排版、扫描页、嵌入表格、比正文还重要的脚注。直接丢给通用聊天机器人每次都败在三件事上：解析错、检索蠢、模型看不到对的 chunk。\n\n这个 pack 按**流水线**组织，不是购物清单。每个工具只管一个阶段，安装顺序就是数据流动的顺序。和 [PhD 博士研究者的文献 + 复现代码包](\u002Fzh\u002Ftopics\u002Fphd-researcher-lit-code) 不同 — 后者解决文献检索和代码复现；本 pack 假设你已经攒了一堆 PDF，需要真的**跟语料对话**。\n\n## 推荐安装顺序\n\n### 阶段 1 — 解析（把 PDF 变成干净的 markdown）\n\n1. **Zerox** — 用视觉模型给任何 PDF 做 OCR。把页面截成图，让 GPT-4o 或 Claude 返回干净 markdown。脏扫描、双栏论文、合同等版式重要的场景胜出。打的赌是：前沿视觉模型在难 PDF 上吊打 2018 年的 OCR 栈，按调用次数付费。\n2. **OpenDataLoader PDF** — 文本优先、专为 AI 摄入调过的解析器。保留结构（章节、表格、列表）输出干净 JSON 或 markdown。对原生数字 PDF（arXiv 论文、近期合同）比 Zerox 快又便宜。先跑它，剩下 10% 失败的回落到 Zerox。\n3. **Surya** — 开源 OCR，支持 90+ 语言。如果语料里有中文、日文、阿拉伯文、俄文论文，这个是必备。本地跑 — 机密草稿不出本机。\n\n### 阶段 2 — 索引（把解析后的文本嵌入并存下来）\n\n4. **Cherry Studio 知识库** — 本地 RAG，原生支持 50+ 种格式。把一个 PDF 文件夹拖进去就能拿到带聊天界面的体验，全程在你笔记本上。除非要多用户或上云，从这里起步。\n5. **Pinecone Assistant** — 托管 RAG 服务，自动建索引。语料超过 ~10k 文档或者团队要共享访问时，Pinecone Assistant 把 ingest、嵌入、检索、引用全帮你封好。用隐私换规模。\n6. **PageIndex** — 面向推理型 RAG 的文档索引。不再是扁平 chunk 嵌入，而是按文档目录建层级索引。长论文（40+ 页）的检索质量明显更好，因为模型能推理「答案在文档的哪一部分」。\n\n### 阶段 3 — 对话（用户面）\n\n7. **RAGFlow** — 深度文档理解 RAG 引擎。开源里表格、复杂版式、带引用回溯的答案做得最好的一档。自托管，Docker 跑，自带完整聊天 UI + 来源高亮。\n8. **Kotaemon** — 开源 RAG 文档对话（人们真的会留下来用的 ChatPDF 克隆）。比 RAGFlow 轻，部署更简单，LLM 可热插拔，多 PDF 对话开箱可用。\n\n### 阶段 4 — 重排 + 翻译\n\n9. **Cohere Rerank** — 用 Rerank-3 拉高 RAG 准确率。塞在任何检索器前面。整个 RAG 栈里**杠杆最高的 10 行代码** — 嘈杂语料上相关性提升 20-40% 是常态。\n10. **PDFMathTranslate** — 翻译 PDF 论文，保留原版式、公式、图表。如果你的阅读列表一半是外文，想在喂索引前对照原文 — 必备。\n\n## 各阶段怎么协同\n\n```\n一个文件夹的 PDF\n   │\n   ├─ OpenDataLoader (原生数字 PDF，快)\n   │\n   ├─ Zerox (脏扫描、复杂版式)\n   │\n   └─ Surya (非英文 OCR)\n         │\n         ▼\n   干净 markdown + 结构\n         │\n         ├─ Cherry Studio 知识库 (本地，笔记本规模)\n         │\n         ├─ Pinecone Assistant (云端，团队规模)\n         │\n         └─ PageIndex (长文档，推理感知)\n               │\n               ▼\n         ┌─────────────────┐\n         │ RAGFlow         │\n         │ 或 Kotaemon     │\n         │ (聊天 UI)       │\n         └─────────────────┘\n               │\n               + 检索前接 Cohere Rerank\n               + 非英文论文 ingest 前过 PDFMathTranslate\n```\n\n关键洞察：**绝大多数翻车的 RAG demo 死在解析阶段，不是检索阶段**。如果你的表格出来只剩「表 1」没有数据，再聪明的检索器也救不回来。Day 1 砸在阶段 1，后面全都变简单。\n\n## 你会遇到的取舍\n\n- **本地 vs 云** — Cherry Studio 知识库和 Kotaemon 在笔记本上跑；Pinecone Assistant 把文本送到供应商。机密语料（法律、医疗、并购），坚持本地。\n- **RAGFlow vs Kotaemon** — RAGFlow 表格解析和引用 UI 更强；Kotaemon 部署和定制更简单。语料表格重（财报、科学论文）选 RAGFlow；散文重（法律备忘录、白皮书）选 Kotaemon。\n- **Zerox 成本** — 视觉模型 OCR 在 GPT-4o 上大概 0.01-0.03 美元\u002F页。200 篇平均 30 页的语料一次性大概 60-180 美元。持续流水线建议只把解析失败的回落到 Zerox。\n- **Cohere Rerank API key** — 多了一个第三方依赖。如果不能接受，可以自托管重排器（BGE-reranker、Jina），但集成成本是真的。\n\n## 常见踩坑\n\n- **chunk size 闭眼定 512 token** — 一般文本可以，论文里 4000 token 一个 method section 就废了。chunk size 要按文档类型调。\n- **聊天 UI 不带来源高亮** — 研究员看不到原页就不信答案。RAGFlow 和 Kotaemon 都做了，自建 UI 的话第一天就要上 citations。\n- **解析没验证就开 ingest** — 推 200 篇 PDF 进嵌入器之前，手动打开 5 篇随机的解析输出看一眼。坏解析污染索引是不可逆的。\n- **忘了重排** — 几乎每个团队都是抱怨完检索质量后第 3 周才加 Cohere Rerank。第 1 周就加。",[105,108,111,114,117],{"q":106,"a":107},"十个工具我必须全装吗？能不能先装 2-3 个？","先装三个：一个解析器（原生数字 PDF 选 OpenDataLoader PDF，脏扫描选 Zerox），一个索引（笔记本规模选 Cherry Studio 知识库），一个聊天 UI（Kotaemon）。这个三件套一下午就能跑起一个能用的多 PDF 对话。第二周觉得检索质量是瓶颈时加 Cohere Rerank，再加 PageIndex 应对长文档，最后用 PDFMathTranslate 处理外文论文。整套 10 个只在语料超过几百份时才有意义。",{"q":109,"a":110},"和「PhD 博士研究者文献 + 复现代码包」有啥区别？","研究流程的不同阶段。PhD 那个 pack 解决文献检索、文献管理、跑通论文代码（Zotero、arXiv MCP、GPT Researcher、JupyterLab、AI Scientist）。本 pack 假设你已经把 PDF 攒在文件夹里了，要从中规模化抽出结构化信息 — 这意味着一条真正的 RAG 流水线：解析、索引、检索、重排。很多研究者两个都用：PhD pack 收论文，本 pack 拷问它们。",{"q":112,"a":113},"法律合同、病例这种机密文档安全吗？","如果坚持本地优先的栈，安全。Surya 在本地跑 OCR；Cherry Studio 知识库和 Kotaemon 都能跑全本地（Ollama \u002F llama.cpp 后端）；RAGFlow 可以 Docker 自托管在内网。云端那几个（Pinecone Assistant、Cohere Rerank、Zerox via GPT-4o \u002F Claude）会把文本送出去，只给非机密语料用。TokRepo 上的「律师 AI 合同审查工具包」对隐私优先的工具有更深的覆盖。",{"q":115,"a":116},"PDF 里的表格和图，这些工具真的能抽出来吗？","表格是 PDF 解析最难的部分。开源选项里 RAGFlow 自带的表格解析器最强；OpenDataLoader PDF 在源 PDF tag 良好时能把表格结构保留成 JSON；Zerox 因为视觉模型像人一样看页面，复杂版式能扛。图表和公式更难 — 公式当前 PDFMathTranslate 是开源最好的，图大多数团队的妥协是保留图片引用，让聊天 UI 跳到原页。",{"q":118,"a":119},"从一个 PDF 文件夹到可用聊天 UI，大概多久？","笔记本上用 Cherry Studio 知识库或 Kotaemon，小语料（50 篇以下原生数字 PDF）大概 30 分钟能开始对话 — 大部分时间花在首次解析和嵌入。大语料（500 篇带扫描和表格的）要几小时流水线工作：先用 OpenDataLoader 跑一遍，失败的回落 Zerox 再跑一遍，ingest 进 RAGFlow，然后调 chunk size 和重排器。之后加一篇新 PDF 的边际成本是秒级。",{"@context":121,"@type":122,"name":13,"description":123,"numberOfItems":124,"inLanguage":125},"https:\u002F\u002Fschema.org","ItemList","按 RAG 流水线挑的十件套，给被 PDF 淹没的研究员、分析师、律师 — 解析、索引、对话、重排、翻译一条龙。",10,"zh-CN",[127,131,135],{"url":128,"anchor":129,"reason":130},"\u002Fzh\u002Ftopics\u002Fphd-researcher-lit-code","PhD 博士研究者的文献 + 复现代码包","姊妹 pack，覆盖文献检索和代码复现，与本 pack 的 PDF 流水线天然互补",{"url":132,"anchor":133,"reason":134},"\u002Fzh\u002Ftopics\u002Flawyer-ai-contract-kit","律师的 AI 合同审查工具包","机密文档工具，呼应本 pack 的本地优先选项",{"url":136,"anchor":137,"reason":138},"\u002Fzh\u002Fai-tools-for\u002Frag","TokRepo 全部 RAG 工具","本 pack 之外浏览更广的 RAG 目录",[140,144,148],{"claim":141,"source_name":142,"source_url":143},"Zerox 用视觉模型把 PDF OCR 成 markdown","Zerox GitHub","https:\u002F\u002Fgithub.com\u002Fgetomni-ai\u002Fzerox",{"claim":145,"source_name":146,"source_url":147},"RAGFlow 是深度文档理解的 RAG 引擎","RAGFlow GitHub","https:\u002F\u002Fgithub.com\u002Finfiniflow\u002Fragflow",{"claim":149,"source_name":150,"source_url":151},"Cohere Rerank 用于提升 RAG 检索相关性","Cohere Rerank 文档","https:\u002F\u002Fdocs.cohere.com\u002Fdocs\u002Frerank-overview",900,"2026-05-22T12:00:00Z"]