[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-data-engineer-agent-toolbox-zh":3,"seo:pack:data-engineer-agent-toolbox:zh":97},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":96},"data-engineer-agent-toolbox","📊","#14B8A6","new","本周新建","数据工程师的 Agent 工具箱","十个资产，让 AI agent 真正读懂你的 SQL 方言、dbt 项目结构、数仓语义和编排模式 — 终于不再瞎编字段，能直接交付模型。",[16,28,35,42,49,59,66,74,82,89],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},3115,"19b431b4-ebdf-47fa-a358-88320283eb55","dbt-mcp-dbt-context-mcp-server-for-agents","dbt-mcp — dbt Context MCP Server for Agents","Give AI agents structured access to dbt project context and tools (SQL, semantic layer, docs search). Ships an experimental MCP bundle in releases.","MCP Hub",107,0,"en","mcp","MCP",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":22,"view_count":34,"vote_count":24,"lang_type":25,"type":26,"type_label":27},3283,"216cb667-d5ae-5400-99d5-63dd528e1690","postgres-mcp-pro-index-tuning-safe-sql-tools","Postgres MCP Pro — Index Tuning + Safe SQL Tools","Postgres MCP Pro is an MCP server for PostgreSQL that runs safe SQL, explains plans, and recommends indexes so agents can tune databases faster.",70,{"id":36,"uuid":37,"slug":38,"title":39,"description":40,"author_name":22,"view_count":41,"vote_count":24,"lang_type":25,"type":26,"type_label":27},3287,"37f3a64a-c095-5dc8-965a-670b50abc8e6","bigquery-mcp-protected-mode-for-phi-pii-guardrails","BigQuery MCP — Protected Mode for PHI\u002FPII Guardrails","BigQuery MCP runs BigQuery queries from Claude Desktop and can block sensitive columns in Protected Mode so PHI\u002FPII never enters the LLM context.",56,{"id":43,"uuid":44,"slug":45,"title":46,"description":47,"author_name":22,"view_count":48,"vote_count":24,"lang_type":25,"type":26,"type_label":27},3288,"8a01b5ca-fe0b-5120-9909-4a9505e34323","trino-mcp-oauth-2-1-query-cli-for-data-warehouses","Trino MCP — OAuth 2.1 + Query CLI for Data Warehouses","Trino MCP is a Go MCP server and CLI for Trino with OAuth 2.1 support, letting agents query catalogs and schemas with traceable user identity.",63,{"id":50,"uuid":51,"slug":52,"title":53,"description":54,"author_name":55,"view_count":56,"vote_count":24,"lang_type":25,"type":57,"type_label":58},1822,"94e852dc-3c71-11f1-9bc6-00163e2b0d79","mcp-toolbox-databases-ai-agent-database-server-94e852dc","MCP Toolbox for Databases — AI Agent Database Server","MCP Toolbox for Databases is an open-source MCP server by Google that gives AI agents secure, governed access to databases including PostgreSQL, MySQL, Spanner, BigQuery, and more.","Script Depot",137,"skill","Skill",{"id":60,"uuid":61,"slug":62,"title":63,"description":64,"author_name":55,"view_count":65,"vote_count":24,"lang_type":25,"type":57,"type_label":58},1481,"894f7271-3931-11f1-9bc6-00163e2b0d79","dbt-data-build-tool-sql-transformations-894f7271","dbt — Data Build Tool for SQL Transformations","Open-source framework for modeling, testing, and documenting SQL transformations in the modern data warehouse.",77,{"id":67,"uuid":68,"slug":69,"title":70,"description":71,"author_name":72,"view_count":73,"vote_count":24,"lang_type":25,"type":57,"type_label":58},4224,"39a2e67f-5319-11f1-9bc6-00163e2b0d79","sqlmesh-scalable-data-transformation-framework-sql-39a2e67f","SQLMesh — Scalable Data Transformation Framework for SQL","SQLMesh is an open-source data transformation framework that provides efficient, incremental builds, built-in data validation, and a virtual data environment system. It is backwards-compatible with dbt and designed to scale data pipelines without full table rebuilds.","AI Open Source",82,{"id":75,"uuid":76,"slug":77,"title":78,"description":79,"author_name":80,"view_count":81,"vote_count":24,"lang_type":25,"type":57,"type_label":58},1183,"00a6152f-371c-11f1-9bc6-00163e2b0d79","apache-airflow-programmatic-workflow-orchestration-platform-00a6152f","Apache Airflow — Programmatic Workflow Orchestration Platform","Apache Airflow is the industry-standard platform for authoring, scheduling, and monitoring data workflows. Define DAGs in Python to orchestrate ETL pipelines, ML training, data processing, and any complex workflow with dependencies.","Apache Software Foundation",133,{"id":83,"uuid":84,"slug":85,"title":86,"description":87,"author_name":55,"view_count":88,"vote_count":24,"lang_type":25,"type":57,"type_label":58},1664,"904ed27e-39eb-11f1-9bc6-00163e2b0d79","datahub-open-source-data-discovery-governance-platform-904ed27e","DataHub — Open-Source Data Discovery & Governance Platform","DataHub is a modern metadata platform for discovering, governing, and observing your data stack. Built by LinkedIn and now a top-level project at Acryl Data, it unifies metadata from warehouses, lakes, dashboards, and ML pipelines into one searchable catalog.",72,{"id":90,"uuid":91,"slug":92,"title":93,"description":94,"author_name":55,"view_count":95,"vote_count":24,"lang_type":25,"type":57,"type_label":58},1670,"16b7b083-39ec-11f1-9bc6-00163e2b0d79","sqlglot-sql-parser-transpiler-optimizer-pure-python-16b7b083","SQLGlot — SQL Parser, Transpiler & Optimizer in Pure Python","SQLGlot is a no-dependency Python library that parses, transpiles, and optimizes SQL across 20+ dialects. Convert queries between Snowflake, BigQuery, DuckDB, Spark, Postgres, and more without touching the database.",156,"tokrepo install pack\u002Fdata-engineer-agent-toolbox",{"pageType":98,"pageKey":8,"locale":99,"title":100,"metaDescription":101,"h1":102,"tldr":103,"bodyMarkdown":104,"faq":105,"schema":121,"internalLinks":126,"citations":139,"wordCount":152,"generatedAt":153},"pack","zh","数据工程师的 Agent 工具箱 — 10 个资产搞定 SQL \u002F dbt \u002F 数仓 \u002F 编排","dbt-mcp \u002F Postgres MCP Pro \u002F BigQuery MCP \u002F Trino MCP \u002F MCP Toolbox \u002F dbt \u002F SQLMesh \u002F Airflow \u002F DataHub \u002F SQLGlot — 一套 10 个资产，让 AI agent 真懂你的方言、你的项目、你的数仓语义。TokRepo 一键安装。","数据工程师的 Agent 工具箱 — 让 AI 不再瞎编 SQL","十个资产按真实数据团队的接入顺序排：先接数仓 + Postgres MCP（让 agent 读到真 schema），再上 dbt-mcp（让它读懂你的模型），再 Airflow + DataHub 管编排和血缘，最后 SQLGlot 解决「agent 把 Snowflake 语法写到 BigQuery 里」的方言漂移。","## 这个 pack 包含什么\n\n这是工作中的数据工程师为了让 AI agent 真正能帮上忙而搭起来的栈 — 不是那种「text-to-SQL 演示」一遇到 800 张表、三种方言、和一堆没人写文档的 dbt macro 就崩的玩具。\n\n每个资产在 agent 循环里只干一件事：给 agent **schema 感知**（数仓 MCP）、给它**模型感知**（dbt-mcp 套在 dbt\u002FSQLMesh 上）、给它**编排感知**（Airflow）、给它**血缘感知**（DataHub），最后给它最缺但谁也没给过的一件 — **方言感知**（SQLGlot）。五层都到位之后，agent 不再对着 Snowflake 写 `STRING_AGG`，模型一次性 compile 通过。\n\n顺序是有讲究的，每一层都是上一层的脚手架。先接好数仓 MCP，再上 dbt-mcp — 连底表都读不到，给 agent 看 dbt manifest 也没用。\n\n## 推荐安装顺序（原始 SQL → 模型 → 编排 → 质量与血缘）\n\n1. **Postgres MCP Pro** — Postgres MCP，安全 SQL、EXPLAIN 计划、索引推荐都自带。哪怕 Postgres 不是你的数仓也先装它，因为你的业务库八成是 Postgres，而 agent 驱动的探索一般从那里起步。\n2. **BigQuery MCP** — 带 PHI\u002FPII 保护模式的 BigQuery MCP。GCP 用户的数仓连接器。任何非工程角色一句「列出 top 客户」时，这个保护模式就值回票价了。\n3. **Trino MCP** — Trino\u002FPresto MCP，OAuth 2.1 接入。你有一个 federated 数仓（Iceberg + S3 + 没死透的 MySQL）想用一个 MCP 一打多时上它。\n4. **MCP Toolbox for Databases** — Google 开源的 MCP，一台 server 前置多个数据源，按声明式配置一次给 agent 暴露。比起把五个 MCP 分别接，「这五个源、这几个工具」一次写完更省心。\n5. **dbt** — 模型层。你团队就算已经在用，也装一下让 agent 有官方资产可引，并知道你项目按 dbt 约定走（`models\u002F` \u002F `schema.yml` \u002F `dbt_project.yml`）。\n6. **dbt-mcp** — 把 dbt 项目上下文（模型、语义层、文档）暴露给 agent 的 MCP。这是关键一跳：agent 回答「`fct_orders` 都有哪些列」时是去**读你的 manifest**，不是凭印象编。\n7. **SQLMesh** — 值得了解的 dbt 替代。虚拟数据环境、真正的列级血缘、语义化版本。新项目可以试；已经在 dbt 上稳定运行的团队继续 dbt。\n8. **Apache Airflow** — 编排层。Dagster \u002F Prefect 再香，生产数据流水线绝大多数还是落在 Airflow。给 agent 装上，它写 DAG 时用的就是你团队真在跑的方言。\n9. **DataHub** — 数据发现 + 血缘 + 治理。当 agent 已经能写 SQL、改 dbt 模型，下一个常见故障就是「agent 改了上游字段，把三个下游 dashboard 静默改坏」。DataHub 解决的就是这种可见性缺口。\n10. **SQLGlot** — 纯 Python 的 SQL 解析与转译器。这个 pack 里最不起眼但最关键的英雄。agent 在 Stack Overflow 上学的 SQL 是 Snowflake 和 Postgres 各一半，结果你在 BigQuery 上跑 — SQLGlot 在中间转一道，方言差异直接消失。\n\n## 它们怎么协同\n\n```\n                  ┌── Postgres MCP Pro ──┐\n                  │  （业务库 + 安全 SQL）  │\n                  └────────┬──────────────┘\n                           │\n   BigQuery MCP ──┐        │       ┌── Trino MCP\n    （数仓）       │        │       │  （联邦查询）\n                  └────────┼───────┘\n                           ▼\n              MCP Toolbox for Databases\n                  （一台 server 多源）\n                           │\n                           ▼\n                  ┌─── dbt \u002F SQLMesh ───┐\n                  │      （模型层）       │\n                  └──────────┬───────────┘\n                             │\n                             ▼\n                         dbt-mcp\n              （agent 读 manifest + 语义层）\n                             │\n                             ▼\n                    ┌──── Airflow ────┐\n                    │   （DAG 调度）    │\n                    └────────┬─────────┘\n                             │\n                             ▼\n                         DataHub\n                  （血缘 + 影响分析）\n                             │\n                             ▼\n                         SQLGlot\n             （agent 写错方言时自动转译）\n```\n\n关键连接是 **MCP 层 + dbt-mcp**：数仓 MCP 暴露物理 schema，dbt-mcp 在上面暴露逻辑模型层。两个都接上之后，agent 才能回答「这个字段是哪个模型产的，底层 SQL 长啥样」 — 这个问题答得出来，agent 才从玩具变成队友。\n\n## 你会遇到的取舍\n\n- **Airflow vs Dagster vs Prefect** — Airflow 部署量最大、招人最容易，所以你在大公司当独立数据人多半会继承一套 Airflow。Dagster 资产语义更干净、开发体验显著更好。Prefect 最轻量。除非全新项目，默认 Airflow；agent 对三者都能用。\n- **dbt vs SQLMesh** — dbt 生态全（每个 BI 都集成）但增量模型容易翻车，列级血缘要付费 Cloud 或第三方。SQLMesh 自带虚拟环境、真血缘、语义版本，但社区小。新项目试 SQLMesh；老 dbt 项目稳定运行就别折腾。\n- **ClickHouse vs BigQuery vs Snowflake** — ClickHouse 实时分析、要低延迟、能容忍 schema 不灵活；BigQuery 在 GCP 想要零运维，代价是 join 慢；Snowflake 想要弹性算力、不在乎账单。agent 不关心你选哪个，但它写的 SQL 关心 — 这就是 SQLGlot 进这个 pack 的原因。\n- **单数仓 MCP vs MCP Toolbox** — MCP Toolbox 一台 server 多源、运维简单但爆炸半径大；分仓 MCP 隔离好但 service 多。先用 Toolbox 起步，等你需要更严格的访问边界再拆。\n\n## 常见踩坑\n\n- **LLM 对着真 schema 瞎编字段** — 典型失败是 agent 把 `customer_id` 编成 `cust_id`。补救办法是先把 MCP 接上、再让 agent 查 — 不是反过来。MCP 没接，agent 必猜；MCP 接好，agent 会读。\n- **从 Stack Overflow 粘来的 SQL 方言漂移** — agent 在 BigQuery 项目里写出 Postgres 味儿的 SQL（`SUBSTRING` 不是 `SUBSTR`、`||` 不是 `CONCAT`、window 语法对不上）。流水线里加一道 SQLGlot 转译，噪声直接降一个量级。\n- **过早把 dbt-mcp 当写工具用** — 先只读。让 agent 读你的 manifest、答问题、以 PR 形式提改动。还没看它稳定工作一周就给生产 dbt 项目开写权限，等着周二早上 200 个模型挂掉吧。\n- **不出事不上血缘** — DataHub 在没出事前看着像额外负担，等 agent 改了上游字段三个 dashboard 静默给出错数那天就晚了。出事之前装好，运维成本远小于事后排查成本。\n- **忘了开 PHI\u002FPII 防护** — BigQuery MCP 的保护模式不是摆设。只要数仓里有任何受监管数据、agent 又会被数据团队以外的人用到，第一天就把列级遮罩打开。别等合规工单来。",[106,109,112,115,118],{"q":107,"a":108},"10 个非装不可吗？最小可用子集是哪几个？","最小三件套：一个数仓 MCP（业务库是 Postgres 就 Postgres MCP Pro，GCP 上就 BigQuery MCP）、dbt-mcp 套在你现有 dbt 项目上、再加 SQLGlot 做方言转译。这三件到位 agent 就能读真 schema、懂你的模型、写错方言时自动救回来。Airflow \u002F DataHub \u002F SQLMesh 这些等栈长起来再加，不要预先全装。",{"q":110,"a":111},"为啥既要数仓 MCP 又要 MCP Toolbox — 不是重复吗？","略重复但有意为之。分仓 MCP（Postgres MCP Pro \u002F BigQuery MCP \u002F Trino MCP）控制粒度细、故障隔离好；MCP Toolbox 是一台声明式 server 前置多源。大多团队会同时跑：Toolbox 做读多探索、关键数仓（一般是生产 OLTP Postgres）用专门的 MCP 加保护开关。按你的运维偏好选。",{"q":113,"a":114},"dbt-mcp 究竟怎么减幻觉的 — 它给 agent 暴露了什么？","dbt-mcp 给 agent 提供结构化访问 dbt 项目产物的能力：模型定义、schema.yml 文档、语义层查询、可选编译后的 SQL。Agent 不再猜模型有哪些列，而是去读你的 manifest；不再编指标名，而是查语义层。回答锚定在你已经定义好的真实实体上 — LLM 查它没训练过的数据时，这就是全部的游戏。",{"q":116,"a":117},"全新项目是不是该跳过 dbt 直接上 SQLMesh？","可能可以。SQLMesh 在虚拟环境、列级血缘、增量模型这些 dbt 已知痛点上语义更干净。代价是生态：dbt 几乎和每个 BI 都集成、招人容易、社区巨大。小团队、自己会折腾、全新项目 — SQLMesh 是合理的赌注；要快速上手分析师、或要和现有工具链集成 — 继续 dbt，遇到坑再考虑 SQLMesh。",{"q":119,"a":120},"通过 MCP 把 agent 连到生产数仓真的安全吗？","可以做到安全，但默认值很关键。默认只读凭据。打开数仓 MCP 自带的安全特性（Postgres MCP Pro 的 safe SQL 模式、BigQuery MCP 的 PHI\u002FPII 保护模式）。dev 和 prod 目标分开，让 agent 不可能误写 prod。任何 DDL\u002FDML 都要人工确认。审计 agent 查了啥 — 数仓本来就在记日志，去看就行。风险不是零，但当作刚入职没拿到写权限的小工程师来管，风险可控。",{"@context":122,"@type":123,"name":13,"description":124,"numberOfItems":125,"inLanguage":99},"https:\u002F\u002Fschema.org","ItemList","十个精选资产，让 AI coding agent 真正拿到 schema、dbt 项目、数仓和编排的上下文，写出能 compile 的 SQL 和能过 test 的 dbt 模型。",10,[127,131,135],{"url":128,"anchor":129,"reason":130},"\u002Fzh\u002Fai-tools-for\u002Fdata","数据工程 AI 工具合集","浏览这 10 个资产所属的更大数据工程目录",{"url":132,"anchor":133,"reason":134},"\u002Fzh\u002Ftopics","其他主题 pack","MCP server \u002F 多 agent 框架 \u002F coding agent 等其他主题包",{"url":136,"anchor":137,"reason":138},"\u002Fzh\u002Ffeatured","TokRepo 精选资产","这 10 个资产是更大「agent-ready」精选目录的一部分",[140,144,148],{"claim":141,"source_name":142,"source_url":143},"dbt 是 SQL 优先的数据转换框架","dbt 官网","https:\u002F\u002Fwww.getdbt.com\u002F",{"claim":145,"source_name":146,"source_url":147},"Apache Airflow 是程序化工作流编排平台","Apache Airflow","https:\u002F\u002Fairflow.apache.org\u002F",{"claim":149,"source_name":150,"source_url":151},"SQLGlot 是无依赖的 SQL 解析、转译与优化器","SQLGlot GitHub","https:\u002F\u002Fgithub.com\u002Ftobymao\u002Fsqlglot",900,"2026-05-22T13:00:00Z"]