[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"pack-detail-ai-cost-optimization-stack-en":3,"seo:pack:ai-cost-optimization-stack:en":102},{"code":4,"message":5,"data":6},200,"操作成功",{"pack":7},{"slug":8,"icon":9,"tone":10,"status":11,"status_label":12,"title":13,"description":14,"items":15,"install_cmd":101},"ai-cost-optimization-stack","💰","#F59E0B","new","New · this week","AI Cost Optimization — Token-Saving Engineering Stack","Ten picks for SaaS \u002F agent teams whose LLM bill is now a real line item — LiteLLM, OpenRouter, Manifest router, Portkey, Helicone cache, Cloudflare AI Gateway, LLMLingua compression, TokenCost calculator, LiteLLM cost dashboard, Fireworks fine-tune. Five layers: measure first, then cache, route, compress, fine-tune. 10–50% savings typical without quality loss.",[16,28,38,46,53,61,69,79,86,93],{"id":17,"uuid":18,"slug":19,"title":20,"description":21,"author_name":22,"view_count":23,"vote_count":24,"lang_type":25,"type":26,"type_label":27},2789,"0f113965-1adc-4435-982b-fb613fa4d157","litellm-proxy-unified-gateway-for-100-llm-apis","LiteLLM Proxy — Unified Gateway for 100+ LLM APIs","LiteLLM Proxy maps 100+ LLM providers (Anthropic, OpenAI, Bedrock, Vertex) to one OpenAI-compatible endpoint. Auth, rate limit, cost track, fallbacks.","LiteLLM (BerriAI)",92,0,"en","agent","Agent",{"id":29,"uuid":30,"slug":31,"title":32,"description":33,"author_name":34,"view_count":35,"vote_count":24,"lang_type":25,"type":36,"type_label":37},2829,"7bb772b3-1ab0-4d27-a758-1cd9acc4f6ff","openrouter-unified-api-for-300-llms-with-auto-failover","OpenRouter — Unified API for 300+ LLMs with Auto Failover","OpenRouter is one OpenAI-compatible endpoint for 300+ LLMs across 60+ providers. Transparent pricing, no markup, automatic failover when a route is down.","OpenRouter",96,"skill","Skill",{"id":39,"uuid":40,"slug":41,"title":42,"description":43,"author_name":44,"view_count":45,"vote_count":24,"lang_type":25,"type":36,"type_label":37},863,"15266cba-33d7-11f1-9bc6-00163e2b0d79","manifest-smart-llm-router-cuts-costs-70-15266cba","Manifest — Smart LLM Router That Cuts Costs 70%","Intelligent LLM routing that scores requests across 23 dimensions in under 2ms. Routes to the cheapest capable model among 300+ options from 13+ providers. MIT, 4,200+ stars.","AI Open Source",185,{"id":47,"uuid":48,"slug":49,"title":50,"description":51,"author_name":44,"view_count":52,"vote_count":24,"lang_type":25,"type":36,"type_label":37},308,"585d3a26-0cca-47cb-ac88-2797a853367d","portkey-ai-gateway-route-250-llms-585d3a26","Portkey AI Gateway — Route to 250+ LLMs","Portkey AI Gateway routes to 250+ LLMs with sub-1ms latency, 40+ guardrails, retries, fallbacks, and caching. 11.1K+ stars. Apache 2.0.",143,{"id":54,"uuid":55,"slug":56,"title":57,"description":58,"author_name":59,"view_count":60,"vote_count":24,"lang_type":25,"type":36,"type_label":37},2817,"5d1acc2e-f42d-4fce-aec7-771506f858ae","helicone-cache-cut-llm-spend-with-drop-in-response-caching","Helicone Cache — Cut LLM Spend with Drop-In Response Caching","Helicone Cache short-circuits identical LLM requests at the proxy. Set Helicone-Cache-Enabled header, exact-match responses come back in ms at zero cost.","Helicone",112,{"id":62,"uuid":63,"slug":64,"title":65,"description":66,"author_name":67,"view_count":68,"vote_count":24,"lang_type":25,"type":36,"type_label":37},624,"b1962c77-9ecf-4a84-87b1-e7d4b677dabe","cloudflare-ai-gateway-llm-proxy-cache-analytics-b1962c77","Cloudflare AI Gateway — LLM Proxy, Cache & Analytics","Free proxy gateway for LLM API calls with caching, rate limiting, cost tracking, and fallback routing across providers. Reduce costs up to 95% with response caching. 7,000+ stars.","Cloudflare",173,{"id":70,"uuid":71,"slug":72,"title":73,"description":74,"author_name":75,"view_count":76,"vote_count":24,"lang_type":25,"type":77,"type_label":78},862,"1510da0c-33d7-11f1-9bc6-00163e2b0d79","llmlingua-compress-prompts-20x-minimal-loss-1510da0c","LLMLingua — Compress Prompts 20x with Minimal Loss","Microsoft research tool for prompt compression. Reduce token usage up to 20x while maintaining LLM performance. Solves lost-in-the-middle for RAG. MIT, 6,000+ stars.","Script Depot",249,"prompt","Prompt",{"id":80,"uuid":81,"slug":82,"title":83,"description":84,"author_name":22,"view_count":85,"vote_count":24,"lang_type":25,"type":36,"type_label":37},2791,"72b2e16c-71b4-4702-87ed-f6ea3ba99f69","litellm-cost-tracking-per-project-llm-spend-dashboard","LiteLLM Cost Tracking — Per-Project LLM Spend Dashboard","LiteLLM ships a built-in cost dashboard. Track LLM spend by project, user, model, tag. Hard budgets that block at the proxy. SOC2 \u002F SSO via Pro tier.",75,{"id":87,"uuid":88,"slug":89,"title":90,"description":91,"author_name":75,"view_count":92,"vote_count":24,"lang_type":25,"type":36,"type_label":37},859,"43b26691-33ce-11f1-9bc6-00163e2b0d79","tokencost-llm-price-calculator-400-models-43b26691","TokenCost — LLM Price Calculator for 400+ Models","Client-side token counting and USD cost estimation for 400+ LLMs. 3 lines of Python to track prompt and completion costs. Supports OpenAI, Anthropic, Mistral, AWS Bedrock. MIT, 2K+ stars.",181,{"id":94,"uuid":95,"slug":96,"title":97,"description":98,"author_name":99,"view_count":100,"vote_count":24,"lang_type":25,"type":36,"type_label":37},2850,"2f07f6a8-78ac-480a-b7a4-00282133dd4d","fireworks-fine-tuning-serverless-lora-on-llama-in-30-min","Fireworks Fine-Tuning — Serverless LoRA on Llama in 30 min","Fireworks runs serverless LoRA fine-tuning on Llama, Qwen, Mixtral. Upload JSONL, get a deployed fine-tune in 30 min on the same endpoint.","Fireworks AI",49,"tokrepo install pack\u002Fai-cost-optimization-stack",{"pageType":103,"pageKey":8,"locale":25,"title":13,"metaDescription":104,"h1":13,"tldr":105,"bodyMarkdown":106,"faq":107,"schema":123,"internalLinks":173,"citations":186,"wordCount":199,"generatedAt":200},"pack","LiteLLM, OpenRouter, Manifest, Portkey, Helicone cache, Cloudflare AI Gateway, LLMLingua, TokenCost, Fireworks fine-tune — five-layer stack that typically cuts LLM bills 10–50% without hurting quality. Install via TokRepo.","Ten assets for the SaaS \u002F agent team whose LLM invoice is now a real line on the P&L. Five layers in order: measure (TokenCost, LiteLLM cost dashboard), cache (Helicone, Cloudflare AI Gateway), route (LiteLLM, OpenRouter, Manifest, Portkey), compress (LLMLingua), fine-tune (Fireworks LoRA). Most teams that follow this order land between 10% and 50% savings without measurable quality loss; the upper end is reserved for high-cache-hit chat support and high-volume classification, not greenfield agent reasoning.","## What's in this pack\n\nWhen the monthly LLM invoice crosses five figures, every engineer suddenly has an opinion about caching. This pack is the boring, ordered playbook that actually moves the number: measure before you optimize, cache before you route, route before you compress, compress before you fine-tune. Most teams that touch every layer in order book 10–50% savings without users noticing — the upper end shows up in support chat (high cache hit) and bulk classification (small fine-tuned models), not in greenfield agent reasoning where you should be careful.\n\n| # | Asset | Layer | What it does |\n|---|---|---|---|\n| 1 | LiteLLM Proxy | router | one OpenAI-compatible endpoint to 100+ providers, fallback chains, per-key budgets |\n| 2 | OpenRouter Unified API | router | hosted gateway over 300+ models, single key, automatic failover |\n| 3 | Manifest Smart Router | router | semantic routing — cheap model first, escalate only when confidence is low |\n| 4 | Portkey AI Gateway | router | enterprise gateway, 250+ LLMs, virtual keys, guardrails, caching |\n| 5 | Helicone Cache | cache | drop-in response cache via proxy header; deterministic and semantic modes |\n| 6 | Cloudflare AI Gateway | cache | edge-level proxy with cache, analytics, retries, rate limits — free tier |\n| 7 | LLMLingua | compression | up to 20× prompt compression with minimal task-quality loss |\n| 8 | LiteLLM Cost Dashboard | monitoring | per-project, per-user, per-model spend tracking with hard budget blocks |\n| 9 | TokenCost | monitoring | offline calculator for 400+ models — sanity-check estimates before shipping |\n| 10 | Fireworks Serverless LoRA | fine-tune | serverless LoRA on Llama in 30 minutes — replace a frontier model for a narrow task |\n\n## Install in this order — measure first, fine-tune last\n\n```bash\n# Layer 1: measure (do this before changing anything)\ntokrepo install tokencost                  # offline price model\ntokrepo install litellm-cost-tracking      # live per-project dashboard\n\n# Layer 2: cache (highest ROI, lowest risk for repetitive workloads)\ntokrepo install helicone-cache             # drop-in response cache\n# or: tokrepo install cloudflare-ai-gateway  # edge cache, free tier\n\n# Layer 3: route (cheap-model-first with safe escalation)\ntokrepo install litellm-proxy              # self-hosted, BYOK\n# or: tokrepo install openrouter-unified-api  # hosted, 300+ models\ntokrepo install manifest-smart-router      # semantic router on top\n\n# Layer 4: compress (only after layers 1–3 are baselined)\ntokrepo install llmlingua                  # 2–20× prompt compression\n\n# Layer 5: fine-tune (last resort — costs engineering time)\ntokrepo install fireworks-fine-tune        # LoRA on Llama, narrow task\n```\n\nThe order matters more than the picks. Skip ahead and you'll either burn engineering time fine-tuning a model that a cache would have replaced, or you'll silently degrade quality with prompt compression you can't attribute because you never instrumented the baseline. The unglamorous truth: the biggest savings usually come from layer 2 (cache) and layer 3 (route), not the fashionable layer 5.\n\n**Layer 1 — measure.** Install TokenCost as a library so every PR prints before\u002Fafter token math on staging. Install LiteLLM cost tracking (or Portkey) so production has a per-call ledger by project \u002F user \u002F model. Don't move on until you can answer 'what does one user session cost' to two significant figures.\n\n**Layer 2 — cache.** Helicone gives you exact-match caching via a single proxy header and semantic caching via embedding similarity. Cloudflare AI Gateway gives you the same at the edge with a free tier. For chatbots, FAQ, RAG Q&A, and idempotent classification, hit rates of 30–70% are realistic. For agent planning loops and creative generation, they are not — don't try.\n\n**Layer 3 — route.** LiteLLM Proxy is the self-hosted default — one OpenAI-compatible URL maps to Anthropic, Bedrock, Vertex, OpenAI, with failover chains and per-key budgets. OpenRouter is the hosted equivalent. Manifest sits on top to classify prompts and route cheap models first, escalating only on low confidence. Portkey adds enterprise features (SSO, audit, virtual keys, guardrails).\n\n**Layer 4 — compress.** LLMLingua compresses prompts up to 20× by token-level importance scoring. Quality loss depends on task: tolerated on summarization, extraction, and classification; risky on math, code generation, and complex reasoning. Always A\u002FB against an eval suite. Treat compression ratio as a budget, not a target.\n\n**Layer 5 — fine-tune.** Fireworks serverless LoRA on Llama replaces a frontier model for one narrow, high-volume task in roughly 30 minutes of training. Worth doing when you have ≥10k labeled or LLM-generated examples, the task is stable, and the frontier bill on that one task justifies the engineering time. Don't fine-tune to save 5% on a low-volume endpoint.\n\n## How they fit together\n\n```\nclient app\n   │\n   ▼\n┌──────────────┐  cache hit   ┌────────────┐\n│ Helicone \u002F   │─────────────▶│ cached     │\n│ Cloudflare   │              │ response   │\n│ AI Gateway   │              └────────────┘\n└──────┬───────┘ cache miss\n       │\n       ▼\n┌──────────────────┐  classify  ┌─────────────────┐\n│ Manifest router  │───────────▶│ cheap model     │\n│ (semantic)       │            │ (Llama \u002F Haiku) │\n└──────┬───────────┘            └─────────────────┘\n       │ low confidence \u002F escalate\n       ▼\n┌─────────────────┐  optional   ┌──────────────┐\n│ LiteLLM \u002F       │────────────▶│ LLMLingua    │\n│ OpenRouter \u002F    │  compress   │ pre-compress │\n│ Portkey gateway │             └──────┬───────┘\n└──────┬──────────┘                    │\n       │                               │\n       ▼                               ▼\n  frontier model (Opus \u002F GPT-4 \u002F Gemini Ultra)\n       │\n       ▼\nLiteLLM cost ledger + TokenCost reconciliation\n```\n\n## Tradeoffs (the honest part)\n\n- **Cache hit rate vs freshness.** 70% hit on support chat is a fortune; 70% on a stock-price assistant is a customer-trust disaster. Set TTL per route, not globally.\n- **Router latency overhead.** A semantic router adds 50–200 ms (embedding + classification). Invisible on chat, visible on streaming voice. Measure end-to-end p95 before and after.\n- **Compression quality loss.** LLMLingua at 5× is mostly free on summarization; at 20× it starts dropping facts on extraction. Pair every rollout with a held-out eval set.\n- **Cheap-model misrouting.** Routing a math problem to Haiku because the router thought it was 'simple Q&A' is a silent regression that surfaces a week later. Log routing decisions with the trace, review bottom-decile confidence weekly.\n- **Fine-tune lock-in.** A LoRA against Llama 3.1 70B is yours to host anywhere; a fine-tune on a proprietary model isn't. Pick the base deliberately.\n- **Observability isn't free either.** Break-even is usually around 1M calls\u002Fmonth — below that, the free tiers are fine.\n\n## Common pitfalls\n\n- **Optimizing before measuring.** Engineers fine-tune 'because GPT-4 is expensive' without ever instrumenting the top-spend endpoint. Eight times out of ten the bill is one feature, not the whole product.\n- **Caching private content by mistake.** A semantic cache keyed on prompt text alone will serve user A's medical chat to user B if queries embed similarly. Always scope cache key by user \u002F tenant \u002F auth-context.\n- **Routing the wrong task to the wrong model.** Tool use and JSON-mode structured output break on many cheap models. Run the routing classifier against real production traffic distribution before rollout.\n- **Confusing 'tokens saved' with 'dollars saved'.** Input tokens are 3–5× cheaper than output tokens at most providers. Track dollars, not tokens.\n- **Treating cost optimization as one-shot.** Provider prices change monthly; new cheaper models ship quarterly. Re-run the routing benchmark every quarter.\n- **Skipping the eval gate.** Every layer 3–5 change must ship behind an eval suite. 'Save 20% on tokens, lose 4% on accuracy' is rarely the trade you wanted.\n\n## Pair with these packs\n\nThis pack is the *cost* layer. Pair with **Agent Observability + Tracing** for debugging — you cannot optimize spend you can't attribute to a span. Pair with **LLM Eval & Guardrails** so every routing and compression change ships behind a quality gate. Pair with **Vector DB + RAG** if retrieval context is what is making prompts long; sometimes the cheapest token is the one you don't send.",[108,111,114,117,120],{"q":109,"a":110},"How much can I actually save with this stack?","10–50% is the honest range for most production workloads, and it's heavily workload-dependent. The lower end (10–20%) is what you get from layer 2–3 alone on a typical mixed-traffic API. The upper end (40–50%) shows up in two specific shapes: high-cache-hit chat \u002F FAQ \u002F support workloads where Helicone or Cloudflare AI Gateway catches the long tail of repeats, and high-volume narrow tasks (classification, extraction) where a fine-tuned small model replaces a frontier call. Anyone quoting '70%+' without naming the workload is either selling you something or has a specific case (e.g. 95% cache hit on a stable bot) that won't generalize. Measure your own baseline first.",{"q":112,"a":113},"Is semantic caching safe for private or multi-tenant data?","Only if you scope the cache key correctly. The default Helicone \u002F GPTCache \u002F Cloudflare semantic cache keys on prompt content; if user A asks 'what's my balance' and user B asks the same phrasing, embeddings will match and the cache will serve A's answer to B. Always add user_id, tenant_id, or auth-context to the cache key, and never cache content that contains PII in the response body. For regulated industries (health, finance) keep semantic caching off the user-data path entirely and only cache system-side things like documentation lookups and tool descriptions.",{"q":115,"a":116},"When is fine-tuning actually worth the engineering time?","Fine-tuning pays off when three things are true: (1) the task is stable — you're not still iterating on the prompt every week, (2) you have ≥10k labeled examples or can generate them from frontier model traces, and (3) the frontier-model bill on that single task alone justifies 1–2 engineer-weeks plus ongoing eval cost. Classic wins: PII extraction, intent classification, structured-data extraction from semi-structured docs, in-domain summarization. Classic losses: 'general agent reasoning', 'creative writing', anything where the prompt or task definition is still moving. Fireworks serverless LoRA on Llama keeps your weights portable — pick that over a closed fine-tune unless you have a specific reason.",{"q":118,"a":119},"OpenRouter vs LiteLLM — which one should I pick?","OpenRouter is the hosted answer: one API key, 300+ models, automatic failover, you pay them a small markup and they handle the multi-provider plumbing. LiteLLM is the self-hosted answer: you run the proxy (or use it as a Python library), bring your own provider keys, and pay only the underlying model cost. Pick OpenRouter if you want one bill, fast time-to-value, and don't want to operate a proxy. Pick LiteLLM if you have direct provider contracts (often cheaper at scale), care about data sovereignty, want a per-project cost dashboard, or are already running infra. Many teams use both: LiteLLM for production critical paths, OpenRouter for prototyping and rare-model access.",{"q":121,"a":122},"What's the cheapest way to start monitoring cost?","Start with TokenCost — it's a free offline library that handles 400+ models and lets you print before\u002Fafter estimates in any script or PR. For production, the cheapest live option is Cloudflare AI Gateway's free tier (cache + analytics + per-model breakdown, no SDK install — you just point your base_url at it) or self-hosted Langfuse \u002F Helicone open source. If you're already on LiteLLM Proxy, its built-in cost tracking is the path of least resistance — same proxy, no extra service. Hosted Helicone, Portkey, and Datadog LLM Observability are all good but only worth paying for once you're above roughly 1M calls\u002Fmonth.",{"@context":124,"@type":125,"name":13,"description":126,"numberOfItems":127,"inLanguage":25,"itemListElement":128,"publisher":169},"https:\u002F\u002Fschema.org","ItemList","Ten assets across five layers (measure, cache, route, compress, fine-tune) for SaaS and agent teams cutting LLM spend 10–50% without quality loss.",10,[129,134,138,142,146,150,154,158,162,166],{"@type":130,"position":131,"name":132,"url":133},"ListItem",1,"LiteLLM Proxy","https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Flitellm-proxy-unified-gateway-for-100-llm-apis-0f113965",{"@type":130,"position":135,"name":136,"url":137},2,"OpenRouter Unified API","https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Fopenrouter-unified-api-for-300-llms-with-auto-failover-7bb772b3",{"@type":130,"position":139,"name":140,"url":141},3,"Manifest Smart LLM Router","https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Fmanifest-smart-llm-router-cuts-costs-70-15266cba",{"@type":130,"position":143,"name":144,"url":145},4,"Portkey AI Gateway","https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Fportkey-ai-gateway-route-250-llms-585d3a26",{"@type":130,"position":147,"name":148,"url":149},5,"Helicone Cache","https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Fhelicone-cache-cut-llm-spend-with-drop-in-response-caching-5d1acc2e",{"@type":130,"position":151,"name":152,"url":153},6,"Cloudflare AI Gateway","https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Fcloudflare-ai-gateway-llm-proxy-cache-analytics-b1962c77",{"@type":130,"position":155,"name":156,"url":157},7,"LLMLingua","https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Fllmlingua-compress-prompts-20x-minimal-loss-1510da0c",{"@type":130,"position":159,"name":160,"url":161},8,"LiteLLM Cost Dashboard","https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Flitellm-cost-tracking-per-project-llm-spend-dashboard-72b2e16c",{"@type":130,"position":163,"name":164,"url":165},9,"TokenCost Calculator","https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Ftokencost-llm-price-calculator-400-models-43b26691",{"@type":130,"position":127,"name":167,"url":168},"Fireworks Serverless LoRA Fine-Tune","https:\u002F\u002Ftokrepo.com\u002Fen\u002Fworkflows\u002Ffireworks-fine-tuning-serverless-lora-on-llama-in-30-min-2f07f6a8",{"@type":170,"name":171,"url":172},"Organization","TokRepo","https:\u002F\u002Ftokrepo.com",[174,178,182],{"url":175,"anchor":176,"reason":177},"\u002Fen\u002Fpacks\u002Fagent-observability-tracing","Agent Observability + Tracing","the debugging layer — cost attribution requires span-level traces",{"url":179,"anchor":180,"reason":181},"\u002Fen\u002Fpacks\u002Fllm-observability","LLM Observability","runtime dashboard for latency, cost, and version trends that complements this pack's optimization tools",{"url":183,"anchor":184,"reason":185},"\u002Fen\u002Fpacks\u002Fml-engineer-rag-eval","ML Engineer RAG + Eval","ship every routing and compression change behind an eval gate so 'cheaper' doesn't quietly mean 'worse'",[187,191,195],{"claim":188,"source_name":189,"source_url":190},"LiteLLM proxy maps 100+ LLM providers to one OpenAI-compatible endpoint with cost tracking and fallbacks","BerriAI\u002Flitellm","https:\u002F\u002Fgithub.com\u002FBerriAI\u002Flitellm",{"claim":192,"source_name":193,"source_url":194},"LLMLingua achieves up to 20× prompt compression with minimal performance loss","microsoft\u002FLLMLingua","https:\u002F\u002Fgithub.com\u002Fmicrosoft\u002FLLMLingua",{"claim":196,"source_name":197,"source_url":198},"Helicone offers drop-in response caching and per-call cost analytics via proxy header","Helicone docs","https:\u002F\u002Fdocs.helicone.ai\u002Ffeatures\u002Fadvanced-usage\u002Fcaching",1290,"2026-05-23T12:00:00Z"]