원클릭으로
llm-cost-optimization
Reduce LLM API and infrastructure costs through model selection, prompt caching, batching, caching, quantization, and self-hosting strategies. Track spend by team and model, set budgets, and implement cost-aware routing.
메뉴
Reduce LLM API and infrastructure costs through model selection, prompt caching, batching, caching, quantization, and self-hosting strategies. Track spend by team and model, set budgets, and implement cost-aware routing.
Orchestrate AI/ML pipelines for data ingestion, model training, batch inference, and RAG indexing using Prefect, Airflow, or Dagster. Build reliable, observable, and retriable workflows for production AI systems.
Implement multi-layer LLM caching with exact match, semantic similarity, and provider-side prompt caching. Reduce API costs by 30–70%, cut latency, and improve throughput using Redis, GPTCache, and provider caching APIs.
Design and operationalize SRE dashboards that surface reliability, latency, error, saturation, and capacity signals across services. Use when building observability views for SLOs, incident response, and executive reliability reporting.
Deploy ML models on Kubernetes with KServe (formerly KFServing) and NVIDIA Triton Inference Server. Includes canary deployments, autoscaling, model versioning, A/B testing, and GPU resource management for production model serving.
Deploy, manage, and optimize vector databases for AI applications. Covers Qdrant, Weaviate, pgvector, and Pinecone — collection management, indexing strategies, backup, and performance tuning for production RAG and semantic search workloads.
Harden OpenClaw self-hosted environments with baseline host controls, auth tightening, secret handling, network segmentation, and safe update/rollback workflows. Use when deploying OpenClaw in home labs, startups, or production-like local AI infrastructure.
| name | llm-cost-optimization |
| description | Reduce LLM API and infrastructure costs through model selection, prompt caching, batching, caching, quantization, and self-hosting strategies. Track spend by team and model, set budgets, and implement cost-aware routing. |
| license | MIT |
| metadata | {"author":"devops-skills","version":"1.0"} |
Cut LLM costs by 50–90% with the right combination of caching, model selection, prompt optimization, and self-hosting.
Use this skill when:
| Strategy | Typical Savings | Effort |
|---|---|---|
| Semantic caching | 20–50% | Low |
| Model right-sizing | 30–70% | Low |
| Prompt compression | 10–30% | Medium |
| Provider caching (prompt cache) | 10–25% | Low |
| Batching offline workloads | 50% (Batch API) | Medium |
| Self-hosting 7–8B models | 80–95% at scale | High |
| Quantization | 30–50% VRAM cost | Medium |
# Use LiteLLM's cost tracking (automatic per-model pricing)
import litellm
response = litellm.completion(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Hello"}],
)
cost = litellm.completion_cost(response)
print(f"Cost: ${cost:.6f}")
# Add custom cost callbacks
def log_cost(kwargs, completion_response, start_time, end_time):
cost = kwargs.get("response_cost", 0)
model = kwargs.get("model")
user = kwargs.get("user")
# Send to your analytics DB
db.record_cost(user=user, model=model, cost=cost)
litellm.success_callback = [log_cost]
# Route by task complexity — don't use GPT-4o for everything
def get_model_for_task(task_type: str) -> str:
routing = {
"classification": "gpt-4o-mini", # ~30× cheaper than gpt-4o
"summarization": "gpt-4o-mini",
"extraction": "gpt-4o-mini",
"simple_qa": "gpt-4o-mini",
"complex_reasoning": "gpt-4o",
"code_generation": "claude-sonnet-4-6",
"creative_writing": "claude-opus-4-6",
}
return routing.get(task_type, "gpt-4o-mini")
# Cost comparison (per 1M tokens, 2025 approx.)
# gpt-4o-mini: input $0.15 / output $0.60
# gpt-4o: input $2.50 / output $10.00
# claude-sonnet-4-6: input $3.00 / output $15.00
# llama-3.1-8b (self): ~$0.05–0.10 all-in (GPU amortized)
# Anthropic — cache long system prompts (saves 90% on cached tokens)
import anthropic
client = anthropic.Anthropic()
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system=[
{
"type": "text",
"text": "You are a helpful assistant.",
},
{
"type": "text",
"text": open("large-context.txt").read(), # large doc
"cache_control": {"type": "ephemeral"}, # cache this!
}
],
messages=[{"role": "user", "content": "Summarize the key points."}],
)
# First call: full price. Subsequent calls: 90% discount on cached part.
print(f"Cache read tokens: {response.usage.cache_read_input_tokens}")
# OpenAI — prompt caching is automatic for repeated prefixes >1024 tokens
# No code change needed; check usage.prompt_tokens_details.cached_tokens
import json
from openai import OpenAI
client = OpenAI()
# Prepare batch requests
requests = [
{
"custom_id": f"task-{i}",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": "gpt-4o-mini",
"messages": [{"role": "user", "content": f"Classify: {text}"}],
"max_tokens": 50,
}
}
for i, text in enumerate(texts)
]
# Write JSONL file
with open("batch.jsonl", "w") as f:
for req in requests:
f.write(json.dumps(req) + "\n")
# Upload and create batch
batch_file = client.files.create(file=open("batch.jsonl", "rb"), purpose="batch")
batch = client.batches.create(
input_file_id=batch_file.id,
endpoint="/v1/chat/completions",
completion_window="24h",
)
print(f"Batch ID: {batch.id}") # poll status with client.batches.retrieve(batch.id)
import hashlib
import json
import redis
import numpy as np
from sentence_transformers import SentenceTransformer
r = redis.Redis(host="localhost", port=6379)
embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
SIMILARITY_THRESHOLD = 0.92
CACHE_TTL = 3600 * 24 # 24 hours
def cached_llm_call(prompt: str, llm_fn) -> str:
# 1. Exact match (free)
exact_key = f"exact:{hashlib.sha256(prompt.encode()).hexdigest()}"
if cached := r.get(exact_key):
return cached.decode()
# 2. Semantic match
query_vec = embed_model.encode(prompt)
cached_keys = r.keys("sem:*")
for key in cached_keys:
data = json.loads(r.get(key))
similarity = np.dot(query_vec, data["embedding"]) / (
np.linalg.norm(query_vec) * np.linalg.norm(data["embedding"])
)
if similarity >= SIMILARITY_THRESHOLD:
return data["response"]
# 3. Cache miss — call LLM
response = llm_fn(prompt)
# Store exact match
r.setex(exact_key, CACHE_TTL, response)
# Store semantic embedding
sem_key = f"sem:{hashlib.sha256(prompt.encode()).hexdigest()}"
r.setex(sem_key, CACHE_TTL, json.dumps({
"embedding": query_vec.tolist(),
"response": response,
"prompt": prompt,
}))
return response
# LLMLingua — compress long prompts by 3–20× with minimal quality loss
from llmlingua import PromptCompressor
compressor = PromptCompressor(
model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
device_map="cpu",
)
compressed = compressor.compress_prompt(
long_context,
ratio=0.5, # keep 50% of tokens
rank_method="longllmlingua",
)
print(f"Original: {len(long_context.split())} words")
print(f"Compressed: {len(compressed['compressed_prompt'].split())} words")
print(f"Savings: {compressed['saving']}")
def break_even_analysis(
monthly_api_spend_usd: float,
gpu_cost_per_hour_usd: float = 2.50, # e.g., A10G on AWS
utilization: float = 0.70, # 70% GPU utilization
) -> dict:
monthly_gpu_cost = gpu_cost_per_hour_usd * 24 * 30 * utilization
break_even = monthly_gpu_cost / monthly_api_spend_usd
recommendation = (
"Self-host now — strong ROI" if break_even < 0.5 else
"Self-host if traffic grows 2×" if break_even < 0.8 else
"Stick with API — not enough scale yet"
)
return {
"monthly_gpu_cost": f"${monthly_gpu_cost:.0f}",
"monthly_api_spend": f"${monthly_api_spend_usd:.0f}",
"gpu_as_pct_of_api": f"{break_even*100:.0f}%",
"recommendation": recommendation,
}
# Example: $5k/month on OpenAI, $2.50/hr A10G
print(break_even_analysis(5000))
# → gpu_cost ~$1,260/mo = 25% of API spend → self-host now
# Emit cost metrics to Prometheus
from prometheus_client import Counter, Histogram
llm_cost_total = Counter(
"llm_cost_usd_total",
"Total LLM spend in USD",
["model", "team", "task_type"],
)
llm_tokens_total = Counter(
"llm_tokens_total",
"Total tokens used",
["model", "token_type"], # token_type: prompt, completion, cached
)
def track_call(model, team, task_type, response):
cost = calculate_cost(model, response.usage)
llm_cost_total.labels(model=model, team=team, task_type=task_type).inc(cost)
llm_tokens_total.labels(model=model, token_type="prompt").inc(
response.usage.prompt_tokens)
llm_tokens_total.labels(model=model, token_type="completion").inc(
response.usage.completion_tokens)
gpt-4o-mini or claude-haiku for 80% of tasks — they're 10–30× cheaper.