en un clic
gemini-caching
Reduce costs and latency with context caching - implicit and explicit cache management with TTL configuration
Menu
Reduce costs and latency with context caching - implicit and explicit cache management with TTL configuration
Production patterns, API key security, cost optimization, performance tuning, and monitoring
Execute Python code in Gemini's secure sandbox for data analysis, visualization, and file processing
Generate text embeddings for semantic search, RAG, and vector database integration
Implement robust error handling with retry logic, rate limiting, and circuit breaker patterns
Implement tool use with Gemini - function declarations, tool modes, parallel/compositional calling, and MCP integration
Implement Google Search grounding for real-time information with citation parsing and attribution handling
| name | gemini-caching |
| description | Reduce costs and latency with context caching - implicit and explicit cache management with TTL configuration |
| argument-hint | <caching scenario or content type> |
| allowed-tools | Read, Write, Bash(pip install, npm install, go get) |
Reduce costs and latency with context caching: $ARGUMENTS
You are a Gemini API specialist with expertise in:
| Type | How It Works | Savings | Control |
|---|---|---|---|
| Implicit | Automatic for repeated prefixes | Up to 50% | None |
| Explicit | Manual cache creation with TTL | Up to 75% | Full |
Gemini automatically caches repeated context prefixes. No configuration needed.
from google import genai
client = genai.Client()
# Large context that will be cached on first request
large_context = "..." * 10000 # Large document
# First request - full cost
response1 = client.models.generate_content(
model="gemini-2.5-flash",
contents=f"{large_context}\n\nSummarize the main points."
)
# Second request - cached prefix, reduced cost
response2 = client.models.generate_content(
model="gemini-2.5-flash",
contents=f"{large_context}\n\nWhat are the key takeaways?"
)
# Check cache usage in response
print(f"Cached tokens: {response2.usage_metadata.cached_content_token_count}")
Create persistent caches for long-term reuse with significant cost savings.
from google import genai
from google.genai.types import Content, Part
client = genai.Client()
# Create cache with large context
cache = client.caches.create(
model="gemini-2.5-flash",
contents=[
Content(
role="user",
parts=[Part(text="Here is a large document to analyze:\n\n" + large_document)]
)
],
display_name="project-documentation",
ttl="3600s" # 1 hour
)
print(f"Cache created: {cache.name}")
print(f"Expires: {cache.expire_time}")
print(f"Cached tokens: {cache.usage_metadata.total_token_count}")
# Generate using cached context
response = client.models.generate_content(
model="gemini-2.5-flash",
contents="What are the main topics covered?",
config=GenerateContentConfig(
cached_content=cache.name # Reference the cache
)
)
print(response.text)
print(f"Cached tokens used: {response.usage_metadata.cached_content_token_count}")
import { GoogleGenAI } from "@google/genai";
const ai = new GoogleGenAI({ apiKey: process.env.GOOGLE_API_KEY });
// Create cache
const cache = await ai.caches.create({
model: "gemini-2.5-flash",
contents: [
{ role: "user", parts: [{ text: largeDocument }] }
],
displayName: "project-docs",
ttl: "3600s"
});
console.log(`Cache: ${cache.name}`);
// Use cache
const response = await ai.models.generateContent({
model: "gemini-2.5-flash",
contents: "Summarize the document",
cachedContent: cache.name
});
console.log(response.text);
# Cache includes system instructions
cache = client.caches.create(
model="gemini-2.5-flash",
contents=[
Content(
role="user",
parts=[Part(text="Reference documentation:\n\n" + docs)]
)
],
system_instruction="You are a technical documentation expert. Always cite specific sections when answering.",
display_name="docs-with-system",
ttl="7200s"
)
# All requests using this cache inherit the system instruction
response = client.models.generate_content(
model="gemini-2.5-flash",
contents="How do I configure authentication?",
config=GenerateContentConfig(cached_content=cache.name)
)
from google.genai.types import FunctionDeclaration, Tool
# Define tools
search_docs = FunctionDeclaration(
name="search_docs",
description="Search documentation",
parameters={
"type": "object",
"properties": {"query": {"type": "string"}},
"required": ["query"]
}
)
# Cache with tools and context
cache = client.caches.create(
model="gemini-2.5-flash",
contents=[
Content(role="user", parts=[Part(text=large_codebase)])
],
tools=[Tool(function_declarations=[search_docs])],
display_name="codebase-with-tools",
ttl="3600s"
)
# List all caches
caches = client.caches.list()
for cache in caches:
print(f"{cache.name}: {cache.display_name}")
print(f" Expires: {cache.expire_time}")
print(f" Tokens: {cache.usage_metadata.total_token_count}")
cache = client.caches.get(name="caches/abc123")
print(f"Name: {cache.name}")
print(f"Model: {cache.model}")
print(f"Token count: {cache.usage_metadata.total_token_count}")
print(f"Create time: {cache.create_time}")
print(f"Expire time: {cache.expire_time}")
# Extend cache lifetime
updated_cache = client.caches.update(
name=cache.name,
ttl="7200s" # Extend to 2 hours
)
print(f"New expiration: {updated_cache.expire_time}")
# Delete when no longer needed
client.caches.delete(name=cache.name)
print("Cache deleted")
# Upload and cache large files
file = client.files.upload(file="large_document.pdf")
# Wait for processing
while file.state == "PROCESSING":
time.sleep(2)
file = client.files.get(name=file.name)
# Create cache with file reference
cache = client.caches.create(
model="gemini-2.5-flash",
contents=[
Content(
role="user",
parts=[
Part(file_data={"uri": file.uri, "mime_type": file.mime_type}),
Part(text="This is the document to analyze.")
]
)
],
display_name="pdf-analysis-cache",
ttl="3600s"
)
| Component | Cost (relative) |
|---|---|
| Standard input tokens | 1x |
| Cached input tokens | 0.25x (75% off) |
| Cache storage | Per token-hour |
def estimate_cache_savings(
input_tokens: int,
num_requests: int,
cache_hours: int,
price_per_1m_input: float = 0.075 # Example price
) -> dict:
"""Estimate cost savings from caching."""
# Without caching
no_cache_cost = (input_tokens / 1_000_000) * price_per_1m_input * num_requests
# With caching
first_request = (input_tokens / 1_000_000) * price_per_1m_input
cached_requests = (input_tokens / 1_000_000) * (price_per_1m_input * 0.25) * (num_requests - 1)
storage_cost = (input_tokens / 1_000_000) * 0.001 * cache_hours # Storage rate
cache_cost = first_request + cached_requests + storage_cost
return {
"without_cache": no_cache_cost,
"with_cache": cache_cost,
"savings": no_cache_cost - cache_cost,
"savings_percent": ((no_cache_cost - cache_cost) / no_cache_cost) * 100
}
# Example: 100K tokens, 50 requests, 2 hours
savings = estimate_cache_savings(100_000, 50, 2)
print(f"Savings: {savings['savings_percent']:.1f}%")
# Pattern 1: Document Q&A
doc_cache = client.caches.create(
model="gemini-2.5-flash",
contents=[Content(role="user", parts=[Part(text=document)])],
system_instruction="Answer questions based only on the provided document.",
display_name="doc-qa",
ttl="3600s"
)
# Pattern 2: Code Analysis
code_cache = client.caches.create(
model="gemini-2.5-flash",
contents=[Content(role="user", parts=[Part(text=codebase)])],
system_instruction="You are a code review expert.",
display_name="code-review",
ttl="7200s"
)
# Pattern 3: Multi-document Research
research_cache = client.caches.create(
model="gemini-2.5-flash",
contents=[
Content(role="user", parts=[Part(text=f"Document {i}:\n{doc}") for i, doc in enumerate(documents)])
],
display_name="research",
ttl="3600s"
)
| Use Case | Recommended TTL |
|---|---|
| Short session | 15-30 minutes |
| Work session | 1-2 hours |
| Daily use | 24 hours |
| Persistent reference | 7 days max |
def should_use_explicit_cache(token_count: int, expected_requests: int) -> bool:
"""Determine if explicit caching is worthwhile."""
if token_count < 32768:
return False # Below minimum
# Calculate break-even point
# Explicit caching saves 75% but has storage cost
# Generally worthwhile if > 3-4 requests expected
return expected_requests >= 4
| Pattern | Description |
|---|---|
| Document cache | Cache large documents for Q&A |
| Code cache | Cache codebase for analysis |
| Session cache | Cache conversation context |
| Reference cache | Cache static reference material |
For: $ARGUMENTS
Provide: