ワンクリックで
gemini-live-api
Build real-time audio and video applications with WebSocket streaming, voice activity detection, and session management
メニュー
Build real-time audio and video applications with WebSocket streaming, voice activity detection, and session management
Production patterns, API key security, cost optimization, performance tuning, and monitoring
Reduce costs and latency with context caching - implicit and explicit cache management with TTL configuration
Execute Python code in Gemini's secure sandbox for data analysis, visualization, and file processing
Generate text embeddings for semantic search, RAG, and vector database integration
Implement robust error handling with retry logic, rate limiting, and circuit breaker patterns
Implement tool use with Gemini - function declarations, tool modes, parallel/compositional calling, and MCP integration
| name | gemini-live-api |
| description | Build real-time audio and video applications with WebSocket streaming, voice activity detection, and session management |
| argument-hint | <real-time use case> |
| allowed-tools | Read, Write, Bash(pip install, npm install, go get) |
Build real-time audio and video applications: $ARGUMENTS
You are a Gemini API specialist with expertise in:
The Live API enables real-time, multi-modal conversations with low-latency streaming.
| Feature | Specification |
|---|---|
| Protocol | WebSocket |
| Audio Input | 16-bit PCM, 16kHz, mono |
| Audio Output | 24kHz PCM |
| Video Input | JPEG frames, max 1 FPS |
| Latency | < 500ms typical |
import asyncio
import websockets
import json
from google import genai
async def live_session():
client = genai.Client()
# Create ephemeral token for secure connection
token = await client.live.create_token(model="gemini-2.5-flash")
uri = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent?key={token}"
async with websockets.connect(uri) as ws:
# Setup message
setup = {
"setup": {
"model": "models/gemini-2.5-flash",
"generationConfig": {
"responseModalities": ["AUDIO", "TEXT"]
}
}
}
await ws.send(json.dumps(setup))
# Wait for setup complete
response = await ws.recv()
print("Setup complete:", response)
# Send text message
message = {
"clientContent": {
"turns": [{
"role": "user",
"parts": [{"text": "Hello, how are you?"}]
}],
"turnComplete": True
}
}
await ws.send(json.dumps(message))
# Receive response
while True:
response = await ws.recv()
data = json.loads(response)
if "serverContent" in data:
parts = data["serverContent"].get("modelTurn", {}).get("parts", [])
for part in parts:
if "text" in part:
print("Response:", part["text"])
if data["serverContent"].get("turnComplete"):
break
asyncio.run(live_session())
const WebSocket = require('ws');
async function liveSession() {
const apiKey = process.env.GOOGLE_API_KEY;
const ws = new WebSocket(
`wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent?key=${apiKey}`
);
ws.on('open', () => {
// Setup
ws.send(JSON.stringify({
setup: {
model: "models/gemini-2.5-flash",
generationConfig: {
responseModalities: ["AUDIO", "TEXT"]
}
}
}));
});
ws.on('message', (data) => {
const response = JSON.parse(data);
if (response.setupComplete) {
// Send message after setup
ws.send(JSON.stringify({
clientContent: {
turns: [{
role: "user",
parts: [{ text: "Hello!" }]
}],
turnComplete: true
}
}));
}
if (response.serverContent) {
const parts = response.serverContent.modelTurn?.parts || [];
for (const part of parts) {
if (part.text) {
console.log("Response:", part.text);
}
}
}
});
}
liveSession();
import asyncio
import websockets
import json
import pyaudio
import base64
async def audio_stream():
# Audio setup
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
audio = pyaudio.PyAudio()
stream = audio.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK
)
uri = f"wss://generativelanguage.googleapis.com/ws/..."
async with websockets.connect(uri) as ws:
# Setup with audio
setup = {
"setup": {
"model": "models/gemini-2.5-flash",
"generationConfig": {
"responseModalities": ["AUDIO", "TEXT"],
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {
"voiceName": "Aoede"
}
}
}
}
}
}
await ws.send(json.dumps(setup))
await ws.recv() # Wait for setup
# Stream audio
try:
while True:
audio_data = stream.read(CHUNK)
encoded = base64.b64encode(audio_data).decode()
message = {
"realtimeInput": {
"mediaChunks": [{
"mimeType": "audio/pcm;rate=16000",
"data": encoded
}]
}
}
await ws.send(json.dumps(message))
# Check for responses (non-blocking)
try:
response = await asyncio.wait_for(ws.recv(), timeout=0.01)
handle_response(json.loads(response))
except asyncio.TimeoutError:
pass
finally:
stream.stop_stream()
stream.close()
audio.terminate()
import base64
import wave
def handle_audio_response(data, output_file):
"""Handle incoming audio data."""
if "serverContent" in data:
parts = data["serverContent"].get("modelTurn", {}).get("parts", [])
for part in parts:
if "inlineData" in part:
audio_data = base64.b64decode(part["inlineData"]["data"])
# Audio is 24kHz PCM
output_file.writeframes(audio_data)
Enable automatic turn detection:
setup = {
"setup": {
"model": "models/gemini-2.5-flash",
"generationConfig": {
"responseModalities": ["AUDIO", "TEXT"],
"speechConfig": {
"voiceActivityDetection": {
"disabled": False,
"startSpeechThreshold": 0.5,
"endSpeechThreshold": 0.3,
"prefixPaddingMs": 300,
"suffixPaddingMs": 500
}
}
}
}
}
Send video frames for real-time analysis:
import cv2
import base64
async def video_stream(ws):
cap = cv2.VideoCapture(0) # Webcam
while True:
ret, frame = cap.read()
if not ret:
break
# Encode frame as JPEG
_, buffer = cv2.imencode('.jpg', frame)
encoded = base64.b64encode(buffer).decode()
message = {
"realtimeInput": {
"mediaChunks": [{
"mimeType": "image/jpeg",
"data": encoded
}]
}
}
await ws.send(json.dumps(message))
# Limit to 1 FPS
await asyncio.sleep(1)
cap.release()
| Voice Name | Description |
|---|---|
| Aoede | Bright, expressive |
| Charon | Deep, authoritative |
| Fenrir | Warm, friendly |
| Kore | Clear, professional |
| Puck | Energetic, youthful |
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {
"voiceName": "Aoede"
}
}
}
For secure client-side connections:
# Server-side: Generate token
token = await client.live.create_token(
model="gemini-2.5-flash",
ttl_seconds=300 # 5 minutes
)
# Client uses token instead of API key
uri = f"wss://...?token={token}"
# Send interrupt to stop current response
interrupt_message = {
"clientContent": {
"interrupt": True
}
}
await ws.send(json.dumps(interrupt_message))
# Enable session state for multi-turn
setup = {
"setup": {
"model": "models/gemini-2.5-flash",
"generationConfig": {
"responseModalities": ["AUDIO", "TEXT"]
},
"sessionConfig": {
"enableServerState": True
}
}
}
import asyncio
import websockets
import json
import pyaudio
import base64
from threading import Thread
from queue import Queue
class LiveConversation:
def __init__(self, api_key):
self.api_key = api_key
self.audio_queue = Queue()
self.running = False
async def connect(self):
uri = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent?key={self.api_key}"
async with websockets.connect(uri) as ws:
self.ws = ws
self.running = True
# Setup
await ws.send(json.dumps({
"setup": {
"model": "models/gemini-2.5-flash",
"generationConfig": {
"responseModalities": ["AUDIO", "TEXT"],
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {"voiceName": "Aoede"}
}
}
}
}
}))
await ws.recv() # Setup complete
# Start tasks
await asyncio.gather(
self.send_audio(),
self.receive_responses()
)
async def send_audio(self):
"""Stream microphone audio."""
audio = pyaudio.PyAudio()
stream = audio.open(
format=pyaudio.paInt16,
channels=1,
rate=16000,
input=True,
frames_per_buffer=1024
)
try:
while self.running:
data = stream.read(1024, exception_on_overflow=False)
encoded = base64.b64encode(data).decode()
await self.ws.send(json.dumps({
"realtimeInput": {
"mediaChunks": [{
"mimeType": "audio/pcm;rate=16000",
"data": encoded
}]
}
}))
await asyncio.sleep(0.01)
finally:
stream.stop_stream()
stream.close()
audio.terminate()
async def receive_responses(self):
"""Handle incoming responses."""
while self.running:
try:
response = await self.ws.recv()
data = json.loads(response)
if "serverContent" in data:
parts = data["serverContent"].get("modelTurn", {}).get("parts", [])
for part in parts:
if "text" in part:
print(f"AI: {part['text']}")
if "inlineData" in part:
# Queue audio for playback
audio_data = base64.b64decode(part["inlineData"]["data"])
self.audio_queue.put(audio_data)
except websockets.exceptions.ConnectionClosed:
break
def stop(self):
self.running = False
# Usage
conversation = LiveConversation(api_key="your-api-key")
asyncio.run(conversation.connect())
| Model | Live API Support |
|---|---|
| Gemini 3 Pro | Yes |
| Gemini 3 Flash | Yes |
| Gemini 2.5 Pro | Yes |
| Gemini 2.5 Flash | Yes |
| Gemini 2.5 Flash-Lite | No |
| Pattern | Description |
|---|---|
| Voice assistant | Audio in -> Audio out |
| Video description | Video + Audio -> Audio narration |
| Real-time translation | Audio in -> Translated audio out |
| Interactive demo | Multi-modal conversation |
For: $ARGUMENTS
Provide: