mit einem Klick
inspect-scout
// Analyze AI agent transcripts using Inspect Scout scanners, grep patterns, and LLM-based analysis
// Analyze AI agent transcripts using Inspect Scout scanners, grep patterns, and LLM-based analysis
| name | inspect-scout |
| description | Analyze AI agent transcripts using Inspect Scout scanners, grep patterns, and LLM-based analysis |
| user-invocable | false |
Inspect Scout is a library for analyzing AI agent transcripts with parallel processing and visualization capabilities. Use this when analyzing evaluation logs for patterns, behaviors, or specific events.
Installation: pip install inspect-scout
Source: https://github.com/meridianlabs-ai/inspect_scout
A Transcript represents an LLM conversation to analyze (e.g., an agent rollout or sample from an Inspect eval). It contains:
class Transcript(TranscriptInfo):
# Identity
transcript_id: str # Globally unique ID (e.g., sample uuid)
source_type: str | None # Type of source (e.g., "eval_log")
source_id: str | None # Source ID (e.g., eval_id)
source_uri: str | None # URI for source data (e.g., log file path)
# Task info
task_set: str | None # Benchmark name
task_id: str | None # Dataset sample ID
task_repeat: int | None # Epoch number
# Agent/model info
agent: str | None # Agent used
model: str | None # Main model
model_options: dict | None # Generation options
# Results
score: JsonValue | None # Task score
success: bool | None # Success/failure boolean
error: str | None # Error message if failed
limit: str | None # Limit that caused exit (tokens, messages, etc.)
# Metrics
message_count: int | None
total_time: float | None
total_tokens: int | None
# Content
messages: list[ChatMessage] # Conversation messages
events: list[Event] # Events from transcript
metadata: dict[str, Any] # Additional metadata
Scanners are functions that analyze transcript content and return Result objects. Decorated with @scanner to specify what content to scan.
from inspect_scout import scanner, Scanner, Result
@scanner(messages="all") # Scan all messages
def my_scanner() -> Scanner[Transcript]:
async def scan(transcript: Transcript) -> Result:
# Analyze transcript
return Result(value=True, explanation="Found pattern")
return scan
@scanner(messages=["assistant"]) # Only assistant messages
def assistant_scanner() -> Scanner[Transcript]:
...
@scanner(events=["tool"]) # Only tool events
def tool_scanner() -> Scanner[ToolEvent]:
...
@scanner(messages=["user", "assistant"], events=["tool", "error"])
def combined_scanner() -> Scanner[Transcript]:
...
class Result(BaseModel):
uuid: str | None # Unique identifier
value: JsonValue # The scan result value
answer: str | None # Answer extracted from model output
explanation: str | None # Explanation of result
metadata: dict | None # Additional metadata
references: list[Reference] # References to messages/events
label: str | None # Label for multi-result scanners
type: str | None # Type designation for value
Uses an LLM to analyze transcripts based on a question.
from inspect_scout import scanner, llm_scanner, Scanner
from inspect_scout._transcript.types import Transcript
@scanner(messages="all")
def did_agent_succeed() -> Scanner[Transcript]:
return llm_scanner(
question="Did the agent successfully complete its task?",
answer="boolean"
)
@scanner(messages="all")
def classify_failure() -> Scanner[Transcript]:
return llm_scanner(
question="Why did the agent fail?",
answer=["gave_up", "hit_limit", "wrong_approach", "environment_error", "other"]
)
@scanner(messages="all")
def rate_performance() -> Scanner[Transcript]:
return llm_scanner(
question="Rate the agent's performance from 1-10",
answer="numeric"
)
Answer types:
"boolean" - True/False"numeric" - Number"string" - Free textlist[str] - Single choice from labelsAnswerMultiLabel - Multi-label classificationAnswerStructured - Custom structured outputAdditional options:
llm_scanner(
question="...",
answer="boolean",
model="openai/gpt-4o", # Model to use
template="...", # Custom prompt template
template_variables={...}, # Extra template vars
preprocessor=..., # Message preprocessing
retry_refusals=3, # Retry on refusals
name="my_scanner" # Scanner name
)
Fast pattern-based scanning without LLM calls.
from inspect_scout import scanner, grep_scanner, Scanner
from inspect_scout._transcript.types import Transcript
@scanner(messages=["assistant"])
def find_secrets() -> Scanner[Transcript]:
return grep_scanner(["password", "secret", "token", "api_key"])
@scanner(messages="all")
def find_urls() -> Scanner[Transcript]:
return grep_scanner(r"https?://\S+", regex=True)
@scanner(messages="all", events=["tool"])
def categorize_errors() -> Scanner[Transcript]:
return grep_scanner({
"permission_denied": ["permission denied", "access denied"],
"not_found": ["not found", "no such file"],
"timeout": ["timeout", "timed out"],
})
Options:
grep_scanner(
pattern, # str, list[str], or dict[str, str|list[str]]
regex=False, # Treat as regex
ignore_case=True, # Case insensitive
word_boundary=False, # Match whole words only
name="my_grep" # Scanner name
)
Returns:
Result with value=count, explanation=context snippetslist[Result] with one per label# Basic scan
scout scan scanner.py -T ./logs --model openai/gpt-4o
# With transcript filtering
scout scan scanner.py -T ./logs -F "task_set='cybench'" --model openai/gpt-4o
# Multiple scanners
scout scan scanners/ -T ./logs --model openai/gpt-4o
# Specific scanner from file
scout scan scanner.py::my_scanner -T ./logs --model openai/gpt-4o
# Launch Scout View UI
scout view
# View specific scan results
scout view --scans ./scans/scan_id=abc123
# View remote results
scout view --scans s3://my-bucket/scans
# List scans
scout scan list
# Resume interrupted scan
scout scan resume "scans/scan_id=iGEYSF6N7J3AoxzQmGgrZs"
# Mark incomplete scan as complete
scout scan complete "scan_id"
scout scan scanner.py -T ./logs \
--max-transcripts 25 \ # Concurrent transcripts (default: 25)
--max-connections 10 \ # Concurrent API requests
--max-processes 4 # Parsing/scanning processes (default: 4)
from inspect_scout import transcripts_from, columns as c
# From local eval logs
transcripts = transcripts_from("./logs")
# From S3
transcripts = transcripts_from("s3://bucket/logs")
# With filtering
transcripts = transcripts_from("./logs")
transcripts = transcripts.where(c.task_set == "cybench")
transcripts = transcripts.where(c.model == "gpt-4")
transcripts = transcripts.where("score > 0.5") # SQL-style
transcripts = transcripts.limit(100)
transcripts = transcripts.shuffle(seed=42)
from inspect_scout import scan, ScanJob, scanjob
# Single scanner
await scan(
my_scanner(),
transcripts="./logs",
model="openai/gpt-4o"
)
# Multiple scanners via ScanJob
@scanjob
def analysis_job() -> ScanJob:
return ScanJob(
scanners=[
did_agent_succeed(),
classify_failure(),
find_secrets(),
],
transcripts="./logs",
model="openai/gpt-4o"
)
from inspect_scout import scan_results_df, scan_status
# Check scan status
status = scan_status("scans/scan_id=abc123")
print(status.complete) # True/False
print(status.summary) # Counts, errors, etc.
# Get results as DataFrames
results = scan_results_df("scans/scan_id=abc123")
# Access by scanner name
df = results.scanners["my_scanner"]
# Available columns in result DataFrame:
# - transcript_id, source_id, source_uri
# - task_set, task_id, model, agent
# - score, success, error, limit
# - value, value_type, answer, explanation
# - message_references, event_references
# - scan_error, scan_total_tokens
# Row granularity options
results = scan_results_df("scans/...", rows="results") # One row per result
results = scan_results_df("scans/...", rows="transcripts") # One row per transcript
from inspect_scout import scan_results_db
db = scan_results_db("scans/scan_id=abc123")
# Use db.connection for SQL queries
Create scout.yaml for reusable configuration:
transcripts: ./logs
model: openai/gpt-4o
# Or with filters
transcripts:
path: s3://bucket/logs
where: "task_set = 'cybench'"
Then run: scout scan scanner.py (uses config automatically)
Convert messages to a string for LLM analysis:
from inspect_scout._scanner.extract import messages_as_str, MessagesPreprocessor
# Basic usage (in a scanner)
messages_str, extract_refs = await messages_as_str(transcript)
# With preprocessing
messages_str, extract_refs = await messages_as_str(
transcript,
preprocessor=MessagesPreprocessor(
exclude_system=True, # Remove system messages
exclude_reasoning=True, # Remove reasoning tokens
exclude_tool_calls=True, # Remove tool calls
),
include_ids=True # Include message IDs for references
)
Validate scanner results against ground truth:
# validation.csv
# id,target
# sample_123,true
# sample_456,false
from inspect_scout import scan
await scan(
my_scanner(),
transcripts="./logs",
validation="validation.csv"
)
Results include validation_target, validation_result, and validation_predicate columns.
@scanner(messages="all", events=["tool"])
def analyze_strategy() -> Scanner[Transcript]:
return llm_scanner(
question="""Analyze the agent's strategy:
1. What approach did it take?
2. Did it adapt when encountering obstacles?
3. What tools did it use most?""",
answer="string"
)
@scanner(messages="all", events=["tool", "error"])
def detect_errors() -> Scanner[Transcript]:
return grep_scanner({
"command_errors": ["command not found", "permission denied", "no such file"],
"api_errors": ["rate limit", "timeout", "connection refused"],
"logic_errors": ["assertion failed", "unexpected", "invalid"],
})
@scanner(messages="all")
def verify_success() -> Scanner[Transcript]:
return llm_scanner(
question="Did the agent retrieve the flag and output it correctly?",
answer="boolean"
)
from inspect_scout._llm_scanner import AnswerMultiLabel
@scanner(messages="all")
def classify_behaviors() -> Scanner[Transcript]:
return llm_scanner(
question="What behaviors did the agent exhibit?",
answer=AnswerMultiLabel(labels=[
"exploration",
"exploitation",
"backtracking",
"tool_chaining",
"giving_up"
])
)