| name | Vizra ADK Evaluation Framework |
| description | Test and evaluate AI agents with automated evaluations, assertions, and LLM-as-a-Judge patterns |
Vizra ADK Evaluation Framework
The evaluation framework enables automated testing of AI agents at scale, including LLM-as-a-Judge evaluation patterns.
Core Concepts
| Component | Purpose |
|---|
| Evaluation | Test suite containing test cases for an agent |
| Test Case | Input/expected output pair for testing |
| Assertion | Validation rule for agent responses |
| Judge | LLM-based evaluation of response quality |
Creating Evaluations
Basic Evaluation
<?php
namespace App\Evaluations;
use Vizra\VizraADK\Evaluations\BaseEvaluation;
class CustomerServiceEvaluation extends BaseEvaluation
{
protected string $agent = 'customer_service';
protected string $description = 'Evaluates customer service agent responses';
public function testCases(): array
{
return [
[
'name' => 'greeting_response',
'input' => 'Hello, I need help with my order',
'assertions' => [
'contains_greeting',
'offers_assistance',
'professional_tone',
],
],
[
'name' => 'refund_request',
'input' => 'I want a refund for order #12345',
'context' => [
'order_id' => '12345',
'order_status' => 'delivered',
],
'assertions' => [
'acknowledges_request',
'asks_for_reason',
'explains_policy',
],
],
[
'name' => 'complaint_handling',
'input' => 'This is terrible service! I\'ve been waiting for weeks!',
'assertions' => [
'empathetic_response',
'apologizes',
'offers_solution',
'no_defensive_language',
],
],
];
}
}
With Expected Outputs
public function testCases(): array
{
return [
[
'name' => 'specific_answer',
'input' => 'What are your business hours?',
'expected' => 'Monday to Friday, 9 AM to 5 PM',
'assertions' => [
'exact_match',
],
],
];
}
Creating Assertions
Basic Assertion
<?php
namespace App\Evaluations\Assertions;
use Vizra\VizraADK\Evaluations\BaseAssertion;
class ContainsGreetingAssertion extends BaseAssertion
{
protected string $description = 'Response should contain a greeting';
public function evaluate(string $response, array $context = []): bool
{
$greetings = ['hello', 'hi', 'good morning', 'good afternoon', 'welcome'];
$lowercaseResponse = strtolower($response);
foreach ($greetings as $greeting) {
if (str_contains($lowercaseResponse, $greeting)) {
return true;
}
}
return false;
}
public function failureMessage(): string
{
return 'Response did not contain a greeting';
}
}
Parameterized Assertion
class ContainsKeywordAssertion extends BaseAssertion
{
protected string $description = 'Response should contain specific keyword';
public function __construct(
protected string $keyword,
protected bool $caseSensitive = false
) {}
public function evaluate(string $response, array $context = []): bool
{
if ($this->caseSensitive) {
return str_contains($response, $this->keyword);
}
return str_contains(
strtolower($response),
strtolower($this->keyword)
);
}
public function failureMessage(): string
{
return "Response did not contain keyword: {$this->keyword}";
}
}
'assertions' => [
new ContainsKeywordAssertion('refund'),
new ContainsKeywordAssertion('policy'),
],
JSON Schema Assertion
use Vizra\VizraADK\Evaluations\Assertions\JsonSchemaAssertion;
'assertions' => [
new JsonSchemaAssertion([
'type' => 'object',
'required' => ['status', 'message'],
'properties' => [
'status' => ['type' => 'string', 'enum' => ['success', 'error']],
'message' => ['type' => 'string'],
'data' => ['type' => 'object'],
],
]),
],
LLM-as-a-Judge
Use an LLM to evaluate response quality:
Basic Judge
use Vizra\VizraADK\Evaluations\Assertions\LlmJudgeAssertion;
'assertions' => [
new LlmJudgeAssertion(
criteria: 'The response should be helpful, accurate, and professional',
model: 'gpt-4o'
),
],
Detailed Rubric
new LlmJudgeAssertion(
criteria: <<<'CRITERIA'
Evaluate the customer service response on these dimensions:
1. Empathy (0-3): Does the response acknowledge the customer's feelings?
2. Helpfulness (0-3): Does it provide actionable assistance?
3. Accuracy (0-3): Is the information correct?
4. Professionalism (0-3): Is the tone appropriate?
Total score should be at least 9/12 to pass.
CRITERIA,
model: 'gpt-4o',
threshold: 0.75
),
Comparative Judge
new LlmComparativeAssertion(
referenceResponse: $idealResponse,
criteria: 'Response should be as helpful as or better than the reference',
model: 'gpt-4o'
),
Running Evaluations
CLI Commands
php artisan vizra:eval:run
php artisan vizra:eval:run --evaluation=CustomerServiceEvaluation
php artisan vizra:eval:run --verbose
php artisan vizra:eval:run --evaluation=CustomerServiceEvaluation --case=greeting_response
php artisan vizra:eval:run --format=json
php artisan vizra:eval:run --output=results.json
Programmatic Execution
use Vizra\VizraADK\Services\EvaluationRunner;
$runner = app(EvaluationRunner::class);
$results = $runner->run(CustomerServiceEvaluation::class);
foreach ($results->testCases as $testCase) {
echo "{$testCase->name}: " . ($testCase->passed ? 'PASSED' : 'FAILED') . "\n";
foreach ($testCase->assertions as $assertion) {
if (!$assertion->passed) {
echo " - {$assertion->name}: {$assertion->message}\n";
}
}
}
echo "Passed: {$results->passedCount}/{$results->totalCount}\n";
Advanced Evaluation Patterns
Context Setup
class OrderEvaluation extends BaseEvaluation
{
protected string $agent = 'order_assistant';
protected function setUp(): void
{
$this->testOrder = Order::factory()->create([
'status' => 'processing',
'total' => 99.99,
]);
}
protected function tearDown(): void
{
$this->testOrder->delete();
}
public function testCases(): array
{
return [
[
'name' => 'order_status_query',
'input' => "What's the status of my order?",
'context' => [
'order_id' => fn() => $this->testOrder->id,
],
'assertions' => ['mentions_processing_status'],
],
];
}
}
Dynamic Test Cases
public function testCases(): array
{
$testCases = [];
$scenarios = json_decode(file_get_contents('test_scenarios.json'), true);
foreach ($scenarios as $scenario) {
$testCases[] = [
'name' => $scenario['id'],
'input' => $scenario['input'],
'expected' => $scenario['expected'],
'assertions' => $this->buildAssertions($scenario['checks']),
];
}
return $testCases;
}
Regression Testing
class RegressionEvaluation extends BaseEvaluation
{
public function testCases(): array
{
$regressions = EvaluationResult::where('status', 'failed')
->where('fixed', true)
->get();
return $regressions->map(fn($r) => [
'name' => "regression_{$r->id}",
'input' => $r->input,
'assertions' => ['should_not_regress'],
'context' => ['original_failure' => $r->failure_reason],
])->toArray();
}
}
Built-in Assertions
| Assertion | Purpose |
|---|
ExactMatchAssertion | Response exactly matches expected |
ContainsAssertion | Response contains substring |
RegexAssertion | Response matches regex pattern |
JsonSchemaAssertion | Response matches JSON schema |
LengthAssertion | Response length within bounds |
SentimentAssertion | Response has expected sentiment |
LlmJudgeAssertion | LLM evaluates response quality |
NoHallucinationAssertion | Response doesn't contain made-up facts |
SafetyAssertion | Response doesn't contain harmful content |
Custom Assertion Example
class NoHallucinationAssertion extends BaseAssertion
{
protected string $description = 'Response should not contain hallucinated facts';
public function __construct(
protected array $knownFacts
) {}
public function evaluate(string $response, array $context = []): bool
{
$judge = app(LlmJudge::class);
return $judge->evaluate(
prompt: "Does this response contain any facts not supported by the known facts?",
response: $response,
context: ['known_facts' => $this->knownFacts]
);
}
}
Evaluation Reports
Generate HTML Report
php artisan vizra:eval:run --report=html --output=report.html
Report Contents
- Overall pass/fail statistics
- Per-test-case results
- Assertion details
- Response comparisons
- Timing metrics
- Failure analysis
CI/CD Integration
GitHub Actions Example
- name: Run Agent Evaluations
run: |
php artisan vizra:eval:run --format=junit --output=results.xml
- name: Upload Results
uses: actions/upload-artifact@v3
with:
name: evaluation-results
path: results.xml
Artisan Commands
php artisan vizra:make:eval CustomerServiceEvaluation
php artisan vizra:make:assertion ContainsGreetingAssertion
php artisan vizra:eval:run
php artisan vizra:eval:list
Best Practices
- Test edge cases - Include difficult scenarios, not just happy paths
- Use multiple assertions - Combine rule-based and LLM-based checks
- Version test cases - Track changes to test expectations over time
- Automate in CI/CD - Run evaluations on every deploy
- Monitor trends - Track pass rates over time to detect regressions