This guide covers proven patterns for structuring your evaluations.
Pattern 1: Simple Assertion
The most straightforward pattern—use assertions like pytest:
@eval(input="What is 2 + 2?", dataset="demo")
async def test_simple(ctx: EvalContext):
ctx.output = await my_agent(ctx.input)
assert ctx.output == "4", f"Expected 4, got {ctx.output}"
Use when: Most evaluations. This is the default approach.
Pattern 2: Pre-populated Decorator
Configure inputs in the decorator for cleaner code:
@eval(input="What is the capital of France?", reference="Paris", dataset="geography")
async def test_capital(ctx: EvalContext):
ctx.output = await my_agent(ctx.input)
assert ctx.output.lower() == ctx.reference.lower()
Use when: Clean separation of test data from logic.
Pattern 3: Target Hook
Separate agent invocation from scoring:
async def call_agent(ctx: EvalContext):
"""Reusable agent caller."""
ctx.output = await my_agent(ctx.input)
@eval(input="What's the weather like?", target=call_agent, dataset="qa")
async def test_weather(ctx: EvalContext):
# ctx.output already populated by target
assert "weather" in ctx.output.lower(), "Should mention weather"
@eval(input="Tell me a joke", target=call_agent, dataset="creative")
async def test_humor(ctx: EvalContext):
# Same agent, different assertions
assert len(ctx.output) > 20, "Response too short"
Use when: Multiple tests with same agent, different assertions.
Pattern 4: Multiple Results
Return multiple results from one function:
from twevals import eval, EvalResult
@eval(dataset="batch")
def test_batch_prompts():
prompts = [
("Hello", "greeting"),
("Goodbye", "farewell"),
("Help me", "support"),
]
results = []
for prompt, expected_intent in prompts:
output = my_agent(prompt)
detected = detect_intent(output)
results.append(EvalResult(
input=prompt,
output=output,
reference=expected_intent,
scores=[{
"key": "intent",
"passed": detected == expected_intent,
"notes": f"Detected: {detected}"
}]
))
return results
Use when: Dynamic test case generation, external data sources.
Pattern 5: Parametrized with Context
Combine parametrize with context for clean data-driven tests:
@eval(dataset="sentiment", default_score_key="accuracy")
@parametrize("input,reference", [
("I love this product!", "positive"),
("This is terrible", "negative"),
("It's okay I guess", "neutral"),
])
async def test_sentiment(ctx: EvalContext):
# ctx.input and ctx.reference are auto-populated (special names)
ctx.output = await analyze_sentiment(ctx.input)
assert ctx.output == ctx.reference
Or with custom parameter names (must be in function signature):
@eval(dataset="sentiment", default_score_key="accuracy")
@parametrize("text,expected", [
("I love this product!", "positive"),
("This is terrible", "negative"),
])
async def test_sentiment(ctx: EvalContext, text, expected):
ctx.input = text
ctx.output = await analyze_sentiment(text)
assert ctx.output == expected
Use when: Testing many input/output pairs.
Pattern 6: File-Level Defaults
Share configuration across multiple evaluations:
# evals/customer_service.py
twevals_defaults = {
"labels": ["production"],
"metadata": {"department": "support"}
}
@eval(input="I want a refund")
async def test_refund(ctx: EvalContext):
ctx.output = await support_agent(ctx.input)
assert "refund" in ctx.output.lower(), "Should acknowledge refund"
@eval(input="Your product broke")
async def test_complaint(ctx: EvalContext):
ctx.output = await support_agent(ctx.input)
assert "sorry" in ctx.output.lower() or "apologize" in ctx.output.lower(), \
"Should express empathy"
@eval(input="Edge case query", labels=["experimental"]) # Override labels
async def test_edge_case(ctx: EvalContext):
...
Use when: Many evaluations share configuration.
Pattern 7: Multi-Metric Evaluation
Use add_score() when you need multiple named metrics or numeric scores:
@eval(input="Explain quantum computing", dataset="qa")
async def test_comprehensive(ctx: EvalContext):
ctx.output = await my_agent(ctx.input)
# Must-pass requirements as assertions
assert ctx.output is not None, "Got no output"
assert len(ctx.output) > 50, "Response too short"
# Numeric/named metrics with add_score
keywords = ["quantum", "superposition", "qubit"]
matches = sum(1 for kw in keywords if kw in ctx.output.lower())
ctx.add_score(matches / len(keywords), f"{matches}/{len(keywords)} keywords", key="coverage")
ctx.add_score(len(ctx.output) < 500, f"Length: {len(ctx.output)}", key="brevity")
Use when: You need numeric scores or multiple independent metrics.
Pattern 8: Comparison Testing
Compare multiple approaches:
@eval(input="Explain recursion simply", dataset="model_comparison", metadata_from_params=["model"])
@parametrize("model", ["gpt-4", "gpt-3.5", "claude-3"])
async def test_model_comparison(ctx: EvalContext, model):
ctx.output = await call_model(model, ctx.input)
assert len(ctx.output) > 50, "Response too short"
assert "recursion" in ctx.output.lower(), "Should mention recursion"
Use when: Benchmarking models, comparing approaches.
Pattern 9: Error Handling
For robustness testing, you may want to catch specific errors:
@eval(input="Complex query", dataset="robustness", timeout=10.0)
async def test_with_error_handling(ctx: EvalContext):
try:
ctx.output = await my_agent(ctx.input)
except TimeoutError:
ctx.output = "TIMEOUT"
assert False, "Agent timed out"
except Exception as e:
ctx.output = f"ERROR: {e}"
assert False, f"Agent raised exception: {e}"
# If we get here, agent succeeded
assert len(ctx.output) > 0, "Empty response"
Use when: Testing reliability, edge cases.
Anti-Pattern: Forgetting to Set Output
Always set ctx.output before assertions so results are recorded for debugging.
# BAD: No output recorded
@eval(dataset="demo")
async def test_bad(ctx: EvalContext):
ctx.input = "test"
result = await my_agent(ctx.input)
assert result # Output not recorded for debugging!
# GOOD: Always set output before assertions
@eval(dataset="demo")
async def test_good(ctx: EvalContext):
ctx.input = "test"
ctx.output = await my_agent(ctx.input)
assert ctx.output # Now output is preserved