from twevals import eval, parametrize, EvalContext
async def run_agent(ctx: EvalContext):
results = await run_rag_agent(ctx.input)
ctx.output = results.response
ctx.metadata["source_docs"] = results.metadata["source_docs"]
@eval(target=run_agent, dataset="rag_qa")
@parametrize("input,reference", [
("What is our refund policy?", "30-day money-back guarantee"),
("How do I reset my password?", "Click 'Forgot Password' on the login page"),
("What payment methods do you accept?", "Visa, Mastercard, and PayPal"),
("How long does shipping take?", "3-5 business days for standard shipping"),
("Can I change my order after placing it?", "Within 1 hour of placing the order"),
# ... hundreds more rows
])
async def test_rag_agent(ctx: EvalContext):
"""Evaluate a RAG Agent for correctness and hallucinations"""
# LLM-as-a-Judge for hallucination
score, reasoning = await hallucination_judge(
answer=ctx.output,
sources=ctx.metadata["source_docs"]
)
ctx.add_score(score, reasoning, key="hallucination")
# Check correctness against reference
correctness_result = await correctness_judge(
answer=ctx.output,
reference=ctx.reference
)
assert correctness_result.is_correct, correctness_result.explanation