🤖AIP Evaluation Tutorial
Prerequisites
Required Parameters for AIP
Required Keys (Langfuse – Optional)
2
Setup Environment and Configuration
# OpenAI API Key for evaluation models
OPENAI_API_KEY=your_openai_api_key_here
# Langfuse (Optional - for experiment tracking)
LANGFUSE_PUBLIC_KEY=your_langfuse_public_key
LANGFUSE_SECRET_KEY=your_langfuse_secret_key
LANGFUSE_HOST=your_langfuse_host_urlimport os
from langfuse import get_client
langfuse = get_client()
if langfuse.auth_check():
print("Langfuse client is authenticated and ready!")
else:
print("Authentication failed. Please check your credentials and host.")3
Prepare Your Dataset
Input Fields
Example Dataset Structure
query,generated_response,expected_response,agent_trajectory,expected_agent_trajectory
"What is the weather in San Francisco?","The weather in San Francisco is 75 degrees and partly cloudy.","It's 75 degrees and partly cloudy in San Francisco.","[{""role"":""user"",""content"":""What is the weather in San Francisco?""},{""role"":""assistant"",""content"":""The weather in San Francisco is 75 degrees and partly cloudy.""}]","[{""role"":""user"",""content"":""What's the weather like in San Francisco?""},{""role"":""assistant"",""content"":""It's 75 degrees and partly cloudy in San Francisco.""}]"Langfuse Mapping (Optional)
langfuse_mapping = {
"input": {
"query": "query",
"agent_trajectory": "agent_trajectory",
"expected_agent_trajectory": "expected_agent_trajectory"
},
"expected_output": {
"expected_response": "expected_response"
},
"metadata": {
"generated_response": "generated_response",
"retrieved_context": "retrieved_context"
}
}4
Configure the AgentEvaluator
Trajectory-Only Evaluation
from gllm_evals.evaluator.agent_evaluator import AgentEvaluator
evaluator = AgentEvaluator(
model_credentials=os.getenv("OPENAI_API_KEY"),
use_reference=True,
continuous=True,
)Combined Evaluation
from gllm_evals.constant import DefaultValues
from gllm_evals.evaluator.trajectory_generation_evaluator import TrajectoryGenerationEvaluator
evaluator = TrajectoryGenerationEvaluator(
# Model for agent trajectory evaluation (execution quality)
agent_model=DefaultValues.AGENT_EVALS_MODEL,
agent_model_credentials=os.getenv("OPENAI_API_KEY"),
# Model for generation evaluation (response quality)
generation_model=DefaultValues.MODEL,
generation_model_credentials=os.getenv("OPENAI_API_KEY"),
)Custom Configuration Options
5
6
7
Advanced Evaluation with Langfuse Integration
import asyncio, os
from langfuse import get_client
from gllm_evals.evaluate import evaluate
from gllm_evals.evaluator.agent_evaluator import AgentEvaluator
from gllm_evals.dataset.simple_agent_dataset import load_simple_agent_dataset
from gllm_evals.experiment_tracker.langfuse_experiment_tracker import LangfuseExperimentTracker
async def main():
dataset = load_simple_agent_dataset()
async def generate_agent_response(item):
return {
"query": item.get("query"),
"generated_response": item.get("generated_response"),
"agent_trajectory": item.get("agent_trajectory"),
"expected_response": item.get("expected_response"),
"expected_agent_trajectory": item.get("expected_agent_trajectory"),
}
results = await evaluate(
data=dataset,
inference_fn=generate_agent_response,
evaluators=[AgentEvaluator(
model_credentials=os.getenv("OPENAI_API_KEY"),
use_reference=True
)],
experiment_tracker=LangfuseExperimentTracker(
langfuse_client=get_client(),
mapping=langfuse_mapping,
),
)
print(results)
if __name__ == "__main__":
asyncio.run(main())8
Understanding Evaluation Results
Trajectory-Only Results
{
"trajectory_accuracy": {
"score": 1.0,
"explanation": "The trajectory shows good progression..."
}
}Combined Evaluation Results (using TrajectoryGenerationEvaluator)
{
"trajectory_accuracy": {"score": 1.0},
"geval_generation_evals": {"score": 1, "relevancy_rating": "good"},
"final_result": {"score": 1, "relevancy_rating": "good"}
}Best Practices
Troubleshooting
Issue
Fix
How to Generate Expected Agent Trajectory
Best Practices for Reference Trajectory Generation
Additional Resources
Was this helpful?