🔄End-to-End Evaluation
Quick Start
1
import asyncio
import os
from langfuse import get_client
from gllm_evals.evaluator.geval_generation_evaluator import GEvalGenerationEvaluator
from gllm_evals.evaluate import evaluate
from gllm_evals.utils.shared_functionality import inference_fn
from gllm_evals.experiment_tracker.langfuse_experiment_tracker import LangfuseExperimentTracker
from gllm_evals import load_simple_qa_dataset
async def main():
"""Main function."""
results = await evaluate(
data=load_simple_qa_dataset(),
inference_fn=inference_fn,
evaluators=[GEvalGenerationEvaluator(model_credentials=os.getenv("GOOGLE_API_KEY"))],
experiment_tracker=LangfuseExperimentTracker(langfuse_client=get_client()),
)
print(results)
if __name__ == "__main__":
asyncio.run(main())2
python evaluate_example.py3
[
[{
'geval_generation_evals': {
'relevancy_rating': 'good',
'possible_issues': [],
'score': 1,
'completeness': {
'score': 3,
'explanation': "The expected output contains the substantive statement 'Paris' as the answer to the question. The actual output, 'The capital of France is Paris,' includes the information 'Paris' as the capital, matching the substantive statement in the expected output. All key information is present, although with additional phrasing, which does not affect the score."
},
'groundedness': {
'score': 3,
'explanation': "The response directly answers the question, stating that Paris is the capital of France. This information is clearly and explicitly supported by the context, which says, 'Paris is the capital and largest city of France.' There are no unsupported or extraneous statements."
},
'redundancy': {
'score': 1,
'explanation': 'The response clearly states the capital of France in a single, concise sentence without any redundancy or repeated information. Each key point is presented just once and the message is direct.'
}
}
}],
[{
'geval_generation_evals': {
'relevancy_rating': 'good',
'possible_issues': [],
'score': 1,
'completeness': {
'score': 3,
'explanation': "The generated output exactly matches the substantive statement in the expected output. The answer '4' is correct and all required information is present."
},
'groundedness': {
'score': 3,
'explanation': "The response accurately answers the question, and the answer '4' is explicitly supported by the context which states '2+2 equals 4.' There is complete alignment between the response, context, and question intent, with no extraneous or unsupported information."
},
'redundancy': {
'score': 1,
'explanation': "The response provides the answer '4' to the question with no repetition or unnecessary elaboration. Each idea is presented only once, and the answer is concise and to the point without restating or paraphrasing the key point."
}
}
}],
[{
'geval_generation_evals': {
'relevancy_rating': 'good',
'possible_issues': [],
'score': 1,
'completeness': {
'score': 3,
'explanation': 'The generated output matches the key substantive statement from the expected output, correctly identifying Jupiter as the largest planet in our solar system. Although the generated output is shorter, it fully captures the essential information required by the question.'
},
'groundedness': {
'score': 3,
'explanation': "The response correctly identifies Jupiter as the largest planet in our solar system, which is directly supported by the context stating, 'Jupiter is the fifth planet from the Sun and the largest in the Solar System.' There are no unsupported or extraneous statements."
},
'redundancy': {
'score': 1,
'explanation': "The response provides the correct answer, 'Jupiter,' with no restatement, repetition, or unnecessary elaboration. There is only one key idea, and it is presented directly and concisely."
}
}
}]
]Function Signature
async def evaluate(
data: str | BaseDataset,
inference_fn: Callable,
evaluators: list[BaseEvaluator | BaseMetric],
experiment_tracker: BaseExperimentTracker | None = None,
batch_size: int = 10,
**kwargs: Any,
) -> list[list[EvaluationOutput]]Parameters
Usage Example
Using data from Google Sheets
Using Langfuse Experiment Tracker with custom mapping
Example Scenario
inference_fn Examples
Last updated
Was this helpful?