🔄Evaluate Helper Function
Quick Start
1
import asyncio
import os
from gllm_evals import load_simple_qa_dataset
from gllm_evals.evaluate import evaluate
from gllm_evals.evaluator.geval_generation_evaluator import GEvalGenerationEvaluator
from gllm_evals.utils.shared_functionality import inference_fn
async def main():
"""Main function."""
results = await evaluate(
data=load_simple_qa_dataset(),
inference_fn=inference_fn,
evaluators=[GEvalGenerationEvaluator(model_credentials=os.getenv("GOOGLE_API_KEY"))],
)
print(results)
if __name__ == "__main__":
asyncio.run(main())2
python evaluate_example.py3
{
"experiment_urls": {
"run_url": "/path/to/experiments/experiment_results.csv",
"leaderboard_url": "/path/to/experiments/leaderboard.csv",
},
"run_id": "default_simple_qa_data_55d8ad1d",
"dataset_name": "simple_qa_data",
"timestamp": "2026-01-31T10:34:05.930843",
"num_samples": 4,
"metadata": {
"batch_size": 10,
"evaluator_parameters": {
"evaluator_0": {
"name": "generation",
"batch_status_check_interval": 30.0,
"batch_max_iterations": 120,
"run_parallel": True,
"judge": None,
"good_thresholds": {
"completeness": (">=", 3),
"redundancy": ("<=", 1),
"groundedness": (">=", 3),
"language_consistency": (">=", 1),
"refusal_alignment": (">=", 1),
},
"bad_thresholds": {
"completeness": ("<=", 1),
"redundancy": (">=", 3),
"groundedness": ("<=", 1),
"language_consistency": ("<=", 0),
"refusal_alignment": ("<=", 0),
},
"metric_0": {
"evaluation_steps": [
"Step 1. Understand the Question...",
"Step 2. Identify Substantive Statements...",
"Step 3. Normalize and Compare Meaning...",
"Step 4. Detect Matches and Contradictions...",
"Step 5. Apply Pragmatic Rules...",
"- Critical Numeric Impact...",
"Step 6. Output Requirements...",
],
"batch_status_check_interval": 30.0,
"batch_max_iterations": 120,
"name": "completeness",
"_evaluation_lock": None,
},
"metric_1": {
"evaluation_steps": [...],
"batch_status_check_interval": 30.0,
"batch_max_iterations": 120,
"name": "groundedness",
"_evaluation_lock": None,
},
"metric_2": {
"evaluation_steps": [...],
"batch_status_check_interval": 30.0,
"batch_max_iterations": 120,
"name": "redundancy",
"_evaluation_lock": None,
},
"metric_3": {
"evaluation_steps": [...],
"batch_status_check_interval": 30.0,
"batch_max_iterations": 120,
"name": "language_consistency",
"_evaluation_lock": None,
},
"metric_4": {
"evaluation_steps": [...],
"batch_status_check_interval": 30.0,
"batch_max_iterations": 120,
"name": "refusal_alignment",
"_evaluation_lock": None,
},
}
},
"dataset_name": "simple_qa_data",
},
"summary_result": {},
}
4
Function Signature
async def evaluate(
data: str | BaseDataset,
inference_fn: Callable,
evaluators: list[BaseEvaluator | BaseMetric],
experiment_tracker: BaseExperimentTracker | None = None,
batch_size: int = 10,
allow_batch_evaluation: bool = False,
summary_evaluators: list[SummaryEvaluatorCallable] | None = None
**kwargs: Any,
) -> list[list[EvaluationOutput]]Parameters
Usage Example
Using data from Google Sheets
Using Langfuse Experiment Tracker with custom mapping
Example Scenario
inference_fn Examples
Summary Evaluator Example
Last updated
Was this helpful?