Evaluator for RAG system quality using RAGAS.
Wraps LlamaIndex LLM and embedding models for use with RAGAS
evaluation framework. Supports multiple evaluation metrics.
Attributes: |
-
judge_llm
–
Wrapped LLM for evaluations
-
embedding_model
–
Wrapped embeddings for metrics
-
evaluator_function
–
Function to run evaluations
-
metrics
–
List of RAGAS metrics to evaluate
|
Source code in src/evaluation/evaluators.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84 | class RagasEvaluator:
"""Evaluator for RAG system quality using RAGAS.
Wraps LlamaIndex LLM and embedding models for use with RAGAS
evaluation framework. Supports multiple evaluation metrics.
Attributes:
judge_llm: Wrapped LLM for evaluations
embedding_model: Wrapped embeddings for metrics
evaluator_function: Function to run evaluations
metrics: List of RAGAS metrics to evaluate
"""
def __init__(
self,
judge_llm: BaseLLM,
embedding_model: BaseEmbedding,
evaluator_function: Callable = ragas_evaluate,
) -> None:
"""Initialize RAGAS evaluator with models.
Args:
judge_llm: LLM for evaluation judgments
embedding_model: Model for embedding comparisons
evaluator_function: Optional custom evaluation function
"""
self.judge_llm = LlamaIndexLLMWrapper(judge_llm)
self.embedding_model = LlamaIndexEmbeddingsWrapper(embedding_model)
self.evaluator_function = evaluator_function
self.metrics = [
answer_relevancy,
faithfulness,
harmfulness,
context_recall,
]
def evaluate(self, response: Response, item: DatasetItemClient) -> Series:
"""Evaluate response quality using RAGAS metrics.
Args:
response: Query response to evaluate
item: Dataset item containing ground truth
Returns:
Series: Scores for each metric
"""
dataset = Dataset.from_dict(
{
"question": [item.input["query_str"]],
"contexts": [[n.node.text for n in response.source_nodes]],
"answer": [response.response],
"ground_truth": [item.expected_output["result"]],
}
)
return (
self.evaluator_function(
metrics=self.metrics,
dataset=dataset,
llm=self.judge_llm,
embeddings=self.embedding_model,
)
.to_pandas()
.iloc[0]
)
|
__init__(judge_llm, embedding_model, evaluator_function=ragas_evaluate)
Initialize RAGAS evaluator with models.
Parameters: |
-
judge_llm
(BaseLLM )
–
LLM for evaluation judgments
-
embedding_model
(BaseEmbedding )
–
Model for embedding comparisons
-
evaluator_function
(Callable , default:
evaluate
)
–
Optional custom evaluation function
|
Source code in src/evaluation/evaluators.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55 | def __init__(
self,
judge_llm: BaseLLM,
embedding_model: BaseEmbedding,
evaluator_function: Callable = ragas_evaluate,
) -> None:
"""Initialize RAGAS evaluator with models.
Args:
judge_llm: LLM for evaluation judgments
embedding_model: Model for embedding comparisons
evaluator_function: Optional custom evaluation function
"""
self.judge_llm = LlamaIndexLLMWrapper(judge_llm)
self.embedding_model = LlamaIndexEmbeddingsWrapper(embedding_model)
self.evaluator_function = evaluator_function
self.metrics = [
answer_relevancy,
faithfulness,
harmfulness,
context_recall,
]
|
evaluate(response, item)
Evaluate response quality using RAGAS metrics.
Parameters: |
-
response
(Response )
–
Query response to evaluate
-
item
(DatasetItemClient )
–
Dataset item containing ground truth
|
Source code in src/evaluation/evaluators.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84 | def evaluate(self, response: Response, item: DatasetItemClient) -> Series:
"""Evaluate response quality using RAGAS metrics.
Args:
response: Query response to evaluate
item: Dataset item containing ground truth
Returns:
Series: Scores for each metric
"""
dataset = Dataset.from_dict(
{
"question": [item.input["query_str"]],
"contexts": [[n.node.text for n in response.source_nodes]],
"answer": [response.response],
"ground_truth": [item.expected_output["result"]],
}
)
return (
self.evaluator_function(
metrics=self.metrics,
dataset=dataset,
llm=self.judge_llm,
embeddings=self.embedding_model,
)
.to_pandas()
.iloc[0]
)
|