Evaluator for RAG system quality using RAGAS framework.
Provides automatic evaluation of RAG pipeline quality using multiple
metrics from the RAGAS evaluation framework. Evaluates answer relevancy,
factual consistency (faithfulness), harmfulness, and source context recall.
Source code in src/evaluation/evaluators/ragas.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92 | class RagasEvaluator:
"""Evaluator for RAG system quality using RAGAS framework.
Provides automatic evaluation of RAG pipeline quality using multiple
metrics from the RAGAS evaluation framework. Evaluates answer relevancy,
factual consistency (faithfulness), harmfulness, and source context recall.
"""
def __init__(
self,
judge_llm: BaseLLM,
judge_embedding_model: BaseEmbedding,
evaluator_function: Callable = ragas_evaluate,
) -> None:
"""Initialize RAGAS evaluator with required models and configuration.
Args:
judge_llm: LlamaIndex LLM used to evaluate response quality
judge_embedding_model: Embedding model for semantic comparisons
evaluator_function: Function that runs the evaluation pipeline,
defaults to the standard RAGAS evaluate function
"""
self.judge_llm = LlamaIndexLLMWrapper(judge_llm)
self.judge_embedding_model = LlamaIndexEmbeddingsWrapper(
judge_embedding_model
)
self.evaluator_function = evaluator_function
self.metrics = [
answer_relevancy,
faithfulness,
harmfulness,
context_recall,
]
def evaluate(self, response: Response, item: DatasetItemClient) -> Series:
"""Evaluate a RAG response against multiple quality metrics.
Calculates RAGAS evaluation metrics comparing the response to ground truth
and source contexts. Creates a temporary dataset structure needed for
RAGAS evaluation framework.
Args:
response: LlamaIndex response object containing the generated answer
and source nodes used for retrieval
item: Langfuse dataset item containing the original query and
expected ground truth answer
Returns:
Series: Pandas Series containing individual scores for each metric
(answer relevancy, faithfulness, harmfulness, context recall)
"""
dataset = Dataset.from_dict(
{
"question": [item.input["query_str"]],
"contexts": [[n.node.text for n in response.source_nodes]],
"answer": [response.response],
"ground_truth": [item.expected_output["result"]],
}
)
return (
self.evaluator_function(
metrics=self.metrics,
dataset=dataset,
llm=self.judge_llm,
embeddings=self.judge_embedding_model,
)
.to_pandas()
.iloc[0]
)
|
__init__(judge_llm, judge_embedding_model, evaluator_function=ragas_evaluate)
Initialize RAGAS evaluator with required models and configuration.
Parameters: |
-
judge_llm
(BaseLLM )
–
LlamaIndex LLM used to evaluate response quality
-
judge_embedding_model
(BaseEmbedding )
–
Embedding model for semantic comparisons
-
evaluator_function
(Callable , default:
evaluate
)
–
Function that runs the evaluation pipeline,
defaults to the standard RAGAS evaluate function
|
Source code in src/evaluation/evaluators/ragas.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56 | def __init__(
self,
judge_llm: BaseLLM,
judge_embedding_model: BaseEmbedding,
evaluator_function: Callable = ragas_evaluate,
) -> None:
"""Initialize RAGAS evaluator with required models and configuration.
Args:
judge_llm: LlamaIndex LLM used to evaluate response quality
judge_embedding_model: Embedding model for semantic comparisons
evaluator_function: Function that runs the evaluation pipeline,
defaults to the standard RAGAS evaluate function
"""
self.judge_llm = LlamaIndexLLMWrapper(judge_llm)
self.judge_embedding_model = LlamaIndexEmbeddingsWrapper(
judge_embedding_model
)
self.evaluator_function = evaluator_function
self.metrics = [
answer_relevancy,
faithfulness,
harmfulness,
context_recall,
]
|
evaluate(response, item)
Evaluate a RAG response against multiple quality metrics.
Calculates RAGAS evaluation metrics comparing the response to ground truth
and source contexts. Creates a temporary dataset structure needed for
RAGAS evaluation framework.
Parameters: |
-
response
(Response )
–
LlamaIndex response object containing the generated answer
and source nodes used for retrieval
-
item
(DatasetItemClient )
–
Langfuse dataset item containing the original query and
expected ground truth answer
|
Returns: |
-
Series ( Series
) –
Pandas Series containing individual scores for each metric
(answer relevancy, faithfulness, harmfulness, context recall)
|
Source code in src/evaluation/evaluators/ragas.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92 | def evaluate(self, response: Response, item: DatasetItemClient) -> Series:
"""Evaluate a RAG response against multiple quality metrics.
Calculates RAGAS evaluation metrics comparing the response to ground truth
and source contexts. Creates a temporary dataset structure needed for
RAGAS evaluation framework.
Args:
response: LlamaIndex response object containing the generated answer
and source nodes used for retrieval
item: Langfuse dataset item containing the original query and
expected ground truth answer
Returns:
Series: Pandas Series containing individual scores for each metric
(answer relevancy, faithfulness, harmfulness, context recall)
"""
dataset = Dataset.from_dict(
{
"question": [item.input["query_str"]],
"contexts": [[n.node.text for n in response.source_nodes]],
"answer": [response.response],
"ground_truth": [item.expected_output["result"]],
}
)
return (
self.evaluator_function(
metrics=self.metrics,
dataset=dataset,
llm=self.judge_llm,
embeddings=self.judge_embedding_model,
)
.to_pandas()
.iloc[0]
)
|