Evaluators

This module contains functionality related to the the evaluators module for evaluation.

Evaluators

LangfuseEvaluator

Evaluator for tracking RAG performance in Langfuse.

Combines query engine execution with RAGAS evaluation and uploads results to Langfuse for monitoring.

Attributes:
  • query_engine

    Engine for generating responses

  • ragas_evaluator

    Evaluator for quality metrics

  • langfuse_dataset_service

    Service for dataset access

  • run_name

    Name of evaluation run

  • run_metadata

    Additional run context

Source code in src/evaluation/evaluators.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
class LangfuseEvaluator:
    """Evaluator for tracking RAG performance in Langfuse.

    Combines query engine execution with RAGAS evaluation and
    uploads results to Langfuse for monitoring.

    Attributes:
        query_engine: Engine for generating responses
        ragas_evaluator: Evaluator for quality metrics
        langfuse_dataset_service: Service for dataset access
        run_name: Name of evaluation run
        run_metadata: Additional run context
    """

    def __init__(
        self,
        query_engine: RagQueryEngine,
        langfuse_dataset_service: LangfuseDatasetService,
        ragas_evaluator: RagasEvaluator,
        run_metadata: dict,
    ) -> None:
        """Initialize Langfuse evaluator.

        Args:
            query_engine: Engine for response generation
            langfuse_dataset_service: Dataset access service
            ragas_evaluator: Quality metrics evaluator
            run_metadata: Run context information
        """
        self.query_engine = query_engine
        self.ragas_evaluator = ragas_evaluator
        self.langfuse_dataset_service = langfuse_dataset_service
        self.run_name = run_metadata["build_name"]
        self.run_metadata = run_metadata

    def evaluate(self, dataset_name: str) -> None:
        """Evaluate dataset and record results in Langfuse.

        Args:
            dataset_name: Name of dataset to evaluate

        Note:
            Uploads scores for answer relevancy, context recall,
            faithfulness and harmfulness when available.
        """
        langfuse_dataset = self.langfuse_dataset_service.get_dataset(
            dataset_name
        )

        for item in langfuse_dataset.items:

            response = self.query_engine.query(
                str_or_query_bundle=item.input["query_str"],
                chainlit_message_id=None,
                source_process=SourceProcess.DEPLOYMENT_EVALUATION,
            ).get_response()

            scores = self.ragas_evaluator.evaluate(response=response, item=item)

            trace = self.query_engine.get_current_langfuse_trace()
            trace.update(output=response.response)
            item.link(
                trace_or_observation=trace,
                run_name=self.run_name,
                run_description="Deployment evaluation",
                run_metadata=self.run_metadata,
            )

            # TODO: How to handle NaNs?
            if not isnan(scores["answer_relevancy"]):
                trace.score(
                    name="Answer Relevancy", value=scores["answer_relevancy"]
                )
            if not isnan(scores["context_recall"]):
                trace.score(
                    name="Context Recall", value=scores["context_recall"]
                )
            if not isnan(scores["faithfulness"]):
                trace.score(name="Faithfulness", value=scores["faithfulness"])
            if not isnan(scores["harmfulness"]):
                trace.score(name="Harmfulness", value=scores["harmfulness"])

__init__(query_engine, langfuse_dataset_service, ragas_evaluator, run_metadata)

Initialize Langfuse evaluator.

Parameters:
  • query_engine (RagQueryEngine) –

    Engine for response generation

  • langfuse_dataset_service (LangfuseDatasetService) –

    Dataset access service

  • ragas_evaluator (RagasEvaluator) –

    Quality metrics evaluator

  • run_metadata (dict) –

    Run context information

Source code in src/evaluation/evaluators.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def __init__(
    self,
    query_engine: RagQueryEngine,
    langfuse_dataset_service: LangfuseDatasetService,
    ragas_evaluator: RagasEvaluator,
    run_metadata: dict,
) -> None:
    """Initialize Langfuse evaluator.

    Args:
        query_engine: Engine for response generation
        langfuse_dataset_service: Dataset access service
        ragas_evaluator: Quality metrics evaluator
        run_metadata: Run context information
    """
    self.query_engine = query_engine
    self.ragas_evaluator = ragas_evaluator
    self.langfuse_dataset_service = langfuse_dataset_service
    self.run_name = run_metadata["build_name"]
    self.run_metadata = run_metadata

evaluate(dataset_name)

Evaluate dataset and record results in Langfuse.

Parameters:
  • dataset_name (str) –

    Name of dataset to evaluate

Note

Uploads scores for answer relevancy, context recall, faithfulness and harmfulness when available.

Source code in src/evaluation/evaluators.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def evaluate(self, dataset_name: str) -> None:
    """Evaluate dataset and record results in Langfuse.

    Args:
        dataset_name: Name of dataset to evaluate

    Note:
        Uploads scores for answer relevancy, context recall,
        faithfulness and harmfulness when available.
    """
    langfuse_dataset = self.langfuse_dataset_service.get_dataset(
        dataset_name
    )

    for item in langfuse_dataset.items:

        response = self.query_engine.query(
            str_or_query_bundle=item.input["query_str"],
            chainlit_message_id=None,
            source_process=SourceProcess.DEPLOYMENT_EVALUATION,
        ).get_response()

        scores = self.ragas_evaluator.evaluate(response=response, item=item)

        trace = self.query_engine.get_current_langfuse_trace()
        trace.update(output=response.response)
        item.link(
            trace_or_observation=trace,
            run_name=self.run_name,
            run_description="Deployment evaluation",
            run_metadata=self.run_metadata,
        )

        # TODO: How to handle NaNs?
        if not isnan(scores["answer_relevancy"]):
            trace.score(
                name="Answer Relevancy", value=scores["answer_relevancy"]
            )
        if not isnan(scores["context_recall"]):
            trace.score(
                name="Context Recall", value=scores["context_recall"]
            )
        if not isnan(scores["faithfulness"]):
            trace.score(name="Faithfulness", value=scores["faithfulness"])
        if not isnan(scores["harmfulness"]):
            trace.score(name="Harmfulness", value=scores["harmfulness"])

RagasEvaluator

Evaluator for RAG system quality using RAGAS.

Wraps LlamaIndex LLM and embedding models for use with RAGAS evaluation framework. Supports multiple evaluation metrics.

Attributes:
  • judge_llm

    Wrapped LLM for evaluations

  • embedding_model

    Wrapped embeddings for metrics

  • evaluator_function

    Function to run evaluations

  • metrics

    List of RAGAS metrics to evaluate

Source code in src/evaluation/evaluators.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
class RagasEvaluator:
    """Evaluator for RAG system quality using RAGAS.

    Wraps LlamaIndex LLM and embedding models for use with RAGAS
    evaluation framework. Supports multiple evaluation metrics.

    Attributes:
        judge_llm: Wrapped LLM for evaluations
        embedding_model: Wrapped embeddings for metrics
        evaluator_function: Function to run evaluations
        metrics: List of RAGAS metrics to evaluate
    """

    def __init__(
        self,
        judge_llm: BaseLLM,
        embedding_model: BaseEmbedding,
        evaluator_function: Callable = ragas_evaluate,
    ) -> None:
        """Initialize RAGAS evaluator with models.

        Args:
            judge_llm: LLM for evaluation judgments
            embedding_model: Model for embedding comparisons
            evaluator_function: Optional custom evaluation function
        """
        self.judge_llm = LlamaIndexLLMWrapper(judge_llm)
        self.embedding_model = LlamaIndexEmbeddingsWrapper(embedding_model)
        self.evaluator_function = evaluator_function

        self.metrics = [
            answer_relevancy,
            faithfulness,
            harmfulness,
            context_recall,
        ]

    def evaluate(self, response: Response, item: DatasetItemClient) -> Series:
        """Evaluate response quality using RAGAS metrics.

        Args:
            response: Query response to evaluate
            item: Dataset item containing ground truth

        Returns:
            Series: Scores for each metric
        """
        dataset = Dataset.from_dict(
            {
                "question": [item.input["query_str"]],
                "contexts": [[n.node.text for n in response.source_nodes]],
                "answer": [response.response],
                "ground_truth": [item.expected_output["result"]],
            }
        )
        return (
            self.evaluator_function(
                metrics=self.metrics,
                dataset=dataset,
                llm=self.judge_llm,
                embeddings=self.embedding_model,
            )
            .to_pandas()
            .iloc[0]
        )

__init__(judge_llm, embedding_model, evaluator_function=ragas_evaluate)

Initialize RAGAS evaluator with models.

Parameters:
  • judge_llm (BaseLLM) –

    LLM for evaluation judgments

  • embedding_model (BaseEmbedding) –

    Model for embedding comparisons

  • evaluator_function (Callable, default: evaluate ) –

    Optional custom evaluation function

Source code in src/evaluation/evaluators.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def __init__(
    self,
    judge_llm: BaseLLM,
    embedding_model: BaseEmbedding,
    evaluator_function: Callable = ragas_evaluate,
) -> None:
    """Initialize RAGAS evaluator with models.

    Args:
        judge_llm: LLM for evaluation judgments
        embedding_model: Model for embedding comparisons
        evaluator_function: Optional custom evaluation function
    """
    self.judge_llm = LlamaIndexLLMWrapper(judge_llm)
    self.embedding_model = LlamaIndexEmbeddingsWrapper(embedding_model)
    self.evaluator_function = evaluator_function

    self.metrics = [
        answer_relevancy,
        faithfulness,
        harmfulness,
        context_recall,
    ]

evaluate(response, item)

Evaluate response quality using RAGAS metrics.

Parameters:
  • response (Response) –

    Query response to evaluate

  • item (DatasetItemClient) –

    Dataset item containing ground truth

Returns:
  • Series( Series ) –

    Scores for each metric

Source code in src/evaluation/evaluators.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def evaluate(self, response: Response, item: DatasetItemClient) -> Series:
    """Evaluate response quality using RAGAS metrics.

    Args:
        response: Query response to evaluate
        item: Dataset item containing ground truth

    Returns:
        Series: Scores for each metric
    """
    dataset = Dataset.from_dict(
        {
            "question": [item.input["query_str"]],
            "contexts": [[n.node.text for n in response.source_nodes]],
            "answer": [response.response],
            "ground_truth": [item.expected_output["result"]],
        }
    )
    return (
        self.evaluator_function(
            metrics=self.metrics,
            dataset=dataset,
            llm=self.judge_llm,
            embeddings=self.embedding_model,
        )
        .to_pandas()
        .iloc[0]
    )