Langfuse

This module contains functionality related to the the langfuse module for evaluation.evaluators.

Langfuse

LangfuseEvaluator

Evaluator that tracks RAG performance metrics in Langfuse.

Integrates chat engine execution with RAGAS evaluation and publishes quality metrics to Langfuse for monitoring and analysis.

Source code in src/evaluation/evaluators/langfuse.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
class LangfuseEvaluator:
    """Evaluator that tracks RAG performance metrics in Langfuse.

    Integrates chat engine execution with RAGAS evaluation and
    publishes quality metrics to Langfuse for monitoring and analysis.
    """

    def __init__(
        self,
        chat_engine: LangfuseChatEngine,
        langfuse_dataset_service: LangfuseDatasetService,
        ragas_evaluator: RagasEvaluator,
        run_metadata: dict,
    ) -> None:
        """Initialize the Langfuse evaluator with required components.

        Args:
            chat_engine: The chat engine that will generate responses
            langfuse_dataset_service: Service to retrieve evaluation datasets
            ragas_evaluator: Component to calculate quality metrics
            run_metadata: Dictionary containing metadata about the evaluation run
        """
        self.chat_engine = chat_engine
        self.ragas_evaluator = ragas_evaluator
        self.langfuse_dataset_service = langfuse_dataset_service
        self.run_name = run_metadata["build_name"]
        self.run_metadata = run_metadata

    def evaluate(self, dataset_name: str) -> None:
        """Run evaluation on a dataset and record results in Langfuse.

        Processes each item in the dataset, generates responses using the chat engine,
        calculates evaluation metrics, and uploads all results to Langfuse for monitoring.

        Args:
            dataset_name: Identifier of the dataset to evaluate

        Note:
            Records scores for answer relevancy, context recall, faithfulness, and
            harmfulness metrics when they are available (not NaN values).
        """
        langfuse_dataset = self.langfuse_dataset_service.get_dataset(
            dataset_name
        )

        for item in langfuse_dataset.items:

            if item.status == DatasetStatus.ARCHIVED:
                continue

            response = self.chat_engine.chat(
                message=item.input["query_str"],
                chat_history=[],
                chainlit_message_id=None,
                source_process=SourceProcess.DEPLOYMENT_EVALUATION,
            )

            scores = self.ragas_evaluator.evaluate(response=response, item=item)

            trace = self.chat_engine.get_current_langfuse_trace()
            trace.update(output=response.response)
            item.link(
                trace_or_observation=trace,
                run_name=self.run_name,
                run_description="Deployment evaluation",
                run_metadata=self.run_metadata,
            )

            # TODO: How to handle NaNs?
            if not isnan(scores["answer_relevancy"]):
                trace.score(
                    name="Answer Relevancy", value=scores["answer_relevancy"]
                )
            if not isnan(scores["context_recall"]):
                trace.score(
                    name="Context Recall", value=scores["context_recall"]
                )
            if not isnan(scores["faithfulness"]):
                trace.score(name="Faithfulness", value=scores["faithfulness"])
            if not isnan(scores["harmfulness"]):
                trace.score(name="Harmfulness", value=scores["harmfulness"])

__init__(chat_engine, langfuse_dataset_service, ragas_evaluator, run_metadata)

Initialize the Langfuse evaluator with required components.

Parameters:
  • chat_engine (LangfuseChatEngine) –

    The chat engine that will generate responses

  • langfuse_dataset_service (LangfuseDatasetService) –

    Service to retrieve evaluation datasets

  • ragas_evaluator (RagasEvaluator) –

    Component to calculate quality metrics

  • run_metadata (dict) –

    Dictionary containing metadata about the evaluation run

Source code in src/evaluation/evaluators/langfuse.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def __init__(
    self,
    chat_engine: LangfuseChatEngine,
    langfuse_dataset_service: LangfuseDatasetService,
    ragas_evaluator: RagasEvaluator,
    run_metadata: dict,
) -> None:
    """Initialize the Langfuse evaluator with required components.

    Args:
        chat_engine: The chat engine that will generate responses
        langfuse_dataset_service: Service to retrieve evaluation datasets
        ragas_evaluator: Component to calculate quality metrics
        run_metadata: Dictionary containing metadata about the evaluation run
    """
    self.chat_engine = chat_engine
    self.ragas_evaluator = ragas_evaluator
    self.langfuse_dataset_service = langfuse_dataset_service
    self.run_name = run_metadata["build_name"]
    self.run_metadata = run_metadata

evaluate(dataset_name)

Run evaluation on a dataset and record results in Langfuse.

Processes each item in the dataset, generates responses using the chat engine, calculates evaluation metrics, and uploads all results to Langfuse for monitoring.

Parameters:
  • dataset_name (str) –

    Identifier of the dataset to evaluate

Note

Records scores for answer relevancy, context recall, faithfulness, and harmfulness metrics when they are available (not NaN values).

Source code in src/evaluation/evaluators/langfuse.py
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def evaluate(self, dataset_name: str) -> None:
    """Run evaluation on a dataset and record results in Langfuse.

    Processes each item in the dataset, generates responses using the chat engine,
    calculates evaluation metrics, and uploads all results to Langfuse for monitoring.

    Args:
        dataset_name: Identifier of the dataset to evaluate

    Note:
        Records scores for answer relevancy, context recall, faithfulness, and
        harmfulness metrics when they are available (not NaN values).
    """
    langfuse_dataset = self.langfuse_dataset_service.get_dataset(
        dataset_name
    )

    for item in langfuse_dataset.items:

        if item.status == DatasetStatus.ARCHIVED:
            continue

        response = self.chat_engine.chat(
            message=item.input["query_str"],
            chat_history=[],
            chainlit_message_id=None,
            source_process=SourceProcess.DEPLOYMENT_EVALUATION,
        )

        scores = self.ragas_evaluator.evaluate(response=response, item=item)

        trace = self.chat_engine.get_current_langfuse_trace()
        trace.update(output=response.response)
        item.link(
            trace_or_observation=trace,
            run_name=self.run_name,
            run_description="Deployment evaluation",
            run_metadata=self.run_metadata,
        )

        # TODO: How to handle NaNs?
        if not isnan(scores["answer_relevancy"]):
            trace.score(
                name="Answer Relevancy", value=scores["answer_relevancy"]
            )
        if not isnan(scores["context_recall"]):
            trace.score(
                name="Context Recall", value=scores["context_recall"]
            )
        if not isnan(scores["faithfulness"]):
            trace.score(name="Faithfulness", value=scores["faithfulness"])
        if not isnan(scores["harmfulness"]):
            trace.score(name="Harmfulness", value=scores["harmfulness"])

LangfuseEvaluatorFactory

Bases: Factory

Factory for creating LangfuseEvaluator instances.

Creates properly configured evaluators based on the provided configuration.

Source code in src/evaluation/evaluators/langfuse.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
class LangfuseEvaluatorFactory(Factory):
    """Factory for creating LangfuseEvaluator instances.

    Creates properly configured evaluators based on the provided configuration.
    """

    _configuration_class: Type = EvaluationConfiguration

    @classmethod
    def _create_instance(
        cls, configuration: EvaluationConfiguration
    ) -> LangfuseEvaluator:
        """Create a new LangfuseEvaluator instance.

        Args:
            configuration: Complete evaluation configuration containing
                           settings for the chat engine, datasets, and metrics

        Returns:
            A fully configured LangfuseEvaluator instance ready for evaluation
        """
        chat_engine = ChatEngineRegistry.get(
            configuration.augmentation.chat_engine.name
        ).create(configuration)
        langfuse_dataset_service = LangfuseDatasetServiceFactory.create(
            configuration.augmentation.langfuse
        )
        ragas_evaluator = RagasEvaluatorFactory.create(configuration.evaluation)
        return LangfuseEvaluator(
            chat_engine=chat_engine,
            langfuse_dataset_service=langfuse_dataset_service,
            ragas_evaluator=ragas_evaluator,
            run_metadata={
                "build_name": configuration.metadata.build_name,
                "llm_configuration": configuration.augmentation.chat_engine.llm.name,
                "judge_llm_configuration": configuration.evaluation.judge_llm.name,
            },
        )