Skip to content

Semantic similarity

Evaluation modules.

SemanticSimilarityEvaluator #

Bases: BaseEvaluator

Embedding similarity evaluator.

Evaluate the quality of a question answering system by comparing the similarity between embeddings of the generated answer and the reference answer.

Inspired by this paper: - Semantic Answer Similarity for Evaluating Question Answering Models https://arxiv.org/pdf/2108.06130.pdf

Parameters:

Name Type Description Default
service_context Optional[ServiceContext]

Service context.

None
similarity_threshold float

Embedding similarity threshold for "passing". Defaults to 0.8.

0.8
Source code in llama-index-core/llama_index/core/evaluation/semantic_similarity.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
class SemanticSimilarityEvaluator(BaseEvaluator):
    """Embedding similarity evaluator.

    Evaluate the quality of a question answering system by
    comparing the similarity between embeddings of the generated answer
    and the reference answer.

    Inspired by this paper:
    - Semantic Answer Similarity for Evaluating Question Answering Models
        https://arxiv.org/pdf/2108.06130.pdf

    Args:
        service_context (Optional[ServiceContext]): Service context.
        similarity_threshold (float): Embedding similarity threshold for "passing".
            Defaults to 0.8.
    """

    def __init__(
        self,
        embed_model: Optional[BaseEmbedding] = None,
        similarity_fn: Optional[Callable[..., float]] = None,
        similarity_mode: Optional[SimilarityMode] = None,
        similarity_threshold: float = 0.8,
        # deprecated
        service_context: Optional[ServiceContext] = None,
    ) -> None:
        self._embed_model = embed_model or embed_model_from_settings_or_context(
            Settings, service_context
        )
        if similarity_fn is None:
            similarity_mode = similarity_mode or SimilarityMode.DEFAULT
            self._similarity_fn = lambda x, y: similarity(x, y, mode=similarity_mode)
        else:
            if similarity_mode is not None:
                raise ValueError(
                    "Cannot specify both similarity_fn and similarity_mode"
                )
            self._similarity_fn = similarity_fn

        self._similarity_threshold = similarity_threshold

    def _get_prompts(self) -> PromptDictType:
        """Get prompts."""
        return {}

    def _update_prompts(self, prompts: PromptDictType) -> None:
        """Update prompts."""

    async def aevaluate(
        self,
        query: Optional[str] = None,
        response: Optional[str] = None,
        contexts: Optional[Sequence[str]] = None,
        reference: Optional[str] = None,
        **kwargs: Any,
    ) -> EvaluationResult:
        del query, contexts, kwargs  # Unused

        if response is None or reference is None:
            raise ValueError("Must specify both response and reference")

        response_embedding = await self._embed_model.aget_text_embedding(response)
        reference_embedding = await self._embed_model.aget_text_embedding(reference)

        similarity_score = self._similarity_fn(response_embedding, reference_embedding)
        passing = similarity_score >= self._similarity_threshold
        return EvaluationResult(
            score=similarity_score,
            passing=passing,
            feedback=f"Similarity score: {similarity_score}",
        )