🚀 DeepEval - Open Source Evals with Tracing¶

This code tutorial shows how you can easily trace and evaluate your LlamaIndex Agents. You can read more about the DeepEval framework here: https://docs.confident-ai.com/docs/getting-started

LlamaIndex integration with DeepEval allows you to trace your LlamaIndex Agents and evaluate them using DeepEval's default metrics. Read more about the integration here: https://deepeval.com/integrations/frameworks/langchain

Feel free to check out our repository here on GitHub: https://github.com/confident-ai/deepeval

Quickstart¶

Install the following packages:

In [ ]:

Copied!

!pip install -q -q llama-index
!pip install -U -q deepeval
!pip install -q -q llama-index
!pip install -U -q deepeval

This step is optional and only if you want a server-hosted dashboard! (Psst I think you should!)

In [ ]:

Copied!

!deepeval login
!deepeval login

End-to-End Evals¶

deepeval allows you to evaluate LlamaIndex applications end-to-end in under a minute.

Create a FunctionAgent with a list of metrics you wish to use, and pass it to your LlamaIndex application's run method.

In [ ]:

Copied!





import asyncio

from llama_index.llms.openai import OpenAI
import llama_index.core.instrumentation as instrument

from deepeval.integrations.llama_index import (
    instrument_llama_index,
    FunctionAgent,
)
from deepeval.metrics import AnswerRelevancyMetric

instrument_llama_index(instrument.get_dispatcher())


def multiply(a: float, b: float) -> float:
    """Useful for multiplying two numbers."""
    return a * b


answer_relevancy_metric = AnswerRelevancyMetric()

agent = FunctionAgent(
    tools=[multiply],
    llm=OpenAI(model="gpt-4o-mini"),
    system_prompt="You are a helpful assistant that can perform calculations.",
    metrics=[answer_relevancy_metric],
)


async def llm_app(input: str):
    return await agent.run(input)


asyncio.run(llm_app("What is 2 * 3?"))
import asyncio

from llama_index.llms.openai import OpenAI
import llama_index.core.instrumentation as instrument

from deepeval.integrations.llama_index import (
    instrument_llama_index,
    FunctionAgent,
)
from deepeval.metrics import AnswerRelevancyMetric

instrument_llama_index(instrument.get_dispatcher())


def multiply(a: float, b: float) -> float:
    """Useful for multiplying two numbers."""
    return a * b


answer_relevancy_metric = AnswerRelevancyMetric()

agent = FunctionAgent(
    tools=[multiply],
    llm=OpenAI(model="gpt-4o-mini"),
    system_prompt="You are a helpful assistant that can perform calculations.",
    metrics=[answer_relevancy_metric],
)


async def llm_app(input: str):
    return await agent.run(input)


asyncio.run(llm_app("What is 2 * 3?"))

Evaluations are supported for LlamaIndex FunctionAgent, ReActAgent and CodeActAgent. Only metrics with LLM parameters input and output are eligible for evaluation.

Synchronous¶

Create a FunctionAgent with a list of metrics you wish to use, and pass it to your LlamaIndex application's run method.

In [ ]:

Copied!





from deepeval.dataset import EvaluationDataset, Golden

dataset = EvaluationDataset(
    goldens=[Golden(input="What is 3 * 12?"), Golden(input="What is 4 * 13?")]
)

for golden in dataset.evals_iterator():
    task = asyncio.create_task(llm_app(golden.input))
    dataset.evaluate(task)
from deepeval.dataset import EvaluationDataset, Golden

dataset = EvaluationDataset(
    goldens=[Golden(input="What is 3 * 12?"), Golden(input="What is 4 * 13?")]
)

for golden in dataset.evals_iterator():
    task = asyncio.create_task(llm_app(golden.input))
    dataset.evaluate(task)

Asynchronous¶

In [ ]:

Copied!





from deepeval.dataset import EvaluationDataset, Golden
import asyncio

dataset = EvaluationDataset(
    goldens=[Golden(input="What's 7 * 8?"), Golden(input="What's 7 * 6?")]
)

for golden in dataset.evals_iterator():
    task = asyncio.create_task(llm_app(golden.input))
    dataset.evaluate(task)
from deepeval.dataset import EvaluationDataset, Golden
import asyncio

dataset = EvaluationDataset(
    goldens=[Golden(input="What's 7 * 8?"), Golden(input="What's 7 * 6?")]
)

for golden in dataset.evals_iterator():
    task = asyncio.create_task(llm_app(golden.input))
    dataset.evaluate(task)

⚠️ Warning: DeepEval runs using event loops for managing asynchronous operations.¶

Jupyter notebooks already maintain their own event loop, which may lead to unexpected behavior, hangs, or runtime errors when running DeepEval examples directly in a notebook cell.

Recommendation: To avoid such issues, run your DeepEval examples in a standalone Python script (.py file) instead of within Jupyter Notebook.

Examples¶

Here are some examples scripts.

In [ ]:

Copied!





# Synchronous (End-to-End Evals)
import os
import deepeval
import asyncio

from llama_index.llms.openai import OpenAI
import llama_index.core.instrumentation as instrument

from deepeval.integrations.llama_index import instrument_llama_index
from deepeval.integrations.llama_index import FunctionAgent
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.dataset import EvaluationDataset, Golden

from dotenv import load_dotenv

load_dotenv()

deepeval.login(os.getenv("CONFIDENT_API_KEY"))
instrument_llama_index(instrument.get_dispatcher())


def multiply(a: float, b: float) -> float:
    """Useful for multiplying two numbers."""
    return a * b


answer_relevancy_metric = AnswerRelevancyMetric()
agent = FunctionAgent(
    tools=[multiply],
    llm=OpenAI(model="gpt-4o-mini"),
    system_prompt="You are a helpful assistant that can perform calculations.",
    metrics=[answer_relevancy_metric],
)


async def llm_app(input: str):
    return await agent.run(input)


dataset = EvaluationDataset(
    goldens=[Golden(input="What is 3 * 12?"), Golden(input="What is 4 * 13?")]
)
for golden in dataset.evals_iterator():
    task = asyncio.create_task(llm_app(golden.input))
    dataset.evaluate(task)
# Synchronous (End-to-End Evals)
import os
import deepeval
import asyncio

from llama_index.llms.openai import OpenAI
import llama_index.core.instrumentation as instrument

from deepeval.integrations.llama_index import instrument_llama_index
from deepeval.integrations.llama_index import FunctionAgent
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.dataset import EvaluationDataset, Golden

from dotenv import load_dotenv

load_dotenv()

deepeval.login(os.getenv("CONFIDENT_API_KEY"))
instrument_llama_index(instrument.get_dispatcher())


def multiply(a: float, b: float) -> float:
    """Useful for multiplying two numbers."""
    return a * b


answer_relevancy_metric = AnswerRelevancyMetric()
agent = FunctionAgent(
    tools=[multiply],
    llm=OpenAI(model="gpt-4o-mini"),
    system_prompt="You are a helpful assistant that can perform calculations.",
    metrics=[answer_relevancy_metric],
)


async def llm_app(input: str):
    return await agent.run(input)


dataset = EvaluationDataset(
    goldens=[Golden(input="What is 3 * 12?"), Golden(input="What is 4 * 13?")]
)
for golden in dataset.evals_iterator():
    task = asyncio.create_task(llm_app(golden.input))
    dataset.evaluate(task)

In [ ]:

Copied!





# Asynchronous (End-to-End Evals)
import os
from deepeval.integrations.llama_index import instrument_llama_index
import llama_index.core.instrumentation as instrument
from deepeval.integrations.llama_index import FunctionAgent
from llama_index.llms.openai import OpenAI
import asyncio
import time

import deepeval
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.dataset import EvaluationDataset, Golden
from dotenv import load_dotenv

load_dotenv()


# Don't forget to setup tracing
deepeval.login(os.getenv("CONFIDENT_API_KEY"))
instrument_llama_index(instrument.get_dispatcher())


def multiply(a: float, b: float) -> float:
    """Useful for multiplying two numbers."""
    return a * b


answer_relevancy_metric = AnswerRelevancyMetric()
agent = FunctionAgent(
    tools=[multiply],
    llm=OpenAI(model="gpt-4o-mini"),
    system_prompt="You are a helpful assistant that can perform calculations.",
    metrics=[answer_relevancy_metric],
)

goldens = [Golden(input="What's 7 * 8?"), Golden(input="What's 7 * 6?")]


async def llm_app(golden: Golden):
    await agent.run(golden.input)


def main():
    dataset = EvaluationDataset(goldens=goldens)
    for golden in dataset.evals_iterator():
        task = asyncio.create_task(llm_app(golden))
        dataset.evaluate(task)


if __name__ == "__main__":
    main()
# Asynchronous (End-to-End Evals)
import os
from deepeval.integrations.llama_index import instrument_llama_index
import llama_index.core.instrumentation as instrument
from deepeval.integrations.llama_index import FunctionAgent
from llama_index.llms.openai import OpenAI
import asyncio
import time

import deepeval
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.dataset import EvaluationDataset, Golden
from dotenv import load_dotenv

load_dotenv()


# Don't forget to setup tracing
deepeval.login(os.getenv("CONFIDENT_API_KEY"))
instrument_llama_index(instrument.get_dispatcher())


def multiply(a: float, b: float) -> float:
    """Useful for multiplying two numbers."""
    return a * b


answer_relevancy_metric = AnswerRelevancyMetric()
agent = FunctionAgent(
    tools=[multiply],
    llm=OpenAI(model="gpt-4o-mini"),
    system_prompt="You are a helpful assistant that can perform calculations.",
    metrics=[answer_relevancy_metric],
)

goldens = [Golden(input="What's 7 * 8?"), Golden(input="What's 7 * 6?")]


async def llm_app(golden: Golden):
    await agent.run(golden.input)


def main():
    dataset = EvaluationDataset(goldens=goldens)
    for golden in dataset.evals_iterator():
        task = asyncio.create_task(llm_app(golden))
        dataset.evaluate(task)


if __name__ == "__main__":
    main()

In [ ]:

Copied!





import os
from deepeval.integrations.llama_index import instrument_llama_index
import llama_index.core.instrumentation as instrument
from deepeval.integrations.llama_index import FunctionAgent
from llama_index.llms.openai import OpenAI
import asyncio

import deepeval
from dotenv import load_dotenv

load_dotenv()

# Don't forget to setup tracing
deepeval.login(os.getenv("CONFIDENT_API_KEY"))
instrument_llama_index(instrument.get_dispatcher())


def multiply(a: float, b: float) -> float:
    """Useful for multiplying two numbers."""
    return a * b


agent = FunctionAgent(
    tools=[multiply],
    llm=OpenAI(model="gpt-4o-mini"),
    system_prompt="You are a helpful assistant that can perform calculations.",
    metric_collection="test_collection_1",
)


async def llm_app(golden: Golden):
    await agent.run(golden.input)


asyncio.run(llm_app(Golden(input="What is 3 * 12?")))
import os
from deepeval.integrations.llama_index import instrument_llama_index
import llama_index.core.instrumentation as instrument
from deepeval.integrations.llama_index import FunctionAgent
from llama_index.llms.openai import OpenAI
import asyncio

import deepeval
from dotenv import load_dotenv

load_dotenv()

# Don't forget to setup tracing
deepeval.login(os.getenv("CONFIDENT_API_KEY"))
instrument_llama_index(instrument.get_dispatcher())


def multiply(a: float, b: float) -> float:
    """Useful for multiplying two numbers."""
    return a * b


agent = FunctionAgent(
    tools=[multiply],
    llm=OpenAI(model="gpt-4o-mini"),
    system_prompt="You are a helpful assistant that can perform calculations.",
    metric_collection="test_collection_1",
)


async def llm_app(golden: Golden):
    await agent.run(golden.input)


asyncio.run(llm_app(Golden(input="What is 3 * 12?")))