DeepEval: Evaluation and Observability for LlamaIndex¶

DeepEval (by Confident AI) now integrates with LlamaIndex, giving you end-to-end visibility and evaluation tools for your LlamaIndex agents.

Quickstart¶

Install the following packages:

In [ ]:

Copied!

!pip install -U deepeval llama-index
!pip install -U deepeval llama-index

Login with your Confident API key and configure DeepEval as instrument LlamaIndex:

In [ ]:

Copied!

import llama_index.core.instrumentation as instrument

import deepeval
from deepeval.integrations.llama_index import instrument_llama_index

deepeval.login("<your-confident-api-key>")

instrument_llama_index(instrument.get_dispatcher())
import llama_index.core.instrumentation as instrument

import deepeval
from deepeval.integrations.llama_index import instrument_llama_index

deepeval.login("")

instrument_llama_index(instrument.get_dispatcher())

Example Agent¶

⚠️ Note: DeepEval may not work reliably in Jupyter notebooks due to event loop conflicts. It is recommended to run examples in a standalone Python script instead.

In [ ]:

Copied!





import os
import time
import asyncio

from llama_index.llms.openai import OpenAI
import llama_index.core.instrumentation as instrument
from llama_index.core.agent.workflow import FunctionAgent

import deepeval
from deepeval.integrations.llama_index import instrument_llama_index

# Don't forget to setup tracing
deepeval.login("<your-confident-api-key>")

# Instrument LlamaIndex
instrument_llama_index(instrument.get_dispatcher())

os.environ["OPENAI_API_KEY"] = "<your-openai-api-key>"


def multiply(a: float, b: float) -> float:
    """Useful for multiplying two numbers."""
    return a * b


agent = FunctionAgent(
    tools=[multiply],
    llm=OpenAI(model="gpt-4o-mini"),
    system_prompt="You are a helpful assistant that can perform calculations.",
)


async def main():
    response = await agent.run("What's 7 * 8?")
    print(response)


if __name__ == "__main__":
    asyncio.run(main())
import os
import time
import asyncio

from llama_index.llms.openai import OpenAI
import llama_index.core.instrumentation as instrument
from llama_index.core.agent.workflow import FunctionAgent

import deepeval
from deepeval.integrations.llama_index import instrument_llama_index

# Don't forget to setup tracing
deepeval.login("")

# Instrument LlamaIndex
instrument_llama_index(instrument.get_dispatcher())

os.environ["OPENAI_API_KEY"] = ""


def multiply(a: float, b: float) -> float:
    """Useful for multiplying two numbers."""
    return a * b


agent = FunctionAgent(
    tools=[multiply],
    llm=OpenAI(model="gpt-4o-mini"),
    system_prompt="You are a helpful assistant that can perform calculations.",
)


async def main():
    response = await agent.run("What's 7 * 8?")
    print(response)


if __name__ == "__main__":
    asyncio.run(main())

You can directly view the traces in the Observatory by clicking on the link in the output printed in the console.

Online Evaluations¶

You can use DeepEval to evaluate your LlamaIndex agents on Confident AI.

Create a metric collection on Confident AI.
Pass the metric collection name on DeepEval's LlamaIndex agent wrapper.

In [ ]:

Copied!





import os
import time
import asyncio

from llama_index.llms.openai import OpenAI
import llama_index.core.instrumentation as instrument

import deepeval
from deepeval.integrations.llama_index import FunctionAgent
from deepeval.integrations.llama_index import instrument_llama_index

deepeval.login("<your-confident-api-key>")

instrument_llama_index(instrument.get_dispatcher())

os.environ["OPENAI_API_KEY"] = ""


def multiply(a: float, b: float) -> float:
    """Useful for multiplying two numbers."""
    return a * b


agent = FunctionAgent(
    tools=[multiply],
    llm=OpenAI(model="gpt-4o-mini"),
    system_prompt="You are a helpful assistant that can perform calculations.",
    metric_collection="test_collection_1",
)


async def main():
    response = await agent.run("What's 7 * 8?")
    print(response)


if __name__ == "__main__":
    asyncio.run(main())
import os
import time
import asyncio

from llama_index.llms.openai import OpenAI
import llama_index.core.instrumentation as instrument

import deepeval
from deepeval.integrations.llama_index import FunctionAgent
from deepeval.integrations.llama_index import instrument_llama_index

deepeval.login("")

instrument_llama_index(instrument.get_dispatcher())

os.environ["OPENAI_API_KEY"] = ""


def multiply(a: float, b: float) -> float:
    """Useful for multiplying two numbers."""
    return a * b


agent = FunctionAgent(
    tools=[multiply],
    llm=OpenAI(model="gpt-4o-mini"),
    system_prompt="You are a helpful assistant that can perform calculations.",
    metric_collection="test_collection_1",
)


async def main():
    response = await agent.run("What's 7 * 8?")
    print(response)


if __name__ == "__main__":
    asyncio.run(main())

DeepEval: Evaluation and Observability for LlamaIndex¶

Quickstart¶

Example Agent¶

Online Evaluations¶

References¶