Jina Embeddings¶

If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙.

In [ ]:

Copied!

%pip install llama-index-embeddings-jinaai
%pip install llama-index-llms-openai
%pip install llama-index-embeddings-jinaai
%pip install llama-index-llms-openai

In [ ]:

Copied!

!pip install llama-index
!pip install llama-index

You may also need other packages that do not come direcly with llama-index

In [ ]:

Copied!

!pip install Pillow
!pip install Pillow

For this example, you will need an API key which you can get from https://jina.ai/embeddings/

In [ ]:

Copied!

# Initilise with your api key
import os

jinaai_api_key = "YOUR_JINAAI_API_KEY"
os.environ["JINAAI_API_KEY"] = jinaai_api_key
# Initilise with your api key
import os

jinaai_api_key = "YOUR_JINAAI_API_KEY"
os.environ["JINAAI_API_KEY"] = jinaai_api_key

Embed text and queries with Jina embedding models through JinaAI API¶

You can encode your text and your queries using the JinaEmbedding class

In [ ]:

Copied!





from llama_index.embeddings.jinaai import JinaEmbedding

embed_model = JinaEmbedding(
    api_key=jinaai_api_key,
    model="jina-embeddings-v2-base-en",
)

embeddings = embed_model.get_text_embedding("This is the text to embed")
print("Text dim:", len(embeddings))
print("Text embed:", embeddings[:5])

embeddings = embed_model.get_query_embedding("This is the query to embed")
print("Query dim:", len(embeddings))
print("Query embed:", embeddings[:5])
from llama_index.embeddings.jinaai import JinaEmbedding

embed_model = JinaEmbedding(
    api_key=jinaai_api_key,
    model="jina-embeddings-v2-base-en",
)

embeddings = embed_model.get_text_embedding("This is the text to embed")
print("Text dim:", len(embeddings))
print("Text embed:", embeddings[:5])

embeddings = embed_model.get_query_embedding("This is the query to embed")
print("Query dim:", len(embeddings))
print("Query embed:", embeddings[:5])

Embed images and queries with Jina CLIP through JinaAI API¶

You can also encode your images and your queries using the JinaEmbedding class

In [ ]:

Copied!





from llama_index.embeddings.jinaai import JinaEmbedding
from PIL import Image
import requests
from numpy import dot
from numpy.linalg import norm

embed_model = JinaEmbedding(
    api_key=jinaai_api_key,
    model="jina-clip-v1",
)

image_url = "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcStMP8S3VbNCqOQd7QQQcbvC_FLa1HlftCiJw&s"
im = Image.open(requests.get(image_url, stream=True).raw)
print("Image:")
display(im)

image_embeddings = embed_model.get_image_embedding(image_url)
print("Image dim:", len(image_embeddings))
print("Image embed:", image_embeddings[:5])

text_embeddings = embed_model.get_text_embedding(
    "Logo of a pink blue llama on dark background"
)
print("Text dim:", len(text_embeddings))
print("Text embed:", text_embeddings[:5])

cos_sim = dot(image_embeddings, text_embeddings) / (
    norm(image_embeddings) * norm(text_embeddings)
)
print("Cosine similarity:", cos_sim)
from llama_index.embeddings.jinaai import JinaEmbedding
from PIL import Image
import requests
from numpy import dot
from numpy.linalg import norm

embed_model = JinaEmbedding(
    api_key=jinaai_api_key,
    model="jina-clip-v1",
)

image_url = "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcStMP8S3VbNCqOQd7QQQcbvC_FLa1HlftCiJw&s"
im = Image.open(requests.get(image_url, stream=True).raw)
print("Image:")
display(im)

image_embeddings = embed_model.get_image_embedding(image_url)
print("Image dim:", len(image_embeddings))
print("Image embed:", image_embeddings[:5])

text_embeddings = embed_model.get_text_embedding(
    "Logo of a pink blue llama on dark background"
)
print("Text dim:", len(text_embeddings))
print("Text embed:", text_embeddings[:5])

cos_sim = dot(image_embeddings, text_embeddings) / (
    norm(image_embeddings) * norm(text_embeddings)
)
print("Cosine similarity:", cos_sim)

Embed in batches¶

You can also embed text in batches, the batch size can be controlled by setting the embed_batch_size parameter (the default value will be 10 if not passed, and it should not be larger than 2048)

In [ ]:

Copied!





embed_model = JinaEmbedding(
    api_key=jinaai_api_key,
    model="jina-embeddings-v2-base-en",
    embed_batch_size=16,
)

embeddings = embed_model.get_text_embedding_batch(
    ["This is the text to embed", "More text can be provided in a batch"]
)

print(len(embeddings))
print(embeddings[0][:5])
embed_model = JinaEmbedding(
    api_key=jinaai_api_key,
    model="jina-embeddings-v2-base-en",
    embed_batch_size=16,
)

embeddings = embed_model.get_text_embedding_batch(
    ["This is the text to embed", "More text can be provided in a batch"]
)

print(len(embeddings))
print(embeddings[0][:5])

Let's build a RAG pipeline using Jina AI Embeddings¶

Download Data¶

In [ ]:

Copied!

!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

Imports¶

In [ ]:

Copied!

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

from llama_index.llms.openai import OpenAI
from llama_index.core.response.notebook_utils import display_source_node

from IPython.display import Markdown, display
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

from llama_index.llms.openai import OpenAI
from llama_index.core.response.notebook_utils import display_source_node

from IPython.display import Markdown, display

Load Data¶

In [ ]:

Copied!

documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()

Build index¶

In [ ]:

Copied!





your_openai_key = "YOUR_OPENAI_KEY"
llm = OpenAI(api_key=your_openai_key)
embed_model = JinaEmbedding(
    api_key=jinaai_api_key,
    model="jina-embeddings-v2-base-en",
    embed_batch_size=16,
)

index = VectorStoreIndex.from_documents(
    documents=documents, embed_model=embed_model
)
your_openai_key = "YOUR_OPENAI_KEY"
llm = OpenAI(api_key=your_openai_key)
embed_model = JinaEmbedding(
    api_key=jinaai_api_key,
    model="jina-embeddings-v2-base-en",
    embed_batch_size=16,
)

index = VectorStoreIndex.from_documents(
    documents=documents, embed_model=embed_model
)

Build retriever¶

In [ ]:

Copied!

search_query_retriever = index.as_retriever()

search_query_retrieved_nodes = search_query_retriever.retrieve(
    "What happened after the thesis?"
)
search_query_retriever = index.as_retriever()

search_query_retrieved_nodes = search_query_retriever.retrieve(
    "What happened after the thesis?"
)

In [ ]:

Copied!

for n in search_query_retrieved_nodes:
    display_source_node(n, source_length=2000)
for n in search_query_retrieved_nodes:
    display_source_node(n, source_length=2000)