Reranking top pages from PDF using LlamaParse and ZeroEntropy¶
In this guide, we'll build a simple workflow to parse PDF documents into text using LlamaParse and then query and rerank the textual data.
Pre-requisites¶
- Python 3.8+
zeroentropy
clientllama_cloud_services
client- A ZeroEntropy API key (Get yours here)
- A LlamaParse API key (Get yours here)
What You'll Learn¶
- How to use LlamaParse to accurately convert PDF documents into markdown
- How to use ZeroEntropy to semantically index and query the parsed documents
- How to rerank your results using ZeroEntropy's reranker zerank-1 to boost accuracy
Setting up your ZeroEntropy Client and LlamaParse Client¶
First, install dependencies:
!pip install zeroentropy python-dotenv llama_cloud_services requests
Now load your API keys and initialize the clients
# Get your API keys from the ZeroEntropy and LlamaParse websites
# https://dashboard.zeroentropy.dev/
# https://docs.cloud.llamaindex.ai/api_key
ZEROENTROPY_API_KEY = "your_api_key_here"
LLAMAPARSE_API_KEY = "your_api_key_here"
from zeroentropy import AsyncZeroEntropy, ConflictError
from llama_cloud_services import LlamaParse
import os
# We initialize the AsyncZeroEntropy client in order to parse multiple documents in parallel
# If you want to parse a single document, you can use the synchronous client instead
zclient = AsyncZeroEntropy(api_key=ZEROENTROPY_API_KEY)
# We initialize the llama_parse client to parse the PDF documents into text
llamaParser = LlamaParse(
api_key=LLAMAPARSE_API_KEY,
num_workers=1, # if multiple files passed, split in `num_workers` API calls
result_type="text",
verbose=True,
language="en", # optionally define a language, default=en
)
Adding a collection to the ZeroEntropy client¶
collection_name = "my_collection"
await zclient.collections.add(collection_name=collection_name)
Now define a function to download and extract PDF files from Dropbox directly to memory:
import requests
import zipfile
import asyncio
import io
from typing import List, Tuple
def download_and_extract_dropbox_zip_to_memory(
url: str,
) -> List[Tuple[str, bytes]]:
"""Download and extract a zip file from Dropbox URL directly to memory.
Returns:
List of tuples containing (filename, file_content_bytes)
"""
try:
# Download the zip file
print(f"Downloading zip file from: {url}")
response = requests.get(url, stream=True)
response.raise_for_status()
# Read zip content into memory
zip_content = io.BytesIO()
for chunk in response.iter_content(chunk_size=8192):
zip_content.write(chunk)
zip_content.seek(0)
# Extract files from zip in memory
files_in_memory = []
with zipfile.ZipFile(zip_content, "r") as zip_ref:
for file_info in zip_ref.infolist():
if (
not file_info.is_dir()
and file_info.filename.lower().endswith(".pdf")
):
file_content = zip_ref.read(file_info.filename)
files_in_memory.append((file_info.filename, file_content))
print(
f"Loaded {file_info.filename} ({len(file_content)} bytes)"
)
print(
f"Successfully loaded {len(files_in_memory)} PDF files into memory"
)
return files_in_memory
except Exception as e:
print(f"Error downloading/extracting zip file: {e}")
raise
# Download and extract files from Dropbox directly to memory
dropbox_url = "https://www.dropbox.com/scl/fi/oi6kf91gz8h76d2wt57mb/example_docs.zip?rlkey=mf21tvyb65tyrjkr1t2szt226&dl=1"
files_in_memory = download_and_extract_dropbox_zip_to_memory(dropbox_url)
Downloading zip file from: https://www.dropbox.com/scl/fi/oi6kf91gz8h76d2wt57mb/example_docs.zip?rlkey=mf21tvyb65tyrjkr1t2szt226&dl=1 Loaded example_docs/S-P-Global-2024-Annual-Report.pdf (2434264 bytes) Loaded example_docs/annual-report-sg-en-spy.pdf (603698 bytes) Loaded example_docs/dashboard-sp-500-factor.pdf (1717787 bytes) Successfully loaded 3 PDF files into memory
Parsing PDFs using LlamaParse¶
Let's download the PDF files from Dropbox and parse them directly in memory using LlamaParse:
# Create file-like objects for LlamaParse
file_objects = []
file_names = []
for filename, file_content in files_in_memory:
# Create a file-like object from bytes
file_obj = io.BytesIO(file_content)
file_obj.name = filename # Set the name attribute for LlamaParse
file_objects.append(file_obj)
file_names.append(filename)
# Parse all PDF files at once using LlamaParse
# Include extra_info with file names formatted as dictionaries for byte data parsing
print(f"Parsing {len(file_objects)} PDF files...")
# Use async parsing to avoid nested event loop issues
text_data = await asyncio.gather(
*[
llamaParser.aparse(file_obj, extra_info={"file_name": name})
for file_obj, name in zip(file_objects, file_names)
]
)
print(f"Successfully parsed {len(text_data)} documents")
Parsing 3 PDF files... Started parsing the file under job_id a1324745-c58b-4a24-b757-c6a6a58e57cd Started parsing the file under job_id 326b947e-9d95-4dc3-aeaf-440b9cc03016 Started parsing the file under job_id b8534aa0-ed69-4079-a720-1b2471066c6f ............Successfully parsed 3 documents
Organizing your documents¶
Once parsed, we form a list of documents with a list of the pages within them.
docs = []
for dindex, doc in enumerate(text_data):
pages = []
for index, page in enumerate(doc.pages):
pages.append(page.text)
docs.append(pages)
print(f"Organized {len(docs)} documents with pages")
if docs:
print(f"First document has {len(docs[0])} pages")
Organized 3 documents with pages First document has 104 pages
Querying with ZeroEntropy¶
We'll now define functions to upload the documents as text pages asynchroniously.
import asyncio
from tqdm.asyncio import tqdm
sem = asyncio.Semaphore(16)
async def add_document_with_pages(
collection_name: str, filename: str, pages: list, doc_index: int
):
"""Add a single document with multiple pages to the collection."""
async with sem: # Limit concurrent operations
for retry in range(3): # Retry logic
try:
response = await zclient.documents.add(
collection_name=collection_name,
path=filename, # Use the actual filename as path
content={
"type": "text-pages",
"pages": pages, # Send list of strings directly
},
)
return response
except ConflictError:
print(
f"Document '{filename}' already exists in collection '{collection_name}'"
)
break
except Exception as e:
if retry == 2: # Last retry
print(f"Failed to add document '{filename}': {e}")
return None
await asyncio.sleep(0.1 * (retry + 1)) # Exponential backoff
async def upload_documents_async(
docs: list, file_names: list, collection_name: str
):
"""
Upload documents asynchronously to ZeroEntropy collection.
Args:
docs: 2D array where docs[i] contains the list of pages (strings) for document i
file_names: Array where file_names[i] contains the path for document i
collection_name: Name of the collection to add documents to
"""
# Validate input arrays have same length
if len(docs) != len(file_names):
raise ValueError("docs and file_names must have the same length")
# Print starting message
print(f"Starting upload of {len(docs)} documents...")
# Create tasks for all documents
tasks = [
add_document_with_pages(collection_name, file_names[i], docs[i], i)
for i in range(len(docs))
]
# Execute all tasks concurrently with progress bar
results = await tqdm.gather(*tasks, desc="Uploading Documents")
# Count successful uploads
successful = sum(1 for result in results if result is not None)
print(f"Successfully uploaded {successful}/{len(docs)} documents")
return results
Querying documents with ZeroEntropy¶
First we will upload documents
await upload_documents_async(docs, file_names, collection_name)
Starting upload of 3 documents...
Uploading Documents: 100%|██████████| 3/3 [00:00<00:00, 3.42it/s]
Successfully uploaded 3/3 documents
[DocumentAddResponse(message='Success!'), DocumentAddResponse(message='Success!'), DocumentAddResponse(message='Success!')]
Query for the top 5 pages
response = await zclient.queries.top_pages(
collection_name=collection_name,
query="What are the top 100 stocks in the S&P 500?",
k=5,
)
Now let's define a function to rerank the pages in the response:
async def rerank_top_pages_with_metadata(
query: str, top_pages_response, collection_name: str
):
"""
Rerank the results from a top_pages query and return re-ordered list with metadata.
Args:
query: The query string to use for reranking
top_pages_response: The response object from zclient.queries.top_pages()
collection_name: Name of the collection to fetch page content from
Returns:
List of dicts with 'path', 'page_index', and 'rerank_score' in reranked order
"""
# Fetch page content and store metadata for each result
documents = []
metadata = []
for result in top_pages_response.results:
# Fetch the actual page content
page_info = await zclient.documents.get_page_info(
collection_name=collection_name,
path=result.path,
page_index=result.page_index,
include_content=True,
)
# Get page content and ensure it's not empty
page_content = page_info.page.content
if page_content and page_content.strip():
documents.append(page_content.strip())
metadata.append(
{
"path": result.path,
"page_index": result.page_index,
"original_score": result.score,
}
)
else:
# Include empty pages with fallback content
documents.append("No content available")
metadata.append(
{
"path": result.path,
"page_index": result.page_index,
"original_score": result.score,
}
)
if not documents:
raise ValueError("No documents found to rerank")
# Perform reranking
rerank_response = await zclient.models.rerank(
model="zerank-1", query=query, documents=documents
)
# Create re-ordered list with metadata
reranked_results = []
for rerank_result in rerank_response.results:
original_metadata = metadata[rerank_result.index]
reranked_results.append(
{
"path": original_metadata["path"],
"page_index": original_metadata["page_index"],
"rerank_score": rerank_result.relevance_score,
}
)
return reranked_results
Run the function and see the results!
reranked_results = await rerank_top_pages_with_metadata(
query="What are the top 100 stocks in the S&P 500?",
top_pages_response=response,
collection_name=collection_name,
)
# Display results
print("Reranked Results with Metadata:")
for i, result in enumerate(reranked_results, 1):
print(
f"Rank {i}: {result['path']} (Page {result['page_index']}) - Score: {result['rerank_score']:.4f}"
)
Reranked Results with Metadata: Rank 1: example_docs/dashboard-sp-500-factor.pdf (Page 9) - Score: 0.8472 Rank 2: example_docs/dashboard-sp-500-factor.pdf (Page 12) - Score: 0.8311 Rank 3: example_docs/dashboard-sp-500-factor.pdf (Page 8) - Score: 0.7941 Rank 4: example_docs/dashboard-sp-500-factor.pdf (Page 2) - Score: 0.4571 Rank 5: example_docs/dashboard-sp-500-factor.pdf (Page 4) - Score: 0.4511
✅ That's It!¶
You've now built a working semantic search engine that processes PDF files entirely in memory using ZeroEntropy and LlamaParse — no local file storage required!