Advanced RAG with temporal filters using LlamaIndex and KDB.AI vector store¶
KDB.AI is a powerful knowledge-based vector database and search engine that allows you to build scalable, reliable AI applications, using real-time data, by providing advanced search, recommendation and personalization.
This example demonstrates how to use KDB.AI to run semantic search, summarization and analysis of financial regulations around some specific moment in time.
To access your end point and API keys, sign up to KDB.AI here.
To set up your development environment, follow the instructions on the KDB.AI pre-requisites page.
The following examples demonstrate some of the ways you can interact with KDB.AI through LlamaIndex.
Install dependencies with Pip¶
In [ ]:
Copied!
# %pip install llama-index llama-index-embeddings-huggingface llama-index-llms-openai llama-index-readers-file llama-index-vector-stores-kdbai
# %pip install kdbai_client pandas
# %pip install llama-index llama-index-embeddings-huggingface llama-index-llms-openai llama-index-readers-file llama-index-vector-stores-kdbai
# %pip install kdbai_client pandas
Import dependencies¶
In [ ]:
Copied!
from getpass import getpass
import re
import os
import shutil
import time
import urllib
import pandas as pd
from llama_index.core import (
Settings,
SimpleDirectoryReader,
ServiceContext,
StorageContext,
VectorStoreIndex,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.kdbai import KDBAIVectorStore
import pykx as kx
import kdbai_client as kdbai
OUTDIR = "pdf"
RESET = True
# LLM = 'gpt-3.5-turbo'
LLM = "gpt-4-turbo-preview" # Expensive !!!
EMBEDDING = "sentence-transformers/all-mpnet-base-v2"
os.environ["OPENAI_API_KEY"] = getpass("OpenAI API key: ")
from getpass import getpass
import re
import os
import shutil
import time
import urllib
import pandas as pd
from llama_index.core import (
Settings,
SimpleDirectoryReader,
ServiceContext,
StorageContext,
VectorStoreIndex,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.kdbai import KDBAIVectorStore
import pykx as kx
import kdbai_client as kdbai
OUTDIR = "pdf"
RESET = True
# LLM = 'gpt-3.5-turbo'
LLM = "gpt-4-turbo-preview" # Expensive !!!
EMBEDDING = "sentence-transformers/all-mpnet-base-v2"
os.environ["OPENAI_API_KEY"] = getpass("OpenAI API key: ")
Create KDB.AI session and table¶
In [ ]:
Copied!
KDBAI_ENDPOINT = "http://localhost:8082"
KDBAI_API_KEY = None
KDBAI_TABLE_NAME = "reports"
session = kdbai.Session(endpoint=KDBAI_ENDPOINT, api_key=KDBAI_API_KEY)
if KDBAI_TABLE_NAME in session.list():
session.table(KDBAI_TABLE_NAME).drop()
schema = dict(
columns=[
dict(name="document_id", pytype="bytes"),
dict(name="text", pytype="bytes"),
dict(
name="embedding",
vectorIndex=dict(type="flat", metric="L2", dims=768),
),
dict(name="title", pytype="bytes"),
dict(name="publication_date", pytype="datetime64[ns]"),
]
)
table = session.create_table(KDBAI_TABLE_NAME, schema)
KDBAI_ENDPOINT = "http://localhost:8082"
KDBAI_API_KEY = None
KDBAI_TABLE_NAME = "reports"
session = kdbai.Session(endpoint=KDBAI_ENDPOINT, api_key=KDBAI_API_KEY)
if KDBAI_TABLE_NAME in session.list():
session.table(KDBAI_TABLE_NAME).drop()
schema = dict(
columns=[
dict(name="document_id", pytype="bytes"),
dict(name="text", pytype="bytes"),
dict(
name="embedding",
vectorIndex=dict(type="flat", metric="L2", dims=768),
),
dict(name="title", pytype="bytes"),
dict(name="publication_date", pytype="datetime64[ns]"),
]
)
table = session.create_table(KDBAI_TABLE_NAME, schema)
Financial reports urls and metadata¶
In [ ]:
Copied!
INPUT_URLS = [
"https://www.govinfo.gov/content/pkg/PLAW-106publ102/pdf/PLAW-106publ102.pdf",
"https://www.govinfo.gov/content/pkg/PLAW-111publ203/pdf/PLAW-111publ203.pdf",
]
METADATA = {
"pdf/PLAW-106publ102.pdf": {
"title": "GRAMM–LEACH–BLILEY ACT, 1999",
"publication_date": pd.to_datetime("1999-11-12"),
},
"pdf/PLAW-111publ203.pdf": {
"title": "DODD-FRANK WALL STREET REFORM AND CONSUMER PROTECTION ACT, 2010",
"publication_date": pd.to_datetime("2010-07-21"),
},
}
INPUT_URLS = [
"https://www.govinfo.gov/content/pkg/PLAW-106publ102/pdf/PLAW-106publ102.pdf",
"https://www.govinfo.gov/content/pkg/PLAW-111publ203/pdf/PLAW-111publ203.pdf",
]
METADATA = {
"pdf/PLAW-106publ102.pdf": {
"title": "GRAMM–LEACH–BLILEY ACT, 1999",
"publication_date": pd.to_datetime("1999-11-12"),
},
"pdf/PLAW-111publ203.pdf": {
"title": "DODD-FRANK WALL STREET REFORM AND CONSUMER PROTECTION ACT, 2010",
"publication_date": pd.to_datetime("2010-07-21"),
},
}
Download PDF files locally¶
In [ ]:
Copied!
%%time
CHUNK_SIZE = 512 * 1024
def download_file(url):
print("Downloading %s..." % url)
out = os.path.join(OUTDIR, os.path.basename(url))
try:
response = urllib.request.urlopen(url)
except urllib.error.URLError as e:
logging.exception("Failed to download %s !" % url)
else:
with open(out, "wb") as f:
while True:
chunk = response.read(CHUNK_SIZE)
if chunk:
f.write(chunk)
else:
break
return out
if RESET:
if os.path.exists(OUTDIR):
shutil.rmtree(OUTDIR)
os.mkdir(OUTDIR)
local_files = [download_file(x) for x in INPUT_URLS]
local_files[:10]
%%time
CHUNK_SIZE = 512 * 1024
def download_file(url):
print("Downloading %s..." % url)
out = os.path.join(OUTDIR, os.path.basename(url))
try:
response = urllib.request.urlopen(url)
except urllib.error.URLError as e:
logging.exception("Failed to download %s !" % url)
else:
with open(out, "wb") as f:
while True:
chunk = response.read(CHUNK_SIZE)
if chunk:
f.write(chunk)
else:
break
return out
if RESET:
if os.path.exists(OUTDIR):
shutil.rmtree(OUTDIR)
os.mkdir(OUTDIR)
local_files = [download_file(x) for x in INPUT_URLS]
local_files[:10]
Downloading https://www.govinfo.gov/content/pkg/PLAW-106publ102/pdf/PLAW-106publ102.pdf... Downloading https://www.govinfo.gov/content/pkg/PLAW-111publ203/pdf/PLAW-111publ203.pdf... CPU times: user 33 ms, sys: 25.4 ms, total: 58.4 ms Wall time: 6.09 s
Load local PDF files with LlamaIndex¶
In [ ]:
Copied!
%%time
def get_metadata(filepath):
return METADATA[filepath]
documents = SimpleDirectoryReader(
input_files=local_files,
file_metadata=get_metadata,
)
docs = documents.load_data()
len(docs)
%%time
def get_metadata(filepath):
return METADATA[filepath]
documents = SimpleDirectoryReader(
input_files=local_files,
file_metadata=get_metadata,
)
docs = documents.load_data()
len(docs)
CPU times: user 3.94 s, sys: 22.5 ms, total: 3.96 s Wall time: 3.96 s
Out[ ]:
994
Setup LlamaIndex RAG pipeline using KDB.AI vector store¶
In [ ]:
Copied!
%%time
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING)
llm = OpenAI(temperature=0, model=LLM)
vector_store = KDBAIVectorStore(table)
service_context = ServiceContext.from_defaults(
embed_model=embed_model, llm=llm
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
docs,
service_context=service_context,
storage_context=storage_context,
transformations=[SentenceSplitter(chunk_size=2048, chunk_overlap=0)],
)
%%time
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING)
llm = OpenAI(temperature=0, model=LLM)
vector_store = KDBAIVectorStore(table)
service_context = ServiceContext.from_defaults(
embed_model=embed_model, llm=llm
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
docs,
service_context=service_context,
storage_context=storage_context,
transformations=[SentenceSplitter(chunk_size=2048, chunk_overlap=0)],
)
<timed exec>:4: DeprecationWarning: Call to deprecated class method from_defaults. (ServiceContext is deprecated, please use `llama_index.settings.Settings` instead.) -- Deprecated since version 0.10.0.
CPU times: user 1min 23s, sys: 3min 3s, total: 4min 27s Wall time: 32.4 s
Setup the LlamaIndex Query Engine¶
In [ ]:
Copied!
%%time
# Using gpt-3.5-turbo, the 16k tokens context size can only fit around 15 pages of document.
# Using gpt-4-turbo-preview, the 128k tokens context size can take 100 pages.
K = 100
query_engine = index.as_query_engine(
similarity_top_k=K,
filter=[("<", "publication_date", "2008-09-15")],
sort_by="publication_date",
)
%%time
# Using gpt-3.5-turbo, the 16k tokens context size can only fit around 15 pages of document.
# Using gpt-4-turbo-preview, the 128k tokens context size can take 100 pages.
K = 100
query_engine = index.as_query_engine(
similarity_top_k=K,
filter=[("<", "publication_date", "2008-09-15")],
sort_by="publication_date",
)
CPU times: user 24.1 ms, sys: 3.74 ms, total: 27.9 ms Wall time: 26.6 ms
Before the 2008 crisis¶
In [ ]:
Copied!
%%time
result = query_engine.query(
"""
What was the main financial regulation in the US before the 2008 financial crisis ?
"""
)
print(result.response)
%%time
result = query_engine.query(
"""
What was the main financial regulation in the US before the 2008 financial crisis ?
"""
)
print(result.response)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Before the 2008 financial crisis, the main financial regulation in the US included a variety of laws and regulatory measures, but one of the most significant frameworks was established by the Gramm-Leach-Bliley Act of 1999. This act repealed parts of the Glass-Steagall Act of 1933, allowing banks to offer a broader range of financial services, including investment, commercial banking, and insurance services. Other regulatory measures and entities, such as the Securities and Exchange Commission (SEC) and laws like the Sarbanes-Oxley Act of 2002, also played key roles in the financial regulatory landscape prior to the crisis. CPU times: user 202 ms, sys: 53.5 ms, total: 256 ms Wall time: 18.1 s
In [ ]:
Copied!
%%time
result = query_engine.query(
"""
Is the Gramm-Leach-Bliley Act of 1999 enough to prevent the 2008 crisis. Search the document and explain its strenghts and weaknesses to regulate the US stock market.
"""
)
print(result.response)
%%time
result = query_engine.query(
"""
Is the Gramm-Leach-Bliley Act of 1999 enough to prevent the 2008 crisis. Search the document and explain its strenghts and weaknesses to regulate the US stock market.
"""
)
print(result.response)
The Gramm-Leach-Bliley Act of 1999, also known as the Financial Services Modernization Act, aimed to modernize financial services by removing barriers between banking, securities, and insurance companies, allowing them to offer each other's services. While the Act contributed to financial services integration and competition, its effectiveness in preventing crises like that of 2008 is debatable due to its strengths and weaknesses in regulating the US stock market. Strengths: 1. Enhanced Competition: By allowing financial institutions to merge and offer a broader range of services, the Act fostered competition, innovation, and efficiency in the financial sector. 2. Functional Regulation: The Act maintained that activities within financial institutions would be regulated by the appropriate functional regulator (e.g., securities activities by the SEC), aiming for expertise-based oversight. Weaknesses: 1. Increased Systemic Risk: The Act's facilitation of larger, more complex financial institutions may have contributed to systemic risk, as failures of these institutions could have more significant impacts on the financial system. 2. Regulatory Gaps and Oversight Challenges: The integration of different financial services under one roof made it challenging for regulators to oversee and manage the risks of these conglomerates effectively. The Act did not fully address the need for a systemic risk regulator or enhance oversight of the shadow banking system, which played a significant role in the 2008 crisis. 3. Weakened Consumer Privacy Protections: While the Act included provisions for protecting consumers' personal financial information, critics argue that it also allowed for increased sharing of this information among financial entities, potentially undermining consumer privacy. In summary, while the Gramm-Leach-Bliley Act of 1999 had the potential to foster innovation and efficiency in the financial sector by breaking down barriers between different types of financial services, its weaknesses in addressing systemic risk and regulatory oversight challenges may have limited its effectiveness in preventing financial crises like that of 2008. CPU times: user 177 ms, sys: 45.6 ms, total: 223 ms Wall time: 31.6 s
After the 2008 crisis¶
In [ ]:
Copied!
%%time
# Using gpt-3.5-turbo, the 16k tokens context size can only fit around 15 pages of document.
# Using gpt-4-turbo-preview, the 128k tokens context size can take 100 pages.
K = 100
query_engine = index.as_query_engine(
similarity_top_k=K,
filter=[(">=", "publication_date", "2008-09-15")],
sort_by="publication_date",
)
%%time
# Using gpt-3.5-turbo, the 16k tokens context size can only fit around 15 pages of document.
# Using gpt-4-turbo-preview, the 128k tokens context size can take 100 pages.
K = 100
query_engine = index.as_query_engine(
similarity_top_k=K,
filter=[(">=", "publication_date", "2008-09-15")],
sort_by="publication_date",
)
CPU times: user 217 µs, sys: 99 µs, total: 316 µs Wall time: 320 µs
In [ ]:
Copied!
%%time
result = query_engine.query(
"""
What happened on the 15th of September 2008 ? Answer from your own knowledge only.
"""
)
print(result.response)
%%time
result = query_engine.query(
"""
What happened on the 15th of September 2008 ? Answer from your own knowledge only.
"""
)
print(result.response)
I'm unable to provide an answer based on the given instructions. CPU times: user 151 ms, sys: 22 ms, total: 173 ms Wall time: 12.7 s
In [ ]:
Copied!
%%time
result = query_engine.query(
"""
What was the new US financial regulation enacted after the 2008 crisis to increase the market regulation and to improve consumer sentiment ?
"""
)
print(result.response)
%%time
result = query_engine.query(
"""
What was the new US financial regulation enacted after the 2008 crisis to increase the market regulation and to improve consumer sentiment ?
"""
)
print(result.response)
The Dodd-Frank Wall Street Reform and Consumer Protection Act, 2010. CPU times: user 184 ms, sys: 23.1 ms, total: 207 ms Wall time: 17.1 s
In depth analysis¶
In [ ]:
Copied!
%%time
# Using gpt-3.5-turbo, the 16k tokens context size can only fit around 15 pages of document.
# Using gpt-4-turbo-preview, the 128k tokens context size can take 100 pages.
K = 100
query_engine = index.as_query_engine(
similarity_top_k=K, sort_by="publication_date"
)
%%time
# Using gpt-3.5-turbo, the 16k tokens context size can only fit around 15 pages of document.
# Using gpt-4-turbo-preview, the 128k tokens context size can take 100 pages.
K = 100
query_engine = index.as_query_engine(
similarity_top_k=K, sort_by="publication_date"
)
CPU times: user 381 µs, sys: 2 µs, total: 383 µs Wall time: 399 µs
In [ ]:
Copied!
%%time
result = query_engine.query(
"""
Analyse the US financial regulations before and after the 2008 crisis and produce a report of all related arguments to explain what happened, and to ensure that does not happen again.
Use both the provided context and your own knowledge but do mention explicitely which one you use.
"""
)
print(result.response)
%%time
result = query_engine.query(
"""
Analyse the US financial regulations before and after the 2008 crisis and produce a report of all related arguments to explain what happened, and to ensure that does not happen again.
Use both the provided context and your own knowledge but do mention explicitely which one you use.
"""
)
print(result.response)
Before the 2008 financial crisis, the US financial system was characterized by deregulation and an increase in complex financial products such as mortgage-backed securities and derivatives. The Gramm-Leach-Bliley Act of 1999 repealed the Glass-Steagall Act, allowing banks to engage in investment activities, which led to increased risk-taking. The lack of transparency and understanding of these complex financial products, coupled with inadequate oversight, contributed to the financial crisis. After the 2008 crisis, the Dodd-Frank Wall Street Reform and Consumer Protection Act was enacted in 2010 to address the regulatory gaps and weaknesses revealed by the crisis. The Act aimed to increase transparency, protect consumers, and prevent the occurrence of a similar crisis. Key provisions included the creation of the Financial Stability Oversight Council to monitor systemic risk, the establishment of the Consumer Financial Protection Bureau to protect consumers from abusive financial practices, and the introduction of the Volcker Rule to limit speculative investments by banks. Additionally, the Act imposed stricter capital requirements and introduced mechanisms for the orderly liquidation of failing financial institutions to prevent bailouts. To ensure that a similar crisis does not happen again, it is crucial to maintain vigilant regulatory oversight, promote transparency in financial markets, and ensure that financial institutions have robust risk management practices in place. Continuous monitoring of systemic risks and the ability to adapt regulations in response to evolving financial products and practices are also essential. This analysis is based on the context provided and my own knowledge of the US financial regulations before and after the 2008 crisis. CPU times: user 1.11 s, sys: 1.99 s, total: 3.1 s Wall time: 29.8 s