Fine Tuning Llama2 for Better Structured Outputs With Gradient and LlamaIndex#
In this notebook we show you how to fine-tune llama2-7b to be better at outputting structured outputs.
We do this by using gradient.ai
This is similar in format to our OpenAI Functions Fine-tuning Notebook.
NOTE: This is an alternative to our repo/guide on fine-tuning llama2-7b with Modal: https://github.com/run-llama/modal_finetune_sql
!pip install llama-index gradientai -q
import os
from llama_index.llms import GradientBaseModelLLM
from llama_index.finetuning.gradient.base import GradientFinetuneEngine
os.environ["GRADIENT_ACCESS_TOKEN"] = os.getenv("GRADIENT_API_KEY")
os.environ["GRADIENT_WORKSPACE_ID"] = "<insert_workspace_id>"
Fine-tuning Using GPT-4 Pydantic Programs#
In this section we show how to log inputs + GPT-4 generated outputs through our low-level Pydantic Program module. We use that dataset to fine-tune llama2.
from pydantic import BaseModel
class Album(BaseModel):
"""Data model for an album."""
name: str
artist: str
from llama_index.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.llms import OpenAI, GradientBaseModelLLM
from llama_index.program import LLMTextCompletionProgram
from llama_index.output_parsers import PydanticOutputParser
openai_handler = LlamaDebugHandler()
openai_callback = CallbackManager([openai_handler])
openai_llm = OpenAI(model="gpt-4", callback_manager=openai_callback)
gradient_handler = LlamaDebugHandler()
gradient_callback = CallbackManager([gradient_handler])
base_model_slug = "llama2-7b-chat"
gradient_llm = GradientBaseModelLLM(
base_model_slug=base_model_slug,
max_tokens=300,
callback_manager=gradient_callback,
is_chat_model=True,
)
# HACK: set chat model
# from llama_index.llms.types import LLMMetadata
# gradient_llm.metadata = LLMMetadata(
# context_window=1024,
# num_output=gradient_llm.max_tokens or 20,
# is_chat_model=True,
# is_function_calling_model=False,
# model_name=gradient_llm._model.id,
# )
# try running both through LLMTextCompletionProgram
prompt_template_str = """\
Generate an example album, with an artist and a list of songs. \
Using the movie {movie_name} as inspiration.\
"""
openai_program = LLMTextCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(Album),
prompt_template_str=prompt_template_str,
llm=openai_llm,
verbose=True,
)
gradient_program = LLMTextCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(Album),
prompt_template_str=prompt_template_str,
llm=gradient_llm,
verbose=True,
)
response = openai_program(movie_name="The Shining")
print(str(response))
tmp = openai_handler.get_llm_inputs_outputs()
print(tmp[0][0].payload["messages"][0])
# print(tmp[0][1])
response = gradient_program(movie_name="The Shining")
print(str(response))
tmp = gradient_handler.get_llm_inputs_outputs()
print(tmp[0][0].payload["messages"][0])
Defining Pydantic Model + Program#
Here, we define the GPT-4 powered function calling program that will generate structured outputs into a Pydantic object (an Album).
from llama_index.program import LLMTextCompletionProgram
from pydantic import BaseModel
from llama_index.llms import OpenAI
from llama_index.callbacks import GradientAIFineTuningHandler
from llama_index.callbacks import CallbackManager
from llama_index.output_parsers import PydanticOutputParser
from typing import List
class Song(BaseModel):
"""Data model for a song."""
title: str
length_seconds: int
class Album(BaseModel):
"""Data model for an album."""
name: str
artist: str
songs: List[Song]
finetuning_handler = GradientAIFineTuningHandler()
callback_manager = CallbackManager([finetuning_handler])
llm_gpt4 = OpenAI(model="gpt-4", callback_manager=callback_manager)
prompt_template_str = """\
Generate an example album, with an artist and a list of songs. \
Using the movie {movie_name} as inspiration.\
"""
openai_program = LLMTextCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(Album),
prompt_template_str=prompt_template_str,
llm=llm_gpt4,
verbose=True,
)
Log Inputs/Outputs#
We define some sample movie names as inputs and log the outputs through the function calling program.
# NOTE: we need >= 10 movies to use Gradient fine-tuning
movie_names = [
"The Shining",
"The Departed",
"Titanic",
"Goodfellas",
"Pretty Woman",
"Home Alone",
"Caged Fury",
"Edward Scissorhands",
"Total Recall",
"Ghost",
"Tremors",
"RoboCop",
"Rocky V",
]
from tqdm.notebook import tqdm
for movie_name in tqdm(movie_names):
output = openai_program(movie_name=movie_name)
print(output.json())
events = finetuning_handler.get_finetuning_events()
events
finetuning_handler.save_finetuning_events("mock_finetune_songs.jsonl")
Wrote 14 examples to mock_finetune_songs.jsonl
!cat mock_finetune_songs.jsonl
Fine-tune on the Dataset#
We now define a fine-tuning engine and fine-tune on the mock dataset.
# define base model
base_model_slug = "llama2-7b-chat"
base_llm = GradientBaseModelLLM(
base_model_slug=base_model_slug, max_tokens=500, is_chat_model=True
)
from llama_index.finetuning import GradientFinetuneEngine
finetune_engine = GradientFinetuneEngine(
base_model_slug=base_model_slug,
# model_adapter_id='805c6fd6-daa8-4fc8-a509-bebb2f2c1024_model_adapter',
name="movies_structured",
data_path="mock_finetune_songs.jsonl",
verbose=True,
max_steps=200,
batch_size=1,
)
finetune_engine.model_adapter_id
'1f810f84-c4b8-43b0-b6b0-10d2cbdaf92f_model_adapter'
# asdjust epochs as necessary
epochs = 2
for i in range(epochs):
print(f"** EPOCH {i} **")
finetune_engine.finetune()
ft_llm = finetune_engine.get_finetuned_model(
max_tokens=500, is_chat_model=True
)
# # NOTE: same as doing the following
# from llama_index.llms import GradientModelAdapterLLM
# ft_llm = GradientModelAdapterLLM(
# model_adapter_id=finetune_engine.model_adapter_id,
# max_tokens=500
# )
Try it Out!#
We obtain the fine-tuned LLM and use it with the Pydantic program.
# try a slightly modified prompt_template_str
new_prompt_template_str = """\
Generate an example album, with an artist and a list of songs. \
Using the movie {movie_name} as inspiration.\
Please only generate one album.
"""
gradient_program = LLMTextCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(Album),
# prompt_template_str=prompt_template_str,
prompt_template_str=new_prompt_template_str,
llm=ft_llm,
verbose=True,
)
gradient_program(movie_name="Goodfellas")
Album(name='Wiseguy Melodies', artist='Tommy DeVito & The Gangsters', songs=[Song(title='Life in the Fast Lane', length_seconds=210), Song(title='Money and Power', length_seconds=240), Song(title='Goodfellas', length_seconds=270), Song(title='Betrayal', length_seconds=200), Song(title='Downfall', length_seconds=180)])
gradient_program(movie_name="Chucky")
# you wouldn't get this with normal llama2-7b!
base_gradient_program = LLMTextCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(Album),
prompt_template_str=prompt_template_str,
llm=base_llm,
verbose=True,
)
# throws an error
base_gradient_program(movie_name="Goodfellas")
Fine-tuning Structured Outputs through a RAG System#
A use case of function calling is to get structured outputs through a RAG system.
Here we show how to create a training dataset of context-augmented inputs + structured outputs over an unstructured document. We can then fine-tune the LLM and plug it into a RAG system to perform retrieval + output extraction.
!mkdir data && wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
from pydantic import Field
from typing import List
class Citation(BaseModel):
"""Citation class."""
author: str = Field(
..., description="Inferred first author (usually last name"
)
year: int = Field(..., description="Inferred year")
desc: str = Field(
...,
description=(
"Inferred description from the text of the work that the author is"
" cited for"
),
)
class Response(BaseModel):
"""List of author citations.
Extracted over unstructured text.
"""
citations: List[Citation] = Field(
...,
description=(
"List of author citations (organized by author, year, and"
" description)."
),
)
Load Data + Setup#
from llama_hub.file.pymu_pdf.base import PyMuPDFReader
from llama_index import Document, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from pathlib import Path
from llama_index.callbacks import GradientAIFineTuningHandler
loader = PyMuPDFReader()
docs0 = loader.load(file_path=Path("./data/llama2.pdf"))
doc_text = "\n\n".join([d.get_content() for d in docs0])
metadata = {
"paper_title": "Llama 2: Open Foundation and Fine-Tuned Chat Models"
}
docs = [Document(text=doc_text, metadata=metadata)]
chunk_size = 1024
node_parser = SimpleNodeParser.from_defaults(chunk_size=chunk_size)
nodes = node_parser.get_nodes_from_documents(docs)
len(nodes)
89
# setup GPT-4 context - to generate "ground-truth" data given queries
finetuning_handler = GradientAIFineTuningHandler()
callback_manager = CallbackManager([finetuning_handler])
gpt_4_context = ServiceContext.from_defaults(
llm=OpenAI(model="gpt-4-0613", temperature=0.3),
callback_manager=callback_manager,
chunk_size=chunk_size,
# force using prompts instead of openai function calling for structured outputs
pydantic_program_mode="llm",
)
# setup gradient.ai context
base_model_slug = "llama2-7b-chat"
base_llm = GradientBaseModelLLM(
base_model_slug=base_model_slug, max_tokens=500, is_chat_model=True
)
gradient_context = ServiceContext.from_defaults(
llm=base_llm,
# callback_manager=callback_manager,
chunk_size=chunk_size,
pydantic_program_mode="llm",
)
# setup eval context (for question generation)
eval_context = ServiceContext.from_defaults(
llm=OpenAI(model="gpt-4-0613", temperature=0), chunk_size=chunk_size
)
Generate Dataset#
Here we show how to generate a training dataset over these unstructured chunks/nodes.
We generate questions to extract citations over different context. We run these questions through a GPT-4 RAG pipeline, extract structured outputs, and log inputs/outputs.
# setup dataset generator
from llama_index.evaluation import DatasetGenerator
from llama_index import SummaryIndex, PromptTemplate
from tqdm.notebook import tqdm
from tqdm.asyncio import tqdm_asyncio
fp = open("data/qa_pairs.jsonl", "w")
question_gen_prompt = PromptTemplate(
"""
{query_str}
Context:
{context_str}
Questions:
"""
)
question_gen_query = """\
Snippets from a research paper is given below. It contains citations.
Please generate questions from the text asking about these citations.
For instance, here are some sample questions:
Which citations correspond to related works on transformer models?
Tell me about authors that worked on advancing RLHF.
Can you tell me citations corresponding to all computer vision works? \
"""
qr_pairs = []
node_questions_tasks = []
for idx, node in enumerate(nodes[:39]):
num_questions = 1 # change this number to increase number of nodes
dataset_generator = DatasetGenerator(
[node],
question_gen_query=question_gen_query,
text_question_template=question_gen_prompt,
service_context=eval_context,
metadata_mode="all",
num_questions_per_chunk=num_questions,
)
task = dataset_generator.agenerate_questions_from_nodes(num=num_questions)
node_questions_tasks.append(task)
node_questions_lists = await tqdm_asyncio.gather(*node_questions_tasks)
100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 39/39 [00:27<00:00, 1.41it/s]
len(node_questions_lists)
39
node_questions_lists[1]
['Which citations are mentioned in the section about RLHF Results?']
# [optional] save
import pickle
pickle.dump(node_questions_lists, open("llama2_questions.pkl", "wb"))
# [optional] load questions
node_questions_lists = pickle.load(open("llama2_questions.pkl", "rb"))
from llama_index import VectorStoreIndex
gpt4_index = VectorStoreIndex(nodes[:39], service_context=gpt_4_context)
gpt4_query_engine = gpt4_index.as_query_engine(
output_cls=Response, similarity_top_k=1
)
from json import JSONDecodeError
for idx, node in enumerate(tqdm(nodes[:39])):
node_questions_0 = node_questions_lists[idx]
for question in node_questions_0:
try:
# note: we don't need to use response, events are logged through fine-tuning handler
gpt4_query_engine.query(question)
except Exception as e:
print(f"Error for question {question}, {repr(e)}")
pass
finetuning_handler.save_finetuning_events("llama2_citation_events.jsonl")
Wrote 39 examples to llama2_citation_events.jsonl
Setup Fine-tuning#
We kick off fine-tuning over the generated dataset.
from llama_index.finetuning import GradientFinetuneEngine
finetune_engine = GradientFinetuneEngine(
base_model_slug=base_model_slug,
# model_adapter_id='23a71710-47b3-43be-9be2-58a3efbccf2b_model_adapter',
name="llama2_structured",
data_path="llama2_citation_events.jsonl",
verbose=True,
max_steps=200,
batch_size=1,
)
# save this for future runs
finetune_engine.model_adapter_id
'23a71710-47b3-43be-9be2-58a3efbccf2b_model_adapter'
# asdjust epochs as necessary
epochs = 2
for i in range(epochs):
print(f"** EPOCH {i} **")
finetune_engine.finetune()
Use within RAG Pipeline#
Letβs plug the fine-tuned LLM into a full RAG pipeline that outputs structured outputs.
ft_llm = finetune_engine.get_finetuned_model(max_tokens=500)
ft_service_context = ServiceContext.from_defaults(llm=ft_llm)
from llama_index import VectorStoreIndex
vector_index = VectorStoreIndex(nodes, service_context=ft_service_context)
query_engine = vector_index.as_query_engine(
output_cls=Response, similarity_top_k=1
)
# setup baseline as well
base_index = VectorStoreIndex(nodes, service_context=gradient_context)
base_query_engine = base_index.as_query_engine(
output_cls=Response, similarity_top_k=1
)
query_str = "Which citations are mentioned in the section about RLHF Results?"
# query_str = """\
# Which citation corresponds to the concept of collecting data that represents \
# empirically sampled human preferences in RLHF?\
# """
# query_str = "Which citations in the paper discuss the development and release of Llama 2?"
# query_str = "Which citations are mentioned in the section on RLHF Results?"
# query_str = "Which citation discusses the carbon output related to the production of AI hardware?"
response = query_engine.query(query_str)
print(str(response))
Letβs take a look at sources
# view sources
print(response.source_nodes[0].get_content())
Letβs compare against the baseline (the base llama2-7b model). Notice that the query engine throws an error!
# throws an error!
base_response = base_query_engine.query(query_str)
print(str(base_response))
As a reference, letβs also compare against gpt-4.
# as a reference, take a look at GPT-4 response
gpt4_response = gpt4_query_engine.query(query_str)
print(str(gpt4_response))