32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263 | class SimpleMultiModalQueryEngine(BaseQueryEngine):
"""Simple Multi Modal Retriever query engine.
Assumes that retrieved text context fits within context window of LLM, along with images.
Args:
retriever (MultiModalVectorIndexRetriever): A retriever object.
multi_modal_llm (Optional[MultiModalLLM]): MultiModalLLM Models.
text_qa_template (Optional[BasePromptTemplate]): Text QA Prompt Template.
image_qa_template (Optional[BasePromptTemplate]): Image QA Prompt Template.
node_postprocessors (Optional[List[BaseNodePostprocessor]]): Node Postprocessors.
callback_manager (Optional[CallbackManager]): A callback manager.
"""
def __init__(
self,
retriever: "MultiModalVectorIndexRetriever",
multi_modal_llm: Optional[MultiModalLLM] = None,
text_qa_template: Optional[BasePromptTemplate] = None,
image_qa_template: Optional[BasePromptTemplate] = None,
node_postprocessors: Optional[List[BaseNodePostprocessor]] = None,
callback_manager: Optional[CallbackManager] = None,
**kwargs: Any,
) -> None:
self._retriever = retriever
if multi_modal_llm:
self._multi_modal_llm = multi_modal_llm
else:
try:
from llama_index.multi_modal_llms.openai import (
OpenAIMultiModal,
) # pants: no-infer-dep
self._multi_modal_llm = OpenAIMultiModal(
model="gpt-4-vision-preview", max_new_tokens=1000
)
except ImportError as e:
raise ImportError(
"`llama-index-multi-modal-llms-openai` package cannot be found. "
"Please install it by using `pip install `llama-index-multi-modal-llms-openai`"
)
self._text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT
self._image_qa_template = image_qa_template or DEFAULT_TEXT_QA_PROMPT
self._node_postprocessors = node_postprocessors or []
callback_manager = callback_manager or CallbackManager([])
for node_postprocessor in self._node_postprocessors:
node_postprocessor.callback_manager = callback_manager
super().__init__(callback_manager)
def _get_prompts(self) -> Dict[str, Any]:
"""Get prompts."""
return {"text_qa_template": self._text_qa_template}
def _get_prompt_modules(self) -> PromptMixinType:
"""Get prompt sub-modules."""
return {}
def _apply_node_postprocessors(
self, nodes: List[NodeWithScore], query_bundle: QueryBundle
) -> List[NodeWithScore]:
for node_postprocessor in self._node_postprocessors:
nodes = node_postprocessor.postprocess_nodes(
nodes, query_bundle=query_bundle
)
return nodes
def retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
nodes = self._retriever.retrieve(query_bundle)
return self._apply_node_postprocessors(nodes, query_bundle=query_bundle)
async def aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
nodes = await self._retriever.aretrieve(query_bundle)
return self._apply_node_postprocessors(nodes, query_bundle=query_bundle)
def synthesize(
self,
query_bundle: QueryBundle,
nodes: List[NodeWithScore],
additional_source_nodes: Optional[Sequence[NodeWithScore]] = None,
) -> RESPONSE_TYPE:
image_nodes, text_nodes = _get_image_and_text_nodes(nodes)
context_str = "\n\n".join(
[r.get_content(metadata_mode=MetadataMode.LLM) for r in text_nodes]
)
fmt_prompt = self._text_qa_template.format(
context_str=context_str, query_str=query_bundle.query_str
)
llm_response = self._multi_modal_llm.complete(
prompt=fmt_prompt,
image_documents=[
image_node.node
for image_node in image_nodes
if isinstance(image_node.node, ImageNode)
],
)
return Response(
response=str(llm_response),
source_nodes=nodes,
metadata={"text_nodes": text_nodes, "image_nodes": image_nodes},
)
def _get_response_with_images(
self,
prompt_str: str,
image_nodes: List[NodeWithScore],
) -> RESPONSE_TYPE:
assert all(isinstance(node.node, ImageNode) for node in image_nodes)
fmt_prompt = self._image_qa_template.format(
query_str=prompt_str,
)
llm_response = self._multi_modal_llm.complete(
prompt=fmt_prompt,
image_documents=[
node.node for node in image_nodes if isinstance(node.node, ImageNode)
],
)
return Response(
response=str(llm_response),
source_nodes=image_nodes,
metadata={"image_nodes": image_nodes},
)
async def asynthesize(
self,
query_bundle: QueryBundle,
nodes: List[NodeWithScore],
additional_source_nodes: Optional[Sequence[NodeWithScore]] = None,
) -> RESPONSE_TYPE:
image_nodes, text_nodes = _get_image_and_text_nodes(nodes)
context_str = "\n\n".join(
[r.get_content(metadata_mode=MetadataMode.LLM) for r in text_nodes]
)
fmt_prompt = self._text_qa_template.format(
context_str=context_str, query_str=query_bundle.query_str
)
llm_response = await self._multi_modal_llm.acomplete(
prompt=fmt_prompt,
image_documents=[
image_node.node
for image_node in image_nodes
if isinstance(image_node.node, ImageNode)
],
)
return Response(
response=str(llm_response),
source_nodes=nodes,
metadata={"text_nodes": text_nodes, "image_nodes": image_nodes},
)
def _query(self, query_bundle: QueryBundle) -> RESPONSE_TYPE:
"""Answer a query."""
with self.callback_manager.event(
CBEventType.QUERY, payload={EventPayload.QUERY_STR: query_bundle.query_str}
) as query_event:
with self.callback_manager.event(
CBEventType.RETRIEVE,
payload={EventPayload.QUERY_STR: query_bundle.query_str},
) as retrieve_event:
nodes = self.retrieve(query_bundle)
retrieve_event.on_end(
payload={EventPayload.NODES: nodes},
)
response = self.synthesize(
query_bundle,
nodes=nodes,
)
query_event.on_end(payload={EventPayload.RESPONSE: response})
return response
def image_query(self, image_path: QueryType, prompt_str: str) -> RESPONSE_TYPE:
"""Answer a image query."""
with self.callback_manager.event(
CBEventType.QUERY, payload={EventPayload.QUERY_STR: str(image_path)}
) as query_event:
with self.callback_manager.event(
CBEventType.RETRIEVE,
payload={EventPayload.QUERY_STR: str(image_path)},
) as retrieve_event:
nodes = self._retriever.image_to_image_retrieve(image_path)
retrieve_event.on_end(
payload={EventPayload.NODES: nodes},
)
image_nodes, _ = _get_image_and_text_nodes(nodes)
response = self._get_response_with_images(
prompt_str=prompt_str,
image_nodes=image_nodes,
)
query_event.on_end(payload={EventPayload.RESPONSE: response})
return response
async def _aquery(self, query_bundle: QueryBundle) -> RESPONSE_TYPE:
"""Answer a query."""
with self.callback_manager.event(
CBEventType.QUERY, payload={EventPayload.QUERY_STR: query_bundle.query_str}
) as query_event:
with self.callback_manager.event(
CBEventType.RETRIEVE,
payload={EventPayload.QUERY_STR: query_bundle.query_str},
) as retrieve_event:
nodes = await self.aretrieve(query_bundle)
retrieve_event.on_end(
payload={EventPayload.NODES: nodes},
)
response = await self.asynthesize(
query_bundle,
nodes=nodes,
)
query_event.on_end(payload={EventPayload.RESPONSE: response})
return response
@property
def retriever(self) -> "MultiModalVectorIndexRetriever":
"""Get the retriever object."""
return self._retriever
|