classSentenceEmbeddingOptimizer(BaseNodePostprocessor):"""Optimization of a text chunk given the query by shortening the input text."""percentile_cutoff:Optional[float]=Field(description="Percentile cutoff for the top k sentences to use.")threshold_cutoff:Optional[float]=Field(description="Threshold cutoff for similarity for each sentence to use.")_embed_model:BaseEmbedding=PrivateAttr()_tokenizer_fn:Callable[[str],List[str]]=PrivateAttr()context_before:Optional[int]=Field(description="Number of sentences before retrieved sentence for further context")context_after:Optional[int]=Field(description="Number of sentences after retrieved sentence for further context")def__init__(self,embed_model:Optional[BaseEmbedding]=None,percentile_cutoff:Optional[float]=None,threshold_cutoff:Optional[float]=None,tokenizer_fn:Optional[Callable[[str],List[str]]]=None,context_before:Optional[int]=None,context_after:Optional[int]=None,):"""Optimizer class that is passed into BaseGPTIndexQuery. Should be set like this: .. code-block:: python from llama_index.core.optimization.optimizer import Optimizer optimizer = SentenceEmbeddingOptimizer( percentile_cutoff=0.5 this means that the top 50% of sentences will be used. Alternatively, you can set the cutoff using a threshold on the similarity score. In this case only sentences with a similarity score higher than the threshold will be used. threshold_cutoff=0.7 these cutoffs can also be used together. ) query_engine = index.as_query_engine( optimizer=optimizer ) response = query_engine.query("<query_str>") """super().__init__(percentile_cutoff=percentile_cutoff,threshold_cutoff=threshold_cutoff,context_after=context_after,context_before=context_before,)self._embed_model=embed_modelorSettings.embed_modelifself._embed_modelisNone:try:fromllama_index.embeddings.openaiimport(OpenAIEmbedding,)# pants: no-infer-depself._embed_model=OpenAIEmbedding()exceptImportError:raiseImportError("`llama-index-embeddings-openai` package not found, ""please run `pip install llama-index-embeddings-openai`")iftokenizer_fnisNone:tokenizer=globals_helper.punkt_tokenizertokenizer_fn=tokenizer.tokenizeself._tokenizer_fn=tokenizer_fn@classmethoddefclass_name(cls)->str:return"SentenceEmbeddingOptimizer"def_postprocess_nodes(self,nodes:List[NodeWithScore],query_bundle:Optional[QueryBundle]=None,)->List[NodeWithScore]:"""Optimize a node text given the query by shortening the node text."""ifquery_bundleisNone:returnnodesfornode_idxinrange(len(nodes)):text=nodes[node_idx].node.get_content(metadata_mode=MetadataMode.LLM)split_text=self._tokenizer_fn(text)ifquery_bundle.embeddingisNone:query_bundle.embedding=(self._embed_model.get_agg_embedding_from_queries(query_bundle.embedding_strs))text_embeddings=self._embed_model._get_text_embeddings(split_text)num_top_k=Nonethreshold=Noneifself.percentile_cutoffisnotNone:num_top_k=int(len(split_text)*self.percentile_cutoff)ifself.threshold_cutoffisnotNone:threshold=self.threshold_cutofftop_similarities,top_idxs=get_top_k_embeddings(query_embedding=query_bundle.embedding,embeddings=text_embeddings,similarity_fn=self._embed_model.similarity,similarity_top_k=num_top_k,embedding_ids=list(range(len(text_embeddings))),similarity_cutoff=threshold,)iflen(top_idxs)==0:raiseValueError("Optimizer returned zero sentences.")rangeMin,rangeMax=0,len(split_text)ifself.context_beforeisNone:self.context_before=1ifself.context_afterisNone:self.context_after=1top_sentences=[" ".join(split_text[max(idx-self.context_before,rangeMin):min(idx+self.context_after+1,rangeMax)])foridxintop_idxs]logger.debug(f"> Top {len(top_idxs)} sentences with scores:\n")iflogger.isEnabledFor(logging.DEBUG):foridxinrange(len(top_idxs)):logger.debug(f"{idx}. {top_sentences[idx]} ({top_similarities[idx]})")nodes[node_idx].node.set_content(" ".join(top_sentences))returnnodes