BGE-M3 is a multilingual embedding model with multi-functionality:
Dense retrieval, Sparse retrieval and Multi-vector retrieval.
Parameters:
index_path: directory containing PLAID index files.
model_name: BGE-M3 hugging face model name.
Default: "BAAI/bge-m3".
show_progress: whether to show progress bar when building index.
Default: False. noop for BGE-M3 for now.
doc_maxlen: max document length. Default: 120.
query_maxlen: max query length. Default: 60.
Source code in llama-index-integrations/indices/llama-index-indices-managed-bge-m3/llama_index/indices/managed/bge_m3/base.py
classBGEM3Index(BaseIndex[IndexDict]):""" Store for BGE-M3 with PLAID indexing. BGE-M3 is a multilingual embedding model with multi-functionality: Dense retrieval, Sparse retrieval and Multi-vector retrieval. Parameters: index_path: directory containing PLAID index files. model_name: BGE-M3 hugging face model name. Default: "BAAI/bge-m3". show_progress: whether to show progress bar when building index. Default: False. noop for BGE-M3 for now. doc_maxlen: max document length. Default: 120. query_maxlen: max query length. Default: 60. """def__init__(self,nodes:Optional[Sequence[BaseNode]]=None,objects:Optional[Sequence[IndexNode]]=None,index_struct:Optional[IndexDict]=None,storage_context:Optional[StorageContext]=None,model_name:str="BAAI/bge-m3",index_name:str="",show_progress:bool=False,pooling_method:str="cls",normalize_embeddings:bool=True,use_fp16:bool=False,batch_size:int=32,doc_maxlen:int=8192,query_maxlen:int=8192,weights_for_different_modes:List[float]=None,**kwargs:Any,)->None:self.index_path="storage/bge_m3_index"self.index_name=index_nameself.batch_size=batch_sizeself.doc_maxlen=doc_maxlenself.query_maxlen=query_maxlenself.weights_for_different_modes=weights_for_different_modesself._multi_embed_store=Noneself._docs_pos_to_node_id:Dict[int,str]={}try:fromFlagEmbeddingimportBGEM3FlagModelexceptImportErrorasexc:raiseImportError("Please install FlagEmbedding to use this feature from the repo:","https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/BGE_M3",)fromexcself.model=BGEM3FlagModel(model_name,pooling_method=pooling_method,normalize_embeddings=normalize_embeddings,use_fp16=use_fp16,)super().__init__(nodes=nodes,index_struct=index_struct,index_name=index_name,storage_context=storage_context,show_progress=show_progress,objects=objects,**kwargs,)def_insert(self,nodes:Sequence[BaseNode],**insert_kwargs:Any)->None:raiseNotImplementedError("BGEM3Index does not support insertion yet.")def_delete_node(self,node_id:str,**delete_kwargs:Any)->None:raiseNotImplementedError("BGEM3Index does not support deletion yet.")defas_retriever(self,**kwargs:Any)->BaseRetriever:from.retrieverimportBGEM3RetrieverreturnBGEM3Retriever(index=self,object_map=self._object_map,**kwargs)@propertydefref_doc_info(self)->Dict[str,RefDocInfo]:raiseNotImplementedError("BGEM3Index does not support ref_doc_info.")def_build_index_from_nodes(self,nodes:Sequence[BaseNode],**kwargs:Any)->IndexDict:"""Generate a PLAID index from the BGE-M3 checkpoint via its hugging face model_name. """index_struct=IndexDict()docs_list=[]fori,nodeinenumerate(nodes):docs_list.append(node.get_content())self._docs_pos_to_node_id[i]=node.node_idindex_struct.add_node(node,text_id=str(i))self._multi_embed_store=self.model.encode(docs_list,batch_size=self.batch_size,max_length=self.doc_maxlen,return_dense=True,return_sparse=True,return_colbert_vecs=True,)returnindex_structdefpersist(self,persist_dir:str)->None:# Check if the destination directory existsifos.path.exists(persist_dir):# Remove the existing destination directoryshutil.rmtree(persist_dir)self._storage_context.persist(persist_dir=persist_dir)# Save _multi_embed_storepickle.dump(self._multi_embed_store,open(Path(persist_dir)/"multi_embed_store.pkl","wb"),)@classmethoddefload_from_disk(cls,persist_dir:str,model_name:str="BAAI/bge-m3",index_name:str="",weights_for_different_modes:List[float]=None,)->"BGEM3Index":sc=StorageContext.from_defaults(persist_dir=persist_dir)index=BGEM3Index(model_name=model_name,index_name=index_name,index_struct=sc.index_store.index_structs()[0],storage_context=sc,weights_for_different_modes=weights_for_different_modes,)docs_pos_to_node_id={int(k):vfork,vinindex.index_struct.nodes_dict.items()}index._docs_pos_to_node_id=docs_pos_to_node_idindex._multi_embed_store=pickle.load(open(Path(persist_dir)/"multi_embed_store.pkl","rb"))returnindexdefquery(self,query_str:str,top_k:int=10)->List[NodeWithScore]:""" Query the BGE-M3 + Plaid store. Returns: list of NodeWithScore. """query_embed=self.model.encode(query_str,batch_size=self.batch_size,max_length=self.query_maxlen,return_dense=True,return_sparse=True,return_colbert_vecs=True,)dense_scores=np.matmul(query_embed["dense_vecs"],self._multi_embed_store["dense_vecs"].T)sparse_scores=np.array([self.model.compute_lexical_matching_score(query_embed["lexical_weights"],doc_lexical_weights)fordoc_lexical_weightsinself._multi_embed_store["lexical_weights"]])colbert_scores=np.array([self.model.colbert_score(query_embed["colbert_vecs"],doc_colbert_vecs).item()fordoc_colbert_vecsinself._multi_embed_store["colbert_vecs"]])ifself.weights_for_different_modesisNone:weights_for_different_modes=[1.0,1.0,1.0]weight_sum=3.0else:weights_for_different_modes=self.weights_for_different_modesweight_sum=sum(weights_for_different_modes)combined_scores=(dense_scores*weights_for_different_modes[0]+sparse_scores*weights_for_different_modes[1]+colbert_scores*weights_for_different_modes[2])/weight_sumtopk_indices=np.argsort(combined_scores)[::-1][:top_k]topk_scores=[combined_scores[idx]foridxintopk_indices]node_doc_ids=[self._docs_pos_to_node_id[idx]foridxintopk_indices]nodes=self.docstore.get_nodes(node_doc_ids)nodes_with_score=[]fornode,scoreinzip(nodes,topk_scores):nodes_with_score.append(NodeWithScore(node=node,score=score))returnnodes_with_score
defquery(self,query_str:str,top_k:int=10)->List[NodeWithScore]:""" Query the BGE-M3 + Plaid store. Returns: list of NodeWithScore. """query_embed=self.model.encode(query_str,batch_size=self.batch_size,max_length=self.query_maxlen,return_dense=True,return_sparse=True,return_colbert_vecs=True,)dense_scores=np.matmul(query_embed["dense_vecs"],self._multi_embed_store["dense_vecs"].T)sparse_scores=np.array([self.model.compute_lexical_matching_score(query_embed["lexical_weights"],doc_lexical_weights)fordoc_lexical_weightsinself._multi_embed_store["lexical_weights"]])colbert_scores=np.array([self.model.colbert_score(query_embed["colbert_vecs"],doc_colbert_vecs).item()fordoc_colbert_vecsinself._multi_embed_store["colbert_vecs"]])ifself.weights_for_different_modesisNone:weights_for_different_modes=[1.0,1.0,1.0]weight_sum=3.0else:weights_for_different_modes=self.weights_for_different_modesweight_sum=sum(weights_for_different_modes)combined_scores=(dense_scores*weights_for_different_modes[0]+sparse_scores*weights_for_different_modes[1]+colbert_scores*weights_for_different_modes[2])/weight_sumtopk_indices=np.argsort(combined_scores)[::-1][:top_k]topk_scores=[combined_scores[idx]foridxintopk_indices]node_doc_ids=[self._docs_pos_to_node_id[idx]foridxintopk_indices]nodes=self.docstore.get_nodes(node_doc_ids)nodes_with_score=[]fornode,scoreinzip(nodes,topk_scores):nodes_with_score.append(NodeWithScore(node=node,score=score))returnnodes_with_score