Splits the JSON format of DoclingReader into nodes corresponding
to respective document elements from Docling's data model
(paragraphs, headings, tables etc.).
Parameters:
Name
Type
Description
Default
chunker
BaseChunker
The chunker to use. Defaults to HierarchicalChunker().
required
id_func(NodeIDGenCallable,
optional
The node ID generation function to use. Defaults to _uuid4_node_id_gen.
required
Source code in llama-index-integrations/node_parser/llama-index-node-parser-docling/llama_index/node_parser/docling/base.py
classDoclingNodeParser(NodeParser):"""Docling format node parser. Splits the JSON format of `DoclingReader` into nodes corresponding to respective document elements from Docling's data model (paragraphs, headings, tables etc.). Args: chunker (BaseChunker, optional): The chunker to use. Defaults to `HierarchicalChunker()`. id_func(NodeIDGenCallable, optional): The node ID generation function to use. Defaults to `_uuid4_node_id_gen`. """@runtime_checkableclassNodeIDGenCallable(Protocol):def__call__(self,i:int,node:BaseNode)->str:...@staticmethoddef_uuid4_node_id_gen(i:int,node:BaseNode)->str:returnstr(uuid.uuid4())chunker:BaseChunker=HierarchicalChunker()id_func:NodeIDGenCallable=_uuid4_node_id_gendef_parse_nodes(self,nodes:Sequence[BaseNode],show_progress:bool=False,**kwargs:Any,)->list[BaseNode]:nodes_with_progress:Iterable[BaseNode]=get_tqdm_iterable(items=nodes,show_progress=show_progress,desc="Parsing nodes")all_nodes:list[BaseNode]=[]forinput_nodeinnodes_with_progress:li_doc=LIDocument.model_validate(input_node)dl_doc:DLDocument=DLDocument.model_validate_json(li_doc.get_content())chunk_iter=self.chunker.chunk(dl_doc=dl_doc)fori,chunkinenumerate(chunk_iter):rels:dict[NodeRelationship,RelatedNodeType]={NodeRelationship.SOURCE:li_doc.as_related_node_info(),}metadata=chunk.meta.export_json_dict()excl_embed_keys=[kforkinchunk.meta.excluded_embedifkinmetadata]excl_llm_keys=[kforkinchunk.meta.excluded_llmifkinmetadata]node=TextNode(id_=self.id_func(i=i,node=li_doc),text=chunk.text,excluded_embed_metadata_keys=excl_embed_keys,excluded_llm_metadata_keys=excl_llm_keys,relationships=rels,)node.metadata=metadataall_nodes.append(node)returnall_nodes