Bases: NodeParser
Docling format node parser.
Splits the JSON format of DoclingReader
into nodes corresponding
to respective document elements from Docling's data model
(paragraphs, headings, tables etc.).
Parameters:
Name |
Type |
Description |
Default |
chunker
|
BaseChunker
|
The chunker to use. Defaults to HierarchicalChunker() .
|
required
|
id_func(NodeIDGenCallable,
|
optional
|
The node ID generation function to use. Defaults to _uuid4_node_id_gen .
|
required
|
Source code in llama-index-integrations/node_parser/llama-index-node-parser-docling/llama_index/node_parser/docling/base.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77 | class DoclingNodeParser(NodeParser):
"""Docling format node parser.
Splits the JSON format of `DoclingReader` into nodes corresponding
to respective document elements from Docling's data model
(paragraphs, headings, tables etc.).
Args:
chunker (BaseChunker, optional): The chunker to use. Defaults to `HierarchicalChunker()`.
id_func(NodeIDGenCallable, optional): The node ID generation function to use. Defaults to `_uuid4_node_id_gen`.
"""
@runtime_checkable
class NodeIDGenCallable(Protocol):
def __call__(self, i: int, node: BaseNode) -> str:
...
@staticmethod
def _uuid4_node_id_gen(i: int, node: BaseNode) -> str:
return str(uuid.uuid4())
chunker: BaseChunker = HierarchicalChunker()
id_func: NodeIDGenCallable = _uuid4_node_id_gen
def _parse_nodes(
self,
nodes: Sequence[BaseNode],
show_progress: bool = False,
**kwargs: Any,
) -> list[BaseNode]:
nodes_with_progress: Iterable[BaseNode] = get_tqdm_iterable(
items=nodes, show_progress=show_progress, desc="Parsing nodes"
)
all_nodes: list[BaseNode] = []
for input_node in nodes_with_progress:
li_doc = LIDocument.model_validate(input_node)
dl_doc: DLDocument = DLDocument.model_validate_json(li_doc.get_content())
chunk_iter = self.chunker.chunk(dl_doc=dl_doc)
for i, chunk in enumerate(chunk_iter):
rels: dict[NodeRelationship, RelatedNodeType] = {
NodeRelationship.SOURCE: li_doc.as_related_node_info(),
}
metadata = chunk.meta.export_json_dict()
excl_embed_keys = [
k for k in chunk.meta.excluded_embed if k in metadata
]
excl_llm_keys = [k for k in chunk.meta.excluded_llm if k in metadata]
node = TextNode(
id_=self.id_func(i=i, node=li_doc),
text=chunk.text,
excluded_embed_metadata_keys=excl_embed_keys,
excluded_llm_metadata_keys=excl_llm_keys,
relationships=rels,
)
node.metadata = metadata
all_nodes.append(node)
return all_nodes
|