Skip to content

Unstructured element

Node parsers.

UnstructuredElementNodeParser #

Bases: BaseElementNodeParser

Unstructured element node parser.

Splits a document into Text Nodes and Index Nodes corresponding to embedded objects (e.g. tables).

Source code in llama-index-core/llama_index/core/node_parser/relational/unstructured_element.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
class UnstructuredElementNodeParser(BaseElementNodeParser):
    """Unstructured element node parser.

    Splits a document into Text Nodes and Index Nodes corresponding to embedded objects
    (e.g. tables).

    """

    partitioning_parameters: Optional[Dict[str, Any]] = Field(
        default={},
        description="Extra dictionary representing parameters of the partitioning process.",
    )

    def __init__(
        self,
        callback_manager: Optional[CallbackManager] = None,
        llm: Optional[Any] = None,
        summary_query_str: str = DEFAULT_SUMMARY_QUERY_STR,
        partitioning_parameters: Optional[Dict[str, Any]] = {},
    ) -> None:
        """Initialize."""
        try:
            import lxml  # noqa  # pants: no-infer-dep
            import unstructured  # noqa  # pants: no-infer-dep
        except ImportError:
            raise ImportError(
                "You must install the `unstructured` and `lxml` "
                "package to use this node parser."
            )
        callback_manager = callback_manager or CallbackManager([])

        return super().__init__(
            callback_manager=callback_manager,
            llm=llm,
            summary_query_str=summary_query_str,
            partitioning_parameters=partitioning_parameters,
        )

    @classmethod
    def class_name(cls) -> str:
        return "UnstructuredElementNodeParser"

    def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
        """Get nodes from node."""
        elements = self.extract_elements(
            node.get_content(), table_filters=[self.filter_table]
        )
        table_elements = self.get_table_elements(elements)
        # extract summaries over table elements
        self.extract_table_summaries(table_elements)
        # convert into nodes
        # will return a list of Nodes and Index Nodes
        nodes = self.get_nodes_from_elements(
            elements, node, ref_doc_text=node.get_content()
        )

        source_document = node.source_node or node.as_related_node_info()
        for n in nodes:
            n.relationships[NodeRelationship.SOURCE] = source_document
            n.metadata.update(node.metadata)
        return nodes

    async def aget_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
        """Get nodes from node."""
        elements = self.extract_elements(
            node.get_content(), table_filters=[self.filter_table]
        )
        table_elements = self.get_table_elements(elements)
        # extract summaries over table elements
        await self.aextract_table_summaries(table_elements)
        # convert into nodes
        # will return a list of Nodes and Index Nodes
        nodes = self.get_nodes_from_elements(
            elements, node, ref_doc_text=node.get_content()
        )

        source_document = node.source_node or node.as_related_node_info()
        for n in nodes:
            n.relationships[NodeRelationship.SOURCE] = source_document
            n.metadata.update(node.metadata)
        return nodes

    def extract_elements(
        self, text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any
    ) -> List[Element]:
        """Extract elements from text."""
        from unstructured.partition.html import partition_html  # pants: no-infer-dep

        table_filters = table_filters or []
        elements = partition_html(text=text, **self.partitioning_parameters)
        output_els = []
        for idx, element in enumerate(elements):
            if "unstructured.documents.elements.Table" in str(type(element)):
                should_keep = all(tf(element) for tf in table_filters)
                if should_keep:
                    table_df = html_to_df(str(element.metadata.text_as_html))
                    output_els.append(
                        Element(
                            id=f"id_{idx}",
                            type="table",
                            element=element,
                            table=table_df,
                        )
                    )
                else:
                    # if not a table, keep it as Text as we don't want to lose context
                    from unstructured.documents.elements import Text

                    new_element = Text(str(element))
                    output_els.append(
                        Element(id=f"id_{idx}", type="text", element=new_element)
                    )
            else:
                output_els.append(Element(id=f"id_{idx}", type="text", element=element))
        return output_els

    def filter_table(self, table_element: Any) -> bool:
        """Filter tables."""
        table_df = html_to_df(table_element.metadata.text_as_html)

        # check if table_df is not None, has more than one row, and more than one column
        return table_df is not None and not table_df.empty and len(table_df.columns) > 1

get_nodes_from_node #

get_nodes_from_node(node: TextNode) -> List[BaseNode]

Get nodes from node.

Source code in llama-index-core/llama_index/core/node_parser/relational/unstructured_element.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
    """Get nodes from node."""
    elements = self.extract_elements(
        node.get_content(), table_filters=[self.filter_table]
    )
    table_elements = self.get_table_elements(elements)
    # extract summaries over table elements
    self.extract_table_summaries(table_elements)
    # convert into nodes
    # will return a list of Nodes and Index Nodes
    nodes = self.get_nodes_from_elements(
        elements, node, ref_doc_text=node.get_content()
    )

    source_document = node.source_node or node.as_related_node_info()
    for n in nodes:
        n.relationships[NodeRelationship.SOURCE] = source_document
        n.metadata.update(node.metadata)
    return nodes

aget_nodes_from_node async #

aget_nodes_from_node(node: TextNode) -> List[BaseNode]

Get nodes from node.

Source code in llama-index-core/llama_index/core/node_parser/relational/unstructured_element.py
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
async def aget_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
    """Get nodes from node."""
    elements = self.extract_elements(
        node.get_content(), table_filters=[self.filter_table]
    )
    table_elements = self.get_table_elements(elements)
    # extract summaries over table elements
    await self.aextract_table_summaries(table_elements)
    # convert into nodes
    # will return a list of Nodes and Index Nodes
    nodes = self.get_nodes_from_elements(
        elements, node, ref_doc_text=node.get_content()
    )

    source_document = node.source_node or node.as_related_node_info()
    for n in nodes:
        n.relationships[NodeRelationship.SOURCE] = source_document
        n.metadata.update(node.metadata)
    return nodes

extract_elements #

extract_elements(text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any) -> List[Element]

Extract elements from text.

Source code in llama-index-core/llama_index/core/node_parser/relational/unstructured_element.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def extract_elements(
    self, text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any
) -> List[Element]:
    """Extract elements from text."""
    from unstructured.partition.html import partition_html  # pants: no-infer-dep

    table_filters = table_filters or []
    elements = partition_html(text=text, **self.partitioning_parameters)
    output_els = []
    for idx, element in enumerate(elements):
        if "unstructured.documents.elements.Table" in str(type(element)):
            should_keep = all(tf(element) for tf in table_filters)
            if should_keep:
                table_df = html_to_df(str(element.metadata.text_as_html))
                output_els.append(
                    Element(
                        id=f"id_{idx}",
                        type="table",
                        element=element,
                        table=table_df,
                    )
                )
            else:
                # if not a table, keep it as Text as we don't want to lose context
                from unstructured.documents.elements import Text

                new_element = Text(str(element))
                output_els.append(
                    Element(id=f"id_{idx}", type="text", element=new_element)
                )
        else:
            output_els.append(Element(id=f"id_{idx}", type="text", element=element))
    return output_els

filter_table #

filter_table(table_element: Any) -> bool

Filter tables.

Source code in llama-index-core/llama_index/core/node_parser/relational/unstructured_element.py
134
135
136
137
138
139
def filter_table(self, table_element: Any) -> bool:
    """Filter tables."""
    table_df = html_to_df(table_element.metadata.text_as_html)

    # check if table_df is not None, has more than one row, and more than one column
    return table_df is not None and not table_df.empty and len(table_df.columns) > 1