Bases: BaseNodePostprocessor
NER PII Node processor.
Uses a HF transformers model.
Source code in llama-index-core/llama_index/core/postprocessor/pii.py
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144 | class NERPIINodePostprocessor(BaseNodePostprocessor):
"""NER PII Node processor.
Uses a HF transformers model.
"""
pii_node_info_key: str = "__pii_node_info__"
@classmethod
def class_name(cls) -> str:
return "NERPIINodePostprocessor"
def mask_pii(self, ner: Callable, text: str) -> Tuple[str, Dict]:
"""Mask PII in text."""
new_text = text
response = ner(text)
mapping = {}
for entry in response:
entity_group_tag = f"[{entry['entity_group']}_{entry['start']}]"
new_text = new_text.replace(entry["word"], entity_group_tag).strip()
mapping[entity_group_tag] = entry["word"]
return new_text, mapping
def _postprocess_nodes(
self,
nodes: List[NodeWithScore],
query_bundle: Optional[QueryBundle] = None,
) -> List[NodeWithScore]:
"""Postprocess nodes."""
from transformers import pipeline # pants: no-infer-dep
ner = pipeline("ner", grouped_entities=True)
# swap out text from nodes, with the original node mappings
new_nodes = []
for node_with_score in nodes:
node = node_with_score.node
new_text, mapping_info = self.mask_pii(
ner, node.get_content(metadata_mode=MetadataMode.LLM)
)
new_node = deepcopy(node)
new_node.excluded_embed_metadata_keys.append(self.pii_node_info_key)
new_node.excluded_llm_metadata_keys.append(self.pii_node_info_key)
new_node.metadata[self.pii_node_info_key] = mapping_info
new_node.set_content(new_text)
new_nodes.append(NodeWithScore(node=new_node, score=node_with_score.score))
return new_nodes
|
mask_pii
mask_pii(ner: Callable, text: str) -> Tuple[str, Dict]
Mask PII in text.
Source code in llama-index-core/llama_index/core/postprocessor/pii.py
109
110
111
112
113
114
115
116
117
118 | def mask_pii(self, ner: Callable, text: str) -> Tuple[str, Dict]:
"""Mask PII in text."""
new_text = text
response = ner(text)
mapping = {}
for entry in response:
entity_group_tag = f"[{entry['entity_group']}_{entry['start']}]"
new_text = new_text.replace(entry["word"], entity_group_tag).strip()
mapping[entity_group_tag] = entry["word"]
return new_text, mapping
|