Bases: BaseNodePostprocessor
presidio PII Node processor.
Uses a presidio to analyse PIIs.
Source code in llama-index-integrations/postprocessor/llama-index-postprocessor-presidio/llama_index/postprocessor/presidio/base.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110 | class PresidioPIINodePostprocessor(BaseNodePostprocessor):
"""presidio PII Node processor.
Uses a presidio to analyse PIIs.
"""
pii_node_info_key: str = "__pii_node_info__"
entity_mapping: Dict[str, Dict] = {}
mapping: Dict[str, str] = {}
@classmethod
def class_name(cls) -> str:
return "PresidioPIINodePostprocessor"
def mask_pii(self, text: str) -> Tuple[str, Dict]:
analyzer = AnalyzerEngine()
results = analyzer.analyze(text=text, language="en")
engine = AnonymizerEngine()
engine.add_anonymizer(EntityTypeCountAnonymizer)
new_text = engine.anonymize(
text=text,
analyzer_results=results,
operators={
"DEFAULT": OperatorConfig(
"EntityTypeCountAnonymizer",
{
"entity_mapping": self.entity_mapping,
"deanonymize_mapping": self.mapping,
},
)
},
)
return new_text.text
def _postprocess_nodes(
self,
nodes: List[NodeWithScore],
query_bundle: Optional[QueryBundle] = None,
) -> List[NodeWithScore]:
"""Postprocess nodes."""
# swap out text from nodes, with the original node mappings
new_nodes = []
for node_with_score in nodes:
node = node_with_score.node
new_text = self.mask_pii(node.get_content(metadata_mode=MetadataMode.LLM))
new_node = deepcopy(node)
new_node.excluded_embed_metadata_keys.append(self.pii_node_info_key)
new_node.excluded_llm_metadata_keys.append(self.pii_node_info_key)
new_node.metadata[self.pii_node_info_key] = self.mapping
new_node.set_content(new_text)
new_nodes.append(NodeWithScore(node=new_node, score=node_with_score.score))
return new_nodes
|