Skip to content

PII

Node PostProcessor module.

PIINodePostprocessor #

Bases: BaseNodePostprocessor

PII Node processor.

NOTE: this is a beta feature, the API might change.

Parameters:

Name Type Description Default
llm LLM

The local LLM to use for prediction.

required
pii_str_tmpl str
'The current context information is provided. \nA task is also provided to mask the PII within the context. \nReturn the text, with all PII masked out, and a mapping of the original PII to the masked PII. \nReturn the output of the task in JSON. \nContext:\nHello Zhang Wei, I am John. Your AnyCompany Financial Services, LLC credit card account 1111-0000-1111-0008 has a minimum payment of $24.53 that is due by July 31st. Based on your autopay settings, we will withdraw your payment. Task: Mask out the PII, replace each PII with a tag, and return the text. Return the mapping in JSON. \nOutput: \nHello [NAME1], I am [NAME2]. Your AnyCompany Financial Services, LLC credit card account [CREDIT_CARD_NUMBER] has a minimum payment of $24.53 that is due by [DATE_TIME]. Based on your autopay settings, we will withdraw your payment. Output Mapping:\n{{"NAME1": "Zhang Wei", "NAME2": "John", "CREDIT_CARD_NUMBER": "1111-0000-1111-0008", "DATE_TIME": "July 31st"}}\nContext:\n{context_str}\nTask: {query_str}\nOutput: \n'
pii_node_info_key str
'__pii_node_info__'
Source code in llama-index-core/llama_index/core/postprocessor/pii.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
class PIINodePostprocessor(BaseNodePostprocessor):
    """PII Node processor.

    NOTE: this is a beta feature, the API might change.

    Args:
        llm (LLM): The local LLM to use for prediction.

    """

    llm: LLM
    pii_str_tmpl: str = DEFAULT_PII_TMPL
    pii_node_info_key: str = "__pii_node_info__"

    @classmethod
    def class_name(cls) -> str:
        return "PIINodePostprocessor"

    def mask_pii(self, text: str) -> Tuple[str, Dict]:
        """Mask PII in text."""
        pii_prompt = PromptTemplate(self.pii_str_tmpl)
        # TODO: allow customization
        task_str = (
            "Mask out the PII, replace each PII with a tag, and return the text. "
            "Return the mapping in JSON."
        )

        response = self.llm.predict(pii_prompt, context_str=text, query_str=task_str)
        splits = response.split("Output Mapping:")
        text_output = splits[0].strip()
        json_str_output = splits[1].strip()
        json_dict = json.loads(json_str_output)
        return text_output, json_dict

    def _postprocess_nodes(
        self,
        nodes: List[NodeWithScore],
        query_bundle: Optional[QueryBundle] = None,
    ) -> List[NodeWithScore]:
        """Postprocess nodes."""
        # swap out text from nodes, with the original node mappings
        new_nodes = []
        for node_with_score in nodes:
            node = node_with_score.node
            new_text, mapping_info = self.mask_pii(
                node.get_content(metadata_mode=MetadataMode.LLM)
            )
            new_node = deepcopy(node)
            new_node.excluded_embed_metadata_keys.append(self.pii_node_info_key)
            new_node.excluded_llm_metadata_keys.append(self.pii_node_info_key)
            new_node.metadata[self.pii_node_info_key] = mapping_info
            new_node.set_content(new_text)
            new_nodes.append(NodeWithScore(node=new_node, score=node_with_score.score))

        return new_nodes

mask_pii #

mask_pii(text: str) -> Tuple[str, Dict]

Mask PII in text.

Source code in llama-index-core/llama_index/core/postprocessor/pii.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def mask_pii(self, text: str) -> Tuple[str, Dict]:
    """Mask PII in text."""
    pii_prompt = PromptTemplate(self.pii_str_tmpl)
    # TODO: allow customization
    task_str = (
        "Mask out the PII, replace each PII with a tag, and return the text. "
        "Return the mapping in JSON."
    )

    response = self.llm.predict(pii_prompt, context_str=text, query_str=task_str)
    splits = response.split("Output Mapping:")
    text_output = splits[0].strip()
    json_str_output = splits[1].strip()
    json_dict = json.loads(json_str_output)
    return text_output, json_dict