Skip to content

Feishu wiki

FeishuWikiReader #

Bases: BaseReader

Feishu Wiki reader.

Reads pages from Feishu wiki under the space

Source code in llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/base.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
class FeishuWikiReader(BaseReader):
    """Feishu Wiki reader.

    Reads pages from Feishu wiki under the space

    """

    host = "https://open.feishu.cn"
    wiki_nodes_url_path = "/open-apis/wiki/v2/spaces/{}/nodes"
    documents_raw_content_url_path = "/open-apis/docx/v1/documents/{}/raw_content"
    tenant_access_token_internal_url_path = (
        "/open-apis/auth/v3/tenant_access_token/internal"
    )

    def __init__(self, app_id: str, app_secret: str) -> None:
        """

        Args:
            app_id: The unique identifier of the application is obtained after the application is created.
            app_secret: Application key, obtained after creating the application.
        """
        super().__init__()
        self.app_id = app_id
        self.app_secret = app_secret

        self.tenant_access_token = ""
        self.expire = 0

    def load_data(self, space_id: str, parent_node_token: str = None) -> List[Document]:
        """Load data from the input directory.

        Args:
            space_id (str): a space id.
            parent_node_token (str[optional]): a parent node token of the space
        """
        if space_id is None:
            raise ValueError('Must specify a "space_id" in `load_kwargs`.')

        document_ids = self._load_space(space_id, parent_node_token=parent_node_token)
        document_ids = list(set(document_ids))

        results = []
        for document_id in document_ids:
            doc = self._load_doc(document_id)
            results.append(Document(text=doc, extra_info={"document_id": document_id}))
        return results

    def _load_space(self, space_id: str, parent_node_token: str = None) -> str:
        if self.tenant_access_token == "" or self.expire < time.time():
            self._update_tenant_access_token()
        headers = {
            "Authorization": f"Bearer {self.tenant_access_token}",
            "Content-Type": "application/json; charset=utf-8",
        }

        url = self.host + self.wiki_nodes_url_path.format(space_id)
        if parent_node_token:
            url += f"?parent_node_token={parent_node_token}"
        try:
            response = requests.get(url, headers=headers)
            result = response.json()
        except Exception:
            return []
        if not result.get("data"):
            return []
        obj_token_list = []
        for item in result["data"]["items"]:
            obj_token_list.append(item["obj_token"])
            if item["has_child"]:
                child_obj_token_list = self._load_space(
                    space_id=space_id, parent_node_token=item["node_token"]
                )
                if child_obj_token_list:
                    obj_token_list.extend(child_obj_token_list)
        return obj_token_list

    def _load_doc(self, document_id: str) -> str:
        """Load a document from Feishu Docs.

        Args:
            document_id: the document id.

        Returns:
            The document text.
        """
        url = self.host + self.documents_raw_content_url_path.format(document_id)
        if self.tenant_access_token == "" or self.expire < time.time():
            self._update_tenant_access_token()
        headers = {
            "Authorization": f"Bearer {self.tenant_access_token}",
            "Content-Type": "application/json; charset=utf-8",
        }
        try:
            response = requests.get(url, headers=headers)
            result = response.json()
        except Exception:
            return None
        if not result.get("data"):
            return None
        return result["data"]["content"]

    def _update_tenant_access_token(self) -> None:
        """For update tenant_access_token."""
        url = self.host + self.tenant_access_token_internal_url_path
        headers = {"Content-Type": "application/json; charset=utf-8"}
        data = {"app_id": self.app_id, "app_secret": self.app_secret}
        response = requests.post(url, data=json.dumps(data), headers=headers)
        self.tenant_access_token = response.json()["tenant_access_token"]
        self.expire = time.time() + response.json()["expire"]

    def set_lark_domain(self, host: str) -> None:
        """Set lark domain."""
        self.host = host

load_data #

load_data(space_id: str, parent_node_token: str = None) -> List[Document]

Load data from the input directory.

Parameters:

Name Type Description Default
space_id str

a space id.

required
parent_node_token str[optional]

a parent node token of the space

None
Source code in llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/base.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def load_data(self, space_id: str, parent_node_token: str = None) -> List[Document]:
    """Load data from the input directory.

    Args:
        space_id (str): a space id.
        parent_node_token (str[optional]): a parent node token of the space
    """
    if space_id is None:
        raise ValueError('Must specify a "space_id" in `load_kwargs`.')

    document_ids = self._load_space(space_id, parent_node_token=parent_node_token)
    document_ids = list(set(document_ids))

    results = []
    for document_id in document_ids:
        doc = self._load_doc(document_id)
        results.append(Document(text=doc, extra_info={"document_id": document_id}))
    return results

set_lark_domain #

set_lark_domain(host: str) -> None

Set lark domain.

Source code in llama-index-integrations/readers/llama-index-readers-feishu-wiki/llama_index/readers/feishu_wiki/base.py
136
137
138
def set_lark_domain(self, host: str) -> None:
    """Set lark domain."""
    self.host = host