Bases: BaseReader
Wordpress reader. Reads data from a Wordpress workspace.
Parameters:
Name |
Type |
Description |
Default |
wordpress_subdomain |
str
|
|
required
|
Source code in llama-index-integrations/readers/llama-index-readers-wordpress/llama_index/readers/wordpress/base.py
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97 | class WordpressReader(BaseReader):
"""Wordpress reader. Reads data from a Wordpress workspace.
Args:
wordpress_subdomain (str): Wordpress subdomain
"""
def __init__(self, url: str, password: str, username: str) -> None:
"""Initialize Wordpress reader."""
self.url = url
self.username = username
self.password = password
def load_data(self) -> List[Document]:
"""Load data from the workspace.
Returns:
List[Document]: List of documents.
"""
from bs4 import BeautifulSoup
results = []
articles = self.get_all_posts()
for article in articles:
body = article.get("content", {}).get("rendered", None)
if not body:
body = article.get("content")
soup = BeautifulSoup(body, "html.parser")
body = soup.get_text()
title = article.get("title", {}).get("rendered", None)
if not title:
title = article.get("title")
extra_info = {
"id": article["id"],
"title": title,
"url": article["link"],
"updated_at": article["modified"],
}
results.append(
Document(
text=body,
extra_info=extra_info,
)
)
return results
def get_all_posts(self):
posts = []
next_page = 1
while True:
response = self.get_posts_page(next_page)
posts.extend(response["articles"])
next_page = response["next_page"]
if next_page is None:
break
return posts
def get_posts_page(self, current_page: int = 1):
import requests
url = f"{self.url}/wp-json/wp/v2/posts?per_page=100&page={current_page}"
response = requests.get(url)
headers = response.headers
if "X-WP-TotalPages" in headers:
num_pages = int(headers["X-WP-TotalPages"])
else:
num_pages = 1
if num_pages > current_page:
next_page = current_page + 1
else:
next_page = None
response_json = json.loads(response.text)
articles = response_json
return {"articles": articles, "next_page": next_page}
|
load_data
Load data from the workspace.
Returns:
Type |
Description |
List[Document]
|
List[Document]: List of documents.
|
Source code in llama-index-integrations/readers/llama-index-readers-wordpress/llama_index/readers/wordpress/base.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59 | def load_data(self) -> List[Document]:
"""Load data from the workspace.
Returns:
List[Document]: List of documents.
"""
from bs4 import BeautifulSoup
results = []
articles = self.get_all_posts()
for article in articles:
body = article.get("content", {}).get("rendered", None)
if not body:
body = article.get("content")
soup = BeautifulSoup(body, "html.parser")
body = soup.get_text()
title = article.get("title", {}).get("rendered", None)
if not title:
title = article.get("title")
extra_info = {
"id": article["id"],
"title": title,
"url": article["link"],
"updated_at": article["modified"],
}
results.append(
Document(
text=body,
extra_info=extra_info,
)
)
return results
|