Skip to content

Wordpress

WordpressReader #

Bases: BaseReader

Wordpress reader. Reads data from a Wordpress workspace.

Parameters:

Name Type Description Default
wordpress_subdomain str

Wordpress subdomain

required
get_pages bool

Retrieve static Wordpress 'pages'. Default True.

True
get_posts bool

Retrieve Wordpress 'posts' (blog entries). Default True.

True
Source code in llama-index-integrations/readers/llama-index-readers-wordpress/llama_index/readers/wordpress/base.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
class WordpressReader(BaseReader):
    """Wordpress reader. Reads data from a Wordpress workspace.

    Args:
        wordpress_subdomain (str): Wordpress subdomain
        get_pages (bool): Retrieve static Wordpress 'pages'.  Default True.
        get_posts (bool): Retrieve Wordpress 'posts' (blog entries).  Default True.
    """

    def __init__(
        self,
        url: str,
        password: Optional[str] = None,
        username: Optional[str] = None,
        get_pages: bool = True,
        get_posts: bool = True,
    ) -> None:
        """Initialize Wordpress reader."""
        self.url = url
        self.username = username
        self.password = password
        self.get_pages = get_pages
        self.get_posts = get_posts

    def load_data(self) -> List[Document]:
        """Load data from the workspace.

        Returns:
            List[Document]: List of documents.
        """
        from bs4 import BeautifulSoup, GuessedAtParserWarning

        #  Suppressing this warning because guessing at the parser is the
        #  desired behavior -- we don't want to force lxml on packages
        #  where it's not installed.
        warnings.filterwarnings("ignore", category=GuessedAtParserWarning)

        results = []
        articles = []

        if self.get_pages:
            articles.extend(self.get_all_posts("pages"))

        if self.get_posts:
            articles.extend(self.get_all_posts("posts"))

        for article in articles:
            body = article.get("content", {}).get("rendered", None)
            if body is None:
                body = article.get("content")

            soup = BeautifulSoup(body)
            body = soup.get_text()

            title = article.get("title", {}).get("rendered", None)
            if not title:
                title = article.get("title")

            extra_info = {
                "id": article["id"],
                "title": title,
                "url": article["link"],
                "updated_at": article["modified"],
            }

            results.append(
                Document(
                    text=body,
                    extra_info=extra_info,
                )
            )
        return results

    def get_all_posts(self, post_type: str):
        posts = []
        next_page = 1

        while True:
            response = self.get_posts_page(post_type, next_page)
            posts.extend(response["articles"])
            next_page = response["next_page"]

            if next_page is None:
                break

        return posts

    def get_posts_page(self, post_type: str, current_page: int = 1):
        import requests

        url = f"{self.url}/wp-json/wp/v2/{post_type}?per_page=100&page={current_page}"

        response = requests.get(url)
        headers = response.headers

        if "X-WP-TotalPages" in headers:
            num_pages = int(headers["X-WP-TotalPages"])
        else:
            num_pages = 1

        if num_pages > current_page:
            next_page = current_page + 1
        else:
            next_page = None

        response_json = json.loads(response.text)

        articles = response_json

        return {"articles": articles, "next_page": next_page}

load_data #

load_data() -> List[Document]

Load data from the workspace.

Returns:

Type Description
List[Document]

List[Document]: List of documents.

Source code in llama-index-integrations/readers/llama-index-readers-wordpress/llama_index/readers/wordpress/base.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def load_data(self) -> List[Document]:
    """Load data from the workspace.

    Returns:
        List[Document]: List of documents.
    """
    from bs4 import BeautifulSoup, GuessedAtParserWarning

    #  Suppressing this warning because guessing at the parser is the
    #  desired behavior -- we don't want to force lxml on packages
    #  where it's not installed.
    warnings.filterwarnings("ignore", category=GuessedAtParserWarning)

    results = []
    articles = []

    if self.get_pages:
        articles.extend(self.get_all_posts("pages"))

    if self.get_posts:
        articles.extend(self.get_all_posts("posts"))

    for article in articles:
        body = article.get("content", {}).get("rendered", None)
        if body is None:
            body = article.get("content")

        soup = BeautifulSoup(body)
        body = soup.get_text()

        title = article.get("title", {}).get("rendered", None)
        if not title:
            title = article.get("title")

        extra_info = {
            "id": article["id"],
            "title": title,
            "url": article["link"],
            "updated_at": article["modified"],
        }

        results.append(
            Document(
                text=body,
                extra_info=extra_info,
            )
        )
    return results