Web Page Reader¶
Demonstrates our web page reader.
In [ ]:
Copied!
%pip install llama-index-readers-web
%pip install llama-index-readers-web
In [ ]:
Copied!
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
Using SimpleWebPageReader¶
If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙.
In [ ]:
Copied!
!pip install llama-index
!pip install llama-index
In [ ]:
Copied!
from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
from IPython.display import Markdown, display
import os
from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
from IPython.display import Markdown, display
import os
In [ ]:
Copied!
# NOTE: the html_to_text=True option requires html2text to be installed
# NOTE: the html_to_text=True option requires html2text to be installed
In [ ]:
Copied!
documents = SimpleWebPageReader(html_to_text=True).load_data(
["http://paulgraham.com/worked.html"]
)
documents = SimpleWebPageReader(html_to_text=True).load_data(
["http://paulgraham.com/worked.html"]
)
In [ ]:
Copied!
documents[0]
documents[0]
In [ ]:
Copied!
index = SummaryIndex.from_documents(documents)
index = SummaryIndex.from_documents(documents)
In [ ]:
Copied!
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
In [ ]:
Copied!
display(Markdown(f"<b>{response}</b>"))
display(Markdown(f"{response}"))
Using TrafilaturaWebReader¶
In [ ]:
Copied!
from llama_index.readers.web import TrafilaturaWebReader
from llama_index.readers.web import TrafilaturaWebReader
In [ ]:
Copied!
documents = TrafilaturaWebReader().load_data(
["http://paulgraham.com/worked.html"]
)
documents = TrafilaturaWebReader().load_data(
["http://paulgraham.com/worked.html"]
)
In [ ]:
Copied!
index = SummaryIndex.from_documents(documents)
index = SummaryIndex.from_documents(documents)
In [ ]:
Copied!
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
In [ ]:
Copied!
display(Markdown(f"<b>{response}</b>"))
display(Markdown(f"{response}"))
Using RssReader¶
In [ ]:
Copied!
from llama_index.core import SummaryIndex
from llama_index.readers.web import RssReader
documents = RssReader().load_data(
["https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"]
)
index = SummaryIndex.from_documents(documents)
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What happened in the news today?")
from llama_index.core import SummaryIndex
from llama_index.readers.web import RssReader
documents = RssReader().load_data(
["https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"]
)
index = SummaryIndex.from_documents(documents)
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What happened in the news today?")