Bases: BasePydanticReader
Youtube Transcript reader.
Source code in llama-index-integrations/readers/llama-index-readers-youtube-transcript/llama_index/readers/youtube_transcript/base.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68 | class YoutubeTranscriptReader(BasePydanticReader):
"""Youtube Transcript reader."""
is_remote: bool = True
@classmethod
def class_name(cls) -> str:
"""Get the name identifier of the class."""
return "YoutubeTranscriptReader"
def load_data(
self,
ytlinks: List[str],
languages: Optional[List[str]] = ["en"],
**load_kwargs: Any,
) -> List[Document]:
"""Load data from the input directory.
Args:
pages (List[str]): List of youtube links \
for which transcripts are to be read.
"""
results = []
for link in ytlinks:
video_id = self._extract_video_id(link)
if not video_id:
raise ValueError(
f"Supplied url {link} is not a supported youtube URL."
"Supported formats include:"
" youtube.com/watch?v=\\{video_id\\} "
"(with or without 'www.')\n"
" youtube.com/embed?v=\\{video_id\\} "
"(with or without 'www.')\n"
" youtu.be/{video_id\\} (never includes www subdomain)"
)
transcript_chunks = YouTubeTranscriptApi.get_transcript(
video_id, languages=languages
)
chunk_text = [chunk["text"] for chunk in transcript_chunks]
transcript = "\n".join(chunk_text)
results.append(
Document(
text=transcript, id_=video_id, extra_info={"video_id": video_id}
)
)
return results
@staticmethod
def _extract_video_id(yt_link) -> Optional[str]:
for pattern in YOUTUBE_URL_PATTERNS:
match = re.search(pattern, yt_link)
if match:
return match.group(1)
# return None if no match is found
return None
|
class_name
classmethod
Get the name identifier of the class.
Source code in llama-index-integrations/readers/llama-index-readers-youtube-transcript/llama_index/readers/youtube_transcript/base.py
| @classmethod
def class_name(cls) -> str:
"""Get the name identifier of the class."""
return "YoutubeTranscriptReader"
|
load_data
load_data(ytlinks: List[str], languages: Optional[List[str]] = ['en'], **load_kwargs: Any) -> List[Document]
Load data from the input directory.
Parameters:
Name |
Type |
Description |
Default |
pages |
List[str]
|
List of youtube links for which transcripts are to be read.
|
required
|
Source code in llama-index-integrations/readers/llama-index-readers-youtube-transcript/llama_index/readers/youtube_transcript/base.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58 | def load_data(
self,
ytlinks: List[str],
languages: Optional[List[str]] = ["en"],
**load_kwargs: Any,
) -> List[Document]:
"""Load data from the input directory.
Args:
pages (List[str]): List of youtube links \
for which transcripts are to be read.
"""
results = []
for link in ytlinks:
video_id = self._extract_video_id(link)
if not video_id:
raise ValueError(
f"Supplied url {link} is not a supported youtube URL."
"Supported formats include:"
" youtube.com/watch?v=\\{video_id\\} "
"(with or without 'www.')\n"
" youtube.com/embed?v=\\{video_id\\} "
"(with or without 'www.')\n"
" youtu.be/{video_id\\} (never includes www subdomain)"
)
transcript_chunks = YouTubeTranscriptApi.get_transcript(
video_id, languages=languages
)
chunk_text = [chunk["text"] for chunk in transcript_chunks]
transcript = "\n".join(chunk_text)
results.append(
Document(
text=transcript, id_=video_id, extra_info={"video_id": video_id}
)
)
return results
|