Bases: BasePydanticReader
General reader for any S3 file or directory.
If key is not set, the entire bucket (filtered by prefix) is parsed.
Args:
bucket (str): the name of your S3 bucket
key (Optional[str]): the name of the specific file. If none is provided,
this loader will iterate through the entire bucket.
prefix (Optional[str]): the prefix to filter by in the case that the loader
iterates through the entire bucket. Defaults to empty string.
recursive (bool): Whether to recursively search in subdirectories.
True by default.
file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
extension to a BaseReader class that specifies how to convert that file
to text. See SimpleDirectoryReader
for more details.
required_exts (Optional[List[str]]): List of required extensions.
Default is None.
num_files_limit (Optional[int]): Maximum number of files to read.
Default is None.
file_metadata (Optional[Callable[str, Dict]]): A function that takes
in a filename and returns a Dict of metadata for the Document.
Default is None.
aws_access_id (Optional[str]): provide AWS access key directly.
aws_access_secret (Optional[str]): provide AWS access key directly.
s3_endpoint_url (Optional[str]): provide S3 endpoint URL directly.
Source code in llama-index-integrations/readers/llama-index-readers-s3/llama_index/readers/s3/base.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122 | class S3Reader(BasePydanticReader):
"""
General reader for any S3 file or directory.
If key is not set, the entire bucket (filtered by prefix) is parsed.
Args:
bucket (str): the name of your S3 bucket
key (Optional[str]): the name of the specific file. If none is provided,
this loader will iterate through the entire bucket.
prefix (Optional[str]): the prefix to filter by in the case that the loader
iterates through the entire bucket. Defaults to empty string.
recursive (bool): Whether to recursively search in subdirectories.
True by default.
file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
extension to a BaseReader class that specifies how to convert that file
to text. See `SimpleDirectoryReader` for more details.
required_exts (Optional[List[str]]): List of required extensions.
Default is None.
num_files_limit (Optional[int]): Maximum number of files to read.
Default is None.
file_metadata (Optional[Callable[str, Dict]]): A function that takes
in a filename and returns a Dict of metadata for the Document.
Default is None.
aws_access_id (Optional[str]): provide AWS access key directly.
aws_access_secret (Optional[str]): provide AWS access key directly.
s3_endpoint_url (Optional[str]): provide S3 endpoint URL directly.
"""
is_remote: bool = True
bucket: str
key: Optional[str] = None
prefix: Optional[str] = ""
recursive: bool = True
file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = Field(
default=None, exclude=True
)
required_exts: Optional[List[str]] = None
filename_as_id: bool = True
num_files_limit: Optional[int] = None
file_metadata: Optional[Callable[[str], Dict]] = Field(default=None, exclude=True)
aws_access_id: Optional[str] = None
aws_access_secret: Optional[str] = None
aws_session_token: Optional[str] = None
s3_endpoint_url: Optional[str] = "https://s3.amazonaws.com"
custom_reader_path: Optional[str] = None
@classmethod
def class_name(cls) -> str:
return "S3Reader"
def load_s3_files_as_docs(self, temp_dir=None) -> List[Document]:
"""Load file(s) from S3."""
from s3fs import S3FileSystem
s3fs = S3FileSystem(
key=self.aws_access_id,
endpoint_url=self.s3_endpoint_url,
secret=self.aws_access_secret,
token=self.aws_session_token,
)
input_dir = self.bucket
input_files = None
if self.key:
input_files = [f"{self.bucket}/{self.key}"]
elif self.prefix:
input_dir = f"{input_dir}/{self.prefix}"
loader = SimpleDirectoryReader(
input_dir=input_dir,
input_files=input_files,
file_extractor=self.file_extractor,
required_exts=self.required_exts,
filename_as_id=self.filename_as_id,
num_files_limit=self.num_files_limit,
file_metadata=self.file_metadata,
recursive=self.recursive,
fs=s3fs,
)
return loader.load_data()
def load_data(self, custom_temp_subdir: str = None) -> List[Document]:
"""
Load the file(s) from S3.
Args:
custom_temp_subdir (str, optional): This parameter is deprecated and unused. Defaults to None.
Returns:
List[Document]: A list of documents loaded from S3.
"""
if custom_temp_subdir is not None:
warnings.warn(
"The `custom_temp_subdir` parameter is deprecated and unused. Please remove it from your code.",
DeprecationWarning,
)
documents = self.load_s3_files_as_docs()
for doc in documents:
doc.id_ = self.s3_endpoint_url + "_" + doc.id_
return documents
|
load_s3_files_as_docs
load_s3_files_as_docs(temp_dir=None) -> List[Document]
Load file(s) from S3.
Source code in llama-index-integrations/readers/llama-index-readers-s3/llama_index/readers/s3/base.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100 | def load_s3_files_as_docs(self, temp_dir=None) -> List[Document]:
"""Load file(s) from S3."""
from s3fs import S3FileSystem
s3fs = S3FileSystem(
key=self.aws_access_id,
endpoint_url=self.s3_endpoint_url,
secret=self.aws_access_secret,
token=self.aws_session_token,
)
input_dir = self.bucket
input_files = None
if self.key:
input_files = [f"{self.bucket}/{self.key}"]
elif self.prefix:
input_dir = f"{input_dir}/{self.prefix}"
loader = SimpleDirectoryReader(
input_dir=input_dir,
input_files=input_files,
file_extractor=self.file_extractor,
required_exts=self.required_exts,
filename_as_id=self.filename_as_id,
num_files_limit=self.num_files_limit,
file_metadata=self.file_metadata,
recursive=self.recursive,
fs=s3fs,
)
return loader.load_data()
|
load_data
load_data(custom_temp_subdir: str = None) -> List[Document]
Load the file(s) from S3.
Parameters:
Name |
Type |
Description |
Default |
custom_temp_subdir |
str
|
This parameter is deprecated and unused. Defaults to None.
|
None
|
Returns:
Type |
Description |
List[Document]
|
List[Document]: A list of documents loaded from S3.
|
Source code in llama-index-integrations/readers/llama-index-readers-s3/llama_index/readers/s3/base.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122 | def load_data(self, custom_temp_subdir: str = None) -> List[Document]:
"""
Load the file(s) from S3.
Args:
custom_temp_subdir (str, optional): This parameter is deprecated and unused. Defaults to None.
Returns:
List[Document]: A list of documents loaded from S3.
"""
if custom_temp_subdir is not None:
warnings.warn(
"The `custom_temp_subdir` parameter is deprecated and unused. Please remove it from your code.",
DeprecationWarning,
)
documents = self.load_s3_files_as_docs()
for doc in documents:
doc.id_ = self.s3_endpoint_url + "_" + doc.id_
return documents
|