Skip to content

Mongodb

SimpleMongoReader #

Bases: BaseReader

Simple mongo reader.

Concatenates each Mongo doc into Document used by LlamaIndex.

Parameters:

Name Type Description Default
host str

Mongo host.

None
port int

Mongo port.

None
Source code in llama-index-integrations/readers/llama-index-readers-mongodb/llama_index/readers/mongodb/base.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
class SimpleMongoReader(BaseReader):
    """Simple mongo reader.

    Concatenates each Mongo doc into Document used by LlamaIndex.

    Args:
        host (str): Mongo host.
        port (int): Mongo port.
    """

    def __init__(
        self,
        host: Optional[str] = None,
        port: Optional[int] = None,
        uri: Optional[str] = None,
    ) -> None:
        """Initialize with parameters."""
        try:
            from pymongo import MongoClient
        except ImportError as err:
            raise ImportError(
                "`pymongo` package not found, please run `pip install pymongo`"
            ) from err

        client: MongoClient
        if uri:
            client = MongoClient(uri)
        elif host and port:
            client = MongoClient(host, port)
        else:
            raise ValueError("Either `host` and `port` or `uri` must be provided.")

        self.client = client

    def _flatten(self, texts: List[Union[str, List[str]]]) -> List[str]:
        result = []
        for text in texts:
            result += text if isinstance(text, list) else [text]
        return result

    def lazy_load_data(
        self,
        db_name: str,
        collection_name: str,
        field_names: List[str] = ["text"],
        separator: str = "",
        query_dict: Optional[Dict] = None,
        max_docs: int = 0,
        metadata_names: Optional[List[str]] = None,
    ) -> Iterable[Document]:
        """Load data from the input directory.

        Args:
            db_name (str): name of the database.
            collection_name (str): name of the collection.
            field_names(List[str]): names of the fields to be concatenated.
                Defaults to ["text"]
            separator (str): separator to be used between fields.
                Defaults to ""
            query_dict (Optional[Dict]): query to filter documents. Read more
            at [official docs](https://www.mongodb.com/docs/manual/reference/method/db.collection.find/#std-label-method-find-query)
                Defaults to None
            max_docs (int): maximum number of documents to load.
                Defaults to 0 (no limit)
            metadata_names (Optional[List[str]]): names of the fields to be added
                to the metadata attribute of the Document. Defaults to None

        Returns:
            List[Document]: A list of documents.

        """
        db = self.client[db_name]
        cursor = db[collection_name].find(
            filter=query_dict or {},
            limit=max_docs,
            projection={name: 1 for name in field_names + (metadata_names or [])},
        )

        for item in cursor:
            try:
                texts = [str(item[name]) for name in field_names]
            except KeyError as err:
                raise ValueError(
                    f"{err.args[0]} field not found in Mongo document."
                ) from err

            texts = self._flatten(texts)
            text = separator.join(texts)

            if metadata_names is None:
                yield Document(text=text, id_=str(item["_id"]))
            else:
                try:
                    metadata = {name: item.get(name) for name in metadata_names}
                    metadata["collection"] = collection_name
                except KeyError as err:
                    raise ValueError(
                        f"{err.args[0]} field not found in Mongo document."
                    ) from err
                yield Document(text=text, id_=str(item["_id"]), metadata=metadata)

lazy_load_data #

lazy_load_data(db_name: str, collection_name: str, field_names: List[str] = ['text'], separator: str = '', query_dict: Optional[Dict] = None, max_docs: int = 0, metadata_names: Optional[List[str]] = None) -> Iterable[Document]

Load data from the input directory.

Parameters:

Name Type Description Default
db_name str

name of the database.

required
collection_name str

name of the collection.

required
field_names(List[str])

names of the fields to be concatenated. Defaults to ["text"]

required
separator str

separator to be used between fields. Defaults to ""

''
query_dict Optional[Dict]

query to filter documents. Read more

None
at [official docs](https

//www.mongodb.com/docs/manual/reference/method/db.collection.find/#std-label-method-find-query) Defaults to None

required
max_docs int

maximum number of documents to load. Defaults to 0 (no limit)

0
metadata_names Optional[List[str]]

names of the fields to be added to the metadata attribute of the Document. Defaults to None

None

Returns:

Type Description
Iterable[Document]

List[Document]: A list of documents.

Source code in llama-index-integrations/readers/llama-index-readers-mongodb/llama_index/readers/mongodb/base.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def lazy_load_data(
    self,
    db_name: str,
    collection_name: str,
    field_names: List[str] = ["text"],
    separator: str = "",
    query_dict: Optional[Dict] = None,
    max_docs: int = 0,
    metadata_names: Optional[List[str]] = None,
) -> Iterable[Document]:
    """Load data from the input directory.

    Args:
        db_name (str): name of the database.
        collection_name (str): name of the collection.
        field_names(List[str]): names of the fields to be concatenated.
            Defaults to ["text"]
        separator (str): separator to be used between fields.
            Defaults to ""
        query_dict (Optional[Dict]): query to filter documents. Read more
        at [official docs](https://www.mongodb.com/docs/manual/reference/method/db.collection.find/#std-label-method-find-query)
            Defaults to None
        max_docs (int): maximum number of documents to load.
            Defaults to 0 (no limit)
        metadata_names (Optional[List[str]]): names of the fields to be added
            to the metadata attribute of the Document. Defaults to None

    Returns:
        List[Document]: A list of documents.

    """
    db = self.client[db_name]
    cursor = db[collection_name].find(
        filter=query_dict or {},
        limit=max_docs,
        projection={name: 1 for name in field_names + (metadata_names or [])},
    )

    for item in cursor:
        try:
            texts = [str(item[name]) for name in field_names]
        except KeyError as err:
            raise ValueError(
                f"{err.args[0]} field not found in Mongo document."
            ) from err

        texts = self._flatten(texts)
        text = separator.join(texts)

        if metadata_names is None:
            yield Document(text=text, id_=str(item["_id"]))
        else:
            try:
                metadata = {name: item.get(name) for name in metadata_names}
                metadata["collection"] = collection_name
            except KeyError as err:
                raise ValueError(
                    f"{err.args[0]} field not found in Mongo document."
                ) from err
            yield Document(text=text, id_=str(item["_id"]), metadata=metadata)