Mongodb

SimpleMongoReader #

Bases: BaseReader

Simple mongo reader.

Concatenates each Mongo doc into Document used by LlamaIndex.

Parameters:

Name	Type	Description	Default
`host`	`str`	Mongo host.	`None`
`port`	`int`	Mongo port.	`None`

Source code in llama-index-integrations/readers/llama-index-readers-mongodb/llama_index/readers/mongodb/base.py

class SimpleMongoReader(BaseReader):
    """Simple mongo reader.

    Concatenates each Mongo doc into Document used by LlamaIndex.

    Args:
        host (str): Mongo host.
        port (int): Mongo port.
    """

    def __init__(
        self,
        host: Optional[str] = None,
        port: Optional[int] = None,
        uri: Optional[str] = None,
    ) -> None:
        """Initialize with parameters."""
        try:
            from pymongo import MongoClient
        except ImportError as err:
            raise ImportError(
                "`pymongo` package not found, please run `pip install pymongo`"
            ) from err

        client: MongoClient
        if uri:
            client = MongoClient(uri)
        elif host and port:
            client = MongoClient(host, port)
        else:
            raise ValueError("Either `host` and `port` or `uri` must be provided.")

        self.client = client

    def _flatten(self, texts: List[Union[str, List[str]]]) -> List[str]:
        result = []
        for text in texts:
            result += text if isinstance(text, list) else [text]
        return result

    def lazy_load_data(
        self,
        db_name: str,
        collection_name: str,
        field_names: List[str] = ["text"],
        separator: str = "",
        query_dict: Optional[Dict] = None,
        max_docs: int = 0,
        metadata_names: Optional[List[str]] = None,
    ) -> Iterable[Document]:
        """Load data from the input directory.

        Args:
            db_name (str): name of the database.
            collection_name (str): name of the collection.
            field_names(List[str]): names of the fields to be concatenated.
                Defaults to ["text"]
            separator (str): separator to be used between fields.
                Defaults to ""
            query_dict (Optional[Dict]): query to filter documents. Read more
            at [official docs](https://www.mongodb.com/docs/manual/reference/method/db.collection.find/#std-label-method-find-query)
                Defaults to None
            max_docs (int): maximum number of documents to load.
                Defaults to 0 (no limit)
            metadata_names (Optional[List[str]]): names of the fields to be added
                to the metadata attribute of the Document. Defaults to None

        Returns:
            List[Document]: A list of documents.

        """
        db = self.client[db_name]
        cursor = db[collection_name].find(
            filter=query_dict or {},
            limit=max_docs,
            projection={name: 1 for name in field_names + (metadata_names or [])},
        )

        for item in cursor:
            try:
                texts = [str(item[name]) for name in field_names]
            except KeyError as err:
                raise ValueError(
                    f"{err.args[0]} field not found in Mongo document."
                ) from err

            texts = self._flatten(texts)
            text = separator.join(texts)

            if metadata_names is None:
                yield Document(text=text, id_=str(item["_id"]))
            else:
                try:
                    metadata = {name: item.get(name) for name in metadata_names}
                    metadata["collection"] = collection_name
                except KeyError as err:
                    raise ValueError(
                        f"{err.args[0]} field not found in Mongo document."
                    ) from err
                yield Document(text=text, id_=str(item["_id"]), metadata=metadata)

lazy_load_data #

lazy_load_data(db_name: str, collection_name: str, field_names: List[str] = ['text'], separator: str = '', query_dict: Optional[Dict] = None, max_docs: int = 0, metadata_names: Optional[List[str]] = None) -> Iterable[Document]

Load data from the input directory.

Parameters:

Name	Type	Description	Default
`db_name`	`str`	name of the database.	required
`collection_name`	`str`	name of the collection.	required
`field_names(List[str])`		names of the fields to be concatenated. Defaults to ["text"]	required
`separator`	`str`	separator to be used between fields. Defaults to ""	`''`
`query_dict`	`Optional[Dict]`	query to filter documents. Read more	`None`
`at`	`[official docs](https`	//www.mongodb.com/docs/manual/reference/method/db.collection.find/#std-label-method-find-query) Defaults to None	required
`max_docs`	`int`	maximum number of documents to load. Defaults to 0 (no limit)	`0`
`metadata_names`	`Optional[List[str]]`	names of the fields to be added to the metadata attribute of the Document. Defaults to None	`None`

Returns:

Type	Description
`Iterable[Document]`	List[Document]: A list of documents.

Source code in llama-index-integrations/readers/llama-index-readers-mongodb/llama_index/readers/mongodb/base.py

def lazy_load_data(
    self,
    db_name: str,
    collection_name: str,
    field_names: List[str] = ["text"],
    separator: str = "",
    query_dict: Optional[Dict] = None,
    max_docs: int = 0,
    metadata_names: Optional[List[str]] = None,
) -> Iterable[Document]:
    """Load data from the input directory.

    Args:
        db_name (str): name of the database.
        collection_name (str): name of the collection.
        field_names(List[str]): names of the fields to be concatenated.
            Defaults to ["text"]
        separator (str): separator to be used between fields.
            Defaults to ""
        query_dict (Optional[Dict]): query to filter documents. Read more
        at [official docs](https://www.mongodb.com/docs/manual/reference/method/db.collection.find/#std-label-method-find-query)
            Defaults to None
        max_docs (int): maximum number of documents to load.
            Defaults to 0 (no limit)
        metadata_names (Optional[List[str]]): names of the fields to be added
            to the metadata attribute of the Document. Defaults to None

    Returns:
        List[Document]: A list of documents.

    """
    db = self.client[db_name]
    cursor = db[collection_name].find(
        filter=query_dict or {},
        limit=max_docs,
        projection={name: 1 for name in field_names + (metadata_names or [])},
    )

    for item in cursor:
        try:
            texts = [str(item[name]) for name in field_names]
        except KeyError as err:
            raise ValueError(
                f"{err.args[0]} field not found in Mongo document."
            ) from err

        texts = self._flatten(texts)
        text = separator.join(texts)

        if metadata_names is None:
            yield Document(text=text, id_=str(item["_id"]))
        else:
            try:
                metadata = {name: item.get(name) for name in metadata_names}
                metadata["collection"] = collection_name
            except KeyError as err:
                raise ValueError(
                    f"{err.args[0]} field not found in Mongo document."
                ) from err
            yield Document(text=text, id_=str(item["_id"]), metadata=metadata)