langchain-ai · eyurtsev · Apr 7, 2023 · Apr 12, 2023 · Apr 13, 2023 · Apr 13, 2023
diff --git a/langchain/document_loaders/base.py b/langchain/document_loaders/base.py
@@ -1,14 +1,17 @@
 """Base loader class."""
+from __future__ import annotations
 
+import abc
 from abc import ABC, abstractmethod
-from typing import List, Optional
+from typing import List, Optional, Generator
 
-from langchain.docstore.document import Document
+from langchain.document_loaders.blob_loaders import Blob
+from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
 
 
 class BaseLoader(ABC):
-    """Base loader class."""
+    """Base loader for documents."""
 
     @abstractmethod
     def load(self) -> List[Document]:
@@ -24,3 +27,50 @@ def load_and_split(
             _text_splitter = text_splitter
         docs = self.load()
         return _text_splitter.split_documents(docs)
+
+    @abstractmethod
+    def lazy_load(
+        self,
+    ) -> Generator[Document, None, None]:
+        """A lazy loader for document content."""
+        raise NotImplementedError()
+
+
+class BaseBlobParser(ABC):
+    """Abstract interface for blob parsers.
+
+    * A blob is a representation of raw data
+    * A blob parser provides a way to parse a blob into one or more documents
+    """
+
+    @abc.abstractmethod
+    def lazy_parse(self, blob: Blob) -> Generator[Document, None, None]:
+        """Lazy parsing interface.
+
+        Subclasses should implement this method and
+
+        Args:
+            blob: Blob instance
+
+        Returns:
+            Generator of documents
+        """
+        raise NotImplementedError()
+
+    def parse(self, blob: Blob) -> List[Document]:
+        """Eagerly parse the blob into a document or documents.
+
+        This is a convenience method when prototyping interactively.
+
+        For serious use, the lazy_parse method should be used instead as it allows
+        for lazy loading of content.
+
+        Subclasses should generally not over-ride this parse method.
+
+        Args:
+            blob: Blob instance
+
+        Returns:
+            List of documents
+        """
+        return list(self.lazy_parse(blob))
diff --git a/langchain/document_loaders/blob_loaders/__init__.py b/langchain/document_loaders/blob_loaders/__init__.py
@@ -0,0 +1,5 @@
+from langchain.document_loaders.blob_loaders.file_system import FileSystemLoader
+from langchain.document_loaders.blob_loaders.gcs import GCSBlobLoader
+from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader
+
+__all__ = ["BlobLoader", "GCSBlobLoader", "FileSystemLoader", "Blob"]
diff --git a/langchain/document_loaders/blob_loaders/azure.py b/langchain/document_loaders/blob_loaders/azure.py
@@ -0,0 +1 @@
+"""Placeholder"""
diff --git a/langchain/document_loaders/blob_loaders/file_system.py b/langchain/document_loaders/blob_loaders/file_system.py
@@ -0,0 +1,46 @@
+"""Code to load blobs from a file system."""
+from pathlib import Path
+from typing import Generator
+
+from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader
+
+
+def _is_visible(p: Path) -> bool:
+    """Is the given path visible."""
+    parts = p.parts
+    for _p in parts:
+        if _p.startswith("."):
+            return False
+    return True
+
+
+# PUBLIC API
+
+
+class FileSystemLoader(BlobLoader):
+    """Loading logic for loading documents from a directory."""
+
+    def __init__(
+        self,
+        path: str,
+        glob: str = "**/[!.]*",
+        *,
+        load_hidden: bool = False,
+        recursive: bool = False,
+    ):
+        """Initialize with path to directory and how to glob over it."""
+        self.path = path
+        self.glob = glob
+        self.load_hidden = load_hidden
+        self.recursive = recursive
+
+    def yield_blobs(
+        self,
+    ) -> Generator[Blob, None, None]:
+        """Yield blobs that match the requested pattern."""
+        p = Path(self.path)
+        items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
+        for item in items:
+            if item.is_file():
+                if _is_visible(item.relative_to(p)) or self.load_hidden:
+                    yield Blob.from_path(str(item))
diff --git a/langchain/document_loaders/blob_loaders/gcs.py b/langchain/document_loaders/blob_loaders/gcs.py
@@ -0,0 +1,33 @@
+"""Loading logic for loading documents from an GCS directory."""
+from io import BytesIO
+from typing import Generator
+
+from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader
+
+
+class GCSBlobLoader(BlobLoader):
+    """GSC blob loader."""
+
+    def __init__(self, project_name: str, bucket: str, prefix: str = "") -> None:
+        """Initialize with bucket and key name."""
+        self.project_name = project_name
+        self.bucket = bucket
+        self.prefix = prefix
+
+    def yield_blobs(
+        self,
+    ) -> Generator[Blob, None, None]:
+        """Yield blobs matching the given pattern."""
+
+        try:
+            from google.cloud import storage
+        except ImportError:
+            raise ValueError(
+                "Could not import google-cloud-storage python package. "
+                "Please install it with `pip install google-cloud-storage`."
+            )
+        client = storage.Client(project=self.project_name)
+        bytes_io = BytesIO()
+        for blob in client.list_blobs(self.bucket, prefix=self.prefix):
+            client.download_blob_to_file(blob, bytes_io)
+            yield Blob(path_like=blob.name, data=bytes_io.seek(0))
diff --git a/langchain/document_loaders/blob_loaders/s3.py b/langchain/document_loaders/blob_loaders/s3.py
@@ -0,0 +1 @@
+"""Placeholder"""
diff --git a/langchain/document_loaders/blob_loaders/schema.py b/langchain/document_loaders/blob_loaders/schema.py
@@ -0,0 +1,129 @@
+import contextlib
+import mimetypes
+from abc import ABC, abstractmethod
+from io import BytesIO
+from pathlib import PurePath
+from pydantic import BaseModel
+from typing import Union, Optional, Generator
+
+PathLike = Union[str, PurePath]
+
+
+class Blob(BaseModel):
+    """A blob is used to represent raw data by either reference or value.
+
+    Provides an interface to materialize the blob in different representations.
+
+    This is inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
+
+    """
+
+    data: Union[bytes, str, None]
+    mimetype: Optional[str] = None
+    encoding: str = "utf-8"  # Use utf-8 as default encoding, if decoding to string
+    # Location where the original content was found
+    # Represent location on the local file system
+    # Useful for situations where downstream code assumes it must work with file paths
+    # rather than in-memory content.
+    path: Optional[PathLike] = None
+
+    class Config:
+        arbitrary_types_allowed = True
+        frozen = True
+
+    @property
+    def source(self) -> Optional[str]:
+        """The source location of the blob as string if known otherwise none."""
+        return str(self.path) if self.path else None
+
+    def as_string(self) -> str:
+        """Read data as a string."""
+        encoding = self.encoding or "utf-8"
+        if self.data is None and self.path:
+            with open(str(self.path), "r", encoding=self.encoding) as f:
+                return f.read()
+        elif isinstance(self.data, bytes):
+            return self.data.decode(encoding)
+        else:
+            return self.data
+
+    def as_bytes(self) -> bytes:
+        """Read data as bytes."""
+        if isinstance(self.data, bytes):
+            return self.data
+        elif self.data is None and self.path:
+            with open(str(self.path), "rb") as f:
+                return f.read()
+        else:
+            raise NotImplementedError(f"Unable to get bytes for blob {self}")
+
+    @contextlib.contextmanager
+    def as_bytes_io(self) -> BytesIO:
+        """Read data as a byte stream."""
+        if isinstance(self.data, bytes):
+            yield BytesIO(self.data)
+        elif self.data is None and self.path:
+            with open(str(self.path), "rb") as f:
+                yield f
+        else:
+            raise NotImplementedError(f"Unable to convert blob {self}")
+
+    @classmethod
+    def from_path(
+        cls,
+        path: Union[str, PurePath],
+        *,
+        encoding: str = "utf-8",
+        guess_type: bool = True,
+    ) -> "Blob":
+        """Load the blob from a path like object.
+
+        Args:
+            path: path like object to file to be read
+            encoding: Encoding to use if decoding the bytes into a string
+            guess_type: If True, the mimetype will be guessed from the file extension
+
+        Returns:
+            Blob instance
+        """
+        mimetype = mimetypes.guess_type(path)[0] if guess_type else None
+        # We do not load the data immediately!
+        # And instead we treat the blob has containing a reference to the underlying data.
+        return cls(data=None, mimetype=mimetype, encoding=encoding, path=path)
+
+    @classmethod
+    def from_data(
+        cls,
+        data: Union[str, bytes],
+        *,
+        encoding: str = "utf-8",
+        mime_type: Optional[str] = None,
+        path: Optional[str] = None,
+    ) -> "Blob":
+        """Initialize the blob from in-memory data.
+        Args:
+            data: the in-memory data associated with the blob
+            encoding: Encoding to use if decoding the bytes into a string
+            mime_type: if provided, will be set as the mime-type of the data
+            path: if provided, will be set as the source from which the data came
+
+        Returns:
+            Blob instance
+        """
+        return cls(data=data, mime_type=mime_type, encoding=encoding, path=path)
+
+    def __repr__(self) -> str:
+        """Define the blob representation."""
+        str_repr = f"Blob {id(self)}"
+        if self.source:
+            str_repr += f" {self.source}"
+        return str_repr
+
+
+class BlobLoader(ABC):
+    @abstractmethod
+    def yield_blobs(
+        self,
+    ) -> Generator[Blob, None, None]:
+        """A lazy loader for raw data represented by LangChain's Blob object."""
+        raise NotImplementedError()