-
Notifications
You must be signed in to change notification settings - Fork 16.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Abstractions for document processing #2833
Changes from all commits
31f92cf
b3f3ed0
7e4d811
2b5a29f
c2dae41
5e93db3
1e359ee
99e40bd
73e11e6
3c46c2d
919532f
905d0c3
5572747
b7f98dc
da7c6c6
fa3be97
ac2fe24
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,17 @@ | ||
"""Base loader class.""" | ||
from __future__ import annotations | ||
|
||
import abc | ||
from abc import ABC, abstractmethod | ||
from typing import List, Optional | ||
from typing import List, Optional, Generator | ||
|
||
from langchain.docstore.document import Document | ||
from langchain.document_loaders.blob_loaders import Blob | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it weird that base imports from something more nested There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. perhaps BlobParser should live in blob_loaders? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah we can definitely move the One thing I wasn't so sure about is that the document loaders right now are basically content fetchers + parsers and the majority of their work should be focused on parsing, so it felt like a parser abstraction could be accommodated at this level of the hierarchy. I'll start diffing smaller more careful changes now, and we can iron out all the namespaces to make sure everything is in the most logical place.
IMO it's generally OK to import more nested code, since the nested code is owned by the importing code. The dangerous imports are when reaching into parent paths or sibling paths since that means that we're reaching into nested code that doesn't belong to the given module/package |
||
from langchain.schema import Document | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter | ||
|
||
|
||
class BaseLoader(ABC): | ||
"""Base loader class.""" | ||
"""Base loader for documents.""" | ||
|
||
@abstractmethod | ||
def load(self) -> List[Document]: | ||
|
@@ -24,3 +27,50 @@ def load_and_split( | |
_text_splitter = text_splitter | ||
docs = self.load() | ||
return _text_splitter.split_documents(docs) | ||
|
||
@abstractmethod | ||
def lazy_load( | ||
self, | ||
) -> Generator[Document, None, None]: | ||
"""A lazy loader for document content.""" | ||
raise NotImplementedError() | ||
|
||
|
||
class BaseBlobParser(ABC): | ||
"""Abstract interface for blob parsers. | ||
|
||
* A blob is a representation of raw data | ||
* A blob parser provides a way to parse a blob into one or more documents | ||
""" | ||
|
||
@abc.abstractmethod | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
def lazy_parse(self, blob: Blob) -> Generator[Document, None, None]: | ||
"""Lazy parsing interface. | ||
|
||
Subclasses should implement this method and | ||
|
||
Args: | ||
blob: Blob instance | ||
|
||
Returns: | ||
Generator of documents | ||
""" | ||
raise NotImplementedError() | ||
|
||
def parse(self, blob: Blob) -> List[Document]: | ||
"""Eagerly parse the blob into a document or documents. | ||
|
||
This is a convenience method when prototyping interactively. | ||
|
||
For serious use, the lazy_parse method should be used instead as it allows | ||
for lazy loading of content. | ||
|
||
Subclasses should generally not over-ride this parse method. | ||
|
||
Args: | ||
blob: Blob instance | ||
|
||
Returns: | ||
List of documents | ||
""" | ||
return list(self.lazy_parse(blob)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from langchain.document_loaders.blob_loaders.file_system import FileSystemLoader | ||
from langchain.document_loaders.blob_loaders.gcs import GCSBlobLoader | ||
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader | ||
|
||
__all__ = ["BlobLoader", "GCSBlobLoader", "FileSystemLoader", "Blob"] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Placeholder""" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
"""Code to load blobs from a file system.""" | ||
from pathlib import Path | ||
from typing import Generator | ||
|
||
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader | ||
|
||
|
||
def _is_visible(p: Path) -> bool: | ||
"""Is the given path visible.""" | ||
parts = p.parts | ||
for _p in parts: | ||
if _p.startswith("."): | ||
return False | ||
return True | ||
|
||
|
||
# PUBLIC API | ||
|
||
|
||
class FileSystemLoader(BlobLoader): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FileSystemBlobLoader might be clearer |
||
"""Loading logic for loading documents from a directory.""" | ||
|
||
def __init__( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @hwchase17 how do we think about when to use pydantic (and let it handle constructors) vs not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch -- this should've probably be pydantic, i was moving code around carelessly |
||
self, | ||
path: str, | ||
glob: str = "**/[!.]*", | ||
*, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what're pros of including this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
load_hidden: bool = False, | ||
recursive: bool = False, | ||
): | ||
"""Initialize with path to directory and how to glob over it.""" | ||
self.path = path | ||
self.glob = glob | ||
self.load_hidden = load_hidden | ||
self.recursive = recursive | ||
|
||
def yield_blobs( | ||
self, | ||
) -> Generator[Blob, None, None]: | ||
"""Yield blobs that match the requested pattern.""" | ||
p = Path(self.path) | ||
items = p.rglob(self.glob) if self.recursive else p.glob(self.glob) | ||
for item in items: | ||
if item.is_file(): | ||
if _is_visible(item.relative_to(p)) or self.load_hidden: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we flip, since checking load_hidden (ever so slightly) faster There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah can update -- this was copied from an existing implementation |
||
yield Blob.from_path(str(item)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
"""Loading logic for loading documents from an GCS directory.""" | ||
from io import BytesIO | ||
from typing import Generator | ||
|
||
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader | ||
|
||
|
||
class GCSBlobLoader(BlobLoader): | ||
"""GSC blob loader.""" | ||
|
||
def __init__(self, project_name: str, bucket: str, prefix: str = "") -> None: | ||
"""Initialize with bucket and key name.""" | ||
self.project_name = project_name | ||
self.bucket = bucket | ||
self.prefix = prefix | ||
|
||
def yield_blobs( | ||
self, | ||
) -> Generator[Blob, None, None]: | ||
"""Yield blobs matching the given pattern.""" | ||
|
||
try: | ||
from google.cloud import storage | ||
except ImportError: | ||
raise ValueError( | ||
"Could not import google-cloud-storage python package. " | ||
"Please install it with `pip install google-cloud-storage`." | ||
) | ||
client = storage.Client(project=self.project_name) | ||
bytes_io = BytesIO() | ||
for blob in client.list_blobs(self.bucket, prefix=self.prefix): | ||
client.download_blob_to_file(blob, bytes_io) | ||
yield Blob(path_like=blob.name, data=bytes_io.seek(0)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Placeholder""" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
import contextlib | ||
import mimetypes | ||
from abc import ABC, abstractmethod | ||
from io import BytesIO | ||
from pathlib import PurePath | ||
from pydantic import BaseModel | ||
from typing import Union, Optional, Generator | ||
|
||
PathLike = Union[str, PurePath] | ||
|
||
|
||
class Blob(BaseModel): | ||
"""A blob is used to represent raw data by either reference or value. | ||
|
||
Provides an interface to materialize the blob in different representations. | ||
|
||
This is inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob | ||
|
||
""" | ||
|
||
data: Union[bytes, str, None] | ||
mimetype: Optional[str] = None | ||
encoding: str = "utf-8" # Use utf-8 as default encoding, if decoding to string | ||
# Location where the original content was found | ||
# Represent location on the local file system | ||
# Useful for situations where downstream code assumes it must work with file paths | ||
# rather than in-memory content. | ||
path: Optional[PathLike] = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So if this was pulled from e.g., a URL this would be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Undecided -- we can set to None or set to the source URL if known or set to a temp location on disk if it was downloaded to the file system and not stored in memory. Any opinions? We can extend it to support URLs and support a driver for loading content feels a bit complex.. e.g., there's more than one way to fetch an HTML file (one with requests and another with something like playwright to execute the js) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't have opinions here. You're right, that sounds complex |
||
|
||
class Config: | ||
arbitrary_types_allowed = True | ||
frozen = True | ||
|
||
@property | ||
def source(self) -> Optional[str]: | ||
"""The source location of the blob as string if known otherwise none.""" | ||
return str(self.path) if self.path else None | ||
|
||
def as_string(self) -> str: | ||
"""Read data as a string.""" | ||
encoding = self.encoding or "utf-8" | ||
if self.data is None and self.path: | ||
with open(str(self.path), "r", encoding=self.encoding) as f: | ||
return f.read() | ||
elif isinstance(self.data, bytes): | ||
return self.data.decode(encoding) | ||
else: | ||
return self.data | ||
|
||
def as_bytes(self) -> bytes: | ||
"""Read data as bytes.""" | ||
if isinstance(self.data, bytes): | ||
return self.data | ||
elif self.data is None and self.path: | ||
with open(str(self.path), "rb") as f: | ||
return f.read() | ||
else: | ||
raise NotImplementedError(f"Unable to get bytes for blob {self}") | ||
|
||
@contextlib.contextmanager | ||
def as_bytes_io(self) -> BytesIO: | ||
"""Read data as a byte stream.""" | ||
if isinstance(self.data, bytes): | ||
yield BytesIO(self.data) | ||
elif self.data is None and self.path: | ||
with open(str(self.path), "rb") as f: | ||
yield f | ||
else: | ||
raise NotImplementedError(f"Unable to convert blob {self}") | ||
|
||
@classmethod | ||
def from_path( | ||
cls, | ||
path: Union[str, PurePath], | ||
*, | ||
encoding: str = "utf-8", | ||
guess_type: bool = True, | ||
) -> "Blob": | ||
"""Load the blob from a path like object. | ||
|
||
Args: | ||
path: path like object to file to be read | ||
encoding: Encoding to use if decoding the bytes into a string | ||
guess_type: If True, the mimetype will be guessed from the file extension | ||
|
||
Returns: | ||
Blob instance | ||
""" | ||
mimetype = mimetypes.guess_type(path)[0] if guess_type else None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ooc: would it make sense to make the mimetype required and use a pptern like
? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea |
||
# We do not load the data immediately! | ||
# And instead we treat the blob has containing a reference to the underlying data. | ||
return cls(data=None, mimetype=mimetype, encoding=encoding, path=path) | ||
|
||
@classmethod | ||
def from_data( | ||
cls, | ||
data: Union[str, bytes], | ||
*, | ||
encoding: str = "utf-8", | ||
mime_type: Optional[str] = None, | ||
path: Optional[str] = None, | ||
) -> "Blob": | ||
"""Initialize the blob from in-memory data. | ||
Args: | ||
data: the in-memory data associated with the blob | ||
encoding: Encoding to use if decoding the bytes into a string | ||
mime_type: if provided, will be set as the mime-type of the data | ||
path: if provided, will be set as the source from which the data came | ||
|
||
Returns: | ||
Blob instance | ||
""" | ||
return cls(data=data, mime_type=mime_type, encoding=encoding, path=path) | ||
|
||
def __repr__(self) -> str: | ||
"""Define the blob representation.""" | ||
str_repr = f"Blob {id(self)}" | ||
if self.source: | ||
str_repr += f" {self.source}" | ||
return str_repr | ||
|
||
|
||
class BlobLoader(ABC): | ||
@abstractmethod | ||
def yield_blobs( | ||
self, | ||
) -> Generator[Blob, None, None]: | ||
"""A lazy loader for raw data represented by LangChain's Blob object.""" | ||
raise NotImplementedError() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need for this if its abstract |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
del