Skip to content

Commit

Permalink
community: better support of pathlib paths in document loaders (#18396)
Browse files Browse the repository at this point in the history
So this arose from the
#18397 problem of document
loaders not supporting `pathlib.Path`.

This pull request provides more uniform support for Path as an argument.
The core ideas for this upgrade: 
- if there is a local file path used as an argument, it should be
supported as `pathlib.Path`
- if there are some external calls that may or may not support Pathlib,
the argument is immidiately converted to `str`
- if there `self.file_path` is used in a way that it allows for it to
stay pathlib without conversion, is is only converted for the metadata.

Twitter handle: https://twitter.com/mwmajewsk
  • Loading branch information
mmajewsk authored Mar 26, 2024
1 parent 94b869a commit f7a1fd9
Show file tree
Hide file tree
Showing 32 changed files with 147 additions and 80 deletions.
7 changes: 5 additions & 2 deletions libs/community/langchain_community/document_loaders/acreom.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from pathlib import Path
from typing import Iterator
from typing import Iterator, Union

from langchain_core.documents import Document

Expand All @@ -14,7 +14,10 @@ class AcreomLoader(BaseLoader):
"""Regex to match front matter metadata in markdown files."""

def __init__(
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
self,
path: Union[str, Path],
encoding: str = "UTF-8",
collect_metadata: bool = True,
):
"""Initialize the loader."""
self.file_path = path
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
from typing import List
from pathlib import Path
from typing import List, Union

from langchain_core.documents import Document
from langchain_core.utils import stringify_dict
Expand All @@ -10,7 +11,7 @@
class AirbyteJSONLoader(BaseLoader):
"""Load local `Airbyte` json files."""

def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
self.file_path = file_path
"""Path to the directory containing the json files."""
Expand All @@ -20,5 +21,5 @@ def load(self) -> List[Document]:
for line in open(self.file_path, "r"):
data = json.loads(line)["_airbyte_data"]
text += stringify_dict(data)
metadata = {"source": self.file_path}
metadata = {"source": str(self.file_path)}
return [Document(page_content=text, metadata=metadata)]
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from __future__ import annotations

from enum import Enum
from typing import TYPE_CHECKING, Iterator, Optional
from pathlib import Path
from typing import TYPE_CHECKING, Iterator, Optional, Union

import requests
from langchain_core.documents import Document
Expand Down Expand Up @@ -44,7 +45,7 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):

def __init__(
self,
file_path: str,
file_path: Union[str, Path],
*,
transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
config: Optional[assemblyai.TranscriptionConfig] = None,
Expand All @@ -71,7 +72,7 @@ def __init__(
if api_key is not None:
assemblyai.settings.api_key = api_key

self.file_path = file_path
self.file_path = str(file_path)
self.transcript_format = transcript_format
self.transcriber = assemblyai.Transcriber(config=config)

Expand Down
7 changes: 4 additions & 3 deletions libs/community/langchain_community/document_loaders/conllu.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import csv
from typing import List
from pathlib import Path
from typing import List, Union

from langchain_core.documents import Document

Expand All @@ -9,7 +10,7 @@
class CoNLLULoader(BaseLoader):
"""Load `CoNLL-U` files."""

def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with a file path."""
self.file_path = file_path

Expand All @@ -29,5 +30,5 @@ def load(self) -> List[Document]:
else:
text += line[1] + " "

metadata = {"source": self.file_path}
metadata = {"source": str(self.file_path)}
return [Document(page_content=text, metadata=metadata)]
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import csv
from io import TextIOWrapper
from typing import Any, Dict, Iterator, List, Optional, Sequence
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union

from langchain_core.documents import Document

Expand Down Expand Up @@ -35,7 +36,7 @@ class CSVLoader(BaseLoader):

def __init__(
self,
file_path: str,
file_path: Union[str, Path],
source_column: Optional[str] = None,
metadata_columns: Sequence[str] = (),
csv_args: Optional[Dict] = None,
Expand Down Expand Up @@ -89,7 +90,7 @@ def __read_file(self, csvfile: TextIOWrapper) -> Iterator[Document]:
source = (
row[self.source_column]
if self.source_column is not None
else self.file_path
else str(self.file_path)
)
except KeyError:
raise ValueError(
Expand Down
14 changes: 9 additions & 5 deletions libs/community/langchain_community/document_loaders/email.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from typing import Any, Iterator, List
from pathlib import Path
from typing import Any, Iterator, List, Union

from langchain_core.documents import Document

Expand Down Expand Up @@ -41,7 +42,10 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
"""

def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
process_attachments = unstructured_kwargs.get("process_attachments")
attachment_partitioner = unstructured_kwargs.get("attachment_partitioner")
Expand Down Expand Up @@ -79,17 +83,17 @@ class OutlookMessageLoader(BaseLoader):
https://github.com/TeamMsgExtractor/msg-extractor
"""

def __init__(self, file_path: str):
def __init__(self, file_path: Union[str, Path]):
"""Initialize with a file path.
Args:
file_path: The path to the Outlook Message file.
"""

self.file_path = file_path
self.file_path = str(file_path)

if not os.path.isfile(self.file_path):
raise ValueError("File path %s is not a valid file" % self.file_path)
raise ValueError(f"File path {self.file_path} is not a valid file")

try:
import extract_msg # noqa:F401
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
import hashlib
import logging
from base64 import b64decode
from pathlib import Path
from time import strptime
from typing import Any, Dict, Iterator, List, Optional
from typing import Any, Dict, Iterator, List, Optional, Union

from langchain_core.documents import Document

Expand Down Expand Up @@ -35,9 +36,9 @@ class EverNoteLoader(BaseLoader):
the 'source' which contains the file name of the export.
""" # noqa: E501

def __init__(self, file_path: str, load_single_document: bool = True):
def __init__(self, file_path: Union[str, Path], load_single_document: bool = True):
"""Initialize with file path."""
self.file_path = file_path
self.file_path = str(file_path)
self.load_single_document = load_single_document

def _lazy_load(self) -> Iterator[Document]:
Expand Down
8 changes: 6 additions & 2 deletions libs/community/langchain_community/document_loaders/excel.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Loads Microsoft Excel files."""
from typing import Any, List
from pathlib import Path
from typing import Any, List, Union

from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
Expand Down Expand Up @@ -27,7 +28,10 @@ class UnstructuredExcelLoader(UnstructuredFileLoader):
"""

def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datetime
import json
from pathlib import Path
from typing import Iterator
from typing import Iterator, Union

from langchain_core.documents import Document

Expand All @@ -25,7 +25,7 @@ def concatenate_rows(row: dict) -> str:
class FacebookChatLoader(BaseLoader):
"""Load `Facebook Chat` messages directory dump."""

def __init__(self, path: str):
def __init__(self, path: Union[str, Path]):
"""Initialize with a path."""
self.file_path = path

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""Document loader helpers."""

import concurrent.futures
from typing import List, NamedTuple, Optional, cast
from pathlib import Path
from typing import List, NamedTuple, Optional, Union, cast


class FileEncoding(NamedTuple):
Expand All @@ -15,7 +16,9 @@ class FileEncoding(NamedTuple):
"""The language of the file."""


def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
def detect_file_encodings(
file_path: Union[str, Path], timeout: int = 5
) -> List[FileEncoding]:
"""Try to detect the file encoding.
Returns a list of `FileEncoding` tuples with the detected encodings ordered
Expand All @@ -27,6 +30,8 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding
"""
import chardet

file_path = str(file_path)

def read_and_detect(file_path: str) -> List[dict]:
with open(file_path, "rb") as f:
rawdata = f.read()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from pathlib import Path
from typing import Dict, Iterator, Union

from langchain_core.documents import Document
Expand All @@ -13,7 +14,7 @@ class BSHTMLLoader(BaseLoader):

def __init__(
self,
file_path: str,
file_path: Union[str, Path],
open_encoding: Union[str, None] = None,
bs_kwargs: Union[dict, None] = None,
get_text_separator: str = "",
Expand Down Expand Up @@ -57,7 +58,7 @@ def lazy_load(self) -> Iterator[Document]:
title = ""

metadata: Dict[str, Union[str, None]] = {
"source": self.file_path,
"source": str(self.file_path),
"title": title,
}
yield Document(page_content=text, metadata=metadata)
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from io import BytesIO
from pathlib import Path
from typing import Any, List, Tuple, Union

import requests
Expand All @@ -17,7 +18,7 @@ class ImageCaptionLoader(BaseLoader):

def __init__(
self,
images: Union[str, bytes, List[Union[str, bytes]]],
images: Union[str, Path, bytes, List[Union[str, bytes, Path]]],
blip_processor: str = "Salesforce/blip-image-captioning-base",
blip_model: str = "Salesforce/blip-image-captioning-base",
):
Expand All @@ -29,7 +30,7 @@ def __init__(
blip_processor: The name of the pre-trained BLIP processor.
blip_model: The name of the pre-trained BLIP model.
"""
if isinstance(images, (str, bytes)):
if isinstance(images, (str, Path, bytes)):
self.images = [images]
else:
self.images = images
Expand Down Expand Up @@ -61,7 +62,7 @@ def load(self) -> List[Document]:
return results

def _get_captions_and_metadata(
self, model: Any, processor: Any, image: Union[str, bytes]
self, model: Any, processor: Any, image: Union[str, Path, bytes]
) -> Tuple[str, dict]:
"""Helper function for getting the captions and metadata of an image."""
try:
Expand All @@ -76,7 +77,9 @@ def _get_captions_and_metadata(
try:
if isinstance(image, bytes):
image = Image.open(BytesIO(image)).convert("RGB")
elif image.startswith("http://") or image.startswith("https://"):
elif isinstance(image, str) and (
image.startswith("http://") or image.startswith("https://")
):
image = Image.open(requests.get(image, stream=True).raw).convert("RGB")
else:
image = Image.open(image).convert("RGB")
Expand All @@ -94,6 +97,6 @@ def _get_captions_and_metadata(
if isinstance(image_source, bytes):
metadata: dict = {"image_source": "Image bytes provided"}
else:
metadata = {"image_path": image_source}
metadata = {"image_path": str(image_source)}

return caption, metadata
5 changes: 3 additions & 2 deletions libs/community/langchain_community/document_loaders/mhtml.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import email
import logging
from pathlib import Path
from typing import Dict, Iterator, Union

from langchain_core.documents import Document
Expand All @@ -14,7 +15,7 @@ class MHTMLLoader(BaseLoader):

def __init__(
self,
file_path: str,
file_path: Union[str, Path],
open_encoding: Union[str, None] = None,
bs_kwargs: Union[dict, None] = None,
get_text_separator: str = "",
Expand Down Expand Up @@ -69,7 +70,7 @@ def lazy_load(self) -> Iterator[Document]:
title = ""

metadata: Dict[str, Union[str, None]] = {
"source": self.file_path,
"source": str(self.file_path),
"title": title,
}
yield Document(page_content=text, metadata=metadata)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Loads .ipynb notebook files."""
import json
from pathlib import Path
from typing import Any, List
from typing import Any, List, Union

from langchain_core.documents import Document

Expand Down Expand Up @@ -75,7 +75,7 @@ class NotebookLoader(BaseLoader):

def __init__(
self,
path: str,
path: Union[str, Path],
include_outputs: bool = False,
max_output_length: int = 10,
remove_newline: bool = False,
Expand Down
4 changes: 2 additions & 2 deletions libs/community/langchain_community/document_loaders/notion.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pathlib import Path
from typing import List
from typing import List, Union

from langchain_core.documents import Document

Expand All @@ -9,7 +9,7 @@
class NotionDirectoryLoader(BaseLoader):
"""Load `Notion directory` dump."""

def __init__(self, path: str, *, encoding: str = "utf-8") -> None:
def __init__(self, path: Union[str, Path], *, encoding: str = "utf-8") -> None:
"""Initialize with a file path."""
self.file_path = path
self.encoding = encoding
Expand Down
Loading

0 comments on commit f7a1fd9

Please sign in to comment.