community: better support of pathlib paths in document loaders (#18396)

So this arose from the #18397 problem of document loaders not supporting `pathlib.Path`. This pull request provides more uniform support for Path as an argument. The core ideas for this upgrade: - if there is a local file path used as an argument, it should be supported as `pathlib.Path` - if there are some external calls that may or may not support Pathlib, the argument is immidiately converted to `str` - if there `self.file_path` is used in a way that it allows for it to stay pathlib without conversion, is is only converted for the metadata. Twitter handle: https://twitter.com/mwmajewsk
langchain-ai · Mar 26, 2024 · f7a1fd9 · f7a1fd9
1 parent 94b869a
commit f7a1fd9
Show file tree

Hide file tree

Showing 32 changed files with 147 additions and 80 deletions.
diff --git a/libs/community/langchain_community/document_loaders/acreom.py b/libs/community/langchain_community/document_loaders/acreom.py
@@ -1,6 +1,6 @@
 import re
 from pathlib import Path
-from typing import Iterator
+from typing import Iterator, Union
 
 from langchain_core.documents import Document
 
@@ -14,7 +14,10 @@ class AcreomLoader(BaseLoader):
     """Regex to match front matter metadata in markdown files."""
 
     def __init__(
-        self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
+        self,
+        path: Union[str, Path],
+        encoding: str = "UTF-8",
+        collect_metadata: bool = True,
     ):
         """Initialize the loader."""
         self.file_path = path

diff --git a/libs/community/langchain_community/document_loaders/airbyte_json.py b/libs/community/langchain_community/document_loaders/airbyte_json.py
@@ -1,5 +1,6 @@
 import json
-from typing import List
+from pathlib import Path
+from typing import List, Union
 
 from langchain_core.documents import Document
 from langchain_core.utils import stringify_dict
@@ -10,7 +11,7 @@
 class AirbyteJSONLoader(BaseLoader):
     """Load local `Airbyte` json files."""
 
-    def __init__(self, file_path: str):
+    def __init__(self, file_path: Union[str, Path]):
         """Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
         self.file_path = file_path
         """Path to the directory containing the json files."""
@@ -20,5 +21,5 @@ def load(self) -> List[Document]:
         for line in open(self.file_path, "r"):
             data = json.loads(line)["_airbyte_data"]
             text += stringify_dict(data)
-        metadata = {"source": self.file_path}
+        metadata = {"source": str(self.file_path)}
         return [Document(page_content=text, metadata=metadata)]
diff --git a/libs/community/langchain_community/document_loaders/assemblyai.py b/libs/community/langchain_community/document_loaders/assemblyai.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 from enum import Enum
-from typing import TYPE_CHECKING, Iterator, Optional
+from pathlib import Path
+from typing import TYPE_CHECKING, Iterator, Optional, Union
 
 import requests
 from langchain_core.documents import Document
@@ -44,7 +45,7 @@ class AssemblyAIAudioTranscriptLoader(BaseLoader):
 
     def __init__(
         self,
-        file_path: str,
+        file_path: Union[str, Path],
         *,
         transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
         config: Optional[assemblyai.TranscriptionConfig] = None,
@@ -71,7 +72,7 @@ def __init__(
         if api_key is not None:
             assemblyai.settings.api_key = api_key
 
-        self.file_path = file_path
+        self.file_path = str(file_path)
         self.transcript_format = transcript_format
         self.transcriber = assemblyai.Transcriber(config=config)
 

diff --git a/libs/community/langchain_community/document_loaders/conllu.py b/libs/community/langchain_community/document_loaders/conllu.py
@@ -1,5 +1,6 @@
 import csv
-from typing import List
+from pathlib import Path
+from typing import List, Union
 
 from langchain_core.documents import Document
 
@@ -9,7 +10,7 @@
 class CoNLLULoader(BaseLoader):
     """Load `CoNLL-U` files."""
 
-    def __init__(self, file_path: str):
+    def __init__(self, file_path: Union[str, Path]):
         """Initialize with a file path."""
         self.file_path = file_path
 
@@ -29,5 +30,5 @@ def load(self) -> List[Document]:
             else:
                 text += line[1] + " "
 
-        metadata = {"source": self.file_path}
+        metadata = {"source": str(self.file_path)}
         return [Document(page_content=text, metadata=metadata)]
diff --git a/libs/community/langchain_community/document_loaders/csv_loader.py b/libs/community/langchain_community/document_loaders/csv_loader.py
@@ -1,6 +1,7 @@
 import csv
 from io import TextIOWrapper
-from typing import Any, Dict, Iterator, List, Optional, Sequence
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
 
 from langchain_core.documents import Document
 
@@ -35,7 +36,7 @@ class CSVLoader(BaseLoader):
 
     def __init__(
         self,
-        file_path: str,
+        file_path: Union[str, Path],
         source_column: Optional[str] = None,
         metadata_columns: Sequence[str] = (),
         csv_args: Optional[Dict] = None,
@@ -89,7 +90,7 @@ def __read_file(self, csvfile: TextIOWrapper) -> Iterator[Document]:
                 source = (
                     row[self.source_column]
                     if self.source_column is not None
-                    else self.file_path
+                    else str(self.file_path)
                 )
             except KeyError:
                 raise ValueError(

diff --git a/libs/community/langchain_community/document_loaders/email.py b/libs/community/langchain_community/document_loaders/email.py
@@ -1,5 +1,6 @@
 import os
-from typing import Any, Iterator, List
+from pathlib import Path
+from typing import Any, Iterator, List, Union
 
 from langchain_core.documents import Document
 
@@ -41,7 +42,10 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
     """
 
     def __init__(
-        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+        self,
+        file_path: Union[str, Path],
+        mode: str = "single",
+        **unstructured_kwargs: Any,
     ):
         process_attachments = unstructured_kwargs.get("process_attachments")
         attachment_partitioner = unstructured_kwargs.get("attachment_partitioner")
@@ -79,17 +83,17 @@ class OutlookMessageLoader(BaseLoader):
     https://github.com/TeamMsgExtractor/msg-extractor
     """
 
-    def __init__(self, file_path: str):
+    def __init__(self, file_path: Union[str, Path]):
         """Initialize with a file path.
 
         Args:
             file_path: The path to the Outlook Message file.
         """
 
-        self.file_path = file_path
+        self.file_path = str(file_path)
 
         if not os.path.isfile(self.file_path):
-            raise ValueError("File path %s is not a valid file" % self.file_path)
+            raise ValueError(f"File path {self.file_path} is not a valid file")
 
         try:
             import extract_msg  # noqa:F401

diff --git a/libs/community/langchain_community/document_loaders/evernote.py b/libs/community/langchain_community/document_loaders/evernote.py
@@ -5,8 +5,9 @@
 import hashlib
 import logging
 from base64 import b64decode
+from pathlib import Path
 from time import strptime
-from typing import Any, Dict, Iterator, List, Optional
+from typing import Any, Dict, Iterator, List, Optional, Union
 
 from langchain_core.documents import Document
 
@@ -35,9 +36,9 @@ class EverNoteLoader(BaseLoader):
             the 'source' which contains the file name of the export.
     """  # noqa: E501
 
-    def __init__(self, file_path: str, load_single_document: bool = True):
+    def __init__(self, file_path: Union[str, Path], load_single_document: bool = True):
         """Initialize with file path."""
-        self.file_path = file_path
+        self.file_path = str(file_path)
         self.load_single_document = load_single_document
 
     def _lazy_load(self) -> Iterator[Document]:

diff --git a/libs/community/langchain_community/document_loaders/excel.py b/libs/community/langchain_community/document_loaders/excel.py
@@ -1,5 +1,6 @@
 """Loads Microsoft Excel files."""
-from typing import Any, List
+from pathlib import Path
+from typing import Any, List, Union
 
 from langchain_community.document_loaders.unstructured import (
     UnstructuredFileLoader,
@@ -27,7 +28,10 @@ class UnstructuredExcelLoader(UnstructuredFileLoader):
     """
 
     def __init__(
-        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+        self,
+        file_path: Union[str, Path],
+        mode: str = "single",
+        **unstructured_kwargs: Any,
     ):
         """
 

diff --git a/libs/community/langchain_community/document_loaders/facebook_chat.py b/libs/community/langchain_community/document_loaders/facebook_chat.py
@@ -1,7 +1,7 @@
 import datetime
 import json
 from pathlib import Path
-from typing import Iterator
+from typing import Iterator, Union
 
 from langchain_core.documents import Document
 
@@ -25,7 +25,7 @@ def concatenate_rows(row: dict) -> str:
 class FacebookChatLoader(BaseLoader):
     """Load `Facebook Chat` messages directory dump."""
 
-    def __init__(self, path: str):
+    def __init__(self, path: Union[str, Path]):
         """Initialize with a path."""
         self.file_path = path
 

diff --git a/libs/community/langchain_community/document_loaders/helpers.py b/libs/community/langchain_community/document_loaders/helpers.py
@@ -1,7 +1,8 @@
 """Document loader helpers."""
 
 import concurrent.futures
-from typing import List, NamedTuple, Optional, cast
+from pathlib import Path
+from typing import List, NamedTuple, Optional, Union, cast
 
 
 class FileEncoding(NamedTuple):
@@ -15,7 +16,9 @@ class FileEncoding(NamedTuple):
     """The language of the file."""
 
 
-def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
+def detect_file_encodings(
+    file_path: Union[str, Path], timeout: int = 5
+) -> List[FileEncoding]:
     """Try to detect the file encoding.
 
     Returns a list of `FileEncoding` tuples with the detected encodings ordered
@@ -27,6 +30,8 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding
     """
     import chardet
 
+    file_path = str(file_path)
+
     def read_and_detect(file_path: str) -> List[dict]:
         with open(file_path, "rb") as f:
             rawdata = f.read()

diff --git a/libs/community/langchain_community/document_loaders/html_bs.py b/libs/community/langchain_community/document_loaders/html_bs.py
@@ -1,4 +1,5 @@
 import logging
+from pathlib import Path
 from typing import Dict, Iterator, Union
 
 from langchain_core.documents import Document
@@ -13,7 +14,7 @@ class BSHTMLLoader(BaseLoader):
 
     def __init__(
         self,
-        file_path: str,
+        file_path: Union[str, Path],
         open_encoding: Union[str, None] = None,
         bs_kwargs: Union[dict, None] = None,
         get_text_separator: str = "",
@@ -57,7 +58,7 @@ def lazy_load(self) -> Iterator[Document]:
             title = ""
 
         metadata: Dict[str, Union[str, None]] = {
-            "source": self.file_path,
+            "source": str(self.file_path),
             "title": title,
         }
         yield Document(page_content=text, metadata=metadata)
diff --git a/libs/community/langchain_community/document_loaders/image_captions.py b/libs/community/langchain_community/document_loaders/image_captions.py
@@ -1,4 +1,5 @@
 from io import BytesIO
+from pathlib import Path
 from typing import Any, List, Tuple, Union
 
 import requests
@@ -17,7 +18,7 @@ class ImageCaptionLoader(BaseLoader):
 
     def __init__(
         self,
-        images: Union[str, bytes, List[Union[str, bytes]]],
+        images: Union[str, Path, bytes, List[Union[str, bytes, Path]]],
         blip_processor: str = "Salesforce/blip-image-captioning-base",
         blip_model: str = "Salesforce/blip-image-captioning-base",
     ):
@@ -29,7 +30,7 @@ def __init__(
             blip_processor: The name of the pre-trained BLIP processor.
             blip_model: The name of the pre-trained BLIP model.
         """
-        if isinstance(images, (str, bytes)):
+        if isinstance(images, (str, Path, bytes)):
             self.images = [images]
         else:
             self.images = images
@@ -61,7 +62,7 @@ def load(self) -> List[Document]:
         return results
 
     def _get_captions_and_metadata(
-        self, model: Any, processor: Any, image: Union[str, bytes]
+        self, model: Any, processor: Any, image: Union[str, Path, bytes]
     ) -> Tuple[str, dict]:
         """Helper function for getting the captions and metadata of an image."""
         try:
@@ -76,7 +77,9 @@ def _get_captions_and_metadata(
         try:
             if isinstance(image, bytes):
                 image = Image.open(BytesIO(image)).convert("RGB")
-            elif image.startswith("http://") or image.startswith("https://"):
+            elif isinstance(image, str) and (
+                image.startswith("http://") or image.startswith("https://")
+            ):
                 image = Image.open(requests.get(image, stream=True).raw).convert("RGB")
             else:
                 image = Image.open(image).convert("RGB")
@@ -94,6 +97,6 @@ def _get_captions_and_metadata(
         if isinstance(image_source, bytes):
             metadata: dict = {"image_source": "Image bytes provided"}
         else:
-            metadata = {"image_path": image_source}
+            metadata = {"image_path": str(image_source)}
 
         return caption, metadata
diff --git a/libs/community/langchain_community/document_loaders/mhtml.py b/libs/community/langchain_community/document_loaders/mhtml.py
@@ -1,5 +1,6 @@
 import email
 import logging
+from pathlib import Path
 from typing import Dict, Iterator, Union
 
 from langchain_core.documents import Document
@@ -14,7 +15,7 @@ class MHTMLLoader(BaseLoader):
 
     def __init__(
         self,
-        file_path: str,
+        file_path: Union[str, Path],
         open_encoding: Union[str, None] = None,
         bs_kwargs: Union[dict, None] = None,
         get_text_separator: str = "",
@@ -69,7 +70,7 @@ def lazy_load(self) -> Iterator[Document]:
                         title = ""
 
                     metadata: Dict[str, Union[str, None]] = {
-                        "source": self.file_path,
+                        "source": str(self.file_path),
                         "title": title,
                     }
                     yield Document(page_content=text, metadata=metadata)

diff --git a/libs/community/langchain_community/document_loaders/notebook.py b/libs/community/langchain_community/document_loaders/notebook.py
@@ -1,7 +1,7 @@
 """Loads .ipynb notebook files."""
 import json
 from pathlib import Path
-from typing import Any, List
+from typing import Any, List, Union
 
 from langchain_core.documents import Document
 
@@ -75,7 +75,7 @@ class NotebookLoader(BaseLoader):
 
     def __init__(
         self,
-        path: str,
+        path: Union[str, Path],
         include_outputs: bool = False,
         max_output_length: int = 10,
         remove_newline: bool = False,

diff --git a/libs/community/langchain_community/document_loaders/notion.py b/libs/community/langchain_community/document_loaders/notion.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import List
+from typing import List, Union
 
 from langchain_core.documents import Document
 
@@ -9,7 +9,7 @@
 class NotionDirectoryLoader(BaseLoader):
     """Load `Notion directory` dump."""
 
-    def __init__(self, path: str, *, encoding: str = "utf-8") -> None:
+    def __init__(self, path: Union[str, Path], *, encoding: str = "utf-8") -> None:
         """Initialize with a file path."""
         self.file_path = path
         self.encoding = encoding