Skip to content

Commit

Permalink
Merge pull request #1166 from LavX/master
Browse files Browse the repository at this point in the history
feat: Add BSHTMLLoader support and enhance error handling for document loading
  • Loading branch information
ElishaKay authored Feb 19, 2025
2 parents 64a57fd + 65b9799 commit 9a06536
Showing 1 changed file with 13 additions and 6 deletions.
19 changes: 13 additions & 6 deletions gpt_researcher/document/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
import os

from langchain_community.document_loaders import (
PyMuPDFLoader,
TextLoader,
UnstructuredCSVLoader,
PyMuPDFLoader,
TextLoader,
UnstructuredCSVLoader,
UnstructuredExcelLoader,
UnstructuredMarkdownLoader,
UnstructuredMarkdownLoader,
UnstructuredPowerPointLoader,
UnstructuredWordDocumentLoader
)
from langchain_community.document_loaders import BSHTMLLoader


class DocumentLoader:
Expand Down Expand Up @@ -52,12 +53,18 @@ async def _load_document(self, file_path: str, file_extension: str) -> list:
"csv": UnstructuredCSVLoader(file_path, mode="elements"),
"xls": UnstructuredExcelLoader(file_path, mode="elements"),
"xlsx": UnstructuredExcelLoader(file_path, mode="elements"),
"md": UnstructuredMarkdownLoader(file_path)
"md": UnstructuredMarkdownLoader(file_path),
"html": BSHTMLLoader(file_path),
"htm": BSHTMLLoader(file_path)
}

loader = loader_dict.get(file_extension, None)
if loader:
ret_data = loader.load()
try:
ret_data = loader.load()
except Exception as e:
print(f"Failed to load HTML document : {file_path}")
print(e)

except Exception as e:
print(f"Failed to load document : {file_path}")
Expand Down

0 comments on commit 9a06536

Please sign in to comment.