From 24b6926499567faa1331c2215402acd656a6a830 Mon Sep 17 00:00:00 2001 From: Laszlo Toth Date: Tue, 18 Feb 2025 14:07:38 +0100 Subject: [PATCH] Add BSHTMLLoader support and improve error handling in DocumentLoader --- gpt_researcher/document/document.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/gpt_researcher/document/document.py b/gpt_researcher/document/document.py index 08922b301..25f882790 100644 --- a/gpt_researcher/document/document.py +++ b/gpt_researcher/document/document.py @@ -2,14 +2,15 @@ import os from langchain_community.document_loaders import ( - PyMuPDFLoader, - TextLoader, - UnstructuredCSVLoader, + PyMuPDFLoader, + TextLoader, + UnstructuredCSVLoader, UnstructuredExcelLoader, - UnstructuredMarkdownLoader, + UnstructuredMarkdownLoader, UnstructuredPowerPointLoader, UnstructuredWordDocumentLoader ) +from langchain_community.document_loaders import BSHTMLLoader class DocumentLoader: @@ -52,12 +53,18 @@ async def _load_document(self, file_path: str, file_extension: str) -> list: "csv": UnstructuredCSVLoader(file_path, mode="elements"), "xls": UnstructuredExcelLoader(file_path, mode="elements"), "xlsx": UnstructuredExcelLoader(file_path, mode="elements"), - "md": UnstructuredMarkdownLoader(file_path) + "md": UnstructuredMarkdownLoader(file_path), + "html": BSHTMLLoader(file_path), + "htm": BSHTMLLoader(file_path) } loader = loader_dict.get(file_extension, None) if loader: - ret_data = loader.load() + try: + ret_data = loader.load() + except Exception as e: + print(f"Failed to load HTML document : {file_path}") + print(e) except Exception as e: print(f"Failed to load document : {file_path}")