From 24b6926499567faa1331c2215402acd656a6a830 Mon Sep 17 00:00:00 2001
From: Laszlo Toth <laszlo.toth@baxter-it.com>
Date: Tue, 18 Feb 2025 14:07:38 +0100
Subject: [PATCH] Add BSHTMLLoader support and improve error handling in
 DocumentLoader

---
 gpt_researcher/document/document.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/gpt_researcher/document/document.py b/gpt_researcher/document/document.py
index 08922b301..25f882790 100644
--- a/gpt_researcher/document/document.py
+++ b/gpt_researcher/document/document.py
@@ -2,14 +2,15 @@
 import os
 
 from langchain_community.document_loaders import (
-    PyMuPDFLoader, 
-    TextLoader, 
-    UnstructuredCSVLoader, 
+    PyMuPDFLoader,
+    TextLoader,
+    UnstructuredCSVLoader,
     UnstructuredExcelLoader,
-    UnstructuredMarkdownLoader, 
+    UnstructuredMarkdownLoader,
     UnstructuredPowerPointLoader,
     UnstructuredWordDocumentLoader
 )
+from langchain_community.document_loaders import BSHTMLLoader
 
 
 class DocumentLoader:
@@ -52,12 +53,18 @@ async def _load_document(self, file_path: str, file_extension: str) -> list:
                 "csv": UnstructuredCSVLoader(file_path, mode="elements"),
                 "xls": UnstructuredExcelLoader(file_path, mode="elements"),
                 "xlsx": UnstructuredExcelLoader(file_path, mode="elements"),
-                "md": UnstructuredMarkdownLoader(file_path)
+                "md": UnstructuredMarkdownLoader(file_path),
+                "html": BSHTMLLoader(file_path),
+                "htm": BSHTMLLoader(file_path)
             }
 
             loader = loader_dict.get(file_extension, None)
             if loader:
-                ret_data = loader.load()
+                try:
+                    ret_data = loader.load()
+                except Exception as e:
+                    print(f"Failed to load HTML document : {file_path}")
+                    print(e)
 
         except Exception as e:
             print(f"Failed to load document : {file_path}")