Merge pull request #1166 from LavX/master

feat: Add BSHTMLLoader support and enhance error handling for document loading
assafelovic · Feb 19, 2025 · 9a06536 · 9a06536
2 parents 64a57fd + 65b9799
commit 9a06536
Showing 1 changed file with 13 additions and 6 deletions.
diff --git a/gpt_researcher/document/document.py b/gpt_researcher/document/document.py
@@ -2,14 +2,15 @@
 import os
 
 from langchain_community.document_loaders import (
-    PyMuPDFLoader, 
-    TextLoader, 
-    UnstructuredCSVLoader, 
+    PyMuPDFLoader,
+    TextLoader,
+    UnstructuredCSVLoader,
     UnstructuredExcelLoader,
-    UnstructuredMarkdownLoader, 
+    UnstructuredMarkdownLoader,
     UnstructuredPowerPointLoader,
     UnstructuredWordDocumentLoader
 )
+from langchain_community.document_loaders import BSHTMLLoader
 
 
 class DocumentLoader:
@@ -52,12 +53,18 @@ async def _load_document(self, file_path: str, file_extension: str) -> list:
                 "csv": UnstructuredCSVLoader(file_path, mode="elements"),
                 "xls": UnstructuredExcelLoader(file_path, mode="elements"),
                 "xlsx": UnstructuredExcelLoader(file_path, mode="elements"),
-                "md": UnstructuredMarkdownLoader(file_path)
+                "md": UnstructuredMarkdownLoader(file_path),
+                "html": BSHTMLLoader(file_path),
+                "htm": BSHTMLLoader(file_path)
             }
 
             loader = loader_dict.get(file_extension, None)
             if loader:
-                ret_data = loader.load()
+                try:
+                    ret_data = loader.load()
+                except Exception as e:
+                    print(f"Failed to load HTML document : {file_path}")
+                    print(e)
 
         except Exception as e:
             print(f"Failed to load document : {file_path}")