Update html.py

updated according to linter tests
langchain-ai · eyurtsev · Jan 20, 2025 · Oct 28, 2024 · Oct 29, 2024 · Nov 8, 2024
commit d4efd97db21e071f72284ac67b54767a23266634
diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py
@@ -9,7 +9,9 @@
 from langchain_core.documents import Document
 
 from langchain_text_splitters.character import RecursiveCharacterTextSplitter
-
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+from langchain.docstore.document import Document
 
 class ElementType(TypedDict):
     """Element type as typed dict."""
@@ -91,104 +93,101 @@ def split_text(self, text: str) -> List[Document]:
         return self.split_text_from_file(StringIO(text))
 
 
-    def split_text_from_file(self, file: Any) -> List[Document]:
-        """Split HTML file using BeautifulSoup.
+def split_text_from_file(self, file: Any) -> List[Document]:
+    """Split HTML file using BeautifulSoup.
 
-        Args:
-            file: HTML file path or file-like object.
+    Args:
+        file: HTML file path or file-like object.
 
-        Returns:
-            List of Document objects with page_content and metadata.
-        """
-        from bs4 import BeautifulSoup
-        from langchain.docstore.document import Document
-        import bs4
-
-        # Read the HTML content from the file or file-like object
-        if isinstance(file, str):
-            with open(file, 'r', encoding='utf-8') as f:
-                html_content = f.read()
-        else:
-            # Assuming file is a file-like object
-            html_content = file.read()
+    Returns:
+        List of Document objects with page_content and metadata.
+    """
 
-        # Parse the HTML content using BeautifulSoup
-        soup = BeautifulSoup(html_content, 'html.parser')
+    # Read the HTML content from the file or file-like object
+    if isinstance(file, str):
+        with open(file, 'r', encoding='utf-8') as f:
+            html_content = f.read()
+    else:
+        # Assuming file is a file-like object
+        html_content = file.read()
 
-        # Extract the header tags and their corresponding metadata keys
-        headers_to_split_on = [tag[0] for tag in self.headers_to_split_on]
-        header_mapping = dict(self.headers_to_split_on)
+    # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(html_content, 'html.parser')
 
-        documents = []
+    # Extract the header tags and their corresponding metadata keys
+    headers_to_split_on = [tag[0] for tag in self.headers_to_split_on]
+    header_mapping = dict(self.headers_to_split_on)
 
-        # Find the body of the document
-        body = soup.body if soup.body else soup
-
-        # Find all header tags in the order they appear
-        all_headers = body.find_all(headers_to_split_on)
-
-        # If there's content before the first header, collect it
-        first_header = all_headers[0] if all_headers else None
-        if first_header:
-            pre_header_content = ''
-            for elem in first_header.find_all_previous():
-                if isinstance(elem, bs4.Tag):
-                    text = elem.get_text(separator=' ', strip=True)
-                    if text:
-                        pre_header_content = text + ' ' + pre_header_content
-            if pre_header_content.strip():
-                documents.append(Document(
-                    page_content=pre_header_content.strip(),
-                    metadata={}  # No metadata since there's no header
-                ))
-        else:
-            # If no headers are found, return the whole content
-            full_text = body.get_text(separator=' ', strip=True)
-            if full_text.strip():
-                documents.append(Document(
-                    page_content=full_text.strip(),
-                    metadata={}
-                ))
-            return documents
-
-        # Process each header and its associated content
-        for header in all_headers:
-            current_metadata = {}
-            header_name = header.name
-            header_text = header.get_text(separator=' ', strip=True)
-            current_metadata[header_mapping[header_name]] = header_text
-
-            # Collect all sibling elements until the next header of the same or higher level
-            content_elements = []
-            for sibling in header.find_next_siblings():
-                if sibling.name in headers_to_split_on:
-                    # Stop at the next header
-                    break
-                if isinstance(sibling, bs4.Tag):
-                    content_elements.append(sibling)
+    documents = []
+
+    # Find the body of the document
+    body = soup.body if soup.body else soup
 
-            # Get the text content of the collected elements
-            current_content = ''
-            for elem in content_elements:
+    # Find all header tags in the order they appear
+    all_headers = body.find_all(headers_to_split_on)
+
+    # If there's content before the first header, collect it
+    first_header = all_headers[0] if all_headers else None
+    if first_header:
+        pre_header_content = ''
+        for elem in first_header.find_all_previous():
+            if isinstance(elem, Tag):
                 text = elem.get_text(separator=' ', strip=True)
                 if text:
-                    current_content += text + ' '
-
-            # Create a Document if there is content
-            if current_content.strip():
-                documents.append(Document(
-                    page_content=current_content.strip(),
-                    metadata=current_metadata.copy()
-                ))
-            else:
-                # If there's no content, but we have metadata, still create a Document
-                documents.append(Document(
-                    page_content='',
-                    metadata=current_metadata.copy()
-                ))
-
+                    pre_header_content = text + ' ' + pre_header_content
+        if pre_header_content.strip():
+            documents.append(Document(
+                page_content=pre_header_content.strip(),
+                metadata={}  # No metadata since there's no header
+            ))
+    else:
+        # If no headers are found, return the whole content
+        full_text = body.get_text(separator=' ', strip=True)
+        if full_text.strip():
+            documents.append(Document(
+                page_content=full_text.strip(),
+                metadata={}
+            ))
         return documents
 
+    # Process each header and its associated content
+    for header in all_headers:
+        current_metadata = {}
+        header_name = header.name
+        header_text = header.get_text(separator=' ', strip=True)
+        current_metadata[header_mapping[header_name]] = header_text
+
+        # Collect all sibling elements until the next header of the same or higher level
+        content_elements = []
+        for sibling in header.find_next_siblings():
+            if sibling.name in headers_to_split_on:
+                # Stop at the next header
+                break
+            if isinstance(sibling, Tag):
+                content_elements.append(sibling)
+
+        # Get the text content of the collected elements
+        current_content = ''
+        for elem in content_elements:
+            text = elem.get_text(separator=' ', strip=True)
+            if text:
+                current_content += text + ' '
+
+        # Create a Document if there is content
+        if current_content.strip():
+            documents.append(Document(
+                page_content=current_content.strip(),
+                metadata=current_metadata.copy()
+            ))
+        else:
+            # If there's no content, but we have metadata, still create a Document
+            documents.append(Document(
+                page_content='',
+                metadata=current_metadata.copy()
+            ))
+
+    return documents
+
 
 
 class HTMLSectionSplitter: