Merge branch 'fix/bug-81-custom-html-handling2' of github.com:weblyza…

…rd/inscriptis into fix/bug-81-custom-html-handling2
weblyzard · Mar 5, 2024 · 504863d · 504863d
2 parents 0261f13 + db012a4
commit 504863d
Show file tree

Hide file tree

Showing 29 changed files with 358 additions and 87 deletions.
diff --git a/README.rst b/README.rst
@@ -534,20 +534,24 @@ If the fine-tuning options discussed above are not sufficient, you may even over
 
 .. code-block:: python
 
+    from inscriptis import ParserConfig
     from inscriptis.html_engine import Inscriptis
-    from functools import partial
-
-    inscriptis = Inscriptis(html_tree, config)
-
-    inscriptis.start_tag_handler_dict['a'] = partial(my_handle_start_a, inscriptis)
-    inscriptis.end_tag_handler_dict['a'] = partial(my_handle_end_a, inscriptis)
+    from inscriptis.model.tag import CustomHtmlTagHandlerMapping
+
+    my_mapping = CustomHtmlTagHandlerMapping(
+        start_tag_mapping={'a': my_handle_start_a},
+        end_tag_mapping={'a': my_handle_end_a}
+    )
+    inscriptis = Inscriptis(html_tree, 
+                            ParserConfig(custom_html_tag_handler_mapping=my_mapping))
     text = inscriptis.get_text()
 		
 
 In the example the standard HTML handlers for the ``a`` tag are overwritten with custom versions (i.e., ``my_handle_start_a`` and ``my_handle_end_a``).
-You may define custom handlers for any tag, regardless of whether it already exists in ``start_tag_handler_dict`` or ``end_tag_handler_dict``. 
+You may define custom handlers for any tag, regardless of whether it already exists in the standard mapping.
 
-Please refer to `custom-html-handling.py <https://github.com/weblyzard/inscriptis/blob/master/examples/custom-html-handling.py>`_ for a working example.
+Please refer to `custom-html-handling.py <https://github.com/weblyzard/inscriptis/blob/master/examples/custom-html-handling.py>`_ for a working example. 
+The standard HTML tag handlers can be found in the `inscriptis.model.tag <https://github.com/weblyzard/inscriptis/blob/master/src/inscriptis/model/tag>`_ package.
 
 Optimizing memory consumption
 -----------------------------

diff --git a/examples/custom-html-handling.py b/examples/custom-html-handling.py
@@ -9,27 +9,35 @@
 Example:
     "Welcome to <b>Chur</b>" is rendered as "Welcome to **Chur**".
 """
+from typing import Dict
 
-
+from inscriptis import ParserConfig
 from inscriptis.html_engine import Inscriptis
-from functools import partial
+from inscriptis.model.html_document_state import HtmlDocumentState
+from inscriptis.model.tag import CustomHtmlTagHandlerMapping
 from lxml.html import fromstring
 
 
-def my_handle_start_b(self, attrs):
+def my_handle_start_b(state: HtmlDocumentState, _: Dict) -> None:
     """Handle the opening <b> tag."""
-    self.tags[-1].write("**")
+    state.tags[-1].write("**")
 
 
-def my_handle_end_b(self):
+def my_handle_end_b(state: HtmlDocumentState) -> None:
     """Handle the closing </b> tag."""
-    self.tags[-1].write("**")
+    state.tags[-1].write("**")
+
+
+MY_MAPPING = CustomHtmlTagHandlerMapping(
+    start_tag_mapping={"b": my_handle_start_b},
+    end_tag_mapping={"b": my_handle_end_b},
+)
 
 
 HTML = "Welcome to <b>Chur</b>"
 
 html_tree = fromstring(HTML)
-inscriptis = Inscriptis(html_tree)
-inscriptis.start_tag_handler_dict["b"] = partial(my_handle_start_b, inscriptis)
-inscriptis.end_tag_handler_dict["b"] = partial(my_handle_end_b, inscriptis)
+inscriptis = Inscriptis(
+    html_tree, ParserConfig(custom_html_tag_handler_mapping=MY_MAPPING)
+)
 print(inscriptis.get_text())
diff --git a/src/inscriptis/__init__.py b/src/inscriptis/__init__.py
@@ -60,12 +60,12 @@
 """
 
 import re
-from lxml.html import fromstring, HtmlElement
-from lxml.etree import ParserError
-
 from typing import Dict, Optional, Any
-
 from inscriptis.model.config import ParserConfig
+
+from lxml.etree import ParserError
+from lxml.html import fromstring, HtmlElement
+
 from inscriptis.html_engine import Inscriptis
 
 RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>")

diff --git a/src/inscriptis/annotation/__init__.py b/src/inscriptis/annotation/__init__.py
@@ -1,7 +1,7 @@
 """The model used for saving annotations."""
 
-from typing import NamedTuple, Tuple
 from typing import List
+from typing import NamedTuple
 
 from inscriptis.html_properties import HorizontalAlignment
 
@@ -25,8 +25,8 @@ class Annotation(NamedTuple):
     """the annotation's start index within the text output."""
     end: int
     """the annotation's end index within the text output."""
-    metadata: Tuple[str]
-    """a tuple of tags to be attached to the annotation."""
+    metadata: str
+    """the tag to be attached to the annotation."""
 
 
 def horizontal_shift(

diff --git a/src/inscriptis/annotation/parser.py b/src/inscriptis/annotation/parser.py
@@ -18,6 +18,7 @@
 """
 from collections import defaultdict
 from copy import copy
+from typing import Dict, Tuple, List
 
 from inscriptis.model.html_element import HtmlElement, DEFAULT_HTML_ELEMENT
 
@@ -85,7 +86,7 @@ def __init__(self, css_profile, model: dict):
         self.css = css_profile
 
     @staticmethod
-    def _parse(model: dict) -> "AnnotationModel":
+    def _parse(model: dict) -> Tuple[Dict, List]:
         """Compute the AnnotationModel from a model dictionary.
 
         Returns:

diff --git a/src/inscriptis/cli/inscript.py b/src/inscriptis/cli/inscript.py
@@ -5,14 +5,14 @@
 import argparse
 import sys
 from json import load, dumps
-from typing import Optional
 from pathlib import Path
+from typing import Optional
 
 import requests
 
 from inscriptis import get_text, get_annotated_text
-from inscriptis.metadata import __version__, __copyright__, __license__
 from inscriptis.css_profiles import CSS_PROFILES
+from inscriptis.metadata import __version__, __copyright__, __license__
 from inscriptis.model.config import ParserConfig
 
 DEFAULT_ENCODING = "utf8"
@@ -148,24 +148,23 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s
     Args:
         url: URL to the HTML content, or None if the content is obtained from stdin.
         encoding: used encoding.
+        timeout: timeout in seconds for retrieving the URL.
 
     Returns:
         The html_content or None, if no content could be extracted.
 
     """
     if not url:
         return sys.stdin.read()
-    elif Path(url).is_file():
-        with Path(url).open(
-            encoding=encoding or DEFAULT_ENCODING, errors="ignore"
-        ) as f:
+    elif (p := Path(url)).is_file():
+        with p.open(encoding=encoding or DEFAULT_ENCODING, errors="ignore") as f:
             return f.read()
     elif url.startswith("http://") or url.startswith("https://"):
         req = requests.get(url, timeout=timeout)
         return req.content.decode(encoding or req.encoding)
 
 
-def cli():
+def cli() -> None:
     """Run the inscript command line client."""
     args = parse_command_line()
     if not (html_content := get_html_content(args.input, args.timeout, args.encoding)):

diff --git a/src/inscriptis/css_profiles.py b/src/inscriptis/css_profiles.py
@@ -8,8 +8,8 @@
              preventing cases where two words stick together.
 """
 
-from inscriptis.model.html_element import HtmlElement
 from inscriptis.html_properties import Display, WhiteSpace
+from inscriptis.model.html_element import HtmlElement
 
 STRICT_CSS_PROFILE = {
     "body": HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal),

diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # coding:utf-8
 """The HTML Engine is responsible for converting HTML to text."""
-from typing import List
+from typing import List, Dict, Callable
 
 import lxml.html
 from lxml.etree import Comment
@@ -56,7 +56,9 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
         config = config or ParserConfig()
 
         # setup start and end tag call tables
-        self.start_tag_handler_dict = {
+        self.start_tag_handler_dict: Dict[
+            str, Callable[[HtmlDocumentState, Dict], None]
+        ] = {
             "table": table_start_handler,
             "tr": tr_start_handler,
             "td": td_start_handler,
@@ -68,7 +70,7 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
             "a": a_start_handler if config.parse_a() else None,
             "img": img_start_handler if config.display_images else None,
         }
-        self.end_tag_handler_dict = {
+        self.end_tag_handler_dict: Dict[str, Callable[[HtmlDocumentState], None]] = {
             "table": table_end_handler,
             "ul": ul_end_handler,
             "ol": ol_end_handler,
@@ -77,6 +79,14 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
             "a": a_end_handler if config.parse_a() else None,
         }
 
+        if config.custom_html_tag_handler_mapping:
+            self.start_tag_handler_dict.update(
+                config.custom_html_tag_handler_mapping.start_tag_mapping
+            )
+            self.end_tag_handler_dict.update(
+                config.custom_html_tag_handler_mapping.end_tag_mapping
+            )
+
         # parse the HTML tree
         self.canvas = self._parse_html_tree(HtmlDocumentState(config), html_tree)
 

diff --git a/src/inscriptis/model/attribute.py b/src/inscriptis/model/attribute.py
@@ -66,7 +66,7 @@ def apply_attributes(
             self.attribute_mapping[attr_name](attr_value, html_element)
         return html_element
 
-    def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None):
+    def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None) -> None:
         attributes = copy(self.attribute_mapping)
         for a in annotations:
             attributes[a.attr] = (

diff --git a/src/inscriptis/model/canvas/__init__.py b/src/inscriptis/model/canvas/__init__.py
@@ -17,8 +17,8 @@
 from inscriptis.annotation import Annotation
 from inscriptis.html_properties import WhiteSpace, Display
 from inscriptis.model.canvas.block import Block
-from inscriptis.model.html_element import HtmlElement
 from inscriptis.model.canvas.prefix import Prefix
+from inscriptis.model.html_element import HtmlElement
 
 
 class Canvas:
@@ -64,10 +64,10 @@ def open_tag(self, tag: HtmlElement) -> None:
         if tag.display == Display.block:
             self.open_block(tag)
 
-    def open_block(self, tag: HtmlElement):
+    def open_block(self, tag: HtmlElement) -> None:
         """Open an HTML block element."""
         # write missing bullets, if no content has been written
-        if not self._flush_inline() and tag.list_bullet:
+        if not self.flush_inline() and tag.list_bullet:
             self.write_unconsumed_bullet()
         self.current_block.prefix.register_prefix(tag.padding_inline, tag.list_bullet)
 
@@ -79,7 +79,7 @@ def open_block(self, tag: HtmlElement):
             self.blocks.append("\n" * (required_newlines - 1))
             self.margin = required_margin
 
-    def write_unconsumed_bullet(self):
+    def write_unconsumed_bullet(self) -> None:
         """Write unconsumed bullets to the blocks list."""
         bullet = self.current_block.prefix.unconsumed_bullet
         if bullet:
@@ -100,7 +100,7 @@ def close_tag(self, tag: HtmlElement) -> None:
         """
         if tag.display == Display.block:
             # write missing bullets, if no content has been written so far.
-            if not self._flush_inline() and tag.list_bullet:
+            if not self.flush_inline() and tag.list_bullet:
                 self.write_unconsumed_bullet()
             self.current_block.prefix.remove_last_prefix()
             self.close_block(tag)
@@ -116,7 +116,7 @@ def close_tag(self, tag: HtmlElement) -> None:
                     Annotation(start_idx, self.current_block.idx, annotation)
                 )
 
-    def close_block(self, tag: HtmlElement):
+    def close_block(self, tag: HtmlElement) -> None:
         """Close the given HtmlElement by writing its bottom margin.
 
         Args:
@@ -128,17 +128,17 @@ def close_block(self, tag: HtmlElement):
             self.blocks.append("\n" * (required_newlines - 1))
             self.margin = tag.margin_after
 
-    def write_newline(self):
-        if not self._flush_inline():
+    def write_newline(self) -> None:
+        if not self.flush_inline():
             self.blocks.append("")
             self.current_block = self.current_block.new_block()
 
     def get_text(self) -> str:
         """Provide a text representation of the Canvas."""
-        self._flush_inline()
+        self.flush_inline()
         return "\n".join(self.blocks)
 
-    def _flush_inline(self) -> bool:
+    def flush_inline(self) -> bool:
         """Attempt to flush the content in self.current_block into a new block.
 
         Notes:

diff --git a/src/inscriptis/model/canvas/block.py b/src/inscriptis/model/canvas/block.py
@@ -1,7 +1,14 @@
 """Representation of a text block within the HTML canvas."""
+from __future__ import annotations
+
 from html import unescape
+from typing import TYPE_CHECKING
+
 from inscriptis.html_properties import WhiteSpace
 
+if TYPE_CHECKING:
+    from inscriptis.model.canvas import Prefix
+
 
 class Block:
     """The current block of text.
@@ -19,7 +26,7 @@ class Block:
 
     __slots__ = ("idx", "prefix", "_content", "collapsable_whitespace")
 
-    def __init__(self, idx: int, prefix: str):
+    def __init__(self, idx: int, prefix: Prefix):
         self.idx = idx
         self.prefix = prefix
         self._content = ""

diff --git a/src/inscriptis/model/canvas/prefix.py b/src/inscriptis/model/canvas/prefix.py
@@ -22,7 +22,7 @@ def __init__(self):
         self.bullets = []
         self.consumed = False
 
-    def register_prefix(self, padding_inline, bullet):
+    def register_prefix(self, padding_inline: int, bullet: str) -> None:
         """Register the given prefix.
 
         Args:
@@ -33,13 +33,13 @@ def register_prefix(self, padding_inline, bullet):
         self.paddings.append(padding_inline)
         self.bullets.append(bullet if bullet else "")
 
-    def remove_last_prefix(self):
+    def remove_last_prefix(self) -> None:
         """Remove the last prefix from the list."""
         with suppress(IndexError):
             self.current_padding -= self.paddings.pop()
             del self.bullets[-1]
 
-    def pop_next_bullet(self):
+    def pop_next_bullet(self) -> str:
         """Pop the next bullet to use, if any bullet is available."""
         next_bullet_idx = (
             next((-idx for idx, val in enumerate(reversed(self.bullets)) if val), 1) - 1
@@ -53,7 +53,7 @@ def pop_next_bullet(self):
         return bullet
 
     @property
-    def first(self):
+    def first(self) -> str:
         """Return the prefix used at the beginning of a tag.
 
         Note::
@@ -69,7 +69,7 @@ def first(self):
         return " " * (self.current_padding - len(bullet)) + bullet
 
     @property
-    def unconsumed_bullet(self):
+    def unconsumed_bullet(self) -> str:
         """Yield any yet unconsumed bullet.
 
         Note::
@@ -87,7 +87,7 @@ def unconsumed_bullet(self):
         return " " * (padding - len(bullet)) + bullet
 
     @property
-    def rest(self):
+    def rest(self) -> str:
         """Return the prefix used for new lines within a block.
 
         This prefix is used for pre-text that contains newlines. The lines