From 8f5ff2c6c4f4f3e59f17f134c0eb92be92538f90 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Fri, 16 Feb 2024 13:19:34 +0100 Subject: [PATCH 01/15] chg: improved type consistency. --- src/inscriptis/annotation/__init__.py | 6 +++--- src/inscriptis/annotation/parser.py | 3 ++- src/inscriptis/cli/inscript.py | 1 + src/inscriptis/html_engine.py | 16 +++++++++++++--- src/inscriptis/model/canvas/__init__.py | 10 +++++----- src/inscriptis/model/canvas/block.py | 8 +++++++- src/inscriptis/model/config.py | 11 ++++++++--- src/inscriptis/model/table.py | 2 +- src/inscriptis/model/tag/__init__.py | 15 +++++++++++++++ src/inscriptis/model/tag/table_tag.py | 2 +- tests/test_model_html_element_canvas.py | 2 +- 11 files changed, 57 insertions(+), 19 deletions(-) create mode 100644 src/inscriptis/model/tag/__init__.py diff --git a/src/inscriptis/annotation/__init__.py b/src/inscriptis/annotation/__init__.py index acf3d09..653c82f 100644 --- a/src/inscriptis/annotation/__init__.py +++ b/src/inscriptis/annotation/__init__.py @@ -1,6 +1,6 @@ """The model used for saving annotations.""" -from typing import NamedTuple, Tuple +from typing import NamedTuple from typing import List from inscriptis.html_properties import HorizontalAlignment @@ -25,8 +25,8 @@ class Annotation(NamedTuple): """the annotation's start index within the text output.""" end: int """the annotation's end index within the text output.""" - metadata: Tuple[str] - """a tuple of tags to be attached to the annotation.""" + metadata: str + """the tag to be attached to the annotation.""" def horizontal_shift( diff --git a/src/inscriptis/annotation/parser.py b/src/inscriptis/annotation/parser.py index 56bdf61..a246aee 100644 --- a/src/inscriptis/annotation/parser.py +++ b/src/inscriptis/annotation/parser.py @@ -18,6 +18,7 @@ """ from collections import defaultdict from copy import copy +from typing import Dict, Tuple, List from inscriptis.model.html_element import HtmlElement, DEFAULT_HTML_ELEMENT @@ -85,7 +86,7 @@ def __init__(self, css_profile, model: dict): self.css = css_profile @staticmethod - def _parse(model: dict) -> "AnnotationModel": + def _parse(model: dict) -> Tuple[Dict, List]: """Compute the AnnotationModel from a model dictionary. Returns: diff --git a/src/inscriptis/cli/inscript.py b/src/inscriptis/cli/inscript.py index c2861dc..42cb891 100755 --- a/src/inscriptis/cli/inscript.py +++ b/src/inscriptis/cli/inscript.py @@ -148,6 +148,7 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s Args: url: URL to the HTML content, or None if the content is obtained from stdin. encoding: used encoding. + timeout: timeout in seconds for retrieving the URL. Returns: The html_content or None, if no content could be extracted. diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index 684cfd8..3d0638a 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # coding:utf-8 """The HTML Engine is responsible for converting HTML to text.""" -from typing import List +from typing import List, Dict, Callable, Any import lxml.html from lxml.etree import Comment @@ -56,7 +56,9 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None config = config or ParserConfig() # setup start and end tag call tables - self.start_tag_handler_dict = { + self.start_tag_handler_dict: Dict[ + str, Callable[[HtmlDocumentState, Any], None] + ] = { "table": table_start_handler, "tr": tr_start_handler, "td": td_start_handler, @@ -68,7 +70,7 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None "a": a_start_handler if config.parse_a() else None, "img": img_start_handler if config.display_images else None, } - self.end_tag_handler_dict = { + self.end_tag_handler_dict: Dict[str, Callable[[HtmlDocumentState], None]] = { "table": table_end_handler, "ul": ul_end_handler, "ol": ol_end_handler, @@ -77,6 +79,14 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None "a": a_end_handler if config.parse_a() else None, } + if config.custom_html_tag_handler_mapping: + self.start_tag_handler_dict.update( + config.custom_html_tag_handler_mapping.start_tag_handler_mapping + ) + self.end_tag_handler_dict.update( + config.custom_html_tag_handler_mapping.end_tag_handler_mapping + ) + # parse the HTML tree self.canvas = self._parse_html_tree(HtmlDocumentState(config), html_tree) diff --git a/src/inscriptis/model/canvas/__init__.py b/src/inscriptis/model/canvas/__init__.py index 7cf5ca4..5334858 100644 --- a/src/inscriptis/model/canvas/__init__.py +++ b/src/inscriptis/model/canvas/__init__.py @@ -67,7 +67,7 @@ def open_tag(self, tag: HtmlElement) -> None: def open_block(self, tag: HtmlElement): """Open an HTML block element.""" # write missing bullets, if no content has been written - if not self._flush_inline() and tag.list_bullet: + if not self.flush_inline() and tag.list_bullet: self.write_unconsumed_bullet() self.current_block.prefix.register_prefix(tag.padding_inline, tag.list_bullet) @@ -100,7 +100,7 @@ def close_tag(self, tag: HtmlElement) -> None: """ if tag.display == Display.block: # write missing bullets, if no content has been written so far. - if not self._flush_inline() and tag.list_bullet: + if not self.flush_inline() and tag.list_bullet: self.write_unconsumed_bullet() self.current_block.prefix.remove_last_prefix() self.close_block(tag) @@ -129,16 +129,16 @@ def close_block(self, tag: HtmlElement): self.margin = tag.margin_after def write_newline(self): - if not self._flush_inline(): + if not self.flush_inline(): self.blocks.append("") self.current_block = self.current_block.new_block() def get_text(self) -> str: """Provide a text representation of the Canvas.""" - self._flush_inline() + self.flush_inline() return "\n".join(self.blocks) - def _flush_inline(self) -> bool: + def flush_inline(self) -> bool: """Attempt to flush the content in self.current_block into a new block. Notes: diff --git a/src/inscriptis/model/canvas/block.py b/src/inscriptis/model/canvas/block.py index 6dc1361..99b8b3a 100644 --- a/src/inscriptis/model/canvas/block.py +++ b/src/inscriptis/model/canvas/block.py @@ -1,7 +1,13 @@ """Representation of a text block within the HTML canvas.""" +from __future__ import annotations from html import unescape +from typing import TYPE_CHECKING + from inscriptis.html_properties import WhiteSpace +if TYPE_CHECKING: + from inscriptis.model.canvas import Prefix + class Block: """The current block of text. @@ -19,7 +25,7 @@ class Block: __slots__ = ("idx", "prefix", "_content", "collapsable_whitespace") - def __init__(self, idx: int, prefix: str): + def __init__(self, idx: int, prefix: Prefix): self.idx = idx self.prefix = prefix self._content = "" diff --git a/src/inscriptis/model/config.py b/src/inscriptis/model/config.py index 0aaeb7a..31f6778 100644 --- a/src/inscriptis/model/config.py +++ b/src/inscriptis/model/config.py @@ -1,13 +1,14 @@ #!/usr/bin/env python """Provide configuration objects for the Inscriptis HTML to text converter.""" - +from __future__ import annotations from copy import deepcopy -from typing import Dict +from typing import Dict, List from inscriptis.css_profiles import CSS_PROFILES from inscriptis.annotation.parser import AnnotationModel from inscriptis.model.attribute import Attribute from inscriptis.model.html_element import HtmlElement +from inscriptis.model.tag import CustomHtmlTagHandlerMapping DEFAULT_CSS_PROFILE_NAME = "relaxed" @@ -22,8 +23,9 @@ def __init__( deduplicate_captions: bool = False, display_links: bool = False, display_anchors: bool = False, - annotation_rules: Attribute = None, + annotation_rules: Dict[str, List[str]] = None, table_cell_separator: str = " ", + custom_html_tag_handler_mapping: CustomHtmlTagHandlerMapping = None, ): """Create a ParserConfig configuration. @@ -39,6 +41,7 @@ def __init__( annotation_rules: an optional dictionary of annotation rules which specify tags and attributes to annotation. table_cell_separator: separator to use between table cells. + custom_html_tag_handler_mapping: an optional CustomHtmlTagHandler """ self.display_images = display_images self.deduplicate_captions = deduplicate_captions @@ -47,6 +50,8 @@ def __init__( self.css = css or CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME] self.attribute_handler = Attribute() self.table_cell_separator = table_cell_separator + self.custom_html_tag_handler_mapping = custom_html_tag_handler_mapping + if annotation_rules: # ensure that we do not modify the original model or its # members. diff --git a/src/inscriptis/model/table.py b/src/inscriptis/model/table.py index 75a2cd3..624ff0f 100644 --- a/src/inscriptis/model/table.py +++ b/src/inscriptis/model/table.py @@ -48,7 +48,7 @@ def normalize_blocks(self) -> int: Returns: The height of the normalized cell. """ - self._flush_inline() + self.flush_inline() self.blocks = list(chain(*(line.split("\n") for line in self.blocks))) if not self.blocks: self.blocks = [""] diff --git a/src/inscriptis/model/tag/__init__.py b/src/inscriptis/model/tag/__init__.py new file mode 100644 index 0000000..d329a2e --- /dev/null +++ b/src/inscriptis/model/tag/__init__.py @@ -0,0 +1,15 @@ +"""HTML Tag handlers and classes for designing custom HTML tag handlers.""" +from __future__ import annotations +from typing import Dict, Callable, NamedTuple + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from inscriptis.model.html_document_state import HtmlDocumentState + + +class CustomHtmlTagHandlerMapping(NamedTuple): + """Provide a custom HTML Tag handler mapping.""" + + start_tag_handler_mapping: Dict[str, Callable[[HtmlDocumentState, Dict], None]] + end_tag_handler_mapping: Dict[str, Callable[[HtmlDocumentState], None]] diff --git a/src/inscriptis/model/tag/table_tag.py b/src/inscriptis/model/tag/table_tag.py index 2533f7d..fe917f5 100644 --- a/src/inscriptis/model/tag/table_tag.py +++ b/src/inscriptis/model/tag/table_tag.py @@ -52,7 +52,7 @@ def table_end_handler(state: HtmlDocumentState): start_idx = state.tags[-2].canvas.current_block.idx state.tags[-2].write_verbatim_text(table.get_text()) - state.tags[-2].canvas._flush_inline() + state.tags[-2].canvas.flush_inline() # transfer annotations from the current tag if state.tags[-1].annotation: diff --git a/tests/test_model_html_element_canvas.py b/tests/test_model_html_element_canvas.py index e0d8c66..4488ffc 100644 --- a/tests/test_model_html_element_canvas.py +++ b/tests/test_model_html_element_canvas.py @@ -25,7 +25,7 @@ def _get_text(html_element): c.close_tag(html_element) HtmlElement().set_canvas(c).write("last") - c._flush_inline() + c.flush_inline() return "\n".join(c.blocks) From 8d9861ffa6b85e3f6e0d8c057b16be9bf8110d7d Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Fri, 16 Feb 2024 19:12:02 +0100 Subject: [PATCH 02/15] chg: adapt example to new, improved handling. --- examples/custom-html-handling.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/examples/custom-html-handling.py b/examples/custom-html-handling.py index 00df8b5..f215fb5 100755 --- a/examples/custom-html-handling.py +++ b/examples/custom-html-handling.py @@ -9,27 +9,33 @@ Example: "Welcome to Chur" is rendered as "Welcome to **Chur**". """ - - +from inscriptis import ParserConfig from inscriptis.html_engine import Inscriptis -from functools import partial +from inscriptis.model.html_document_state import HtmlDocumentState +from inscriptis.model.tag import CustomHtmlTagHandlerMapping from lxml.html import fromstring -def my_handle_start_b(self, attrs): +def my_handle_start_b(state: HtmlDocumentState, _): """Handle the opening tag.""" - self.tags[-1].write("**") + state.tags[-1].write("**") -def my_handle_end_b(self): +def my_handle_end_b(state: HtmlDocumentState): """Handle the closing tag.""" - self.tags[-1].write("**") + state.tags[-1].write("**") + + +MY_MAPPING = CustomHtmlTagHandlerMapping( + start_tag_handler_mapping={"b": my_handle_start_b}, + end_tag_handler_mapping={"b": my_handle_end_b}, +) HTML = "Welcome to Chur" html_tree = fromstring(HTML) -inscriptis = Inscriptis(html_tree) -inscriptis.start_tag_handler_dict["b"] = partial(my_handle_start_b, inscriptis) -inscriptis.end_tag_handler_dict["b"] = partial(my_handle_end_b, inscriptis) +inscriptis = Inscriptis( + html_tree, ParserConfig(custom_html_tag_handler_mapping=MY_MAPPING) +) print(inscriptis.get_text()) From 0f3280d8728bef78a9f2e7a96d2ecf6dd0e550a0 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Fri, 16 Feb 2024 20:21:26 +0100 Subject: [PATCH 03/15] chg: improved documentation and code cleanup. --- examples/custom-html-handling.py | 4 ++-- src/inscriptis/html_engine.py | 8 ++++---- src/inscriptis/model/tag/__init__.py | 11 ++++++++--- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/examples/custom-html-handling.py b/examples/custom-html-handling.py index f215fb5..03253a4 100755 --- a/examples/custom-html-handling.py +++ b/examples/custom-html-handling.py @@ -27,8 +27,8 @@ def my_handle_end_b(state: HtmlDocumentState): MY_MAPPING = CustomHtmlTagHandlerMapping( - start_tag_handler_mapping={"b": my_handle_start_b}, - end_tag_handler_mapping={"b": my_handle_end_b}, + start_tag_mapping={"b": my_handle_start_b}, + end_tag_mapping={"b": my_handle_end_b}, ) diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index 3d0638a..42d849e 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # coding:utf-8 """The HTML Engine is responsible for converting HTML to text.""" -from typing import List, Dict, Callable, Any +from typing import List, Dict, Callable import lxml.html from lxml.etree import Comment @@ -57,7 +57,7 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None # setup start and end tag call tables self.start_tag_handler_dict: Dict[ - str, Callable[[HtmlDocumentState, Any], None] + str, Callable[[HtmlDocumentState, Dict], None] ] = { "table": table_start_handler, "tr": tr_start_handler, @@ -81,10 +81,10 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None if config.custom_html_tag_handler_mapping: self.start_tag_handler_dict.update( - config.custom_html_tag_handler_mapping.start_tag_handler_mapping + config.custom_html_tag_handler_mapping.start_tag_mapping ) self.end_tag_handler_dict.update( - config.custom_html_tag_handler_mapping.end_tag_handler_mapping + config.custom_html_tag_handler_mapping.end_tag_mapping ) # parse the HTML tree diff --git a/src/inscriptis/model/tag/__init__.py b/src/inscriptis/model/tag/__init__.py index d329a2e..c0d29c2 100644 --- a/src/inscriptis/model/tag/__init__.py +++ b/src/inscriptis/model/tag/__init__.py @@ -9,7 +9,12 @@ class CustomHtmlTagHandlerMapping(NamedTuple): - """Provide a custom HTML Tag handler mapping.""" + """Refine the standard HTML Tag handling with the provided mapping. - start_tag_handler_mapping: Dict[str, Callable[[HtmlDocumentState, Dict], None]] - end_tag_handler_mapping: Dict[str, Callable[[HtmlDocumentState], None]] + Attributes: + start_tag_mapping: a dictionary of custom start tag handlers. + end_tag_mapping: a dictionary of custom end tag handlers. + """ + + start_tag_mapping: Dict[str, Callable[[HtmlDocumentState, Dict], None]] + end_tag_mapping: Dict[str, Callable[[HtmlDocumentState], None]] From aa1b4cb63424ebac9a0de5382d703f1bd30c304e Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Fri, 16 Feb 2024 20:21:42 +0100 Subject: [PATCH 04/15] fix: documentation on custom html tag handlers. --- README.rst | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index c273c48..5559e1a 100644 --- a/README.rst +++ b/README.rst @@ -535,19 +535,22 @@ If the fine-tuning options discussed above are not sufficient, you may even over .. code-block:: python from inscriptis.html_engine import Inscriptis - from functools import partial - - inscriptis = Inscriptis(html_tree, config) - - inscriptis.start_tag_handler_dict['a'] = partial(my_handle_start_a, inscriptis) - inscriptis.end_tag_handler_dict['a'] = partial(my_handle_end_a, inscriptis) + from inscriptis import ParserConfig + + my_mapping = CustomHtmlTagHandlerMapping( + start_tag_mapping={'a': my_handle_start_a}, + end_tag_mapping={'a': my_handle_end_a} + ) + inscriptis = Inscriptis(html_tree, + ParserConfig(custom_html_tag_handler_mapping=my_mapping)) text = inscriptis.get_text() In the example the standard HTML handlers for the ``a`` tag are overwritten with custom versions (i.e., ``my_handle_start_a`` and ``my_handle_end_a``). -You may define custom handlers for any tag, regardless of whether it already exists in ``start_tag_handler_dict`` or ``end_tag_handler_dict``. +You may define custom handlers for any tag, regardless of whether it already exists in the standard mapping. -Please refer to `custom-html-handling.py `_ for a working example. +Please refer to `custom-html-handling.py `_ for a working example. +The standard HTML tag handlers can be found in the `inscriptis.model.tag` package. Optimizing memory consumption ----------------------------- From 56d92fe6f6ba530b861cf2133fc85e93ee8b74ce Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Fri, 16 Feb 2024 20:24:02 +0100 Subject: [PATCH 05/15] fix: link to standard HTML tag handlers. --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 5559e1a..3d32703 100644 --- a/README.rst +++ b/README.rst @@ -550,7 +550,7 @@ In the example the standard HTML handlers for the ``a`` tag are overwritten with You may define custom handlers for any tag, regardless of whether it already exists in the standard mapping. Please refer to `custom-html-handling.py `_ for a working example. -The standard HTML tag handlers can be found in the `inscriptis.model.tag` package. +The standard HTML tag handlers can be found in the `inscriptis.model.tag `_ package. Optimizing memory consumption ----------------------------- From 1283e03a0b96fe2cd28efc83822e1cc9001f9e59 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Fri, 16 Feb 2024 20:26:49 +0100 Subject: [PATCH 06/15] chg: add missing import in example. --- README.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 3d32703..361191a 100644 --- a/README.rst +++ b/README.rst @@ -534,8 +534,9 @@ If the fine-tuning options discussed above are not sufficient, you may even over .. code-block:: python - from inscriptis.html_engine import Inscriptis from inscriptis import ParserConfig + from inscriptis.html_engine import Inscriptis + from inscriptis.model.tag import CustomHtmlTagHandlerMapping my_mapping = CustomHtmlTagHandlerMapping( start_tag_mapping={'a': my_handle_start_a}, From 5f010585063d6fa65f01b782b5e192cf5ed46a2e Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 17 Feb 2024 04:27:40 +0100 Subject: [PATCH 07/15] add: test for custom HTML tag handling. --- tests/test_custom_html_tag_handling.py | 31 ++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tests/test_custom_html_tag_handling.py diff --git a/tests/test_custom_html_tag_handling.py b/tests/test_custom_html_tag_handling.py new file mode 100644 index 0000000..d050e6a --- /dev/null +++ b/tests/test_custom_html_tag_handling.py @@ -0,0 +1,31 @@ +"""Test the custom HTML tag handling.""" +from lxml.html import fromstring + +from inscriptis import Inscriptis, ParserConfig +from inscriptis.model.html_document_state import HtmlDocumentState +from inscriptis.model.tag import CustomHtmlTagHandlerMapping + + +def test_custom_html_handler(): + def my_handle_start_b(state: HtmlDocumentState, _): + """Handle the opening tag.""" + state.tags[-1].write("**") + + def my_handle_end_b(state: HtmlDocumentState): + """Handle the closing tag.""" + state.tags[-1].write("**") + + custom_mapping = CustomHtmlTagHandlerMapping( + start_tag_mapping={"b": my_handle_start_b}, + end_tag_mapping={"b": my_handle_end_b}, + ) + + html_tree = fromstring("Welcome to Chur") + inscriptis = Inscriptis( + html_tree, ParserConfig(custom_html_tag_handler_mapping=custom_mapping) + ) + + # custom HTML Handler + assert inscriptis.get_text().strip() == "Welcome to **Chur**" + # standard HTML handler + assert Inscriptis(html_tree).get_text().strip() == "Welcome to Chur" From 80c8dd197f5f3fc4e4517b9c58cbfe54c2c8c2cb Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 17 Feb 2024 15:12:19 +0100 Subject: [PATCH 08/15] add: unittests for the command line client. --- src/inscriptis/cli/inscript.py | 6 +- tests/test_cli.py | 104 +++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 tests/test_cli.py diff --git a/src/inscriptis/cli/inscript.py b/src/inscriptis/cli/inscript.py index 42cb891..e83fefa 100755 --- a/src/inscriptis/cli/inscript.py +++ b/src/inscriptis/cli/inscript.py @@ -156,10 +156,8 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s """ if not url: return sys.stdin.read() - elif Path(url).is_file(): - with Path(url).open( - encoding=encoding or DEFAULT_ENCODING, errors="ignore" - ) as f: + elif (p := Path(url)).is_file(): + with p.open(encoding=encoding or DEFAULT_ENCODING, errors="ignore") as f: return f.read() elif url.startswith("http://") or url.startswith("https://"): req = requests.get(url, timeout=timeout) diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..7aa32e3 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,104 @@ +""" +Tests the Inscriptis CLI client. +""" +from io import StringIO +from pathlib import Path +from json import loads +from unittest.mock import Mock, mock_open, patch, call + +import pytest + +from inscriptis.cli.inscript import cli + +INPUT_DATA = """Hello World!""" + + +def test_cli_read_from_stdin(monkeypatch, capsys): + """Test converting HTML from standard input with the command line client.""" + # Use monkeypatch to replace the 'input' function + monkeypatch.setattr("sys.argv", ["inscript"]) + monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA)) + cli() + + # Capture the printed output + captured = capsys.readouterr() + assert captured.out.strip() == "Hello World!" + + +def test_cli_read_from_stdin_write_to_file(monkeypatch, capsys): + """Test converting HTML from standard input with the command line client and + writing it to a file.""" + # Use monkeypatch to replace the 'input' function + monkeypatch.setattr("sys.argv", ["inscript", "--output", "test.txt"]) + monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA)) + with patch("pathlib.Path.open", create=True) as mock_file: + cli() + + # Capture the printed output + captured = capsys.readouterr() + assert captured.out.strip() == "" + # Capture the test written to the mock output file + assert call().__enter__().write("Hello World!") in mock_file.mock_calls + + +def test_cli_read_from_file(monkeypatch, capsys): + """Test converting HTML from a file with the command line client.""" + # Use monkeypatch to replace the 'input' function + monkeypatch.setattr("sys.argv", ["inscript", "test.html"]) + monkeypatch.setattr("pathlib.Path.is_file", lambda _: True) + monkeypatch.setattr("pathlib.Path.open", mock_open(read_data=INPUT_DATA)) + cli() + + # Capture the printed output + captured = capsys.readouterr() + assert captured.out.strip() == "Hello World!" + + +def test_cli_read_from_url(monkeypatch, capsys): + """Test converting HTML from an URL with the command line client.""" + # Use monkeypatch to replace the 'input' function + monkeypatch.setattr("sys.argv", ["inscript", "https://www.fhgr.ch/test.html"]) + + mock_request = Mock() + mock_request.content = INPUT_DATA.encode("utf8") + mock_request.encoding = "utf-8" + monkeypatch.setattr("requests.get", lambda url, timeout=0: mock_request) + cli() + + # Capture the printed output + captured = capsys.readouterr() + assert captured.out.strip() == "Hello World!" + + +def test_cli_annotations(monkeypatch, capsys): + """Test annotation handling in the command line client.""" + # Prepare input data for the test + annotation_rule_path = ( + Path(__file__).parent / "data" / "annotation-profile-unittest.json" + ) + + # Use monkeypatch to replace the 'input' function + monkeypatch.setattr( + "sys.argv", ["inscript", "-p", "surface", "-r", str(annotation_rule_path)] + ) + monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA)) + cli() + + # Capture the printed json data and convert it to an object + captured = loads(capsys.readouterr().out.strip()) + assert captured["text"].strip() == "Hello World!" + assert captured["label"] == [[6, 11, "emphasis"]] + assert captured["surface"] == [["emphasis", "World"]] + + +def test_help(monkeypatch, capsys): + monkeypatch.setattr("sys.argv", ["inscript", "--version"]) + + # the cli should exit with exit code 0 + with pytest.raises(SystemExit) as exit_info: + cli() + assert exit_info.value.code == 0 + + captured = capsys.readouterr().out + assert captured.startswith("Inscript HTML to text conversion") + assert "Inscript comes with ABSOLUTELY NO WARRANTY." in captured From 35626dd79e3a46ced3a9f5cc55135388a71e69e7 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 17 Feb 2024 15:24:53 +0100 Subject: [PATCH 09/15] chg: fully cover the inscript client with unittests. --- tests/test_cli.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 7aa32e3..4e4cfc4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -102,3 +102,24 @@ def test_help(monkeypatch, capsys): captured = capsys.readouterr().out assert captured.startswith("Inscript HTML to text conversion") assert "Inscript comes with ABSOLUTELY NO WARRANTY." in captured + + +def test_missing_input_file(monkeypatch, capsys): + monkeypatch.setattr("sys.argv", ["inscript", "test.html"]) + with pytest.raises(SystemExit) as exit_info: + cli() + + captured = capsys.readouterr() + assert exit_info.value.code == -1 + assert captured.out.strip().startswith("ERROR: Cannot open input file") + + +def test_missing_annotation_file(monkeypatch, capsys): + monkeypatch.setattr("sys.argv", ["inscript", "--annotation-rules", "rules.json"]) + monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA)) + with pytest.raises(SystemExit) as exit_info: + cli() + + captured = capsys.readouterr() + assert exit_info.value.code == -1 + assert captured.out.strip().startswith("ERROR: Cannot open annotation rule file") From 942c4c77822e1182967c2bb55f4083307631d556 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 17 Feb 2024 16:48:46 +0100 Subject: [PATCH 10/15] fix: Exception handling for annotation rule files. --- src/inscriptis/cli/inscript.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/inscriptis/cli/inscript.py b/src/inscriptis/cli/inscript.py index e83fefa..d57deaf 100755 --- a/src/inscriptis/cli/inscript.py +++ b/src/inscriptis/cli/inscript.py @@ -175,7 +175,7 @@ def cli(): try: with Path(args.annotation_rules).open() as f: annotation_rules = load(f) - except IOError: + except (IOError, FileNotFoundError, PermissionError): print( "ERROR: Cannot open annotation rule file '{0}'.".format( args.annotation_rules From 1fe8da779bdbf90d47c2cc130b633ed9bc36c476 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 17 Feb 2024 17:07:47 +0100 Subject: [PATCH 11/15] add: annotation profiles required for the unittests. --- tests/data/annotation-profile-unittest.json | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 tests/data/annotation-profile-unittest.json diff --git a/tests/data/annotation-profile-unittest.json b/tests/data/annotation-profile-unittest.json new file mode 100644 index 0000000..48a58ec --- /dev/null +++ b/tests/data/annotation-profile-unittest.json @@ -0,0 +1,7 @@ +{ + "h1": ["heading"], + "h2": ["heading"], + "h3": ["heading"], + "b": ["emphasis"], + "table": ["table"] +} From 16ba471142f73d66785a7bdf9aae479ebb7789e7 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 17 Feb 2024 17:13:14 +0100 Subject: [PATCH 12/15] add: unittesting for the inscriptis web service. --- src/inscriptis/service/web.py | 6 ++--- tests/test_web_service.py | 47 +++++++++++++++++++++++++++++++++++ tox.ini | 2 ++ 3 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 tests/test_web_service.py diff --git a/src/inscriptis/service/web.py b/src/inscriptis/service/web.py index fdf47ca..358d73d 100755 --- a/src/inscriptis/service/web.py +++ b/src/inscriptis/service/web.py @@ -22,15 +22,15 @@ @app.get("/") def index(): """Print a short status message for the Web service's base URL.""" - return "Inscriptis text to HTML Web service." + return PlainTextResponse("Inscriptis text to HTML Web service.") @app.post("/get_text", response_class=PlainTextResponse) async def get_text_call(request: Request): """Return the text representation of the given HTML content.""" content_type = request.headers.get("Content-type") - if "; encoding=" in content_type: - encoding = content_type.split("; encoding=")[1] + if "; charset=" in content_type: + encoding = content_type.split("; charset=")[1] else: encoding = "UTF-8" html_content = await request.body() diff --git a/tests/test_web_service.py b/tests/test_web_service.py new file mode 100644 index 0000000..508c1c5 --- /dev/null +++ b/tests/test_web_service.py @@ -0,0 +1,47 @@ +import pytest +from fastapi.testclient import TestClient +from inscriptis.service.web import app +from inscriptis.metadata import __version__ + +# Replace "your_module" with the actual module name where your FastAPI app is defined. + + +@pytest.fixture +def client(): + return TestClient(app) + + +def test_index(client): + response = client.get("/") + assert response.status_code == 200 + assert response.text == "Inscriptis text to HTML Web service." + + +def test_get_text_call_with_content_type(client): + html_content = "Ă–sterliche Freuden!" + response = client.post( + "/get_text", + content=html_content, + headers={"Content-type": "text/html; charset=UTF-8"}, + ) + assert response.status_code == 200 + assert response.text == "Ă–sterliche Freuden!" + + +def test_get_text_call_without_content_type(client): + html_content = "Hello World!" + response = client.post( + "/get_text", + content=html_content, + headers={"Content-type": "text/html"}, + ) + assert response.status_code == 200 + assert response.text == "Hello World!" + + +def test_get_version_call(client): + response = client.get("/version") + assert response.status_code == 200 + assert ( + response.text == __version__ + ) # Assuming your ParserConfig has a version attribute diff --git a/tox.ini b/tox.ini index 8dc0683..0105747 100644 --- a/tox.ini +++ b/tox.ini @@ -5,6 +5,8 @@ envlist = pytest, pyroma, flake8 [testenv:pytest] deps = pytest ~= 7.4.4 pytest-cov ~= 4.1.0 + fastapi ~= 0.109.2 + httpx ~= 0.26.0 commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests # python packaging best practices From 9e31c6cd8c67472b95ff1e6ea48dc3bc8563940e Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 17 Feb 2024 17:14:23 +0100 Subject: [PATCH 13/15] chg: code cleanup. --- tests/test_web_service.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/test_web_service.py b/tests/test_web_service.py index 508c1c5..282fa75 100644 --- a/tests/test_web_service.py +++ b/tests/test_web_service.py @@ -3,8 +3,6 @@ from inscriptis.service.web import app from inscriptis.metadata import __version__ -# Replace "your_module" with the actual module name where your FastAPI app is defined. - @pytest.fixture def client(): @@ -42,6 +40,4 @@ def test_get_text_call_without_content_type(client): def test_get_version_call(client): response = client.get("/version") assert response.status_code == 200 - assert ( - response.text == __version__ - ) # Assuming your ParserConfig has a version attribute + assert response.text == __version__ From f138377d3cab66323f5379a3b55e5bdf92c894d5 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 17 Feb 2024 17:18:23 +0100 Subject: [PATCH 14/15] chg: code cleanup. --- src/inscriptis/cli/inscript.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/inscriptis/cli/inscript.py b/src/inscriptis/cli/inscript.py index d57deaf..e83fefa 100755 --- a/src/inscriptis/cli/inscript.py +++ b/src/inscriptis/cli/inscript.py @@ -175,7 +175,7 @@ def cli(): try: with Path(args.annotation_rules).open() as f: annotation_rules = load(f) - except (IOError, FileNotFoundError, PermissionError): + except IOError: print( "ERROR: Cannot open annotation rule file '{0}'.".format( args.annotation_rules From db012a45d1acd965d76b9c4a9170652aab40db62 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 17 Feb 2024 17:54:28 +0100 Subject: [PATCH 15/15] chg: optimized imports and additional type hints. --- examples/custom-html-handling.py | 6 ++++-- src/inscriptis/__init__.py | 8 ++++---- src/inscriptis/annotation/__init__.py | 2 +- src/inscriptis/cli/inscript.py | 6 +++--- src/inscriptis/css_profiles.py | 2 +- src/inscriptis/model/attribute.py | 2 +- src/inscriptis/model/canvas/__init__.py | 10 +++++----- src/inscriptis/model/canvas/block.py | 1 + src/inscriptis/model/canvas/prefix.py | 12 ++++++------ src/inscriptis/model/config.py | 3 ++- src/inscriptis/model/css.py | 1 + src/inscriptis/model/html_element.py | 8 ++++---- src/inscriptis/model/table.py | 8 ++++---- src/inscriptis/model/tag/__init__.py | 2 +- src/inscriptis/model/tag/a_tag.py | 5 +++-- src/inscriptis/model/tag/br_tag.py | 4 +++- src/inscriptis/model/tag/img_tag.py | 3 ++- src/inscriptis/model/tag/list_tag.py | 11 ++++++----- src/inscriptis/model/tag/table_tag.py | 12 +++++++----- src/inscriptis/service/web.py | 2 +- tox.ini | 2 +- 21 files changed, 61 insertions(+), 49 deletions(-) diff --git a/examples/custom-html-handling.py b/examples/custom-html-handling.py index 03253a4..42412fa 100755 --- a/examples/custom-html-handling.py +++ b/examples/custom-html-handling.py @@ -9,6 +9,8 @@ Example: "Welcome to Chur" is rendered as "Welcome to **Chur**". """ +from typing import Dict + from inscriptis import ParserConfig from inscriptis.html_engine import Inscriptis from inscriptis.model.html_document_state import HtmlDocumentState @@ -16,12 +18,12 @@ from lxml.html import fromstring -def my_handle_start_b(state: HtmlDocumentState, _): +def my_handle_start_b(state: HtmlDocumentState, _: Dict) -> None: """Handle the opening tag.""" state.tags[-1].write("**") -def my_handle_end_b(state: HtmlDocumentState): +def my_handle_end_b(state: HtmlDocumentState) -> None: """Handle the closing tag.""" state.tags[-1].write("**") diff --git a/src/inscriptis/__init__.py b/src/inscriptis/__init__.py index 5f71f7c..4e52312 100644 --- a/src/inscriptis/__init__.py +++ b/src/inscriptis/__init__.py @@ -60,12 +60,12 @@ """ import re -from lxml.html import fromstring, HtmlElement -from lxml.etree import ParserError - from typing import Dict, Optional, Any - from inscriptis.model.config import ParserConfig + +from lxml.etree import ParserError +from lxml.html import fromstring, HtmlElement + from inscriptis.html_engine import Inscriptis RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>") diff --git a/src/inscriptis/annotation/__init__.py b/src/inscriptis/annotation/__init__.py index 653c82f..94e5fe5 100644 --- a/src/inscriptis/annotation/__init__.py +++ b/src/inscriptis/annotation/__init__.py @@ -1,7 +1,7 @@ """The model used for saving annotations.""" -from typing import NamedTuple from typing import List +from typing import NamedTuple from inscriptis.html_properties import HorizontalAlignment diff --git a/src/inscriptis/cli/inscript.py b/src/inscriptis/cli/inscript.py index e83fefa..be085e0 100755 --- a/src/inscriptis/cli/inscript.py +++ b/src/inscriptis/cli/inscript.py @@ -5,14 +5,14 @@ import argparse import sys from json import load, dumps -from typing import Optional from pathlib import Path +from typing import Optional import requests from inscriptis import get_text, get_annotated_text -from inscriptis.metadata import __version__, __copyright__, __license__ from inscriptis.css_profiles import CSS_PROFILES +from inscriptis.metadata import __version__, __copyright__, __license__ from inscriptis.model.config import ParserConfig DEFAULT_ENCODING = "utf8" @@ -164,7 +164,7 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s return req.content.decode(encoding or req.encoding) -def cli(): +def cli() -> None: """Run the inscript command line client.""" args = parse_command_line() if not (html_content := get_html_content(args.input, args.timeout, args.encoding)): diff --git a/src/inscriptis/css_profiles.py b/src/inscriptis/css_profiles.py index 51889b3..6f680e8 100644 --- a/src/inscriptis/css_profiles.py +++ b/src/inscriptis/css_profiles.py @@ -8,8 +8,8 @@ preventing cases where two words stick together. """ -from inscriptis.model.html_element import HtmlElement from inscriptis.html_properties import Display, WhiteSpace +from inscriptis.model.html_element import HtmlElement STRICT_CSS_PROFILE = { "body": HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal), diff --git a/src/inscriptis/model/attribute.py b/src/inscriptis/model/attribute.py index d8cf3f6..a66e9bd 100644 --- a/src/inscriptis/model/attribute.py +++ b/src/inscriptis/model/attribute.py @@ -66,7 +66,7 @@ def apply_attributes( self.attribute_mapping[attr_name](attr_value, html_element) return html_element - def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None): + def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None) -> None: attributes = copy(self.attribute_mapping) for a in annotations: attributes[a.attr] = ( diff --git a/src/inscriptis/model/canvas/__init__.py b/src/inscriptis/model/canvas/__init__.py index 5334858..bf9c61f 100644 --- a/src/inscriptis/model/canvas/__init__.py +++ b/src/inscriptis/model/canvas/__init__.py @@ -17,8 +17,8 @@ from inscriptis.annotation import Annotation from inscriptis.html_properties import WhiteSpace, Display from inscriptis.model.canvas.block import Block -from inscriptis.model.html_element import HtmlElement from inscriptis.model.canvas.prefix import Prefix +from inscriptis.model.html_element import HtmlElement class Canvas: @@ -64,7 +64,7 @@ def open_tag(self, tag: HtmlElement) -> None: if tag.display == Display.block: self.open_block(tag) - def open_block(self, tag: HtmlElement): + def open_block(self, tag: HtmlElement) -> None: """Open an HTML block element.""" # write missing bullets, if no content has been written if not self.flush_inline() and tag.list_bullet: @@ -79,7 +79,7 @@ def open_block(self, tag: HtmlElement): self.blocks.append("\n" * (required_newlines - 1)) self.margin = required_margin - def write_unconsumed_bullet(self): + def write_unconsumed_bullet(self) -> None: """Write unconsumed bullets to the blocks list.""" bullet = self.current_block.prefix.unconsumed_bullet if bullet: @@ -116,7 +116,7 @@ def close_tag(self, tag: HtmlElement) -> None: Annotation(start_idx, self.current_block.idx, annotation) ) - def close_block(self, tag: HtmlElement): + def close_block(self, tag: HtmlElement) -> None: """Close the given HtmlElement by writing its bottom margin. Args: @@ -128,7 +128,7 @@ def close_block(self, tag: HtmlElement): self.blocks.append("\n" * (required_newlines - 1)) self.margin = tag.margin_after - def write_newline(self): + def write_newline(self) -> None: if not self.flush_inline(): self.blocks.append("") self.current_block = self.current_block.new_block() diff --git a/src/inscriptis/model/canvas/block.py b/src/inscriptis/model/canvas/block.py index 99b8b3a..5013233 100644 --- a/src/inscriptis/model/canvas/block.py +++ b/src/inscriptis/model/canvas/block.py @@ -1,5 +1,6 @@ """Representation of a text block within the HTML canvas.""" from __future__ import annotations + from html import unescape from typing import TYPE_CHECKING diff --git a/src/inscriptis/model/canvas/prefix.py b/src/inscriptis/model/canvas/prefix.py index 8a68066..2e6963d 100644 --- a/src/inscriptis/model/canvas/prefix.py +++ b/src/inscriptis/model/canvas/prefix.py @@ -22,7 +22,7 @@ def __init__(self): self.bullets = [] self.consumed = False - def register_prefix(self, padding_inline, bullet): + def register_prefix(self, padding_inline: int, bullet: str) -> None: """Register the given prefix. Args: @@ -33,13 +33,13 @@ def register_prefix(self, padding_inline, bullet): self.paddings.append(padding_inline) self.bullets.append(bullet if bullet else "") - def remove_last_prefix(self): + def remove_last_prefix(self) -> None: """Remove the last prefix from the list.""" with suppress(IndexError): self.current_padding -= self.paddings.pop() del self.bullets[-1] - def pop_next_bullet(self): + def pop_next_bullet(self) -> str: """Pop the next bullet to use, if any bullet is available.""" next_bullet_idx = ( next((-idx for idx, val in enumerate(reversed(self.bullets)) if val), 1) - 1 @@ -53,7 +53,7 @@ def pop_next_bullet(self): return bullet @property - def first(self): + def first(self) -> str: """Return the prefix used at the beginning of a tag. Note:: @@ -69,7 +69,7 @@ def first(self): return " " * (self.current_padding - len(bullet)) + bullet @property - def unconsumed_bullet(self): + def unconsumed_bullet(self) -> str: """Yield any yet unconsumed bullet. Note:: @@ -87,7 +87,7 @@ def unconsumed_bullet(self): return " " * (padding - len(bullet)) + bullet @property - def rest(self): + def rest(self) -> str: """Return the prefix used for new lines within a block. This prefix is used for pre-text that contains newlines. The lines diff --git a/src/inscriptis/model/config.py b/src/inscriptis/model/config.py index 31f6778..fe06897 100644 --- a/src/inscriptis/model/config.py +++ b/src/inscriptis/model/config.py @@ -1,11 +1,12 @@ #!/usr/bin/env python """Provide configuration objects for the Inscriptis HTML to text converter.""" from __future__ import annotations + from copy import deepcopy from typing import Dict, List -from inscriptis.css_profiles import CSS_PROFILES from inscriptis.annotation.parser import AnnotationModel +from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.attribute import Attribute from inscriptis.model.html_element import HtmlElement from inscriptis.model.tag import CustomHtmlTagHandlerMapping diff --git a/src/inscriptis/model/css.py b/src/inscriptis/model/css.py index d9efa44..f52c41b 100644 --- a/src/inscriptis/model/css.py +++ b/src/inscriptis/model/css.py @@ -7,6 +7,7 @@ """ from contextlib import suppress from re import compile as re_compile + from inscriptis.html_properties import ( Display, WhiteSpace, diff --git a/src/inscriptis/model/html_element.py b/src/inscriptis/model/html_element.py index 91e9585..5c72bbc 100644 --- a/src/inscriptis/model/html_element.py +++ b/src/inscriptis/model/html_element.py @@ -52,9 +52,9 @@ class HtmlElement: def __init__( self, - tag="default", - prefix="", - suffix="", + tag: str = "default", + prefix: str = "", + suffix: str = "", display: Display = Display.inline, margin_before: int = 0, margin_after: int = 0, @@ -156,7 +156,7 @@ def get_refined_html_element(self, new: "HtmlElement") -> "HtmlElement": return new - def __str__(self): + def __str__(self) -> str: return ( f"<{self.tag} prefix={self.prefix}, suffix={self.suffix}, " f"display={self.display}, margin_before={self.margin_before}, " diff --git a/src/inscriptis/model/table.py b/src/inscriptis/model/table.py index 624ff0f..9207df7 100644 --- a/src/inscriptis/model/table.py +++ b/src/inscriptis/model/table.py @@ -2,11 +2,11 @@ # encoding: utf-8 """Classes used for representing Tables, TableRows and TableCells.""" -from typing import List from itertools import chain, accumulate +from typing import List -from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment from inscriptis.annotation import Annotation, horizontal_shift +from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment from inscriptis.model.canvas import Canvas @@ -168,7 +168,7 @@ class TableRow: __slots__ = ("columns", "cell_separator") - def __init__(self, cell_separator): + def __init__(self, cell_separator: str): self.columns: List[TableCell] = [] self.cell_separator = cell_separator @@ -205,7 +205,7 @@ class Table: __slots__ = ("rows", "left_margin_len", "cell_separator") - def __init__(self, left_margin_len: int, cell_separator): + def __init__(self, left_margin_len: int, cell_separator: str): self.rows = [] self.left_margin_len = left_margin_len self.cell_separator = cell_separator diff --git a/src/inscriptis/model/tag/__init__.py b/src/inscriptis/model/tag/__init__.py index c0d29c2..e877f80 100644 --- a/src/inscriptis/model/tag/__init__.py +++ b/src/inscriptis/model/tag/__init__.py @@ -1,7 +1,7 @@ """HTML Tag handlers and classes for designing custom HTML tag handlers.""" from __future__ import annotations -from typing import Dict, Callable, NamedTuple +from typing import Dict, Callable, NamedTuple from typing import TYPE_CHECKING if TYPE_CHECKING: diff --git a/src/inscriptis/model/tag/a_tag.py b/src/inscriptis/model/tag/a_tag.py index b9c145a..f435377 100644 --- a/src/inscriptis/model/tag/a_tag.py +++ b/src/inscriptis/model/tag/a_tag.py @@ -1,9 +1,10 @@ """Handle the tag.""" +from typing import Dict from inscriptis.model.html_document_state import HtmlDocumentState -def a_start_handler(state: HtmlDocumentState, attrs): +def a_start_handler(state: HtmlDocumentState, attrs: Dict) -> None: """Handle the tag.""" state.link_target = "" if state.config.display_links: @@ -15,7 +16,7 @@ def a_start_handler(state: HtmlDocumentState, attrs): state.tags[-1].write("[") -def a_end_handler(state: HtmlDocumentState): +def a_end_handler(state: HtmlDocumentState) -> None: """Handle the tag.""" if state.link_target: state.tags[-1].write(f"]({state.link_target})") diff --git a/src/inscriptis/model/tag/br_tag.py b/src/inscriptis/model/tag/br_tag.py index b7d5062..6a354d1 100644 --- a/src/inscriptis/model/tag/br_tag.py +++ b/src/inscriptis/model/tag/br_tag.py @@ -1,7 +1,9 @@ """Handle the
tag.""" +from typing import Dict + from inscriptis.model.html_document_state import HtmlDocumentState -def br_start_handler(state: HtmlDocumentState, _): +def br_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the
tag.""" state.tags[-1].canvas.write_newline() diff --git a/src/inscriptis/model/tag/img_tag.py b/src/inscriptis/model/tag/img_tag.py index 27404b4..51848af 100644 --- a/src/inscriptis/model/tag/img_tag.py +++ b/src/inscriptis/model/tag/img_tag.py @@ -1,9 +1,10 @@ """Handle the tag.""" +from typing import Dict from inscriptis.model.html_document_state import HtmlDocumentState -def img_start_handler(state: HtmlDocumentState, attrs): +def img_start_handler(state: HtmlDocumentState, attrs: Dict) -> None: """Handle the tag.""" image_text = attrs.get("alt", "") or attrs.get("title", "") if image_text and not ( diff --git a/src/inscriptis/model/tag/list_tag.py b/src/inscriptis/model/tag/list_tag.py index 7286fe9..08fc553 100644 --- a/src/inscriptis/model/tag/list_tag.py +++ b/src/inscriptis/model/tag/list_tag.py @@ -1,4 +1,5 @@ """Handle the
  • ,
      ,
        tags.""" +from typing import Dict from inscriptis.model.html_document_state import HtmlDocumentState @@ -11,7 +12,7 @@ def get_bullet(state: HtmlDocumentState) -> str: return UL_COUNTER[len(state.li_counter) % UL_COUNTER_LEN] -def li_start_handler(state: HtmlDocumentState, _): +def li_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the
      • tag.""" bullet = state.li_counter[-1] if state.li_counter else "* " if isinstance(bullet, int): @@ -23,21 +24,21 @@ def li_start_handler(state: HtmlDocumentState, _): state.tags[-1].write("") -def ul_start_handler(state: HtmlDocumentState, _): +def ul_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the
          tag.""" state.li_counter.append(get_bullet(state)) -def ul_end_handler(state: HtmlDocumentState): +def ul_end_handler(state: HtmlDocumentState) -> None: """Handle the
        tag.""" state.li_counter.pop() -def ol_start_handler(state: HtmlDocumentState, _): +def ol_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the
          tag.""" state.li_counter.append(1) -def ol_end_handler(state: HtmlDocumentState): +def ol_end_handler(state: HtmlDocumentState) -> None: """Handle the
        tag.""" state.li_counter.pop() diff --git a/src/inscriptis/model/tag/table_tag.py b/src/inscriptis/model/tag/table_tag.py index fe917f5..3e6cf34 100644 --- a/src/inscriptis/model/tag/table_tag.py +++ b/src/inscriptis/model/tag/table_tag.py @@ -1,11 +1,13 @@ """Handle the , and tag.""" if state.current_table: state.current_table[-1].add_row() -def table_start_handler(state: HtmlDocumentState, _): +def table_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the
        tags.""" +from typing import Dict + from inscriptis.annotation import Annotation from inscriptis.model.canvas import Canvas from inscriptis.model.html_document_state import HtmlDocumentState from inscriptis.model.table import Table, TableCell -def td_start_handler(state: HtmlDocumentState, _): +def td_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the tag.""" if state.current_table: # open td tag @@ -14,13 +16,13 @@ def td_start_handler(state: HtmlDocumentState, _): state.current_table[-1].add_cell(table_cell) -def tr_start_handler(state: HtmlDocumentState, _): +def tr_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the
        tag.""" state.tags[-1].set_canvas(Canvas()) state.current_table.append( @@ -31,13 +33,13 @@ def table_start_handler(state: HtmlDocumentState, _): ) -def td_end_handler(state: HtmlDocumentState): +def td_end_handler(state: HtmlDocumentState) -> None: """Handle the tag.""" if state.current_table: state.tags[-1].canvas.close_tag(state.tags[-1]) -def table_end_handler(state: HtmlDocumentState): +def table_end_handler(state: HtmlDocumentState) -> None: """Handle the
        tag.""" if state.current_table: td_end_handler(state) diff --git a/src/inscriptis/service/web.py b/src/inscriptis/service/web.py index 358d73d..902d1a4 100755 --- a/src/inscriptis/service/web.py +++ b/src/inscriptis/service/web.py @@ -6,8 +6,8 @@ from fastapi.responses import PlainTextResponse from inscriptis import get_text -from inscriptis.metadata import __version__ from inscriptis.css_profiles import RELAXED_CSS_PROFILE +from inscriptis.metadata import __version__ from inscriptis.model.config import ParserConfig app = FastAPI() diff --git a/tox.ini b/tox.ini index 0105747..26f1cce 100644 --- a/tox.ini +++ b/tox.ini @@ -19,7 +19,7 @@ deps = flake8 ~= 7.0.0 dlint ~= 0.14.1 flake8-bandit ~= 4.1.1 flake8-blind-except ~= 0.2.1 - flake8-bugbear ~= 23.12.2 + flake8-bugbear ~= 24.2.6 flake8-builtins ~= 2.2.0 flake8-cognitive-complexity ~= 0.1.0 flake8-colors ~= 0.1.9