diff --git a/README.rst b/README.rst index c273c48..361191a 100644 --- a/README.rst +++ b/README.rst @@ -534,20 +534,24 @@ If the fine-tuning options discussed above are not sufficient, you may even over .. code-block:: python + from inscriptis import ParserConfig from inscriptis.html_engine import Inscriptis - from functools import partial - - inscriptis = Inscriptis(html_tree, config) - - inscriptis.start_tag_handler_dict['a'] = partial(my_handle_start_a, inscriptis) - inscriptis.end_tag_handler_dict['a'] = partial(my_handle_end_a, inscriptis) + from inscriptis.model.tag import CustomHtmlTagHandlerMapping + + my_mapping = CustomHtmlTagHandlerMapping( + start_tag_mapping={'a': my_handle_start_a}, + end_tag_mapping={'a': my_handle_end_a} + ) + inscriptis = Inscriptis(html_tree, + ParserConfig(custom_html_tag_handler_mapping=my_mapping)) text = inscriptis.get_text() In the example the standard HTML handlers for the ``a`` tag are overwritten with custom versions (i.e., ``my_handle_start_a`` and ``my_handle_end_a``). -You may define custom handlers for any tag, regardless of whether it already exists in ``start_tag_handler_dict`` or ``end_tag_handler_dict``. +You may define custom handlers for any tag, regardless of whether it already exists in the standard mapping. -Please refer to `custom-html-handling.py `_ for a working example. +Please refer to `custom-html-handling.py `_ for a working example. +The standard HTML tag handlers can be found in the `inscriptis.model.tag `_ package. Optimizing memory consumption ----------------------------- diff --git a/examples/custom-html-handling.py b/examples/custom-html-handling.py index 00df8b5..42412fa 100755 --- a/examples/custom-html-handling.py +++ b/examples/custom-html-handling.py @@ -9,27 +9,35 @@ Example: "Welcome to Chur" is rendered as "Welcome to **Chur**". """ +from typing import Dict - +from inscriptis import ParserConfig from inscriptis.html_engine import Inscriptis -from functools import partial +from inscriptis.model.html_document_state import HtmlDocumentState +from inscriptis.model.tag import CustomHtmlTagHandlerMapping from lxml.html import fromstring -def my_handle_start_b(self, attrs): +def my_handle_start_b(state: HtmlDocumentState, _: Dict) -> None: """Handle the opening tag.""" - self.tags[-1].write("**") + state.tags[-1].write("**") -def my_handle_end_b(self): +def my_handle_end_b(state: HtmlDocumentState) -> None: """Handle the closing tag.""" - self.tags[-1].write("**") + state.tags[-1].write("**") + + +MY_MAPPING = CustomHtmlTagHandlerMapping( + start_tag_mapping={"b": my_handle_start_b}, + end_tag_mapping={"b": my_handle_end_b}, +) HTML = "Welcome to Chur" html_tree = fromstring(HTML) -inscriptis = Inscriptis(html_tree) -inscriptis.start_tag_handler_dict["b"] = partial(my_handle_start_b, inscriptis) -inscriptis.end_tag_handler_dict["b"] = partial(my_handle_end_b, inscriptis) +inscriptis = Inscriptis( + html_tree, ParserConfig(custom_html_tag_handler_mapping=MY_MAPPING) +) print(inscriptis.get_text()) diff --git a/src/inscriptis/__init__.py b/src/inscriptis/__init__.py index 5f71f7c..4e52312 100644 --- a/src/inscriptis/__init__.py +++ b/src/inscriptis/__init__.py @@ -60,12 +60,12 @@ """ import re -from lxml.html import fromstring, HtmlElement -from lxml.etree import ParserError - from typing import Dict, Optional, Any - from inscriptis.model.config import ParserConfig + +from lxml.etree import ParserError +from lxml.html import fromstring, HtmlElement + from inscriptis.html_engine import Inscriptis RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>") diff --git a/src/inscriptis/annotation/__init__.py b/src/inscriptis/annotation/__init__.py index acf3d09..94e5fe5 100644 --- a/src/inscriptis/annotation/__init__.py +++ b/src/inscriptis/annotation/__init__.py @@ -1,7 +1,7 @@ """The model used for saving annotations.""" -from typing import NamedTuple, Tuple from typing import List +from typing import NamedTuple from inscriptis.html_properties import HorizontalAlignment @@ -25,8 +25,8 @@ class Annotation(NamedTuple): """the annotation's start index within the text output.""" end: int """the annotation's end index within the text output.""" - metadata: Tuple[str] - """a tuple of tags to be attached to the annotation.""" + metadata: str + """the tag to be attached to the annotation.""" def horizontal_shift( diff --git a/src/inscriptis/annotation/parser.py b/src/inscriptis/annotation/parser.py index 56bdf61..a246aee 100644 --- a/src/inscriptis/annotation/parser.py +++ b/src/inscriptis/annotation/parser.py @@ -18,6 +18,7 @@ """ from collections import defaultdict from copy import copy +from typing import Dict, Tuple, List from inscriptis.model.html_element import HtmlElement, DEFAULT_HTML_ELEMENT @@ -85,7 +86,7 @@ def __init__(self, css_profile, model: dict): self.css = css_profile @staticmethod - def _parse(model: dict) -> "AnnotationModel": + def _parse(model: dict) -> Tuple[Dict, List]: """Compute the AnnotationModel from a model dictionary. Returns: diff --git a/src/inscriptis/cli/inscript.py b/src/inscriptis/cli/inscript.py index c2861dc..be085e0 100755 --- a/src/inscriptis/cli/inscript.py +++ b/src/inscriptis/cli/inscript.py @@ -5,14 +5,14 @@ import argparse import sys from json import load, dumps -from typing import Optional from pathlib import Path +from typing import Optional import requests from inscriptis import get_text, get_annotated_text -from inscriptis.metadata import __version__, __copyright__, __license__ from inscriptis.css_profiles import CSS_PROFILES +from inscriptis.metadata import __version__, __copyright__, __license__ from inscriptis.model.config import ParserConfig DEFAULT_ENCODING = "utf8" @@ -148,6 +148,7 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s Args: url: URL to the HTML content, or None if the content is obtained from stdin. encoding: used encoding. + timeout: timeout in seconds for retrieving the URL. Returns: The html_content or None, if no content could be extracted. @@ -155,17 +156,15 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s """ if not url: return sys.stdin.read() - elif Path(url).is_file(): - with Path(url).open( - encoding=encoding or DEFAULT_ENCODING, errors="ignore" - ) as f: + elif (p := Path(url)).is_file(): + with p.open(encoding=encoding or DEFAULT_ENCODING, errors="ignore") as f: return f.read() elif url.startswith("http://") or url.startswith("https://"): req = requests.get(url, timeout=timeout) return req.content.decode(encoding or req.encoding) -def cli(): +def cli() -> None: """Run the inscript command line client.""" args = parse_command_line() if not (html_content := get_html_content(args.input, args.timeout, args.encoding)): diff --git a/src/inscriptis/css_profiles.py b/src/inscriptis/css_profiles.py index 51889b3..6f680e8 100644 --- a/src/inscriptis/css_profiles.py +++ b/src/inscriptis/css_profiles.py @@ -8,8 +8,8 @@ preventing cases where two words stick together. """ -from inscriptis.model.html_element import HtmlElement from inscriptis.html_properties import Display, WhiteSpace +from inscriptis.model.html_element import HtmlElement STRICT_CSS_PROFILE = { "body": HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal), diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index 684cfd8..42d849e 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # coding:utf-8 """The HTML Engine is responsible for converting HTML to text.""" -from typing import List +from typing import List, Dict, Callable import lxml.html from lxml.etree import Comment @@ -56,7 +56,9 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None config = config or ParserConfig() # setup start and end tag call tables - self.start_tag_handler_dict = { + self.start_tag_handler_dict: Dict[ + str, Callable[[HtmlDocumentState, Dict], None] + ] = { "table": table_start_handler, "tr": tr_start_handler, "td": td_start_handler, @@ -68,7 +70,7 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None "a": a_start_handler if config.parse_a() else None, "img": img_start_handler if config.display_images else None, } - self.end_tag_handler_dict = { + self.end_tag_handler_dict: Dict[str, Callable[[HtmlDocumentState], None]] = { "table": table_end_handler, "ul": ul_end_handler, "ol": ol_end_handler, @@ -77,6 +79,14 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None "a": a_end_handler if config.parse_a() else None, } + if config.custom_html_tag_handler_mapping: + self.start_tag_handler_dict.update( + config.custom_html_tag_handler_mapping.start_tag_mapping + ) + self.end_tag_handler_dict.update( + config.custom_html_tag_handler_mapping.end_tag_mapping + ) + # parse the HTML tree self.canvas = self._parse_html_tree(HtmlDocumentState(config), html_tree) diff --git a/src/inscriptis/model/attribute.py b/src/inscriptis/model/attribute.py index d8cf3f6..a66e9bd 100644 --- a/src/inscriptis/model/attribute.py +++ b/src/inscriptis/model/attribute.py @@ -66,7 +66,7 @@ def apply_attributes( self.attribute_mapping[attr_name](attr_value, html_element) return html_element - def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None): + def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None) -> None: attributes = copy(self.attribute_mapping) for a in annotations: attributes[a.attr] = ( diff --git a/src/inscriptis/model/canvas/__init__.py b/src/inscriptis/model/canvas/__init__.py index 7cf5ca4..bf9c61f 100644 --- a/src/inscriptis/model/canvas/__init__.py +++ b/src/inscriptis/model/canvas/__init__.py @@ -17,8 +17,8 @@ from inscriptis.annotation import Annotation from inscriptis.html_properties import WhiteSpace, Display from inscriptis.model.canvas.block import Block -from inscriptis.model.html_element import HtmlElement from inscriptis.model.canvas.prefix import Prefix +from inscriptis.model.html_element import HtmlElement class Canvas: @@ -64,10 +64,10 @@ def open_tag(self, tag: HtmlElement) -> None: if tag.display == Display.block: self.open_block(tag) - def open_block(self, tag: HtmlElement): + def open_block(self, tag: HtmlElement) -> None: """Open an HTML block element.""" # write missing bullets, if no content has been written - if not self._flush_inline() and tag.list_bullet: + if not self.flush_inline() and tag.list_bullet: self.write_unconsumed_bullet() self.current_block.prefix.register_prefix(tag.padding_inline, tag.list_bullet) @@ -79,7 +79,7 @@ def open_block(self, tag: HtmlElement): self.blocks.append("\n" * (required_newlines - 1)) self.margin = required_margin - def write_unconsumed_bullet(self): + def write_unconsumed_bullet(self) -> None: """Write unconsumed bullets to the blocks list.""" bullet = self.current_block.prefix.unconsumed_bullet if bullet: @@ -100,7 +100,7 @@ def close_tag(self, tag: HtmlElement) -> None: """ if tag.display == Display.block: # write missing bullets, if no content has been written so far. - if not self._flush_inline() and tag.list_bullet: + if not self.flush_inline() and tag.list_bullet: self.write_unconsumed_bullet() self.current_block.prefix.remove_last_prefix() self.close_block(tag) @@ -116,7 +116,7 @@ def close_tag(self, tag: HtmlElement) -> None: Annotation(start_idx, self.current_block.idx, annotation) ) - def close_block(self, tag: HtmlElement): + def close_block(self, tag: HtmlElement) -> None: """Close the given HtmlElement by writing its bottom margin. Args: @@ -128,17 +128,17 @@ def close_block(self, tag: HtmlElement): self.blocks.append("\n" * (required_newlines - 1)) self.margin = tag.margin_after - def write_newline(self): - if not self._flush_inline(): + def write_newline(self) -> None: + if not self.flush_inline(): self.blocks.append("") self.current_block = self.current_block.new_block() def get_text(self) -> str: """Provide a text representation of the Canvas.""" - self._flush_inline() + self.flush_inline() return "\n".join(self.blocks) - def _flush_inline(self) -> bool: + def flush_inline(self) -> bool: """Attempt to flush the content in self.current_block into a new block. Notes: diff --git a/src/inscriptis/model/canvas/block.py b/src/inscriptis/model/canvas/block.py index 6dc1361..5013233 100644 --- a/src/inscriptis/model/canvas/block.py +++ b/src/inscriptis/model/canvas/block.py @@ -1,7 +1,14 @@ """Representation of a text block within the HTML canvas.""" +from __future__ import annotations + from html import unescape +from typing import TYPE_CHECKING + from inscriptis.html_properties import WhiteSpace +if TYPE_CHECKING: + from inscriptis.model.canvas import Prefix + class Block: """The current block of text. @@ -19,7 +26,7 @@ class Block: __slots__ = ("idx", "prefix", "_content", "collapsable_whitespace") - def __init__(self, idx: int, prefix: str): + def __init__(self, idx: int, prefix: Prefix): self.idx = idx self.prefix = prefix self._content = "" diff --git a/src/inscriptis/model/canvas/prefix.py b/src/inscriptis/model/canvas/prefix.py index 8a68066..2e6963d 100644 --- a/src/inscriptis/model/canvas/prefix.py +++ b/src/inscriptis/model/canvas/prefix.py @@ -22,7 +22,7 @@ def __init__(self): self.bullets = [] self.consumed = False - def register_prefix(self, padding_inline, bullet): + def register_prefix(self, padding_inline: int, bullet: str) -> None: """Register the given prefix. Args: @@ -33,13 +33,13 @@ def register_prefix(self, padding_inline, bullet): self.paddings.append(padding_inline) self.bullets.append(bullet if bullet else "") - def remove_last_prefix(self): + def remove_last_prefix(self) -> None: """Remove the last prefix from the list.""" with suppress(IndexError): self.current_padding -= self.paddings.pop() del self.bullets[-1] - def pop_next_bullet(self): + def pop_next_bullet(self) -> str: """Pop the next bullet to use, if any bullet is available.""" next_bullet_idx = ( next((-idx for idx, val in enumerate(reversed(self.bullets)) if val), 1) - 1 @@ -53,7 +53,7 @@ def pop_next_bullet(self): return bullet @property - def first(self): + def first(self) -> str: """Return the prefix used at the beginning of a tag. Note:: @@ -69,7 +69,7 @@ def first(self): return " " * (self.current_padding - len(bullet)) + bullet @property - def unconsumed_bullet(self): + def unconsumed_bullet(self) -> str: """Yield any yet unconsumed bullet. Note:: @@ -87,7 +87,7 @@ def unconsumed_bullet(self): return " " * (padding - len(bullet)) + bullet @property - def rest(self): + def rest(self) -> str: """Return the prefix used for new lines within a block. This prefix is used for pre-text that contains newlines. The lines diff --git a/src/inscriptis/model/config.py b/src/inscriptis/model/config.py index 0aaeb7a..fe06897 100644 --- a/src/inscriptis/model/config.py +++ b/src/inscriptis/model/config.py @@ -1,13 +1,15 @@ #!/usr/bin/env python """Provide configuration objects for the Inscriptis HTML to text converter.""" +from __future__ import annotations from copy import deepcopy -from typing import Dict +from typing import Dict, List -from inscriptis.css_profiles import CSS_PROFILES from inscriptis.annotation.parser import AnnotationModel +from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.attribute import Attribute from inscriptis.model.html_element import HtmlElement +from inscriptis.model.tag import CustomHtmlTagHandlerMapping DEFAULT_CSS_PROFILE_NAME = "relaxed" @@ -22,8 +24,9 @@ def __init__( deduplicate_captions: bool = False, display_links: bool = False, display_anchors: bool = False, - annotation_rules: Attribute = None, + annotation_rules: Dict[str, List[str]] = None, table_cell_separator: str = " ", + custom_html_tag_handler_mapping: CustomHtmlTagHandlerMapping = None, ): """Create a ParserConfig configuration. @@ -39,6 +42,7 @@ def __init__( annotation_rules: an optional dictionary of annotation rules which specify tags and attributes to annotation. table_cell_separator: separator to use between table cells. + custom_html_tag_handler_mapping: an optional CustomHtmlTagHandler """ self.display_images = display_images self.deduplicate_captions = deduplicate_captions @@ -47,6 +51,8 @@ def __init__( self.css = css or CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME] self.attribute_handler = Attribute() self.table_cell_separator = table_cell_separator + self.custom_html_tag_handler_mapping = custom_html_tag_handler_mapping + if annotation_rules: # ensure that we do not modify the original model or its # members. diff --git a/src/inscriptis/model/css.py b/src/inscriptis/model/css.py index d9efa44..f52c41b 100644 --- a/src/inscriptis/model/css.py +++ b/src/inscriptis/model/css.py @@ -7,6 +7,7 @@ """ from contextlib import suppress from re import compile as re_compile + from inscriptis.html_properties import ( Display, WhiteSpace, diff --git a/src/inscriptis/model/html_element.py b/src/inscriptis/model/html_element.py index 91e9585..5c72bbc 100644 --- a/src/inscriptis/model/html_element.py +++ b/src/inscriptis/model/html_element.py @@ -52,9 +52,9 @@ class HtmlElement: def __init__( self, - tag="default", - prefix="", - suffix="", + tag: str = "default", + prefix: str = "", + suffix: str = "", display: Display = Display.inline, margin_before: int = 0, margin_after: int = 0, @@ -156,7 +156,7 @@ def get_refined_html_element(self, new: "HtmlElement") -> "HtmlElement": return new - def __str__(self): + def __str__(self) -> str: return ( f"<{self.tag} prefix={self.prefix}, suffix={self.suffix}, " f"display={self.display}, margin_before={self.margin_before}, " diff --git a/src/inscriptis/model/table.py b/src/inscriptis/model/table.py index 75a2cd3..9207df7 100644 --- a/src/inscriptis/model/table.py +++ b/src/inscriptis/model/table.py @@ -2,11 +2,11 @@ # encoding: utf-8 """Classes used for representing Tables, TableRows and TableCells.""" -from typing import List from itertools import chain, accumulate +from typing import List -from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment from inscriptis.annotation import Annotation, horizontal_shift +from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment from inscriptis.model.canvas import Canvas @@ -48,7 +48,7 @@ def normalize_blocks(self) -> int: Returns: The height of the normalized cell. """ - self._flush_inline() + self.flush_inline() self.blocks = list(chain(*(line.split("\n") for line in self.blocks))) if not self.blocks: self.blocks = [""] @@ -168,7 +168,7 @@ class TableRow: __slots__ = ("columns", "cell_separator") - def __init__(self, cell_separator): + def __init__(self, cell_separator: str): self.columns: List[TableCell] = [] self.cell_separator = cell_separator @@ -205,7 +205,7 @@ class Table: __slots__ = ("rows", "left_margin_len", "cell_separator") - def __init__(self, left_margin_len: int, cell_separator): + def __init__(self, left_margin_len: int, cell_separator: str): self.rows = [] self.left_margin_len = left_margin_len self.cell_separator = cell_separator diff --git a/src/inscriptis/model/tag/__init__.py b/src/inscriptis/model/tag/__init__.py new file mode 100644 index 0000000..e877f80 --- /dev/null +++ b/src/inscriptis/model/tag/__init__.py @@ -0,0 +1,20 @@ +"""HTML Tag handlers and classes for designing custom HTML tag handlers.""" +from __future__ import annotations + +from typing import Dict, Callable, NamedTuple +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from inscriptis.model.html_document_state import HtmlDocumentState + + +class CustomHtmlTagHandlerMapping(NamedTuple): + """Refine the standard HTML Tag handling with the provided mapping. + + Attributes: + start_tag_mapping: a dictionary of custom start tag handlers. + end_tag_mapping: a dictionary of custom end tag handlers. + """ + + start_tag_mapping: Dict[str, Callable[[HtmlDocumentState, Dict], None]] + end_tag_mapping: Dict[str, Callable[[HtmlDocumentState], None]] diff --git a/src/inscriptis/model/tag/a_tag.py b/src/inscriptis/model/tag/a_tag.py index b9c145a..f435377 100644 --- a/src/inscriptis/model/tag/a_tag.py +++ b/src/inscriptis/model/tag/a_tag.py @@ -1,9 +1,10 @@ """Handle the tag.""" +from typing import Dict from inscriptis.model.html_document_state import HtmlDocumentState -def a_start_handler(state: HtmlDocumentState, attrs): +def a_start_handler(state: HtmlDocumentState, attrs: Dict) -> None: """Handle the tag.""" state.link_target = "" if state.config.display_links: @@ -15,7 +16,7 @@ def a_start_handler(state: HtmlDocumentState, attrs): state.tags[-1].write("[") -def a_end_handler(state: HtmlDocumentState): +def a_end_handler(state: HtmlDocumentState) -> None: """Handle the tag.""" if state.link_target: state.tags[-1].write(f"]({state.link_target})") diff --git a/src/inscriptis/model/tag/br_tag.py b/src/inscriptis/model/tag/br_tag.py index b7d5062..6a354d1 100644 --- a/src/inscriptis/model/tag/br_tag.py +++ b/src/inscriptis/model/tag/br_tag.py @@ -1,7 +1,9 @@ """Handle the
tag.""" +from typing import Dict + from inscriptis.model.html_document_state import HtmlDocumentState -def br_start_handler(state: HtmlDocumentState, _): +def br_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the
tag.""" state.tags[-1].canvas.write_newline() diff --git a/src/inscriptis/model/tag/img_tag.py b/src/inscriptis/model/tag/img_tag.py index 27404b4..51848af 100644 --- a/src/inscriptis/model/tag/img_tag.py +++ b/src/inscriptis/model/tag/img_tag.py @@ -1,9 +1,10 @@ """Handle the tag.""" +from typing import Dict from inscriptis.model.html_document_state import HtmlDocumentState -def img_start_handler(state: HtmlDocumentState, attrs): +def img_start_handler(state: HtmlDocumentState, attrs: Dict) -> None: """Handle the tag.""" image_text = attrs.get("alt", "") or attrs.get("title", "") if image_text and not ( diff --git a/src/inscriptis/model/tag/list_tag.py b/src/inscriptis/model/tag/list_tag.py index 7286fe9..08fc553 100644 --- a/src/inscriptis/model/tag/list_tag.py +++ b/src/inscriptis/model/tag/list_tag.py @@ -1,4 +1,5 @@ """Handle the
  • ,
      ,
        tags.""" +from typing import Dict from inscriptis.model.html_document_state import HtmlDocumentState @@ -11,7 +12,7 @@ def get_bullet(state: HtmlDocumentState) -> str: return UL_COUNTER[len(state.li_counter) % UL_COUNTER_LEN] -def li_start_handler(state: HtmlDocumentState, _): +def li_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the
      • tag.""" bullet = state.li_counter[-1] if state.li_counter else "* " if isinstance(bullet, int): @@ -23,21 +24,21 @@ def li_start_handler(state: HtmlDocumentState, _): state.tags[-1].write("") -def ul_start_handler(state: HtmlDocumentState, _): +def ul_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the
          tag.""" state.li_counter.append(get_bullet(state)) -def ul_end_handler(state: HtmlDocumentState): +def ul_end_handler(state: HtmlDocumentState) -> None: """Handle the
        tag.""" state.li_counter.pop() -def ol_start_handler(state: HtmlDocumentState, _): +def ol_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the
          tag.""" state.li_counter.append(1) -def ol_end_handler(state: HtmlDocumentState): +def ol_end_handler(state: HtmlDocumentState) -> None: """Handle the
        tag.""" state.li_counter.pop() diff --git a/src/inscriptis/model/tag/table_tag.py b/src/inscriptis/model/tag/table_tag.py index 2533f7d..3e6cf34 100644 --- a/src/inscriptis/model/tag/table_tag.py +++ b/src/inscriptis/model/tag/table_tag.py @@ -1,11 +1,13 @@ """Handle the , and tag.""" if state.current_table: state.current_table[-1].add_row() -def table_start_handler(state: HtmlDocumentState, _): +def table_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the
        tags.""" +from typing import Dict + from inscriptis.annotation import Annotation from inscriptis.model.canvas import Canvas from inscriptis.model.html_document_state import HtmlDocumentState from inscriptis.model.table import Table, TableCell -def td_start_handler(state: HtmlDocumentState, _): +def td_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the tag.""" if state.current_table: # open td tag @@ -14,13 +16,13 @@ def td_start_handler(state: HtmlDocumentState, _): state.current_table[-1].add_cell(table_cell) -def tr_start_handler(state: HtmlDocumentState, _): +def tr_start_handler(state: HtmlDocumentState, _: Dict) -> None: """Handle the
        tag.""" state.tags[-1].set_canvas(Canvas()) state.current_table.append( @@ -31,13 +33,13 @@ def table_start_handler(state: HtmlDocumentState, _): ) -def td_end_handler(state: HtmlDocumentState): +def td_end_handler(state: HtmlDocumentState) -> None: """Handle the tag.""" if state.current_table: state.tags[-1].canvas.close_tag(state.tags[-1]) -def table_end_handler(state: HtmlDocumentState): +def table_end_handler(state: HtmlDocumentState) -> None: """Handle the
        tag.""" if state.current_table: td_end_handler(state) @@ -52,7 +54,7 @@ def table_end_handler(state: HtmlDocumentState): start_idx = state.tags[-2].canvas.current_block.idx state.tags[-2].write_verbatim_text(table.get_text()) - state.tags[-2].canvas._flush_inline() + state.tags[-2].canvas.flush_inline() # transfer annotations from the current tag if state.tags[-1].annotation: diff --git a/src/inscriptis/service/web.py b/src/inscriptis/service/web.py index fdf47ca..902d1a4 100755 --- a/src/inscriptis/service/web.py +++ b/src/inscriptis/service/web.py @@ -6,8 +6,8 @@ from fastapi.responses import PlainTextResponse from inscriptis import get_text -from inscriptis.metadata import __version__ from inscriptis.css_profiles import RELAXED_CSS_PROFILE +from inscriptis.metadata import __version__ from inscriptis.model.config import ParserConfig app = FastAPI() @@ -22,15 +22,15 @@ @app.get("/") def index(): """Print a short status message for the Web service's base URL.""" - return "Inscriptis text to HTML Web service." + return PlainTextResponse("Inscriptis text to HTML Web service.") @app.post("/get_text", response_class=PlainTextResponse) async def get_text_call(request: Request): """Return the text representation of the given HTML content.""" content_type = request.headers.get("Content-type") - if "; encoding=" in content_type: - encoding = content_type.split("; encoding=")[1] + if "; charset=" in content_type: + encoding = content_type.split("; charset=")[1] else: encoding = "UTF-8" html_content = await request.body() diff --git a/tests/data/annotation-profile-unittest.json b/tests/data/annotation-profile-unittest.json new file mode 100644 index 0000000..48a58ec --- /dev/null +++ b/tests/data/annotation-profile-unittest.json @@ -0,0 +1,7 @@ +{ + "h1": ["heading"], + "h2": ["heading"], + "h3": ["heading"], + "b": ["emphasis"], + "table": ["table"] +} diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..4e4cfc4 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,125 @@ +""" +Tests the Inscriptis CLI client. +""" +from io import StringIO +from pathlib import Path +from json import loads +from unittest.mock import Mock, mock_open, patch, call + +import pytest + +from inscriptis.cli.inscript import cli + +INPUT_DATA = """Hello World!""" + + +def test_cli_read_from_stdin(monkeypatch, capsys): + """Test converting HTML from standard input with the command line client.""" + # Use monkeypatch to replace the 'input' function + monkeypatch.setattr("sys.argv", ["inscript"]) + monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA)) + cli() + + # Capture the printed output + captured = capsys.readouterr() + assert captured.out.strip() == "Hello World!" + + +def test_cli_read_from_stdin_write_to_file(monkeypatch, capsys): + """Test converting HTML from standard input with the command line client and + writing it to a file.""" + # Use monkeypatch to replace the 'input' function + monkeypatch.setattr("sys.argv", ["inscript", "--output", "test.txt"]) + monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA)) + with patch("pathlib.Path.open", create=True) as mock_file: + cli() + + # Capture the printed output + captured = capsys.readouterr() + assert captured.out.strip() == "" + # Capture the test written to the mock output file + assert call().__enter__().write("Hello World!") in mock_file.mock_calls + + +def test_cli_read_from_file(monkeypatch, capsys): + """Test converting HTML from a file with the command line client.""" + # Use monkeypatch to replace the 'input' function + monkeypatch.setattr("sys.argv", ["inscript", "test.html"]) + monkeypatch.setattr("pathlib.Path.is_file", lambda _: True) + monkeypatch.setattr("pathlib.Path.open", mock_open(read_data=INPUT_DATA)) + cli() + + # Capture the printed output + captured = capsys.readouterr() + assert captured.out.strip() == "Hello World!" + + +def test_cli_read_from_url(monkeypatch, capsys): + """Test converting HTML from an URL with the command line client.""" + # Use monkeypatch to replace the 'input' function + monkeypatch.setattr("sys.argv", ["inscript", "https://www.fhgr.ch/test.html"]) + + mock_request = Mock() + mock_request.content = INPUT_DATA.encode("utf8") + mock_request.encoding = "utf-8" + monkeypatch.setattr("requests.get", lambda url, timeout=0: mock_request) + cli() + + # Capture the printed output + captured = capsys.readouterr() + assert captured.out.strip() == "Hello World!" + + +def test_cli_annotations(monkeypatch, capsys): + """Test annotation handling in the command line client.""" + # Prepare input data for the test + annotation_rule_path = ( + Path(__file__).parent / "data" / "annotation-profile-unittest.json" + ) + + # Use monkeypatch to replace the 'input' function + monkeypatch.setattr( + "sys.argv", ["inscript", "-p", "surface", "-r", str(annotation_rule_path)] + ) + monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA)) + cli() + + # Capture the printed json data and convert it to an object + captured = loads(capsys.readouterr().out.strip()) + assert captured["text"].strip() == "Hello World!" + assert captured["label"] == [[6, 11, "emphasis"]] + assert captured["surface"] == [["emphasis", "World"]] + + +def test_help(monkeypatch, capsys): + monkeypatch.setattr("sys.argv", ["inscript", "--version"]) + + # the cli should exit with exit code 0 + with pytest.raises(SystemExit) as exit_info: + cli() + assert exit_info.value.code == 0 + + captured = capsys.readouterr().out + assert captured.startswith("Inscript HTML to text conversion") + assert "Inscript comes with ABSOLUTELY NO WARRANTY." in captured + + +def test_missing_input_file(monkeypatch, capsys): + monkeypatch.setattr("sys.argv", ["inscript", "test.html"]) + with pytest.raises(SystemExit) as exit_info: + cli() + + captured = capsys.readouterr() + assert exit_info.value.code == -1 + assert captured.out.strip().startswith("ERROR: Cannot open input file") + + +def test_missing_annotation_file(monkeypatch, capsys): + monkeypatch.setattr("sys.argv", ["inscript", "--annotation-rules", "rules.json"]) + monkeypatch.setattr("sys.stdin", StringIO(INPUT_DATA)) + with pytest.raises(SystemExit) as exit_info: + cli() + + captured = capsys.readouterr() + assert exit_info.value.code == -1 + assert captured.out.strip().startswith("ERROR: Cannot open annotation rule file") diff --git a/tests/test_custom_html_tag_handling.py b/tests/test_custom_html_tag_handling.py new file mode 100644 index 0000000..d050e6a --- /dev/null +++ b/tests/test_custom_html_tag_handling.py @@ -0,0 +1,31 @@ +"""Test the custom HTML tag handling.""" +from lxml.html import fromstring + +from inscriptis import Inscriptis, ParserConfig +from inscriptis.model.html_document_state import HtmlDocumentState +from inscriptis.model.tag import CustomHtmlTagHandlerMapping + + +def test_custom_html_handler(): + def my_handle_start_b(state: HtmlDocumentState, _): + """Handle the opening tag.""" + state.tags[-1].write("**") + + def my_handle_end_b(state: HtmlDocumentState): + """Handle the closing tag.""" + state.tags[-1].write("**") + + custom_mapping = CustomHtmlTagHandlerMapping( + start_tag_mapping={"b": my_handle_start_b}, + end_tag_mapping={"b": my_handle_end_b}, + ) + + html_tree = fromstring("Welcome to Chur") + inscriptis = Inscriptis( + html_tree, ParserConfig(custom_html_tag_handler_mapping=custom_mapping) + ) + + # custom HTML Handler + assert inscriptis.get_text().strip() == "Welcome to **Chur**" + # standard HTML handler + assert Inscriptis(html_tree).get_text().strip() == "Welcome to Chur" diff --git a/tests/test_model_html_element_canvas.py b/tests/test_model_html_element_canvas.py index e0d8c66..4488ffc 100644 --- a/tests/test_model_html_element_canvas.py +++ b/tests/test_model_html_element_canvas.py @@ -25,7 +25,7 @@ def _get_text(html_element): c.close_tag(html_element) HtmlElement().set_canvas(c).write("last") - c._flush_inline() + c.flush_inline() return "\n".join(c.blocks) diff --git a/tests/test_web_service.py b/tests/test_web_service.py new file mode 100644 index 0000000..282fa75 --- /dev/null +++ b/tests/test_web_service.py @@ -0,0 +1,43 @@ +import pytest +from fastapi.testclient import TestClient +from inscriptis.service.web import app +from inscriptis.metadata import __version__ + + +@pytest.fixture +def client(): + return TestClient(app) + + +def test_index(client): + response = client.get("/") + assert response.status_code == 200 + assert response.text == "Inscriptis text to HTML Web service." + + +def test_get_text_call_with_content_type(client): + html_content = "Ă–sterliche Freuden!" + response = client.post( + "/get_text", + content=html_content, + headers={"Content-type": "text/html; charset=UTF-8"}, + ) + assert response.status_code == 200 + assert response.text == "Ă–sterliche Freuden!" + + +def test_get_text_call_without_content_type(client): + html_content = "Hello World!" + response = client.post( + "/get_text", + content=html_content, + headers={"Content-type": "text/html"}, + ) + assert response.status_code == 200 + assert response.text == "Hello World!" + + +def test_get_version_call(client): + response = client.get("/version") + assert response.status_code == 200 + assert response.text == __version__ diff --git a/tox.ini b/tox.ini index 8dc0683..26f1cce 100644 --- a/tox.ini +++ b/tox.ini @@ -5,6 +5,8 @@ envlist = pytest, pyroma, flake8 [testenv:pytest] deps = pytest ~= 7.4.4 pytest-cov ~= 4.1.0 + fastapi ~= 0.109.2 + httpx ~= 0.26.0 commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests # python packaging best practices @@ -17,7 +19,7 @@ deps = flake8 ~= 7.0.0 dlint ~= 0.14.1 flake8-bandit ~= 4.1.1 flake8-blind-except ~= 0.2.1 - flake8-bugbear ~= 23.12.2 + flake8-bugbear ~= 24.2.6 flake8-builtins ~= 2.2.0 flake8-cognitive-complexity ~= 0.1.0 flake8-colors ~= 0.1.9