Skip to content

Commit

Permalink
Merge branch 'fix/bug-81-custom-html-handling2' of github.com:weblyza…
Browse files Browse the repository at this point in the history
…rd/inscriptis into fix/bug-81-custom-html-handling2
  • Loading branch information
AlbertWeichselbraun committed Mar 5, 2024
2 parents 0261f13 + db012a4 commit 504863d
Show file tree
Hide file tree
Showing 29 changed files with 358 additions and 87 deletions.
20 changes: 12 additions & 8 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -534,20 +534,24 @@ If the fine-tuning options discussed above are not sufficient, you may even over
.. code-block:: python
from inscriptis import ParserConfig
from inscriptis.html_engine import Inscriptis
from functools import partial
inscriptis = Inscriptis(html_tree, config)
inscriptis.start_tag_handler_dict['a'] = partial(my_handle_start_a, inscriptis)
inscriptis.end_tag_handler_dict['a'] = partial(my_handle_end_a, inscriptis)
from inscriptis.model.tag import CustomHtmlTagHandlerMapping
my_mapping = CustomHtmlTagHandlerMapping(
start_tag_mapping={'a': my_handle_start_a},
end_tag_mapping={'a': my_handle_end_a}
)
inscriptis = Inscriptis(html_tree,
ParserConfig(custom_html_tag_handler_mapping=my_mapping))
text = inscriptis.get_text()
In the example the standard HTML handlers for the ``a`` tag are overwritten with custom versions (i.e., ``my_handle_start_a`` and ``my_handle_end_a``).
You may define custom handlers for any tag, regardless of whether it already exists in ``start_tag_handler_dict`` or ``end_tag_handler_dict``.
You may define custom handlers for any tag, regardless of whether it already exists in the standard mapping.
Please refer to `custom-html-handling.py <https://github.com/weblyzard/inscriptis/blob/master/examples/custom-html-handling.py>`_ for a working example.
Please refer to `custom-html-handling.py <https://github.com/weblyzard/inscriptis/blob/master/examples/custom-html-handling.py>`_ for a working example.
The standard HTML tag handlers can be found in the `inscriptis.model.tag <https://github.com/weblyzard/inscriptis/blob/master/src/inscriptis/model/tag>`_ package.
Optimizing memory consumption
-----------------------------
Expand Down
26 changes: 17 additions & 9 deletions examples/custom-html-handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,35 @@
Example:
"Welcome to <b>Chur</b>" is rendered as "Welcome to **Chur**".
"""
from typing import Dict


from inscriptis import ParserConfig
from inscriptis.html_engine import Inscriptis
from functools import partial
from inscriptis.model.html_document_state import HtmlDocumentState
from inscriptis.model.tag import CustomHtmlTagHandlerMapping
from lxml.html import fromstring


def my_handle_start_b(self, attrs):
def my_handle_start_b(state: HtmlDocumentState, _: Dict) -> None:
"""Handle the opening <b> tag."""
self.tags[-1].write("**")
state.tags[-1].write("**")


def my_handle_end_b(self):
def my_handle_end_b(state: HtmlDocumentState) -> None:
"""Handle the closing </b> tag."""
self.tags[-1].write("**")
state.tags[-1].write("**")


MY_MAPPING = CustomHtmlTagHandlerMapping(
start_tag_mapping={"b": my_handle_start_b},
end_tag_mapping={"b": my_handle_end_b},
)


HTML = "Welcome to <b>Chur</b>"

html_tree = fromstring(HTML)
inscriptis = Inscriptis(html_tree)
inscriptis.start_tag_handler_dict["b"] = partial(my_handle_start_b, inscriptis)
inscriptis.end_tag_handler_dict["b"] = partial(my_handle_end_b, inscriptis)
inscriptis = Inscriptis(
html_tree, ParserConfig(custom_html_tag_handler_mapping=MY_MAPPING)
)
print(inscriptis.get_text())
8 changes: 4 additions & 4 deletions src/inscriptis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,12 @@
"""

import re
from lxml.html import fromstring, HtmlElement
from lxml.etree import ParserError

from typing import Dict, Optional, Any

from inscriptis.model.config import ParserConfig

from lxml.etree import ParserError
from lxml.html import fromstring, HtmlElement

from inscriptis.html_engine import Inscriptis

RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>")
Expand Down
6 changes: 3 additions & 3 deletions src/inscriptis/annotation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""The model used for saving annotations."""

from typing import NamedTuple, Tuple
from typing import List
from typing import NamedTuple

from inscriptis.html_properties import HorizontalAlignment

Expand All @@ -25,8 +25,8 @@ class Annotation(NamedTuple):
"""the annotation's start index within the text output."""
end: int
"""the annotation's end index within the text output."""
metadata: Tuple[str]
"""a tuple of tags to be attached to the annotation."""
metadata: str
"""the tag to be attached to the annotation."""


def horizontal_shift(
Expand Down
3 changes: 2 additions & 1 deletion src/inscriptis/annotation/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"""
from collections import defaultdict
from copy import copy
from typing import Dict, Tuple, List

from inscriptis.model.html_element import HtmlElement, DEFAULT_HTML_ELEMENT

Expand Down Expand Up @@ -85,7 +86,7 @@ def __init__(self, css_profile, model: dict):
self.css = css_profile

@staticmethod
def _parse(model: dict) -> "AnnotationModel":
def _parse(model: dict) -> Tuple[Dict, List]:
"""Compute the AnnotationModel from a model dictionary.
Returns:
Expand Down
13 changes: 6 additions & 7 deletions src/inscriptis/cli/inscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
import argparse
import sys
from json import load, dumps
from typing import Optional
from pathlib import Path
from typing import Optional

import requests

from inscriptis import get_text, get_annotated_text
from inscriptis.metadata import __version__, __copyright__, __license__
from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.metadata import __version__, __copyright__, __license__
from inscriptis.model.config import ParserConfig

DEFAULT_ENCODING = "utf8"
Expand Down Expand Up @@ -148,24 +148,23 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s
Args:
url: URL to the HTML content, or None if the content is obtained from stdin.
encoding: used encoding.
timeout: timeout in seconds for retrieving the URL.
Returns:
The html_content or None, if no content could be extracted.
"""
if not url:
return sys.stdin.read()
elif Path(url).is_file():
with Path(url).open(
encoding=encoding or DEFAULT_ENCODING, errors="ignore"
) as f:
elif (p := Path(url)).is_file():
with p.open(encoding=encoding or DEFAULT_ENCODING, errors="ignore") as f:
return f.read()
elif url.startswith("http://") or url.startswith("https://"):
req = requests.get(url, timeout=timeout)
return req.content.decode(encoding or req.encoding)


def cli():
def cli() -> None:
"""Run the inscript command line client."""
args = parse_command_line()
if not (html_content := get_html_content(args.input, args.timeout, args.encoding)):
Expand Down
2 changes: 1 addition & 1 deletion src/inscriptis/css_profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
preventing cases where two words stick together.
"""

from inscriptis.model.html_element import HtmlElement
from inscriptis.html_properties import Display, WhiteSpace
from inscriptis.model.html_element import HtmlElement

STRICT_CSS_PROFILE = {
"body": HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal),
Expand Down
16 changes: 13 additions & 3 deletions src/inscriptis/html_engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# coding:utf-8
"""The HTML Engine is responsible for converting HTML to text."""
from typing import List
from typing import List, Dict, Callable

import lxml.html
from lxml.etree import Comment
Expand Down Expand Up @@ -56,7 +56,9 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
config = config or ParserConfig()

# setup start and end tag call tables
self.start_tag_handler_dict = {
self.start_tag_handler_dict: Dict[
str, Callable[[HtmlDocumentState, Dict], None]
] = {
"table": table_start_handler,
"tr": tr_start_handler,
"td": td_start_handler,
Expand All @@ -68,7 +70,7 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
"a": a_start_handler if config.parse_a() else None,
"img": img_start_handler if config.display_images else None,
}
self.end_tag_handler_dict = {
self.end_tag_handler_dict: Dict[str, Callable[[HtmlDocumentState], None]] = {
"table": table_end_handler,
"ul": ul_end_handler,
"ol": ol_end_handler,
Expand All @@ -77,6 +79,14 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
"a": a_end_handler if config.parse_a() else None,
}

if config.custom_html_tag_handler_mapping:
self.start_tag_handler_dict.update(
config.custom_html_tag_handler_mapping.start_tag_mapping
)
self.end_tag_handler_dict.update(
config.custom_html_tag_handler_mapping.end_tag_mapping
)

# parse the HTML tree
self.canvas = self._parse_html_tree(HtmlDocumentState(config), html_tree)

Expand Down
2 changes: 1 addition & 1 deletion src/inscriptis/model/attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def apply_attributes(
self.attribute_mapping[attr_name](attr_value, html_element)
return html_element

def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None):
def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None) -> None:
attributes = copy(self.attribute_mapping)
for a in annotations:
attributes[a.attr] = (
Expand Down
20 changes: 10 additions & 10 deletions src/inscriptis/model/canvas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
from inscriptis.annotation import Annotation
from inscriptis.html_properties import WhiteSpace, Display
from inscriptis.model.canvas.block import Block
from inscriptis.model.html_element import HtmlElement
from inscriptis.model.canvas.prefix import Prefix
from inscriptis.model.html_element import HtmlElement


class Canvas:
Expand Down Expand Up @@ -64,10 +64,10 @@ def open_tag(self, tag: HtmlElement) -> None:
if tag.display == Display.block:
self.open_block(tag)

def open_block(self, tag: HtmlElement):
def open_block(self, tag: HtmlElement) -> None:
"""Open an HTML block element."""
# write missing bullets, if no content has been written
if not self._flush_inline() and tag.list_bullet:
if not self.flush_inline() and tag.list_bullet:
self.write_unconsumed_bullet()
self.current_block.prefix.register_prefix(tag.padding_inline, tag.list_bullet)

Expand All @@ -79,7 +79,7 @@ def open_block(self, tag: HtmlElement):
self.blocks.append("\n" * (required_newlines - 1))
self.margin = required_margin

def write_unconsumed_bullet(self):
def write_unconsumed_bullet(self) -> None:
"""Write unconsumed bullets to the blocks list."""
bullet = self.current_block.prefix.unconsumed_bullet
if bullet:
Expand All @@ -100,7 +100,7 @@ def close_tag(self, tag: HtmlElement) -> None:
"""
if tag.display == Display.block:
# write missing bullets, if no content has been written so far.
if not self._flush_inline() and tag.list_bullet:
if not self.flush_inline() and tag.list_bullet:
self.write_unconsumed_bullet()
self.current_block.prefix.remove_last_prefix()
self.close_block(tag)
Expand All @@ -116,7 +116,7 @@ def close_tag(self, tag: HtmlElement) -> None:
Annotation(start_idx, self.current_block.idx, annotation)
)

def close_block(self, tag: HtmlElement):
def close_block(self, tag: HtmlElement) -> None:
"""Close the given HtmlElement by writing its bottom margin.
Args:
Expand All @@ -128,17 +128,17 @@ def close_block(self, tag: HtmlElement):
self.blocks.append("\n" * (required_newlines - 1))
self.margin = tag.margin_after

def write_newline(self):
if not self._flush_inline():
def write_newline(self) -> None:
if not self.flush_inline():
self.blocks.append("")
self.current_block = self.current_block.new_block()

def get_text(self) -> str:
"""Provide a text representation of the Canvas."""
self._flush_inline()
self.flush_inline()
return "\n".join(self.blocks)

def _flush_inline(self) -> bool:
def flush_inline(self) -> bool:
"""Attempt to flush the content in self.current_block into a new block.
Notes:
Expand Down
9 changes: 8 additions & 1 deletion src/inscriptis/model/canvas/block.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
"""Representation of a text block within the HTML canvas."""
from __future__ import annotations

from html import unescape
from typing import TYPE_CHECKING

from inscriptis.html_properties import WhiteSpace

if TYPE_CHECKING:
from inscriptis.model.canvas import Prefix


class Block:
"""The current block of text.
Expand All @@ -19,7 +26,7 @@ class Block:

__slots__ = ("idx", "prefix", "_content", "collapsable_whitespace")

def __init__(self, idx: int, prefix: str):
def __init__(self, idx: int, prefix: Prefix):
self.idx = idx
self.prefix = prefix
self._content = ""
Expand Down
12 changes: 6 additions & 6 deletions src/inscriptis/model/canvas/prefix.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self):
self.bullets = []
self.consumed = False

def register_prefix(self, padding_inline, bullet):
def register_prefix(self, padding_inline: int, bullet: str) -> None:
"""Register the given prefix.
Args:
Expand All @@ -33,13 +33,13 @@ def register_prefix(self, padding_inline, bullet):
self.paddings.append(padding_inline)
self.bullets.append(bullet if bullet else "")

def remove_last_prefix(self):
def remove_last_prefix(self) -> None:
"""Remove the last prefix from the list."""
with suppress(IndexError):
self.current_padding -= self.paddings.pop()
del self.bullets[-1]

def pop_next_bullet(self):
def pop_next_bullet(self) -> str:
"""Pop the next bullet to use, if any bullet is available."""
next_bullet_idx = (
next((-idx for idx, val in enumerate(reversed(self.bullets)) if val), 1) - 1
Expand All @@ -53,7 +53,7 @@ def pop_next_bullet(self):
return bullet

@property
def first(self):
def first(self) -> str:
"""Return the prefix used at the beginning of a tag.
Note::
Expand All @@ -69,7 +69,7 @@ def first(self):
return " " * (self.current_padding - len(bullet)) + bullet

@property
def unconsumed_bullet(self):
def unconsumed_bullet(self) -> str:
"""Yield any yet unconsumed bullet.
Note::
Expand All @@ -87,7 +87,7 @@ def unconsumed_bullet(self):
return " " * (padding - len(bullet)) + bullet

@property
def rest(self):
def rest(self) -> str:
"""Return the prefix used for new lines within a block.
This prefix is used for pre-text that contains newlines. The lines
Expand Down
Loading

0 comments on commit 504863d

Please sign in to comment.