Skip to content

Commit

Permalink
chg: optimized imports and additional type hints.
Browse files Browse the repository at this point in the history
  • Loading branch information
AlbertWeichselbraun committed Feb 17, 2024
1 parent f138377 commit db012a4
Show file tree
Hide file tree
Showing 21 changed files with 61 additions and 49 deletions.
6 changes: 4 additions & 2 deletions examples/custom-html-handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,21 @@
Example:
"Welcome to <b>Chur</b>" is rendered as "Welcome to **Chur**".
"""
from typing import Dict

from inscriptis import ParserConfig
from inscriptis.html_engine import Inscriptis
from inscriptis.model.html_document_state import HtmlDocumentState
from inscriptis.model.tag import CustomHtmlTagHandlerMapping
from lxml.html import fromstring


def my_handle_start_b(state: HtmlDocumentState, _):
def my_handle_start_b(state: HtmlDocumentState, _: Dict) -> None:
"""Handle the opening <b> tag."""
state.tags[-1].write("**")


def my_handle_end_b(state: HtmlDocumentState):
def my_handle_end_b(state: HtmlDocumentState) -> None:
"""Handle the closing </b> tag."""
state.tags[-1].write("**")

Expand Down
8 changes: 4 additions & 4 deletions src/inscriptis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,12 @@
"""

import re
from lxml.html import fromstring, HtmlElement
from lxml.etree import ParserError

from typing import Dict, Optional, Any

from inscriptis.model.config import ParserConfig

from lxml.etree import ParserError
from lxml.html import fromstring, HtmlElement

from inscriptis.html_engine import Inscriptis

RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>")
Expand Down
2 changes: 1 addition & 1 deletion src/inscriptis/annotation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""The model used for saving annotations."""

from typing import NamedTuple
from typing import List
from typing import NamedTuple

from inscriptis.html_properties import HorizontalAlignment

Expand Down
6 changes: 3 additions & 3 deletions src/inscriptis/cli/inscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
import argparse
import sys
from json import load, dumps
from typing import Optional
from pathlib import Path
from typing import Optional

import requests

from inscriptis import get_text, get_annotated_text
from inscriptis.metadata import __version__, __copyright__, __license__
from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.metadata import __version__, __copyright__, __license__
from inscriptis.model.config import ParserConfig

DEFAULT_ENCODING = "utf8"
Expand Down Expand Up @@ -164,7 +164,7 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s
return req.content.decode(encoding or req.encoding)


def cli():
def cli() -> None:
"""Run the inscript command line client."""
args = parse_command_line()
if not (html_content := get_html_content(args.input, args.timeout, args.encoding)):
Expand Down
2 changes: 1 addition & 1 deletion src/inscriptis/css_profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
preventing cases where two words stick together.
"""

from inscriptis.model.html_element import HtmlElement
from inscriptis.html_properties import Display, WhiteSpace
from inscriptis.model.html_element import HtmlElement

STRICT_CSS_PROFILE = {
"body": HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal),
Expand Down
2 changes: 1 addition & 1 deletion src/inscriptis/model/attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def apply_attributes(
self.attribute_mapping[attr_name](attr_value, html_element)
return html_element

def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None):
def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None) -> None:
attributes = copy(self.attribute_mapping)
for a in annotations:
attributes[a.attr] = (
Expand Down
10 changes: 5 additions & 5 deletions src/inscriptis/model/canvas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
from inscriptis.annotation import Annotation
from inscriptis.html_properties import WhiteSpace, Display
from inscriptis.model.canvas.block import Block
from inscriptis.model.html_element import HtmlElement
from inscriptis.model.canvas.prefix import Prefix
from inscriptis.model.html_element import HtmlElement


class Canvas:
Expand Down Expand Up @@ -64,7 +64,7 @@ def open_tag(self, tag: HtmlElement) -> None:
if tag.display == Display.block:
self.open_block(tag)

def open_block(self, tag: HtmlElement):
def open_block(self, tag: HtmlElement) -> None:
"""Open an HTML block element."""
# write missing bullets, if no content has been written
if not self.flush_inline() and tag.list_bullet:
Expand All @@ -79,7 +79,7 @@ def open_block(self, tag: HtmlElement):
self.blocks.append("\n" * (required_newlines - 1))
self.margin = required_margin

def write_unconsumed_bullet(self):
def write_unconsumed_bullet(self) -> None:
"""Write unconsumed bullets to the blocks list."""
bullet = self.current_block.prefix.unconsumed_bullet
if bullet:
Expand Down Expand Up @@ -116,7 +116,7 @@ def close_tag(self, tag: HtmlElement) -> None:
Annotation(start_idx, self.current_block.idx, annotation)
)

def close_block(self, tag: HtmlElement):
def close_block(self, tag: HtmlElement) -> None:
"""Close the given HtmlElement by writing its bottom margin.
Args:
Expand All @@ -128,7 +128,7 @@ def close_block(self, tag: HtmlElement):
self.blocks.append("\n" * (required_newlines - 1))
self.margin = tag.margin_after

def write_newline(self):
def write_newline(self) -> None:
if not self.flush_inline():
self.blocks.append("")
self.current_block = self.current_block.new_block()
Expand Down
1 change: 1 addition & 0 deletions src/inscriptis/model/canvas/block.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Representation of a text block within the HTML canvas."""
from __future__ import annotations

from html import unescape
from typing import TYPE_CHECKING

Expand Down
12 changes: 6 additions & 6 deletions src/inscriptis/model/canvas/prefix.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self):
self.bullets = []
self.consumed = False

def register_prefix(self, padding_inline, bullet):
def register_prefix(self, padding_inline: int, bullet: str) -> None:
"""Register the given prefix.
Args:
Expand All @@ -33,13 +33,13 @@ def register_prefix(self, padding_inline, bullet):
self.paddings.append(padding_inline)
self.bullets.append(bullet if bullet else "")

def remove_last_prefix(self):
def remove_last_prefix(self) -> None:
"""Remove the last prefix from the list."""
with suppress(IndexError):
self.current_padding -= self.paddings.pop()
del self.bullets[-1]

def pop_next_bullet(self):
def pop_next_bullet(self) -> str:
"""Pop the next bullet to use, if any bullet is available."""
next_bullet_idx = (
next((-idx for idx, val in enumerate(reversed(self.bullets)) if val), 1) - 1
Expand All @@ -53,7 +53,7 @@ def pop_next_bullet(self):
return bullet

@property
def first(self):
def first(self) -> str:
"""Return the prefix used at the beginning of a tag.
Note::
Expand All @@ -69,7 +69,7 @@ def first(self):
return " " * (self.current_padding - len(bullet)) + bullet

@property
def unconsumed_bullet(self):
def unconsumed_bullet(self) -> str:
"""Yield any yet unconsumed bullet.
Note::
Expand All @@ -87,7 +87,7 @@ def unconsumed_bullet(self):
return " " * (padding - len(bullet)) + bullet

@property
def rest(self):
def rest(self) -> str:
"""Return the prefix used for new lines within a block.
This prefix is used for pre-text that contains newlines. The lines
Expand Down
3 changes: 2 additions & 1 deletion src/inscriptis/model/config.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#!/usr/bin/env python
"""Provide configuration objects for the Inscriptis HTML to text converter."""
from __future__ import annotations

from copy import deepcopy
from typing import Dict, List

from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.annotation.parser import AnnotationModel
from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.model.attribute import Attribute
from inscriptis.model.html_element import HtmlElement
from inscriptis.model.tag import CustomHtmlTagHandlerMapping
Expand Down
1 change: 1 addition & 0 deletions src/inscriptis/model/css.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"""
from contextlib import suppress
from re import compile as re_compile

from inscriptis.html_properties import (
Display,
WhiteSpace,
Expand Down
8 changes: 4 additions & 4 deletions src/inscriptis/model/html_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ class HtmlElement:

def __init__(
self,
tag="default",
prefix="",
suffix="",
tag: str = "default",
prefix: str = "",
suffix: str = "",
display: Display = Display.inline,
margin_before: int = 0,
margin_after: int = 0,
Expand Down Expand Up @@ -156,7 +156,7 @@ def get_refined_html_element(self, new: "HtmlElement") -> "HtmlElement":

return new

def __str__(self):
def __str__(self) -> str:
return (
f"<{self.tag} prefix={self.prefix}, suffix={self.suffix}, "
f"display={self.display}, margin_before={self.margin_before}, "
Expand Down
8 changes: 4 additions & 4 deletions src/inscriptis/model/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
# encoding: utf-8
"""Classes used for representing Tables, TableRows and TableCells."""

from typing import List
from itertools import chain, accumulate
from typing import List

from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment
from inscriptis.annotation import Annotation, horizontal_shift
from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment
from inscriptis.model.canvas import Canvas


Expand Down Expand Up @@ -168,7 +168,7 @@ class TableRow:

__slots__ = ("columns", "cell_separator")

def __init__(self, cell_separator):
def __init__(self, cell_separator: str):
self.columns: List[TableCell] = []
self.cell_separator = cell_separator

Expand Down Expand Up @@ -205,7 +205,7 @@ class Table:

__slots__ = ("rows", "left_margin_len", "cell_separator")

def __init__(self, left_margin_len: int, cell_separator):
def __init__(self, left_margin_len: int, cell_separator: str):
self.rows = []
self.left_margin_len = left_margin_len
self.cell_separator = cell_separator
Expand Down
2 changes: 1 addition & 1 deletion src/inscriptis/model/tag/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""HTML Tag handlers and classes for designing custom HTML tag handlers."""
from __future__ import annotations
from typing import Dict, Callable, NamedTuple

from typing import Dict, Callable, NamedTuple
from typing import TYPE_CHECKING

if TYPE_CHECKING:
Expand Down
5 changes: 3 additions & 2 deletions src/inscriptis/model/tag/a_tag.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Handle the <a> tag."""
from typing import Dict

from inscriptis.model.html_document_state import HtmlDocumentState


def a_start_handler(state: HtmlDocumentState, attrs):
def a_start_handler(state: HtmlDocumentState, attrs: Dict) -> None:
"""Handle the <a> tag."""
state.link_target = ""
if state.config.display_links:
Expand All @@ -15,7 +16,7 @@ def a_start_handler(state: HtmlDocumentState, attrs):
state.tags[-1].write("[")


def a_end_handler(state: HtmlDocumentState):
def a_end_handler(state: HtmlDocumentState) -> None:
"""Handle the </a> tag."""
if state.link_target:
state.tags[-1].write(f"]({state.link_target})")
4 changes: 3 additions & 1 deletion src/inscriptis/model/tag/br_tag.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Handle the <br> tag."""
from typing import Dict

from inscriptis.model.html_document_state import HtmlDocumentState


def br_start_handler(state: HtmlDocumentState, _):
def br_start_handler(state: HtmlDocumentState, _: Dict) -> None:
"""Handle the <br> tag."""
state.tags[-1].canvas.write_newline()
3 changes: 2 additions & 1 deletion src/inscriptis/model/tag/img_tag.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Handle the <img> tag."""
from typing import Dict

from inscriptis.model.html_document_state import HtmlDocumentState


def img_start_handler(state: HtmlDocumentState, attrs):
def img_start_handler(state: HtmlDocumentState, attrs: Dict) -> None:
"""Handle the <img> tag."""
image_text = attrs.get("alt", "") or attrs.get("title", "")
if image_text and not (
Expand Down
11 changes: 6 additions & 5 deletions src/inscriptis/model/tag/list_tag.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Handle the <li>, <ol>, <ul> tags."""
from typing import Dict

from inscriptis.model.html_document_state import HtmlDocumentState

Expand All @@ -11,7 +12,7 @@ def get_bullet(state: HtmlDocumentState) -> str:
return UL_COUNTER[len(state.li_counter) % UL_COUNTER_LEN]


def li_start_handler(state: HtmlDocumentState, _):
def li_start_handler(state: HtmlDocumentState, _: Dict) -> None:
"""Handle the <li> tag."""
bullet = state.li_counter[-1] if state.li_counter else "* "
if isinstance(bullet, int):
Expand All @@ -23,21 +24,21 @@ def li_start_handler(state: HtmlDocumentState, _):
state.tags[-1].write("")


def ul_start_handler(state: HtmlDocumentState, _):
def ul_start_handler(state: HtmlDocumentState, _: Dict) -> None:
"""Handle the <ul> tag."""
state.li_counter.append(get_bullet(state))


def ul_end_handler(state: HtmlDocumentState):
def ul_end_handler(state: HtmlDocumentState) -> None:
"""Handle the </ul> tag."""
state.li_counter.pop()


def ol_start_handler(state: HtmlDocumentState, _):
def ol_start_handler(state: HtmlDocumentState, _: Dict) -> None:
"""Handle the <ol> tag."""
state.li_counter.append(1)


def ol_end_handler(state: HtmlDocumentState):
def ol_end_handler(state: HtmlDocumentState) -> None:
"""Handle the </ol> tag."""
state.li_counter.pop()
Loading

0 comments on commit db012a4

Please sign in to comment.