diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..e768d6d --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1 @@ +55fa29ca39f9ed5895f9e88b2eb0f17e4d84245f diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index bb76eb2..0e79ee4 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -13,10 +13,7 @@ name: "CodeQL" on: push: - branches: [ master ] pull_request: - # The branches below must be a subset of the branches above - branches: [ master ] schedule: - cron: '26 5 * * 2' diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 7d2aff2..c2cb2d5 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -2,18 +2,16 @@ name: build on: push: - branches: [ master ] pull_request: - branches: [ master ] jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: fail-fast: false matrix: - python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'] + python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12'] steps: - uses: actions/checkout@v3 @@ -21,18 +19,10 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Install build environment run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest pytest-cov codecov - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - python setup.py install - - name: Lint with flake8 + python -m pip install tox setuptools pytest pytest-cov codecov + - name: Build and test with tox. run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=80 --statistics - - name: Test with pytest - run: | - py.test --cov=inscripits ./tests && codecov + tox diff --git a/.gitignore b/.gitignore index 18e246b..53ece72 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ tests/reference.txt *.c docs/paper/*.pdf htmlcov/ +poetry.lock diff --git a/Dockerfile b/Dockerfile index 70e30e5..089e929 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,10 +4,8 @@ FROM python:3.11-slim-bullseye AS builder WORKDIR /inscriptis -COPY requirements.txt . RUN python -m venv .venv && .venv/bin/python -m pip install --upgrade pip -RUN .venv/bin/pip install --no-cache-dir -r requirements.txt && \ - .venv/bin/pip install --no-cache-dir Flask waitress && \ +RUN .venv/bin/pip install --no-cache-dir inscriptis[web-service] && \ find /inscriptis/.venv \( -type d -a -name test -o -name tests \) -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) -exec rm -rf '{}' \+ # @@ -18,10 +16,9 @@ LABEL maintainer="albert@weichselbraun.net" # Note: only copy the src directory, to prevent bloating the image with # irrelevant files from the project directory. -WORKDIR /inscriptis/src +WORKDIR /inscriptis COPY --from=builder /inscriptis /inscriptis -COPY ./src /inscriptis/src ENV PATH="/inscriptis/.venv/bin:$PATH" -CMD ["waitress-serve", "inscriptis.service.web:app", "--port=5000", "--host=0.0.0.0"] +CMD ["uvicorn", "inscriptis.service.web:app", "--port=5000", "--host=0.0.0.0"] EXPOSE 5000 diff --git a/README.rst b/README.rst index a6fac94..87eca1e 100644 --- a/README.rst +++ b/README.rst @@ -131,9 +131,9 @@ the corresponding text representation. Command line parameters ----------------------- -The inscript.py command line client supports the following parameters:: +The inscript command line client supports the following parameters:: - usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR] [--indentation INDENTATION] + usage: inscript [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR] [--indentation INDENTATION] [--table-cell-separator TABLE_CELL_SEPARATOR] [-v] [input] @@ -172,19 +172,19 @@ HTML to text conversion ----------------------- convert the given page to text and output the result to the screen:: - $ inscript.py https://www.fhgr.ch + $ inscript https://www.fhgr.ch convert the file to text and save the output to fhgr.txt:: - $ inscript.py fhgr.html -o fhgr.txt + $ inscript fhgr.html -o fhgr.txt convert the file using strict indentation (i.e., minimize indentation and extra spaces) and save the output to fhgr-layout-optimized.txt:: - $ inscript.py --indentation strict fhgr.html -o fhgr-layout-optimized.txt + $ inscript --indentation strict fhgr.html -o fhgr-layout-optimized.txt convert HTML provided via stdin and save the output to output.txt:: - $ echo "
Make it so!
" | inscript.py -o output.txt + $ echo "Make it so!
" | inscript -o output.txt HTML to annotated text conversion @@ -193,7 +193,7 @@ convert and annotate HTML from a Web page using the provided annotation rules. Download the example `annotation-profile.json" + html_content + "") def get_text(html_content: str, config: ParserConfig = None) -> str: @@ -105,12 +105,12 @@ def get_text(html_content: str, config: ParserConfig = None) -> str: The text representation of the HTML content. """ html_tree = _get_html_tree(html_content) - return Inscriptis(html_tree, config).get_text() if html_tree is not None \ - else '' + return Inscriptis(html_tree, config).get_text() if html_tree is not None else "" -def get_annotated_text(html_content: str, - config: ParserConfig = None) -> Dict[str, Any]: +def get_annotated_text( + html_content: str, config: ParserConfig = None +) -> Dict[str, Any]: """Return a dictionary of the extracted text and annotations. Notes: @@ -132,7 +132,5 @@ def get_annotated_text(html_content: str, return {} inscriptis = Inscriptis(html_tree, config) - labels = [(a.start, a.end, a.metadata) - for a in inscriptis.get_annotations()] - return {'text': inscriptis.get_text(), - 'label': labels} + labels = [(a.start, a.end, a.metadata) for a in inscriptis.get_annotations()] + return {"text": inscriptis.get_text(), "label": labels} diff --git a/src/inscriptis/annotation/__init__.py b/src/inscriptis/annotation/__init__.py index 3d2b626..acf3d09 100644 --- a/src/inscriptis/annotation/__init__.py +++ b/src/inscriptis/annotation/__init__.py @@ -29,9 +29,13 @@ class Annotation(NamedTuple): """a tuple of tags to be attached to the annotation.""" -def horizontal_shift(annotations: List[Annotation], content_width: int, - line_width: int, align: HorizontalAlignment, - shift: int = 0) -> List[Annotation]: +def horizontal_shift( + annotations: List[Annotation], + content_width: int, + line_width: int, + align: HorizontalAlignment, + shift: int = 0, +) -> List[Annotation]: r"""Shift annotations based on the given line's formatting. Adjusts the start and end indices of annotations based on the line's @@ -56,5 +60,6 @@ def horizontal_shift(annotations: List[Annotation], content_width: int, else: h_align = shift + (line_width - content_width) // 2 - return [Annotation(a.start + h_align, a.end + h_align, a.metadata) - for a in annotations] + return [ + Annotation(a.start + h_align, a.end + h_align, a.metadata) for a in annotations + ] diff --git a/src/inscriptis/annotation/output/html.py b/src/inscriptis/annotation/output/html.py index 310f935..f7da4a8 100644 --- a/src/inscriptis/annotation/output/html.py +++ b/src/inscriptis/annotation/output/html.py @@ -5,8 +5,7 @@ from inscriptis.annotation.output import AnnotationProcessor -COLOR_SCHEMA = ('#D8115980', '#8F2D5680', '#21838080', - '#FBB13C80', '#73D2DE80') +COLOR_SCHEMA = ("#D8115980", "#8F2D5680", "#21838080", "#FBB13C80", "#73D2DE80") class HtmlExtractor(AnnotationProcessor): @@ -21,39 +20,43 @@ class HtmlExtractor(AnnotationProcessor): def __call__(self, annotated_text: Dict[str, Any]) -> str: tag_indices = defaultdict(list) - for start, end, label in sorted(annotated_text['label']): + for start, end, label in sorted(annotated_text["label"]): tag_indices[start].append(label) - tag_indices[end].append('/' + label) + tag_indices[end].append("/" + label) open_tags = [] - tagged_content = ['
'] - for idx, ch in enumerate(annotated_text['text']): + tagged_content = [ + "", + ] + for idx, ch in enumerate(annotated_text["text"]): if idx in tag_indices: tags = tag_indices[idx] # close tags: - for _ in (t for t in sorted(tags, reverse=True) - if t.startswith('/')): + for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")): open_tags.pop() - tagged_content.append('
')
- tagged_content.extend([''.format(tag=tag)
- for tag in open_tags])
+ ''.format(tag=tag)
+ )
+
+ if ch == "\n":
+ tagged_content.extend(["" for _ in open_tags])
+ tagged_content.append("
\n")
+ tagged_content.extend(
+ [''.format(tag=tag) for tag in open_tags]
+ )
else:
tagged_content.append(ch)
- return ''.join(tagged_content) + '
'
+ return "".join(tagged_content) + ""
@staticmethod
def _get_label_colors(labels: List[str]) -> Dict[str, str]:
@@ -68,9 +71,7 @@ def _get_label_colors(labels: List[str]) -> Dict[str, str]:
A mapping between the available labels and the corresponding color
from the COLOR_SCHEMA.
"""
- return {label: color
- for label, color in zip({a[2] for a in sorted(labels)},
- cycle(COLOR_SCHEMA))}
+ return dict(zip({a[2] for a in sorted(labels)}, cycle(COLOR_SCHEMA)))
def _get_css(self, labels: List[str]) -> str:
"""Compute the CSS to be included into the HTML output.
@@ -86,18 +87,18 @@ def _get_css(self, labels: List[str]) -> str:
css = []
for label, color in sorted(self._get_label_colors(labels).items()):
css.append(
- 'pre{{'
- ' position: relative;\n'
- '}}\n'
- '.{label} {{\n'
- ' background-color: {color};\n'
- ' border-radius: 0.4em;\n'
- '}}\n'
- '.{label}-label {{\n'
- ' top: -1.0em;\n'
+ "pre{{"
+ " position: relative;\n"
+ "}}\n"
+ ".{label} {{\n"
+ " background-color: {color};\n"
+ " border-radius: 0.4em;\n"
+ "}}\n"
+ ".{label}-label {{\n"
+ " top: -1.0em;\n"
' content: "{label}";\n'
- ' position: absolute;\n'
- ' background-color: {color};\n'
- ' font-size: 75%; }}\n'.format(label=label,
- color=color))
- return '\n'.join(css)
+ " position: absolute;\n"
+ " background-color: {color};\n"
+ " font-size: 75%; }}\n".format(label=label, color=color)
+ )
+ return "\n".join(css)
diff --git a/src/inscriptis/annotation/output/surface.py b/src/inscriptis/annotation/output/surface.py
index 52472d4..e4e5252 100644
--- a/src/inscriptis/annotation/output/surface.py
+++ b/src/inscriptis/annotation/output/surface.py
@@ -21,7 +21,9 @@ def __call__(self, annotated_text: Dict[str, Any]) -> Dict[str, Any]:
An extended dictionary which contains the extracted surface-forms
of the annotations under the key 'surface'.
"""
- surface_forms = [(label, annotated_text['text'][start:end])
- for start, end, label in annotated_text['label']]
- annotated_text['surface'] = surface_forms
+ surface_forms = [
+ (label, annotated_text["text"][start:end])
+ for start, end, label in annotated_text["label"]
+ ]
+ annotated_text["surface"] = surface_forms
return annotated_text
diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py
index 9c983f7..c31aa06 100644
--- a/src/inscriptis/annotation/output/xml.py
+++ b/src/inscriptis/annotation/output/xml.py
@@ -22,23 +22,28 @@ def __call__(self, annotated_text: Dict[str, Any]) -> str:
"""
tag_indices = defaultdict(list)
- for start, end, label in sorted(annotated_text['label']):
+ for start, end, label in sorted(annotated_text["label"]):
tag_indices[start].append(label)
- tag_indices[end].append('/' + label)
+ tag_indices[end].append("/" + label)
current_idx = 0
tagged_content = ['\n']
- text = annotated_text['text']
+ text = annotated_text["text"]
for index, tags in sorted(tag_indices.items()):
tagged_content.append(text[current_idx:index])
# close tags
- tagged_content.extend(['<' + tag + '>'
- for tag in sorted(tags, reverse=True)
- if tag.startswith('/')])
+ tagged_content.extend(
+ [
+ "<" + tag + ">"
+ for tag in sorted(tags, reverse=True)
+ if tag.startswith("/")
+ ]
+ )
# open tags
- tagged_content.extend(['<' + tag + '>' for tag in sorted(tags)
- if not tag.startswith('/')])
+ tagged_content.extend(
+ ["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")]
+ )
current_idx = index
tagged_content.append(text[current_idx:])
- return ''.join(tagged_content)
+ return "".join(tagged_content)
diff --git a/src/inscriptis/annotation/parser.py b/src/inscriptis/annotation/parser.py
index 500df4f..56bdf61 100644
--- a/src/inscriptis/annotation/parser.py
+++ b/src/inscriptis/annotation/parser.py
@@ -34,10 +34,15 @@ class ApplyAnnotation:
match_value.
"""
- __slots__ = ('annotations', 'match_tag', 'match_value', 'attr', 'matcher')
-
- def __init__(self, annotations: tuple, attr: str, match_tag: str = None,
- match_value: str = None):
+ __slots__ = ("annotations", "match_tag", "match_value", "attr", "matcher")
+
+ def __init__(
+ self,
+ annotations: tuple,
+ attr: str,
+ match_tag: str = None,
+ match_value: str = None,
+ ):
self.annotations = tuple(annotations)
self.attr = attr
self.match_tag = match_tag
@@ -46,17 +51,18 @@ def __init__(self, annotations: tuple, attr: str, match_tag: str = None,
def apply(self, attr_value: str, html_element: HtmlElement):
"""Apply the annotation to HtmlElements with matching tags."""
if (self.match_tag and self.match_tag != html_element.tag) or (
- self.match_value and self.match_value
- not in attr_value.split()):
+ self.match_value and self.match_value not in attr_value.split()
+ ):
return
html_element.annotation += self.annotations
def __str__(self):
- return '- 'pre': HtmlElement(display=Display.block, - whitespace=WhiteSpace.pre), - 'xmp': HtmlElement(display=Display.block, - whitespace=WhiteSpace.pre), - 'listing': HtmlElement(display=Display.block, - whitespace=WhiteSpace.pre), - 'plaintext': HtmlElement(display=Display.block, - whitespace=WhiteSpace.pre), + "pre": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre), + "xmp": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre), + "listing": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre), + "plaintext": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre), } RELAXED_CSS_PROFILE = STRICT_CSS_PROFILE.copy() -RELAXED_CSS_PROFILE['div'] = HtmlElement(display=Display.block, - padding_inline=2) -RELAXED_CSS_PROFILE['span'] = HtmlElement(display=Display.inline, - prefix=' ', suffix=' ', - limit_whitespace_affixes=True) +RELAXED_CSS_PROFILE["div"] = HtmlElement(display=Display.block, padding_inline=2) +RELAXED_CSS_PROFILE["span"] = HtmlElement( + display=Display.inline, prefix=" ", suffix=" ", limit_whitespace_affixes=True +) -CSS_PROFILES = {'strict': STRICT_CSS_PROFILE, - 'relaxed': RELAXED_CSS_PROFILE} +CSS_PROFILES = {"strict": STRICT_CSS_PROFILE, "relaxed": RELAXED_CSS_PROFILE} diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index 85664b7..35496fb 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -35,34 +35,33 @@ class Inscriptis: text = parser.get_text() """ - UL_COUNTER = ('* ', '+ ', 'o ', '- ') + UL_COUNTER = ("* ", "+ ", "o ", "- ") UL_COUNTER_LEN = len(UL_COUNTER) - def __init__(self, html_tree: lxml.html.HtmlElement, - config: ParserConfig = None): + def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None): # use the default configuration, if no config object is provided self.config = config or ParserConfig() # setup start and end tag call tables self.start_tag_handler_dict = { - 'table': self._start_table, - 'tr': self._start_tr, - 'td': self._start_td, - 'th': self._start_td, - 'ul': self._start_ul, - 'ol': self._start_ol, - 'li': self._start_li, - 'br': self._newline, - 'a': self._start_a if self.config.parse_a() else None, - 'img': self._start_img if self.config.display_images else None, + "table": self._start_table, + "tr": self._start_tr, + "td": self._start_td, + "th": self._start_td, + "ul": self._start_ul, + "ol": self._start_ol, + "li": self._start_li, + "br": self._newline, + "a": self._start_a if self.config.parse_a() else None, + "img": self._start_img if self.config.display_images else None, } self.end_tag_handler_dict = { - 'table': self._end_table, - 'ul': self._end_ul, - 'ol': self._end_ol, - 'td': self._end_td, - 'th': self._end_td, - 'a': self._end_a if self.config.parse_a() else None, + "table": self._end_table, + "ul": self._end_ul, + "ol": self._end_ol, + "td": self._end_td, + "th": self._end_td, + "a": self._end_a if self.config.parse_a() else None, } # instance variables @@ -70,13 +69,13 @@ def __init__(self, html_tree: lxml.html.HtmlElement, self.css = self.config.css self.apply_attributes = self.config.attribute_handler.apply_attributes - self.tags = [self.css['body'].set_canvas(self.canvas)] + self.tags = [self.css["body"].set_canvas(self.canvas)] self.current_table = [] self.li_counter = [] self.last_caption = None # used if display_links is enabled - self.link_target = '' + self.link_target = "" # crawl the html tree self._parse_html_tree(html_tree) @@ -133,11 +132,16 @@ def handle_starttag(self, tag, attrs): """ # use the css to handle tags known to it :) cur = self.tags[-1].get_refined_html_element( - self.apply_attributes(attrs, html_element=self.css.get( - tag, DEFAULT_HTML_ELEMENT).__copy__().set_tag(tag))) + self.apply_attributes( + attrs, + html_element=self.css.get(tag, DEFAULT_HTML_ELEMENT) + .__copy__() + .set_tag(tag), + ) + ) self.tags.append(cur) - handler = self.start_tag_handler_dict.get(tag, None) + handler = self.start_tag_handler_dict.get(tag) if handler: handler(attrs) @@ -150,7 +154,7 @@ def handle_endtag(self, tag): Args: tag: the HTML end tag to process. """ - handler = self.end_tag_handler_dict.get(tag, None) + handler = self.end_tag_handler_dict.get(tag) if handler: handler() @@ -161,25 +165,26 @@ def _end_ul(self): self.li_counter.pop() def _start_img(self, attrs): - image_text = attrs.get('alt', '') or attrs.get('title', '') - if image_text and not (self.config.deduplicate_captions - and image_text == self.last_caption): - self.tags[-1].write('[{0}]'.format(image_text)) + image_text = attrs.get("alt", "") or attrs.get("title", "") + if image_text and not ( + self.config.deduplicate_captions and image_text == self.last_caption + ): + self.tags[-1].write(f"[{image_text}]") self.last_caption = image_text def _start_a(self, attrs): - self.link_target = '' + self.link_target = "" if self.config.display_links: - self.link_target = attrs.get('href', '') + self.link_target = attrs.get("href", "") if self.config.display_anchors: - self.link_target = self.link_target or attrs.get('name', '') + self.link_target = self.link_target or attrs.get("name", "") if self.link_target: - self.tags[-1].write('[') + self.tags[-1].write("[") def _end_a(self): if self.link_target: - self.tags[-1].write(']({0})'.format(self.link_target)) + self.tags[-1].write(f"]({self.link_target})") def _start_ol(self, _): self.li_counter.append(1) @@ -188,20 +193,23 @@ def _end_ol(self): self.li_counter.pop() def _start_li(self, _): - bullet = self.li_counter[-1] if self.li_counter else '* ' + bullet = self.li_counter[-1] if self.li_counter else "* " if isinstance(bullet, int): self.li_counter[-1] += 1 - self.tags[-1].list_bullet = '{0}. '.format(bullet) + self.tags[-1].list_bullet = f"{bullet}. " else: self.tags[-1].list_bullet = bullet - self.tags[-1].write('') + self.tags[-1].write("") def _start_table(self, _): self.tags[-1].set_canvas(Canvas()) - self.current_table.append(Table( - left_margin_len=self.tags[-1].canvas.left_margin, - cell_separator=self.config.table_cell_separator)) + self.current_table.append( + Table( + left_margin_len=self.tags[-1].canvas.left_margin, + cell_separator=self.config.table_cell_separator, + ) + ) def _start_tr(self, _): if self.current_table: @@ -210,8 +218,9 @@ def _start_tr(self, _): def _start_td(self, _): if self.current_table: # open td tag - table_cell = TableCell(align=self.tags[-1].align, - valign=self.tags[-1].valign) + table_cell = TableCell( + align=self.tags[-1].align, valign=self.tags[-1].valign + ) self.tags[-1].canvas = table_cell self.current_table[-1].add_cell(table_cell) @@ -239,17 +248,18 @@ def _end_table(self): if self.tags[-1].annotation: end_idx = self.tags[-2].canvas.current_block.idx for a in self.tags[-1].annotation: - self.tags[-2].canvas.annotations.append(Annotation( - start_idx, end_idx, a)) + self.tags[-2].canvas.annotations.append( + Annotation(start_idx, end_idx, a) + ) # transfer in-table annotations self.tags[-2].canvas.annotations.extend( - table.get_annotations(start_idx, self.tags[-2].canvas.left_margin)) + table.get_annotations(start_idx, self.tags[-2].canvas.left_margin) + ) def _newline(self, _): self.tags[-1].canvas.write_newline() def get_bullet(self) -> str: """Return the bullet that correspond to the given index.""" - return Inscriptis.UL_COUNTER[ - len(self.li_counter) % Inscriptis.UL_COUNTER_LEN] + return Inscriptis.UL_COUNTER[len(self.li_counter) % Inscriptis.UL_COUNTER_LEN] diff --git a/src/inscriptis/html_properties.py b/src/inscriptis/html_properties.py index b1d24ea..4dc9dea 100644 --- a/src/inscriptis/html_properties.py +++ b/src/inscriptis/html_properties.py @@ -39,11 +39,11 @@ class WhiteSpace(Enum): class HorizontalAlignment(Enum): """Specify the content's horizontal alignment.""" - left = '<' + left = "<" """Left alignment of the block's content.""" - right = '>' + right = ">" """Right alignment of the block's content.""" - center = '^' + center = "^" """Center the block's content.""" diff --git a/src/inscriptis/metadata.py b/src/inscriptis/metadata.py index ff06c8d..c86c482 100644 --- a/src/inscriptis/metadata.py +++ b/src/inscriptis/metadata.py @@ -1,7 +1,14 @@ """Inscriptis metadata information.""" -__author__ = 'Albert Weichselbraun, Fabian Odoni' -__author_email__ = 'albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch' -__copyright__ = '2016-2023 Albert Weichselbraun, Fabian Odoni' -__license__ = 'Apache 2.0' -__version__ = '2.3.2' +import importlib.metadata as metadata + +PACKAGE = "inscriptis" + +__author__ = "Albert Weichselbraun, Fabian Odoni" +__author_email__ = "albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch" +__copyright__ = ( + f"{metadata.metadata(PACKAGE)['Name']} " + + f"{metadata.metadata(PACKAGE)['Version']} © 2016-2023 {__author__}" +) +__license__ = metadata.metadata(PACKAGE)["License"] +__version__ = metadata.metadata(PACKAGE)["Version"] diff --git a/src/inscriptis/model/attribute.py b/src/inscriptis/model/attribute.py index 0102e3f..d8cf3f6 100644 --- a/src/inscriptis/model/attribute.py +++ b/src/inscriptis/model/attribute.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # encoding: utf-8 """HTML attribute handling.""" @@ -10,9 +10,9 @@ from inscriptis.model.html_element import HtmlElement DEFAULT_ATTRIBUTE_MAP = { - 'style': CssParse.attr_style, - 'align': CssParse.attr_horizontal_align, - 'valign': CssParse.attr_vertical_align + "style": CssParse.attr_style, + "align": CssParse.attr_horizontal_align, + "valign": CssParse.attr_vertical_align, } @@ -26,9 +26,11 @@ def merge_function(func1, func2): func1: the first function func2: the second function """ + def merged(*args): func1(*args) func2(*args) + return merged @@ -46,16 +48,20 @@ class Attribute: def __init__(self): self.attribute_mapping = DEFAULT_ATTRIBUTE_MAP - def apply_attributes(self, attributes: Dict[str, str], - html_element: HtmlElement) -> HtmlElement: + def apply_attributes( + self, attributes: Dict[str, str], html_element: HtmlElement + ) -> HtmlElement: """Apply the attributes to the given HTML element. Args: attributes: the list of attributes html_element: the HTML element for which the attributes are parsed """ - supported_attributes = ((name, val) for name, val in attributes.items() - if name in self.attribute_mapping) + supported_attributes = ( + (name, val) + for name, val in attributes.items() + if name in self.attribute_mapping + ) for attr_name, attr_value in supported_attributes: self.attribute_mapping[attr_name](attr_value, html_element) return html_element @@ -63,6 +69,9 @@ def apply_attributes(self, attributes: Dict[str, str], def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None): attributes = copy(self.attribute_mapping) for a in annotations: - attributes[a.attr] = a.apply if a.attr not in attributes \ + attributes[a.attr] = ( + a.apply + if a.attr not in attributes else merge_function(attributes[a.attr], a.apply) + ) self.attribute_mapping = attributes diff --git a/src/inscriptis/model/canvas/__init__.py b/src/inscriptis/model/canvas/__init__.py index ef41254..7cf5ca4 100644 --- a/src/inscriptis/model/canvas/__init__.py +++ b/src/inscriptis/model/canvas/__init__.py @@ -37,8 +37,13 @@ class Canvas: _open_annotations: a map of open tags that contain annotations. """ - __slots__ = ('annotations', 'blocks', 'current_block', '_open_annotations', - 'margin') + __slots__ = ( + "annotations", + "blocks", + "current_block", + "_open_annotations", + "margin", + ) def __init__(self): self.margin = 1000 # margin to the previous block @@ -64,15 +69,14 @@ def open_block(self, tag: HtmlElement): # write missing bullets, if no content has been written if not self._flush_inline() and tag.list_bullet: self.write_unconsumed_bullet() - self.current_block.prefix.register_prefix(tag.padding_inline, - tag.list_bullet) + self.current_block.prefix.register_prefix(tag.padding_inline, tag.list_bullet) # write the block margin required_margin = max(tag.previous_margin_after, tag.margin_before) if required_margin > self.margin: required_newlines = required_margin - self.margin self.current_block.idx += required_newlines - self.blocks.append('\n' * (required_newlines - 1)) + self.blocks.append("\n" * (required_newlines - 1)) self.margin = required_margin def write_unconsumed_bullet(self): @@ -84,8 +88,7 @@ def write_unconsumed_bullet(self): self.current_block = self.current_block.new_block() self.margin = 0 - def write(self, tag: HtmlElement, text: str, - whitespace: WhiteSpace = None) -> None: + def write(self, tag: HtmlElement, text: str, whitespace: WhiteSpace = None) -> None: """Write the given text to the current block.""" self.current_block.merge(text, whitespace or tag.whitespace) @@ -110,7 +113,8 @@ def close_tag(self, tag: HtmlElement) -> None: for annotation in tag.annotation: self.annotations.append( - Annotation(start_idx, self.current_block.idx, annotation)) + Annotation(start_idx, self.current_block.idx, annotation) + ) def close_block(self, tag: HtmlElement): """Close the given HtmlElement by writing its bottom margin. @@ -121,18 +125,18 @@ def close_block(self, tag: HtmlElement): if tag.margin_after > self.margin: required_newlines = tag.margin_after - self.margin self.current_block.idx += required_newlines - self.blocks.append('\n' * (required_newlines - 1)) + self.blocks.append("\n" * (required_newlines - 1)) self.margin = tag.margin_after def write_newline(self): if not self._flush_inline(): - self.blocks.append('') + self.blocks.append("") self.current_block = self.current_block.new_block() def get_text(self) -> str: """Provide a text representation of the Canvas.""" self._flush_inline() - return '\n'.join(self.blocks) + return "\n".join(self.blocks) def _flush_inline(self) -> bool: """Attempt to flush the content in self.current_block into a new block. diff --git a/src/inscriptis/model/canvas/block.py b/src/inscriptis/model/canvas/block.py index 23c6906..6dc1361 100644 --- a/src/inscriptis/model/canvas/block.py +++ b/src/inscriptis/model/canvas/block.py @@ -17,12 +17,12 @@ class Block: prefix: prefix used within the current block. """ - __slots__ = ('idx', 'prefix', '_content', 'collapsable_whitespace') + __slots__ = ("idx", "prefix", "_content", "collapsable_whitespace") def __init__(self, idx: int, prefix: str): self.idx = idx self.prefix = prefix - self._content = '' + self._content = "" self.collapsable_whitespace = True def merge(self, text: str, whitespace: WhiteSpace) -> None: @@ -42,6 +42,10 @@ def merge_normal_text(self, text: str) -> None: Args: text: the text to merge + + Note: + If the previous text ended with a whitespace and text starts with one, both + will automatically collapse into a single whitespace. """ normalized_text = [] @@ -50,12 +54,15 @@ def merge_normal_text(self, text: str) -> None: normalized_text.append(ch) self.collapsable_whitespace = False elif not self.collapsable_whitespace: - normalized_text.append(' ') + normalized_text.append(" ") self.collapsable_whitespace = True if normalized_text: - text = ''.join((self.prefix.first, *normalized_text)) if not \ - self._content else ''.join(normalized_text) + text = ( + "".join((self.prefix.first, *normalized_text)) + if not self._content + else "".join(normalized_text) + ) text = unescape(text) self._content += text self.idx += len(text) @@ -66,8 +73,7 @@ def merge_pre_text(self, text: str) -> None: Args: text: the text to merge """ - text = ''.join((self.prefix.first, - text.replace('\n', '\n' + self.prefix.rest))) + text = "".join((self.prefix.first, text.replace("\n", "\n" + self.prefix.rest))) text = unescape(text) self._content += text self.idx += len(text) @@ -81,12 +87,12 @@ def content(self): if not self.collapsable_whitespace: return self._content - if self._content.endswith(' '): + if self._content.endswith(" "): self._content = self._content[:-1] self.idx -= 1 return self._content - def new_block(self) -> 'Block': + def new_block(self) -> "Block": """Return a new Block based on the current one.""" self.prefix.consumed = False return Block(idx=self.idx + 1, prefix=self.prefix) diff --git a/src/inscriptis/model/canvas/prefix.py b/src/inscriptis/model/canvas/prefix.py index ca0b768..8a68066 100644 --- a/src/inscriptis/model/canvas/prefix.py +++ b/src/inscriptis/model/canvas/prefix.py @@ -14,7 +14,7 @@ class Prefix: consumed: whether the current bullet has already been consumed. """ - __slots__ = ('current_padding', 'paddings', 'bullets', 'consumed') + __slots__ = ("current_padding", "paddings", "bullets", "consumed") def __init__(self): self.current_padding = 0 @@ -31,7 +31,7 @@ def register_prefix(self, padding_inline, bullet): """ self.current_padding += padding_inline self.paddings.append(padding_inline) - self.bullets.append(bullet if bullet else '') + self.bullets.append(bullet if bullet else "") def remove_last_prefix(self): """Remove the last prefix from the list.""" @@ -41,15 +41,15 @@ def remove_last_prefix(self): def pop_next_bullet(self): """Pop the next bullet to use, if any bullet is available.""" - next_bullet_idx = next((-idx for idx, val - in enumerate(reversed(self.bullets)) - if val), 1) - 1 + next_bullet_idx = ( + next((-idx for idx, val in enumerate(reversed(self.bullets)) if val), 1) - 1 + ) if not next_bullet_idx: - return '' + return "" bullet = self.bullets[next_bullet_idx] - self.bullets[next_bullet_idx] = '' + self.bullets[next_bullet_idx] = "" return bullet @property @@ -62,12 +62,11 @@ def first(self): further prefixes should be used for a line. """ if self.consumed: - return '' + return "" self.consumed = True bullet = self.pop_next_bullet() - return ' ' * (self.current_padding - len(bullet)) \ - + bullet + return " " * (self.current_padding - len(bullet)) + bullet @property def unconsumed_bullet(self): @@ -78,15 +77,14 @@ def unconsumed_bullet(self): not been consumed yet. """ if self.consumed: - return '' + return "" bullet = self.pop_next_bullet() if not bullet: - return '' + return "" padding = self.current_padding - self.paddings[-1] - return ' ' * (padding - len(bullet)) \ - + bullet + return " " * (padding - len(bullet)) + bullet @property def rest(self): @@ -96,4 +94,4 @@ def rest(self): need to be prefixed with the right padding to preserver the indentation. """ - return ' ' * self.current_padding + return " " * self.current_padding diff --git a/src/inscriptis/model/config.py b/src/inscriptis/model/config.py index 9bc216d..0aaeb7a 100644 --- a/src/inscriptis/model/config.py +++ b/src/inscriptis/model/config.py @@ -9,19 +9,22 @@ from inscriptis.model.attribute import Attribute from inscriptis.model.html_element import HtmlElement -DEFAULT_CSS_PROFILE_NAME = 'relaxed' +DEFAULT_CSS_PROFILE_NAME = "relaxed" class ParserConfig: """Encapsulate configuration options and CSS definitions.""" - def __init__(self, css: Dict[str, HtmlElement] = None, - display_images: bool = False, - deduplicate_captions: bool = False, - display_links: bool = False, - display_anchors: bool = False, - annotation_rules: Attribute = None, - table_cell_separator: str = ' '): + def __init__( + self, + css: Dict[str, HtmlElement] = None, + display_images: bool = False, + deduplicate_captions: bool = False, + display_links: bool = False, + display_anchors: bool = False, + annotation_rules: Attribute = None, + table_cell_separator: str = " ", + ): """Create a ParserConfig configuration. Args: @@ -47,13 +50,11 @@ def __init__(self, css: Dict[str, HtmlElement] = None, if annotation_rules: # ensure that we do not modify the original model or its # members. - annotation_model = AnnotationModel(deepcopy(self.css), - annotation_rules) + annotation_model = AnnotationModel(deepcopy(self.css), annotation_rules) # css with annotation support self.css = annotation_model.css # attribute handler with annotation support - self.attribute_handler.merge_attribute_map( - annotation_model.css_attr) + self.attribute_handler.merge_attribute_map(annotation_model.css_attr) def parse_a(self) -> bool: """Indicate whether the text output should contain links or anchors. diff --git a/src/inscriptis/model/css.py b/src/inscriptis/model/css.py index 1610055..d9efa44 100644 --- a/src/inscriptis/model/css.py +++ b/src/inscriptis/model/css.py @@ -7,8 +7,12 @@ """ from contextlib import suppress from re import compile as re_compile -from inscriptis.html_properties import (Display, WhiteSpace, - HorizontalAlignment, VerticalAlignment) +from inscriptis.html_properties import ( + Display, + WhiteSpace, + HorizontalAlignment, + VerticalAlignment, +) from inscriptis.model.html_element import HtmlElement @@ -20,7 +24,7 @@ class CssParse: """ # used to separate value and unit from each other - RE_UNIT = re_compile(r'(-?[0-9.]+)(\w+)') + RE_UNIT = re_compile(r"(-?[0-9.]+)(\w+)") @staticmethod def attr_style(style_attribute: str, html_element: HtmlElement): @@ -31,15 +35,15 @@ def attr_style(style_attribute: str, html_element: HtmlElement): Example: display: none html_element: The HtmlElement to which the given style is applied. """ - for style_directive in style_attribute.lower().split(';'): - if ':' not in style_directive: + for style_directive in style_attribute.lower().split(";"): + if ":" not in style_directive: continue - key, value = (s.strip() for s in style_directive.split(':', 1)) + key, value = (s.strip() for s in style_directive.split(":", 1)) try: - apply_style = getattr(CssParse, 'attr_' - + key.replace('-webkit-', '') - .replace('-', '_')) + apply_style = getattr( + CssParse, "attr_" + key.replace("-webkit-", "").replace("-", "_") + ) apply_style(value, html_element) except AttributeError: pass @@ -61,7 +65,7 @@ def _get_em(length: str) -> int: value = float(_m.group(1)) unit = _m.group(2) - if unit not in ('em', 'qem', 'rem'): + if unit not in ("em", "qem", "rem"): return int(round(value / 8)) return int(round(value)) @@ -75,9 +79,9 @@ def attr_display(value: str, html_element: HtmlElement): if html_element.display == Display.none: return - if value == 'block': + if value == "block": html_element.display = Display.block - elif value == 'none': + elif value == "none": html_element.display = Display.none else: html_element.display = Display.inline @@ -85,9 +89,9 @@ def attr_display(value: str, html_element: HtmlElement): @staticmethod def attr_white_space(value: str, html_element: HtmlElement): """Apply the given white-space value.""" - if value in ('normal', 'nowrap'): + if value in ("normal", "nowrap"): html_element.whitespace = WhiteSpace.normal - elif value in ('pre', 'pre-line', 'pre-wrap'): + elif value in ("pre", "pre-line", "pre-wrap"): html_element.whitespace = WhiteSpace.pre @staticmethod diff --git a/src/inscriptis/model/html_element.py b/src/inscriptis/model/html_element.py index 3ea95fe..91e9585 100644 --- a/src/inscriptis/model/html_element.py +++ b/src/inscriptis/model/html_element.py @@ -1,8 +1,12 @@ """Data structures for handling HTML Elements.""" from typing import Tuple -from inscriptis.html_properties import Display, HorizontalAlignment, \ - VerticalAlignment, WhiteSpace +from inscriptis.html_properties import ( + Display, + HorizontalAlignment, + VerticalAlignment, + WhiteSpace, +) class HtmlElement: @@ -28,22 +32,40 @@ class HtmlElement: - annotation: annotations associated with the HtmlElement. """ - __slots__ = ('canvas', 'tag', 'prefix', 'suffix', 'display', - 'margin_before', 'margin_after', 'padding_inline', - 'list_bullet', 'whitespace', 'limit_whitespace_affixes', - 'align', 'valign', 'previous_margin_after', 'annotation') - - def __init__(self, tag='default', prefix='', suffix='', - display: Display = Display.inline, - margin_before: int = 0, - margin_after: int = 0, - padding_inline: int = 0, - list_bullet: str = '', - whitespace: WhiteSpace = None, - limit_whitespace_affixes: bool = False, - align: HorizontalAlignment = HorizontalAlignment.left, - valign: VerticalAlignment = VerticalAlignment.middle, - annotation: Tuple[str] = ()): + __slots__ = ( + "canvas", + "tag", + "prefix", + "suffix", + "display", + "margin_before", + "margin_after", + "padding_inline", + "list_bullet", + "whitespace", + "limit_whitespace_affixes", + "align", + "valign", + "previous_margin_after", + "annotation", + ) + + def __init__( + self, + tag="default", + prefix="", + suffix="", + display: Display = Display.inline, + margin_before: int = 0, + margin_after: int = 0, + padding_inline: int = 0, + list_bullet: str = "", + whitespace: WhiteSpace = None, + limit_whitespace_affixes: bool = False, + align: HorizontalAlignment = HorizontalAlignment.left, + valign: VerticalAlignment = VerticalAlignment.middle, + annotation: Tuple[str] = (), + ): self.canvas = None self.tag = tag self.prefix = prefix @@ -60,7 +82,7 @@ def __init__(self, tag='default', prefix='', suffix='', self.previous_margin_after = 0 self.annotation = annotation - def __copy__(self) -> 'HtmlElement': + def __copy__(self) -> "HtmlElement": """Performance-optimized copy implementation.""" copy = self.__class__.__new__(self.__class__) for attr in self.__slots__: @@ -71,14 +93,13 @@ def write(self, text: str): """Write the given HTML text to the element's canvas.""" if not text or self.display == Display.none: return - self.canvas.write(self, ''.join( - (self.prefix, text, self.suffix))) + self.canvas.write(self, "".join((self.prefix, text, self.suffix))) - def set_canvas(self, canvas) -> 'HtmlElement': + def set_canvas(self, canvas) -> "HtmlElement": self.canvas = canvas return self - def set_tag(self, tag: str) -> 'HtmlElement': + def set_tag(self, tag: str) -> "HtmlElement": self.tag = tag return self @@ -99,7 +120,7 @@ def write_verbatim_text(self, text: str): if self.display == Display.block: self.canvas.close_block(self) - def get_refined_html_element(self, new: 'HtmlElement') -> 'HtmlElement': + def get_refined_html_element(self, new: "HtmlElement") -> "HtmlElement": """Compute the new HTML element based on the previous one. Adaptations: @@ -124,12 +145,11 @@ def get_refined_html_element(self, new: 'HtmlElement') -> 'HtmlElement': # do not display whitespace only affixes in Whitespace.pre areas # if `limit_whitespace_affixes` is set. - if (new.limit_whitespace_affixes - and self.whitespace == WhiteSpace.pre): + if new.limit_whitespace_affixes and self.whitespace == WhiteSpace.pre: if new.prefix.isspace(): - new.prefix = '' + new.prefix = "" if new.suffix.isspace(): - new.suffix = '' + new.suffix = "" if new.display == Display.block and self.display == Display.block: new.previous_margin_after = self.margin_after @@ -138,14 +158,14 @@ def get_refined_html_element(self, new: 'HtmlElement') -> 'HtmlElement': def __str__(self): return ( - '<{self.tag} prefix={self.prefix}, suffix={self.suffix}, ' - 'display={self.display}, margin_before={self.margin_before}, ' - 'margin_after={self.margin_after}, ' - 'padding_inline={self.padding_inline}, ' - 'list_bullet={self.list_bullet}, ' - 'whitespace={self.whitespace}, align={self.align}, ' - 'valign={self.valign}, annotation={self.annotation}>' - ).format(self=self) + f"<{self.tag} prefix={self.prefix}, suffix={self.suffix}, " + f"display={self.display}, margin_before={self.margin_before}, " + f"margin_after={self.margin_after}, " + f"padding_inline={self.padding_inline}, " + f"list_bullet={self.list_bullet}, " + f"whitespace={self.whitespace}, align={self.align}, " + f"valign={self.valign}, annotation={self.annotation}>" + ) __repr__ = __str__ diff --git a/src/inscriptis/model/table.py b/src/inscriptis/model/table.py index 559aa79..75a2cd3 100644 --- a/src/inscriptis/model/table.py +++ b/src/inscriptis/model/table.py @@ -20,9 +20,19 @@ class TableCell(Canvas): vertical formatting rules. """ - __slots__ = ('annotations', 'block_annotations', 'blocks', 'current_block', - 'margin', 'annotation_counter', 'align', 'valign', '_width', - 'line_width', 'vertical_padding') + __slots__ = ( + "annotations", + "block_annotations", + "blocks", + "current_block", + "margin", + "annotation_counter", + "align", + "valign", + "_width", + "line_width", + "vertical_padding", + ) def __init__(self, align: HorizontalAlignment, valign: VerticalAlignment): super().__init__() @@ -39,13 +49,13 @@ def normalize_blocks(self) -> int: The height of the normalized cell. """ self._flush_inline() - self.blocks = list(chain(*(line.split('\n') for line in self.blocks))) + self.blocks = list(chain(*(line.split("\n") for line in self.blocks))) if not self.blocks: - self.blocks = [''] + self.blocks = [""] return len(self.blocks) @property - def height(self): + def height(self) -> int: """Compute the table cell's height. Returns: @@ -54,7 +64,7 @@ def height(self): return max(1, len(self.blocks)) @property - def width(self): + def width(self) -> int: """Compute the table cell's width. Returns: @@ -62,8 +72,9 @@ def width(self): """ if self._width: return self._width - return max((len(line) for line in chain(*(block.split('\n') - for block in self.blocks)))) + return max( + (len(line) for line in chain(*(block.split("\n") for block in self.blocks))) + ) @width.setter def width(self, width): @@ -77,8 +88,7 @@ def width(self, width): # record new width and start reformatting self._width = width - format_spec = '{{:{align}{width}}}'.format(align=self.align.value, - width=width) + format_spec = "{{:{align}{width}}}".format(align=self.align.value, width=width) self.blocks = [format_spec.format(b) for b in self.blocks] @height.setter @@ -91,14 +101,17 @@ def height(self, height: int): """ rows = len(self.blocks) if rows < height: - empty_line = [''] + empty_line = [""] if self.valign == VerticalAlignment.bottom: - self.vertical_padding = (height - rows) + self.vertical_padding = height - rows self.blocks = self.vertical_padding * empty_line + self.blocks elif self.valign == VerticalAlignment.middle: self.vertical_padding = (height - rows) // 2 - self.blocks = self.vertical_padding * empty_line + \ - self.blocks + ((height - rows + 1) // 2 * empty_line) + self.blocks = ( + self.vertical_padding * empty_line + + self.blocks + + ((height - rows + 1) // 2 * empty_line) + ) else: self.blocks = self.blocks + ((height - rows) * empty_line) @@ -116,9 +129,9 @@ def get_annotations(self, idx: int, row_width: int) -> List[Annotation]: # the easy case - the cell has only one line :) if len(self.blocks) == 1: self.line_width[0] = self.width - return horizontal_shift(self.annotations, - self.line_width[0], - self.width, self.align, idx) + return horizontal_shift( + self.annotations, self.line_width[0], self.width, self.align, idx + ) # the more challenging one - multiple cell lines line_break_pos = list(accumulate(self.line_width)) @@ -127,17 +140,19 @@ def get_annotations(self, idx: int, row_width: int) -> List[Annotation]: # assign annotations to the corresponding line for a in self.annotations: for no, line_break in enumerate(line_break_pos): - if a.start <= (line_break + no): # consider newline + if a.start <= (line_break + no): # consider newline annotation_lines[no + self.vertical_padding].append(a) break # compute the annotation index based on its line and delta :) result = [] - idx += self.vertical_padding # newlines introduced by the padding - for line_annotations, line_len in zip(annotation_lines, - self.line_width): - result.extend(horizontal_shift(line_annotations, line_len, - self.width, self.align, idx)) + idx += self.vertical_padding # newlines introduced by the padding + for line_annotations, line_len in zip(annotation_lines, self.line_width): + result.extend( + horizontal_shift( + line_annotations, line_len, self.width, self.align, idx + ) + ) idx += row_width - line_len self.line_width = [self.width for _ in self.line_width] return result @@ -151,7 +166,7 @@ class TableRow: cell_separator: string used for separating columns from each other. """ - __slots__ = ('columns', 'cell_separator') + __slots__ = ("columns", "cell_separator") def __init__(self, cell_separator): self.columns: List[TableCell] = [] @@ -162,19 +177,21 @@ def __len__(self): def get_text(self) -> str: """Return a text representation of the TableRow.""" - row_lines = [self.cell_separator.join(line) - for line in zip(*[column.blocks - for column in self.columns])] - return '\n'.join(row_lines) + row_lines = [ + self.cell_separator.join(line) + for line in zip(*[column.blocks for column in self.columns]) + ] + return "\n".join(row_lines) @property - def width(self): + def width(self) -> int: """Compute and return the width of the current row.""" if not self.columns: return 0 - return sum((cell.width for cell in self.columns)) + len( - self.cell_separator) * (len(self.columns) - 1) + return sum((cell.width for cell in self.columns)) + len(self.cell_separator) * ( + len(self.columns) - 1 + ) class Table: @@ -186,7 +203,7 @@ class Table: cell_separator: string used for separating cells from each other. """ - __slots__ = ('rows', 'left_margin_len', 'cell_separator') + __slots__ = ("rows", "left_margin_len", "cell_separator") def __init__(self, left_margin_len: int, cell_separator): self.rows = [] @@ -210,9 +227,11 @@ def add_cell(self, table_cell: TableCell): def _set_row_height(self): """Set the cell height for all :class:`TableCell`s in the table.""" for row in self.rows: - max_row_height = max((cell.normalize_blocks() - for cell in row.columns)) \ - if row.columns else 0 + max_row_height = ( + max((cell.normalize_blocks() for cell in row.columns)) + if row.columns + else 0 + ) for cell in row.columns: cell.height = max_row_height @@ -223,26 +242,29 @@ def _set_column_width(self): for cur_column_idx in range(max_columns): # determine the required column width for the current column - max_column_width = max((row.columns[cur_column_idx].width - for row in self.rows - if len(row) > cur_column_idx)) + max_column_width = max( + ( + row.columns[cur_column_idx].width + for row in self.rows + if len(row) > cur_column_idx + ) + ) # set column width for all TableCells in the current column for row in self.rows: if len(row) > cur_column_idx: row.columns[cur_column_idx].width = max_column_width - def get_text(self): + def get_text(self) -> str: """Return and render the text of the given table.""" if not self.rows: - return '\n' + return "\n" self._set_row_height() self._set_column_width() - return '\n'.join((row.get_text() for row in self.rows)) + '\n' + return "\n".join((row.get_text() for row in self.rows)) + "\n" - def get_annotations(self, idx: int, - left_margin_len: int) -> List[Annotation]: + def get_annotations(self, idx: int, left_margin_len: int) -> List[Annotation]: r"""Return all annotations in the given table. Args: @@ -270,6 +292,6 @@ def get_annotations(self, idx: int, annotations += cell.get_annotations(cell_idx, row_width) cell_idx += cell.width + len(row.cell_separator) - idx += (row_width + 1) * row_height # linebreak + idx += (row_width + 1) * row_height # linebreak return annotations diff --git a/src/inscriptis/service/web.py b/src/inscriptis/service/web.py index f654b26..fdf47ca 100755 --- a/src/inscriptis/service/web.py +++ b/src/inscriptis/service/web.py @@ -2,43 +2,54 @@ # coding:utf-8 """Inscriptis Web Service.""" -from flask import request, Response, Flask +from fastapi import FastAPI, Request +from fastapi.responses import PlainTextResponse from inscriptis import get_text from inscriptis.metadata import __version__ from inscriptis.css_profiles import RELAXED_CSS_PROFILE from inscriptis.model.config import ParserConfig -app = Flask(__name__) -CONFIG = ParserConfig(css=RELAXED_CSS_PROFILE, display_images=True, - deduplicate_captions=True, display_links=False) +app = FastAPI() +CONFIG = ParserConfig( + css=RELAXED_CSS_PROFILE, + display_images=True, + deduplicate_captions=True, + display_links=False, +) -@app.route('/') +@app.get("/") def index(): """Print a short status message for the Web service's base URL.""" - return 'Inscriptis text to HTML Web service.' + return "Inscriptis text to HTML Web service." -@app.route('/get_text', methods=['POST']) -def get_text_call(): +@app.post("/get_text", response_class=PlainTextResponse) +async def get_text_call(request: Request): """Return the text representation of the given HTML content.""" - content_type = request.headers['Content-type'] - if '; encoding=' in content_type: - encoding = content_type.split('; encoding=')[1] + content_type = request.headers.get("Content-type") + if "; encoding=" in content_type: + encoding = content_type.split("; encoding=")[1] else: - encoding = 'UTF-8' - html_content = request.data.decode(encoding, errors='ignore') - text = get_text(html_content, CONFIG) - return Response(text, mimetype='text/plain') + encoding = "UTF-8" + html_content = await request.body() + return get_text(html_content.decode(encoding, errors="ignore"), CONFIG) -@app.route('/version', methods=['GET']) +@app.get("/version", response_class=PlainTextResponse) def get_version_call(): """Return the used inscriptis version.""" - return Response(__version__ + '\n', mimetype='text/plain') + return __version__ -if __name__ == '__main__': - print('Starting Web service based on Inscriptis', __version__) - app.run(threaded=True, host='127.0.0.1', port=5000) +def start(): + """Start the webservice.""" + import uvicorn + + print("Starting Web service based on Inscriptis", __version__) + uvicorn.run(app, host="127.0.0.1", port=5000) + + +if __name__ == "__main__": + start() diff --git a/tests/test_annotation.py b/tests/test_annotation.py index b19ddeb..c3518b3 100644 --- a/tests/test_annotation.py +++ b/tests/test_annotation.py @@ -11,57 +11,61 @@ def test_horizontal_shift(): - a = [Annotation(0, 4, 'test')] + a = [Annotation(0, 4, "test")] # no shift - assert horizontal_shift(a, - content_width=5, - line_width=10, - align=HorizontalAlignment.left, - shift=0).pop() == Annotation(0, 4, 'test') + assert horizontal_shift( + a, content_width=5, line_width=10, align=HorizontalAlignment.left, shift=0 + ).pop() == Annotation(0, 4, "test") # shift - assert horizontal_shift(a, - content_width=5, - line_width=10, - align=HorizontalAlignment.left, - shift=3).pop() == Annotation(3, 7, 'test') + assert horizontal_shift( + a, content_width=5, line_width=10, align=HorizontalAlignment.left, shift=3 + ).pop() == Annotation(3, 7, "test") # realignment to the right - assert horizontal_shift(a, - content_width=len('test'), - line_width=10, - align=HorizontalAlignment.right, - shift=0).pop() == Annotation(6, 10, 'test') - assert '{:>10}'.format('test')[6:10] == 'test' - + assert horizontal_shift( + a, + content_width=len("test"), + line_width=10, + align=HorizontalAlignment.right, + shift=0, + ).pop() == Annotation(6, 10, "test") + assert "{:>10}".format("test")[6:10] == "test" # shift + realignment to the right - assert horizontal_shift(a, - content_width=len('test'), - line_width=10, - align=HorizontalAlignment.right, - shift=3).pop() == Annotation(9, 13, 'test') + assert horizontal_shift( + a, + content_width=len("test"), + line_width=10, + align=HorizontalAlignment.right, + shift=3, + ).pop() == Annotation(9, 13, "test") # realignment to the center - assert horizontal_shift(a, - content_width=len('test'), - line_width=10, - align=HorizontalAlignment.center, - shift=0).pop() == Annotation(3, 7, 'test') - assert '{:^10}'.format('test')[3:7] == 'test' + assert horizontal_shift( + a, + content_width=len("test"), + line_width=10, + align=HorizontalAlignment.center, + shift=0, + ).pop() == Annotation(3, 7, "test") + assert "{:^10}".format("test")[3:7] == "test" - assert horizontal_shift(a, - content_width=len('test'), - line_width=11, - align=HorizontalAlignment.center, - shift=0).pop() == Annotation(3, 7, 'test') - assert '{:^11}'.format('test')[3:7] == 'test' + assert horizontal_shift( + a, + content_width=len("test"), + line_width=11, + align=HorizontalAlignment.center, + shift=0, + ).pop() == Annotation(3, 7, "test") + assert "{:^11}".format("test")[3:7] == "test" # realignment + shift - assert horizontal_shift(a, - content_width=len('test'), - line_width=11, - align=HorizontalAlignment.center, - shift=7).pop() == Annotation(10, 14, 'test') - + assert horizontal_shift( + a, + content_width=len("test"), + line_width=11, + align=HorizontalAlignment.center, + shift=7, + ).pop() == Annotation(10, 14, "test") diff --git a/tests/test_annotation_output_processor.py b/tests/test_annotation_output_processor.py index c80a654..82fdc7a 100644 --- a/tests/test_annotation_output_processor.py +++ b/tests/test_annotation_output_processor.py @@ -11,12 +11,12 @@ from inscriptis.annotation.output.surface import SurfaceExtractor from inscriptis.annotation.output.xml import XmlExtractor -EXAMPLE_OUTPUT = {'text': 'Chur\n\nChur is the capital and largest town of ' - 'the Swiss canton of the Grisons and lies in the ' - 'Grisonian Rhine Valley.', - 'label': [[0, 4, 'heading'], - [0, 4, 'h1'], - [6, 10, 'emphasis']]} +EXAMPLE_OUTPUT = { + "text": "Chur\n\nChur is the capital and largest town of " + "the Swiss canton of the Grisons and lies in the " + "Grisonian Rhine Valley.", + "label": [[0, 4, "heading"], [0, 4, "h1"], [6, 10, "emphasis"]], +} def test_abstract_class(): @@ -31,13 +31,15 @@ def test_surface_annotator(): result = processor(EXAMPLE_OUTPUT) # the old keys haven't been changed - assert 'text' in result - assert 'label' in result + assert "text" in result + assert "label" in result # and we have additional information on surface forms :) - assert result['surface'] == [('heading', 'Chur'), - ('h1', 'Chur'), - ('emphasis', 'Chur')] + assert result["surface"] == [ + ("heading", "Chur"), + ("h1", "Chur"), + ("emphasis", "Chur"), + ] def test_xml_annotator(): @@ -45,35 +47,40 @@ def test_xml_annotator(): result = processor(EXAMPLE_OUTPUT) # and we have additional information on surface forms :) - assert result == ('\n' - '\n\n
Chur ' - 'Chur is the capital and largest town ' - 'of the Swiss canton of the Grisons and lies in ' - 'the Grisonian Rhine Valley.') + assert result == ( + '\n' + "\n\n
Chur " + "Chur is the capital and largest town " + "of the Swiss canton of the Grisons and lies in " + "the Grisonian Rhine Valley." + ) def test_html_annotator(): processor = HtmlExtractor() result = processor(EXAMPLE_OUTPUT) - assert result.startswith('' - 'heading' - '' - 'h1' - 'Chur\n' - '\n' - 'emphasis' - 'Chur is the capital ' - 'and largest town of the Swiss canton of the ' - 'Grisons and lies in the Grisonian Rhine Valley.' - '') + assert result.startswith("" + 'heading' + '' + 'h1' + "Chur\n" + "\n" + 'emphasis' + 'Chur is the capital ' + "and largest town of the Swiss canton of the " + "Grisons and lies in the Grisonian Rhine Valley." + "" + ) def test_trailing_tag_annotation(): processor = XmlExtractor() - result = processor({'text': 'Ehre sei Gott!', - 'label': [[9, 14, 'emphasis']]}) + result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]}) - assert result == ('\n' - 'Ehre seiGott! ') + assert result == ( + '\n' + "Ehre seiGott! " + ) diff --git a/tests/test_annotation_rule_parsing.py b/tests/test_annotation_rule_parsing.py index fef265a..5893831 100644 --- a/tests/test_annotation_rule_parsing.py +++ b/tests/test_annotation_rule_parsing.py @@ -18,59 +18,58 @@ def test_parse(): """ basic rule parsing. """ - rules = {'table#border=1': ['table'], - 'hr': ['horizontal-line']} + rules = {"table#border=1": ["table"], "hr": ["horizontal-line"]} tags, attrs = AnnotationModel._parse(rules) - assert tags == {'hr': ['horizontal-line']} + assert tags == {"hr": ["horizontal-line"]} - apply_annotation= attrs[0] - assert apply_annotation.match_tag == 'table' - assert apply_annotation.match_value == '1' - assert apply_annotation.attr == 'border' + apply_annotation = attrs[0] + assert apply_annotation.match_tag == "table" + assert apply_annotation.match_value == "1" + assert apply_annotation.attr == "border" - e = HtmlElement(tag='table') - apply_annotation.apply('1', e) - assert e.annotation == ('table', ) + e = HtmlElement(tag="table") + apply_annotation.apply("1", e) + assert e.annotation == ("table",) def test_apply_annotation(): """ rule application. """ - rules = {'table#border=1': ['table'], - 'hr': ['horizontal-line'], - '#color=red': ['red'], - '#bgcolor': ['bgcolor']} - - css = deepcopy(CSS_PROFILES['strict']) + rules = { + "table#border=1": ["table"], + "hr": ["horizontal-line"], + "#color=red": ["red"], + "#bgcolor": ["bgcolor"], + } + + css = deepcopy(CSS_PROFILES["strict"]) annotation_model = AnnotationModel(css, rules) - assert annotation_model.css['hr'].annotation == ('horizontal-line', ) + assert annotation_model.css["hr"].annotation == ("horizontal-line",) attribute_handler = Attribute() attribute_handler.merge_attribute_map(annotation_model.css_attr) - assert 'table#border=1' in str(attribute_handler.attribute_mapping['border']) - assert '{any}#color=red' in str(attribute_handler.attribute_mapping['color']) - assert '{any}#bgcolor={any}' in str(attribute_handler.attribute_mapping['bgcolor']) + assert "table#border=1" in str(attribute_handler.attribute_mapping["border"]) + assert "{any}#color=red" in str(attribute_handler.attribute_mapping["color"]) + assert "{any}#bgcolor={any}" in str(attribute_handler.attribute_mapping["bgcolor"]) + def test_merged_attribute(): """ test multiple rules per attribute """ - rules = {'#color=white': ['white'], - '#color=yellow': ['yellow']} - css = deepcopy(CSS_PROFILES['strict']) + rules = {"#color=white": ["white"], "#color=yellow": ["yellow"]} + css = deepcopy(CSS_PROFILES["strict"]) annotation_model = AnnotationModel(css, rules) attribute_handler = Attribute() attribute_handler.merge_attribute_map(annotation_model.css_attr) e = HtmlElement() - attribute_handler.attribute_mapping['color']('green', e) + attribute_handler.attribute_mapping["color"]("green", e) assert e.annotation == () - attribute_handler.attribute_mapping['color']('yellow', e) - assert e.annotation == ('yellow', ) - attribute_handler.attribute_mapping['color']('white', e) - assert e.annotation == ('yellow', 'white') - - + attribute_handler.attribute_mapping["color"]("yellow", e) + assert e.annotation == ("yellow",) + attribute_handler.attribute_mapping["color"]("white", e) + assert e.annotation == ("yellow", "white") diff --git a/tests/test_block.py b/tests/test_block.py index 21ac592..8aacc93 100644 --- a/tests/test_block.py +++ b/tests/test_block.py @@ -11,25 +11,25 @@ def test_merge_normal_text_collapsable_whitespaces(): """ b = Block(0, Prefix()) b.merge_normal_text("Hallo") - assert b._content == 'Hallo' + assert b._content == "Hallo" assert not b.collapsable_whitespace b = Block(0, Prefix()) b.merge_normal_text(" Hallo ") - assert b._content == 'Hallo ' + assert b._content == "Hallo " assert b.collapsable_whitespace b = Block(0, Prefix()) - b.merge_normal_text('') - assert b._content == '' + b.merge_normal_text("") + assert b._content == "" assert b.collapsable_whitespace - b.merge_normal_text(' ') - assert b._content == '' + b.merge_normal_text(" ") + assert b._content == "" assert b.collapsable_whitespace - b.merge_normal_text(' ') - assert b._content == '' + b.merge_normal_text(" ") + assert b._content == "" assert b.collapsable_whitespace @@ -37,29 +37,29 @@ def test_merge_normal_non_collapsable_whitespaces(): b = Block(0, Prefix()) b.collapsable_whitespace = False b.merge_normal_text("Hallo") - assert b._content == 'Hallo' + assert b._content == "Hallo" assert not b.collapsable_whitespace b = Block(0, Prefix()) b.collapsable_whitespace = False b.merge_normal_text(" Hallo ") - assert b._content == ' Hallo ' + assert b._content == " Hallo " assert b.collapsable_whitespace b = Block(0, Prefix()) b.collapsable_whitespace = False - b.merge_normal_text('') - assert b._content == '' + b.merge_normal_text("") + assert b._content == "" assert not b.collapsable_whitespace b = Block(0, Prefix()) b.collapsable_whitespace = False - b.merge_normal_text(' ') - assert b._content == ' ' + b.merge_normal_text(" ") + assert b._content == " " assert b.collapsable_whitespace b = Block(0, Prefix()) b.collapsable_whitespace = False - b.merge_normal_text(' ') - assert b._content == ' ' + b.merge_normal_text(" ") + assert b._content == " " assert b.collapsable_whitespace diff --git a/tests/test_broken_table_handling.py b/tests/test_broken_table_handling.py index bd210e9..dee75f3 100644 --- a/tests/test_broken_table_handling.py +++ b/tests/test_broken_table_handling.py @@ -9,21 +9,16 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig -config = ParserConfig(css=CSS_PROFILES['strict']) +config = ParserConfig(css=CSS_PROFILES["strict"]) def test_forgotten_td_close_tag(): # one line (i.e., missing before the nextand the next - html = ('hallo before the' - '
echo') + html = "hallo' - ' 1 2 " "
echo" print(html) # assert get_text(html, config) == u'hallo\n1 2\necho' # two lines (i.e. missing" " 1 2 and before the - html = ('hallo ' - '
echo') + html = "hallo1 2' - ' 3 4' - ' " "
echo" print(html) - assert get_text(html, config) == u'hallo\n1 2\n3 4\n\necho' + assert get_text(html, config) == "hallo\n1 2\n3 4\n\necho" diff --git a/tests/test_double_a.py b/tests/test_double_a.py index 24623bd..a16ceb7 100644 --- a/tests/test_double_a.py +++ b/tests/test_double_a.py @@ -9,10 +9,14 @@ def test_successive_a(): - html = 'first' \ - 'second' - assert get_text(html) == 'firstsecond' + html = ( + 'first' + 'second' + ) + assert get_text(html) == "firstsecond" - html = 'first\n' \ - 'second' - assert get_text(html) == 'first second' + html = ( + 'first\n' + 'second' + ) + assert get_text(html) == "first second" diff --git a/tests/test_empty_string.py b/tests/test_empty_string.py index dd46353..9f7987c 100644 --- a/tests/test_empty_string.py +++ b/tests/test_empty_string.py @@ -9,9 +9,8 @@ def test_empty_and_corrupt(): - assert get_text('test').strip() == 'test' - assert get_text(' ') == '' - assert get_text('') == '' + assert get_text("test").strip() == "test" + assert get_text(" ") == "" + assert get_text("") == "" # test for the behaviour of older and recent lxml versions. - assert get_text('<<<').strip() in ('<<<', '<<', '') - + assert get_text("<<<").strip() in ("<<<", "<<", "") diff --git a/tests/test_engine.py b/tests/test_engine.py index 728191b..519c1ee 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -4,8 +4,8 @@ def test_text_from_empty_content(): - assert get_text('') == '' + assert get_text("") == "" def test_annotations_from_empty_content(): - assert get_annotated_text('') == {} + assert get_annotated_text("") == {} diff --git a/tests/test_html_conversion_options.py b/tests/test_html_conversion_options.py index fdc69be..c9bb878 100644 --- a/tests/test_html_conversion_options.py +++ b/tests/test_html_conversion_options.py @@ -9,69 +9,70 @@ def test_display_links(): - html = ''' + html = """ first second third - ''' + """ config = ParserConfig(display_links=True) - assert get_text(html, config).strip() == \ - '[first](first) [second](second) third' + assert get_text(html, config).strip() == "[first](first) [second](second) third" def test_display_anchors(): - html = ''' + html = """ first second - ''' + """ config = ParserConfig(display_anchors=True) - assert get_text(html, config).strip() == \ - '[first](first) second' + assert get_text(html, config).strip() == "[first](first) second" def test_display_links_and_anchors(): - html = ''' + html = """ first second third - ''' + """ config = ParserConfig(display_links=True, display_anchors=True) - assert get_text(html, config).strip() == \ - '[first](first) [second](second) [third](third)' + assert ( + get_text(html, config).strip() + == "[first](first) [second](second) [third](third)" + ) def test_display_images(): - html = ''' + html = """ - ''' + """ config = ParserConfig(display_images=True) - assert get_text(html, config).strip() == \ - '[Ein Test Bild] [Ein Test Bild] [Ein zweites Bild]' + assert ( + get_text(html, config).strip() + == "[Ein Test Bild] [Ein Test Bild] [Ein zweites Bild]" + ) def test_display_images_deduplicated(): - html = ''' + html = """ - ''' + """ config = ParserConfig(display_images=True, deduplicate_captions=True) - assert get_text(html, config).strip() == \ - '[Ein Test Bild] [Ein zweites Bild]' + assert get_text(html, config).strip() == "[Ein Test Bild] [Ein zweites Bild]" diff --git a/tests/test_html_snippets.py b/tests/test_html_snippets.py index 9e7197f..9df864d 100644 --- a/tests/test_html_snippets.py +++ b/tests/test_html_snippets.py @@ -11,10 +11,10 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig -TESTCASE_PATTERN = join(dirname(__file__), 'html/*.txt') +TESTCASE_PATTERN = join(dirname(__file__), "html/*.txt") -def test_html_snippets(filter_str=''): +def test_html_snippets(filter_str=""): for testcase_txt in glob(TESTCASE_PATTERN): if filter_str not in testcase_txt: continue @@ -22,26 +22,30 @@ def test_html_snippets(filter_str=''): with open(testcase_txt) as f: reference_txt = f.read().rstrip() - with open(testcase_txt.replace('.txt', '.html')) as f: + with open(testcase_txt.replace(".txt", ".html")) as f: print(f.name) - html = '{}'.format(f.read()) + html = "{}".format(f.read()) - converted_txt = get_text(html, ParserConfig( - css=CSS_PROFILES['strict'])).rstrip() + converted_txt = get_text( + html, ParserConfig(css=CSS_PROFILES["strict"]) + ).rstrip() if converted_txt != reference_txt: - print('File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}' - .format(testcase_txt, html, reference_txt, converted_txt)) - print('HTML file:', testcase_txt.replace('.txt', '.html')) - print("Visualize differences with `vimdiff reference.txt " - "converted.txt`") + print( + "File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}".format( + testcase_txt, html, reference_txt, converted_txt + ) + ) + print("HTML file:", testcase_txt.replace(".txt", ".html")) + print("Visualize differences with `vimdiff reference.txt " "converted.txt`") open("reference.txt", "w").write(reference_txt) open("converted.txt", "w").write(converted_txt) assert converted_txt == reference_txt -if __name__ == '__main__': +if __name__ == "__main__": from sys import argv - filter_str = argv[1] if len(argv) > 1 else '' + + filter_str = argv[1] if len(argv) > 1 else "" test_html_snippets(filter_str) diff --git a/tests/test_html_snippets_annotations.py b/tests/test_html_snippets_annotations.py index 9655afa..6c481a1 100644 --- a/tests/test_html_snippets_annotations.py +++ b/tests/test_html_snippets_annotations.py @@ -12,18 +12,18 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig -TESTCASE_PATTERN = os.path.join(os.path.dirname(__file__), 'html/*.json') +TESTCASE_PATTERN = os.path.join(os.path.dirname(__file__), "html/*.json") -def assert_equal_ignoring_whitespace(reference: List[str], - converted: List[str]) -> bool: +def assert_equal_ignoring_whitespace( + reference: List[str], converted: List[str] +) -> bool: for (ref_tag, ref_str), (conv_tag, conv_str) in zip(reference, converted): - assert ref_tag == conv_tag - assert ''.join(ref_str.split()) == ''.join(conv_str.split()) + assert "".join(ref_str.split()) == "".join(conv_str.split()) -def test_html_annotations(filter_str=''): +def test_html_annotations(filter_str=""): for annotation_file in glob(TESTCASE_PATTERN): if filter_str not in annotation_file: continue @@ -31,33 +31,39 @@ def test_html_annotations(filter_str=''): with open(annotation_file) as f: reference = load(f) - with open(annotation_file.replace('.json', '.html')) as f: + with open(annotation_file.replace(".json", ".html")) as f: print(f.name) - html = '{}'.format(f.read()) + html = "{}".format(f.read()) - for indentation_strategy in ('strict', 'relaxed'): - result = get_annotated_text(html, ParserConfig( - css=CSS_PROFILES[indentation_strategy], - annotation_rules=reference['annotation_rules'])) + for indentation_strategy in ("strict", "relaxed"): + result = get_annotated_text( + html, + ParserConfig( + css=CSS_PROFILES[indentation_strategy], + annotation_rules=reference["annotation_rules"], + ), + ) - converted = [[a[2], result['text'][a[0]:a[1]]] - for a in result['label']] + converted = [[a[2], result["text"][a[0] : a[1]]] for a in result["label"]] - if reference['result'] != converted: + if reference["result"] != converted: print("Reference:") - print(reference['result']) - print("\nConverted (indentation strategy: {})".format(indentation_strategy)) + print(reference["result"]) + print( + "\nConverted (indentation strategy: {})".format( + indentation_strategy + ) + ) print(converted) - if indentation_strategy == 'strict': - assert reference['result'] == converted + if indentation_strategy == "strict": + assert reference["result"] == converted else: - assert_equal_ignoring_whitespace(reference['result'], - converted) + assert_equal_ignoring_whitespace(reference["result"], converted) -if __name__ == '__main__': +if __name__ == "__main__": from sys import argv - filter_str = argv[1] if len(argv) > 1 else '' + filter_str = argv[1] if len(argv) > 1 else "" test_html_annotations(filter_str) diff --git a/tests/test_limit_whitespace_affixes.py b/tests/test_limit_whitespace_affixes.py index 20d6666..53e97fd 100644 --- a/tests/test_limit_whitespace_affixes.py +++ b/tests/test_limit_whitespace_affixes.py @@ -13,36 +13,41 @@ def test_html_element_refinement(): - new = HtmlElement('span', display=Display.inline, prefix=' ', suffix=' ', - limit_whitespace_affixes=True) - pre = HtmlElement('pre', display=Display.block, whitespace=WhiteSpace.pre) - code = HtmlElement('code') + new = HtmlElement( + "span", + display=Display.inline, + prefix=" ", + suffix=" ", + limit_whitespace_affixes=True, + ) + pre = HtmlElement("pre", display=Display.block, whitespace=WhiteSpace.pre) + code = HtmlElement("code") # refinement with pre and whitespaces refined = pre.get_refined_html_element(copy(new)) - assert refined.prefix == '' - assert refined.suffix == '' + assert refined.prefix == "" + assert refined.suffix == "" # refinement with code and whitespaces refined = code.get_refined_html_element(copy(new)) - assert refined.prefix == ' ' - assert refined.suffix == ' ' + assert refined.prefix == " " + assert refined.suffix == " " # refinement with pre and non-whitespaces - new.prefix = ' 1. ' - new.suffix = '<' + new.prefix = " 1. " + new.suffix = "<" refined = pre.get_refined_html_element(copy(new)) - assert refined.prefix == ' 1. ' - assert refined.suffix == '<' + assert refined.prefix == " 1. " + assert refined.suffix == "<" # refinement with code and non-whitespaces refined = code.get_refined_html_element(copy(new)) - assert refined.prefix == ' 1. ' - assert refined.suffix == '<' + assert refined.prefix == " 1. " + assert refined.suffix == "<" def test_limit_whitespace_affixes(): - html = ''' + html = """ halloecho1 2" " 3 4" " @@ -51,9 +56,10 @@ def hallo():
- ''' + """ config = ParserConfig(css=RELAXED_CSS_PROFILE) - assert get_text(html, config).strip() == \ - 'hallo echo\n\n' \ - 'def hallo():\n' \ + assert ( + get_text(html, config).strip() == "hallo echo\n\n" + "def hallo():\n" ' print("echo")' + ) diff --git a/tests/test_list_div.py b/tests/test_list_div.py index 07ae5d1..44c1ef5 100644 --- a/tests/test_list_div.py +++ b/tests/test_list_div.py @@ -10,21 +10,21 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig -config = ParserConfig(css=CSS_PROFILES['strict']) +config = ParserConfig(css=CSS_PROFILES["strict"]) def test_divs(): - html = u'ThomasAntonMaria' - assert get_text(html, config) == u'Thomas\nAnton\nMaria' + html = "ThomasAntonMaria" + assert get_text(html, config) == "Thomas\nAnton\nMaria" - html = u'ThomasAnna läuft weit weg.' - assert get_text(html, config) == u'Thomas\nAnna läuft weit weg.' + html = "ThomasAnna läuft weit weg." + assert get_text(html, config) == "Thomas\nAnna läuft weit weg." - html = u'Thomas' - assert get_text(html, config) == u'Thomas\n * Anton\n Maria' + html = "Thomas
AntonMaria" + assert get_text(html, config) == "Thomas\n * Anton\n Maria" - html = u'Thomas
AntonMaria' - assert get_text(html, config) == u'Thomas\n * Anton\n Maria' + html = "Thomas
AntonMaria" + assert get_text(html, config) == "Thomas\n * Anton\n Maria" - html = u'Thomas
AntonMaria' - assert get_text(html, config) == u'Thomas\n * a\n Anton\n Maria' + html = "Thomas
- a
AntonMaria" + assert get_text(html, config) == "Thomas\n * a\n Anton\n Maria" diff --git a/tests/test_margin_before_at_start.py b/tests/test_margin_before_at_start.py index bcadbc5..870c076 100644 --- a/tests/test_margin_before_at_start.py +++ b/tests/test_margin_before_at_start.py @@ -9,20 +9,18 @@ def test_content(): - html = 'first' - assert get_text(html) == 'first' + html = "first" + assert get_text(html) == "first" def test_margin_before(): - html = '
- a
AntonMariafirst
' - assert get_text(html) == 'first\n' + html = "first
" + assert get_text(html) == "first\n" - html = 'first' \ - 'second
' - assert get_text(html) == 'first\n\nsecond\n' + html = "first" "second
" + assert get_text(html) == "first\n\nsecond\n" def test_br(): - html = '
' \ - 'first' - assert get_text(html) == '\nfirst' + html = "
" "first" + assert get_text(html) == "\nfirst" diff --git a/tests/test_margin_handling.py b/tests/test_margin_handling.py index c09d944..c6a9906 100644 --- a/tests/test_margin_handling.py +++ b/tests/test_margin_handling.py @@ -9,29 +9,29 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig -config = ParserConfig(css=CSS_PROFILES['strict']) +config = ParserConfig(css=CSS_PROFILES["strict"]) def test_margin_handling(): - html = u'''Hallo + html = """HalloEchosei Gott - ''' - assert get_text(html, config) == u'Hallo\n\nEcho\n\n\nMecho\n\nsei Gott' + """ + assert get_text(html, config) == "Hallo\n\nEcho\n\n\nMecho\n\nsei Gott" - html = u'''Hallo + html = """HalloMechoEchoMechosei Gott - ''' - assert get_text(html, config) == u'Hallo\n\nEcho\n\n\nMecho\nsei Gott' + """ + assert get_text(html, config) == "Hallo\n\nEcho\n\n\nMecho\nsei Gott" - html = u'''Hallo + html = """Hallosei Gott - ''' - assert get_text(html, config) == u'Hallo\n\n\nEhre\n\nsei Gott' + """ + assert get_text(html, config) == "Hallo\n\n\nEhre\n\nsei Gott" diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 9ffe217..d0f5a9a 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,19 +1,21 @@ -from inscriptis.metadata import (__author__, __author_email__, __copyright__, - __license__, __version__) +from inscriptis.metadata import ( + __author__, + __author_email__, + __copyright__, + __license__, + __version__, +) def test_metadata(): """Test inscriptis package metadata.""" - assert 'Albert Weichselbraun' in __author__ - assert 'Fabian Odoni' in __author__ + assert "Albert Weichselbraun" in __author__ + assert "Fabian Odoni" in __author__ - assert '@' in __author_email__ - - assert '2016-' in __copyright__ - assert 'Albert Weichselbraun' in __copyright__ - assert 'Fabian Odoni' in __copyright__ - - assert __license__ == 'Apache 2.0' + assert "Albert Weichselbraun" in __copyright__ + assert "Fabian Odoni" in __copyright__ + assert "@" in __author_email__ + assert __license__ == "Apache-2.0" assert __version__[0].isnumeric() - assert '.' in __version__ + assert "." in __version__ diff --git a/tests/test_model_html_element_canvas.py b/tests/test_model_html_element_canvas.py index 574c047..e0d8c66 100644 --- a/tests/test_model_html_element_canvas.py +++ b/tests/test_model_html_element_canvas.py @@ -26,31 +26,31 @@ def _get_text(html_element): HtmlElement().set_canvas(c).write("last") c._flush_inline() - return '\n'.join(c.blocks) + return "\n".join(c.blocks) def test_formatting(): # standard line h = HtmlElement() - assert _get_text(h) == 'firstEhre sei Gott!last' + assert _get_text(h) == "firstEhre sei Gott!last" h.display = Display.block h.margin_before = 1 h.margin_after = 2 print(h) print(_get_text(h)) - assert _get_text(h) == 'first\n\nEhre sei Gott!\n\n\nlast' + assert _get_text(h) == "first\n\nEhre sei Gott!\n\n\nlast" # list bullet without padding_inline h.list_bullet = "* " - assert _get_text(h) == 'first\n\n* Ehre sei Gott!\n\n\nlast' + assert _get_text(h) == "first\n\n* Ehre sei Gott!\n\n\nlast" # add a padding_inline h.padding_inline = 3 - assert _get_text(h) == 'first\n\n * Ehre sei Gott!\n\n\nlast' + assert _get_text(h) == "first\n\n * Ehre sei Gott!\n\n\nlast" # and prefixes + suffixes - h.prefix = '>>' - h.suffix = '<<' - assert _get_text(h)== 'first\n\n * >>Ehre sei Gott!<<\n\n\nlast' + h.prefix = ">>" + h.suffix = "<<" + assert _get_text(h) == "first\n\n * >>Ehre sei Gott!<<\n\n\nlast" diff --git a/tests/test_model_prefix.py b/tests/test_model_prefix.py index 6682bbb..f5e3f8c 100644 --- a/tests/test_model_prefix.py +++ b/tests/test_model_prefix.py @@ -11,46 +11,45 @@ def test_simple_prefix(): p = Prefix() - p.register_prefix(5, '1. ') + p.register_prefix(5, "1. ") # first use - assert p.first == ' 1. ' + assert p.first == " 1. " # the prefix has been consumed - assert p.first == '' + assert p.first == "" # prefix used to indent lines separated with newlines - assert p.rest == ' ' + assert p.rest == " " def test_combined_prefix(): p = Prefix() - p.register_prefix(5, '1. ') - p.register_prefix(2, '') + p.register_prefix(5, "1. ") + p.register_prefix(2, "") - assert p.first == ' 1. ' - assert p.first == '' + assert p.first == " 1. " + assert p.first == "" p.remove_last_prefix() - assert p.first == '' + assert p.first == "" p.remove_last_prefix() # final consumption - no prefix - assert p.first == '' + assert p.first == "" # ensure that there are no interactions between different runs with # bullets p.consumed = False - p.register_prefix(5, '2. ') - p.register_prefix(2, '- ') + p.register_prefix(5, "2. ") + p.register_prefix(2, "- ") - assert p.first == ' - ' - assert p.first == '' - assert p.rest == ' ' + assert p.first == " - " + assert p.first == "" + assert p.rest == " " p.consumed = False p.remove_last_prefix() - assert p.first == ' 2. ' - assert p.rest == ' ' - + assert p.first == " 2. " + assert p.rest == " " diff --git a/tests/test_parse_css.py b/tests/test_parse_css.py index 9822644..8b26bf5 100644 --- a/tests/test_parse_css.py +++ b/tests/test_parse_css.py @@ -7,54 +7,61 @@ from copy import copy from inscriptis.css_profiles import CSS_PROFILES -from inscriptis.html_properties import (Display, WhiteSpace, VerticalAlignment, - HorizontalAlignment) +from inscriptis.html_properties import ( + Display, + WhiteSpace, + VerticalAlignment, + HorizontalAlignment, +) from inscriptis.model.css import CssParse from inscriptis.model.html_element import HtmlElement def test_css_parsing(): - html_element = copy(CSS_PROFILES['strict']['div']) - CssParse.attr_style('padding_left: 8px; display: block', html_element) + html_element = copy(CSS_PROFILES["strict"]["div"]) + CssParse.attr_style("padding_left: 8px; display: block", html_element) assert html_element.padding_inline == 1 assert html_element.display == Display.block - CssParse.attr_style('margin_before: 8em; display: inline', html_element) + CssParse.attr_style("margin_before: 8em; display: inline", html_element) assert html_element.margin_before == 8 assert html_element.display == Display.inline def test_html_element_str(): - ''' + """ Tests the string representation of an HtmlElement. - ''' - html_element = HtmlElement('div', '', '', Display.inline, 0, 0, 0, - '', WhiteSpace.pre) - assert str(html_element) == ('Ehre') + """ + html_element = HtmlElement( + "div", "", "", Display.inline, 0, 0, 0, "", WhiteSpace.pre + ) + assert str(html_element) == ( + "" + ) def test_parse_vertical_align(): html_element = HtmlElement() - CssParse.attr_vertical_align('top', html_element) + CssParse.attr_vertical_align("top", html_element) assert html_element.valign == VerticalAlignment.top # invalid value - CssParse.attr_vertical_align('unknown', html_element) + CssParse.attr_vertical_align("unknown", html_element) assert html_element.valign == VerticalAlignment.top def test_parse_horizontal_align(): html_element = HtmlElement() - CssParse.attr_horizontal_align('center', html_element) + CssParse.attr_horizontal_align("center", html_element) assert html_element.align == HorizontalAlignment.center # invalid value - CssParse.attr_horizontal_align('unknown', html_element) + CssParse.attr_horizontal_align("unknown", html_element) assert html_element.align == HorizontalAlignment.center diff --git a/tests/test_strip_xml_header.py b/tests/test_strip_xml_header.py index b2e8e44..cc28f05 100644 --- a/tests/test_strip_xml_header.py +++ b/tests/test_strip_xml_header.py @@ -4,7 +4,7 @@ from inscriptis import get_text -def test_successive_a(): - html = u' Hallo?>' - assert get_text(html).strip() == 'Hallo?>' +def test_successive_a(): + html = ' Hallo?>' + assert get_text(html).strip() == "Hallo?>" diff --git a/tests/test_style_parsing.py b/tests/test_style_parsing.py index 8efce8f..d23ae48 100644 --- a/tests/test_style_parsing.py +++ b/tests/test_style_parsing.py @@ -10,7 +10,8 @@ def test_style_unit_parsing(): html_element = HtmlElement() - CssParse.attr_style("margin-top:2.666666667em;margin-bottom: 2.666666667em", - html_element) + CssParse.attr_style( + "margin-top:2.666666667em;margin-bottom: 2.666666667em", html_element + ) assert html_element.margin_before == 3 assert html_element.margin_after == 3 diff --git a/tests/test_table_cell.py b/tests/test_table_cell.py index 8c728b2..597af19 100644 --- a/tests/test_table_cell.py +++ b/tests/test_table_cell.py @@ -9,39 +9,40 @@ from inscriptis.model.table import TableCell from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment + def test_height(): cell = TableCell(HorizontalAlignment.left, VerticalAlignment.top) - cell.blocks = ['hallo'] + cell.blocks = ["hallo"] cell.normalize_blocks() - assert cell.height == len('\n'.join(cell.blocks).split('\n')) + assert cell.height == len("\n".join(cell.blocks).split("\n")) - cell.blocks = ['hallo', 'echo'] + cell.blocks = ["hallo", "echo"] cell.normalize_blocks() assert cell.height == 2 - cell.blocks = ['hallo\necho'] + cell.blocks = ["hallo\necho"] cell.normalize_blocks() assert cell.height == 2 - cell.blocks = ['hallo\necho', 'Ehre sei Gott', 'Jump\n&\nRun!\n\n\n'] + cell.blocks = ["hallo\necho", "Ehre sei Gott", "Jump\n&\nRun!\n\n\n"] cell.normalize_blocks() assert cell.height == 9 - assert cell.height == len('\n'.join(cell.blocks).split('\n')) + assert cell.height == len("\n".join(cell.blocks).split("\n")) + def test_width(): cell = TableCell(HorizontalAlignment.left, VerticalAlignment.top) - cell.blocks = ['hallo'] + cell.blocks = ["hallo"] cell.normalize_blocks() assert cell.width == len(cell.blocks[0]) - cell.blocks = ['hallo\necho', 'Ehre sei Gott', 'Jump\n&\nRun!\n\n\n'] + cell.blocks = ["hallo\necho", "Ehre sei Gott", "Jump\n&\nRun!\n\n\n"] cell.normalize_blocks() - assert cell.width == len('Ehre sei Gott') + assert cell.width == len("Ehre sei Gott") # fixed set width cell.width = 95 cell.normalize_blocks() assert cell.width == 95 - diff --git a/tests/test_table_cell_formatting.py b/tests/test_table_cell_formatting.py index 7062d78..f8d6de8 100644 --- a/tests/test_table_cell_formatting.py +++ b/tests/test_table_cell_formatting.py @@ -11,50 +11,38 @@ def test_horizontal_cell_formatting(): - - cell = TableCell(align=HorizontalAlignment.left, - valign=VerticalAlignment.top) + cell = TableCell(align=HorizontalAlignment.left, valign=VerticalAlignment.top) # left alignment - cell.blocks = ['Ehre sei Gott!'] + cell.blocks = ["Ehre sei Gott!"] cell.width = 16 - assert cell.blocks == ['Ehre sei Gott! '] + assert cell.blocks == ["Ehre sei Gott! "] # right alignment cell.align = HorizontalAlignment.right - cell.blocks = ['Ehre sei Gott!'] + cell.blocks = ["Ehre sei Gott!"] cell.width = 16 - assert cell.blocks == [' Ehre sei Gott!'] + assert cell.blocks == [" Ehre sei Gott!"] def test_vertical_cell_formatting(): - cell = TableCell(align=HorizontalAlignment.left, - valign=VerticalAlignment.top) + cell = TableCell(align=HorizontalAlignment.left, valign=VerticalAlignment.top) # default top alignment - cell.blocks = ['Ehre sei Gott!'] + cell.blocks = ["Ehre sei Gott!"] cell.width = 16 cell.height = 4 - assert cell.blocks == ['Ehre sei Gott! ', - '', - '', - ''] + assert cell.blocks == ["Ehre sei Gott! ", "", "", ""] # bottom alignment - cell.blocks = ['Ehre sei Gott!'] + cell.blocks = ["Ehre sei Gott!"] cell.valign = VerticalAlignment.bottom cell.width = 16 cell.height = 4 - assert cell.blocks == ['', - '', - '', - 'Ehre sei Gott! '] + assert cell.blocks == ["", "", "", "Ehre sei Gott! "] # middle alignment - cell.blocks = ['Ehre sei Gott!'] + cell.blocks = ["Ehre sei Gott!"] cell.valign = VerticalAlignment.middle cell.width = 16 cell.height = 4 - assert cell.blocks == ['', - 'Ehre sei Gott! ', - '', - ''] + assert cell.blocks == ["", "Ehre sei Gott! ", "", ""] diff --git a/tests/test_table_row.py b/tests/test_table_row.py index dc2f5f6..b0ea143 100644 --- a/tests/test_table_row.py +++ b/tests/test_table_row.py @@ -11,17 +11,17 @@ def test_empty_row(): - tr = TableRow(cell_separator=' ') + tr = TableRow(cell_separator=" ") assert tr.width == 0 - assert tr.get_text() == '' + assert tr.get_text() == "" def test_table_cell_separator(): - html = '' + html = "
Hallo
EinsEcho
Zwei" config = ParserConfig() - assert get_text(html, config) == 'Hallo Echo\nEins Zwei\n' + assert get_text(html, config) == "Hallo Echo\nEins Zwei\n" - config = ParserConfig(table_cell_separator='\t') - assert get_text(html, config) == 'Hallo\tEcho\nEins \tZwei\n' + config = ParserConfig(table_cell_separator="\t") + assert get_text(html, config) == "Hallo\tEcho\nEins \tZwei\n" diff --git a/tests/test_white_space_handling.py b/tests/test_white_space_handling.py index cf43d4d..b8b8e28 100644 --- a/tests/test_white_space_handling.py +++ b/tests/test_white_space_handling.py @@ -9,29 +9,24 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig -config = ParserConfig(css=CSS_PROFILES['strict']) +config = ParserConfig(css=CSS_PROFILES["strict"]) def test_white_space(): - html = (u'12\n3' - u'') - assert get_text(html, config) == u'12 3' + html = '12\n3' "" + assert get_text(html, config) == "12 3" - html = (u'12\n3' - u'') - assert get_text(html, config) == u'12 3' + html = '12\n3' "" + assert get_text(html, config) == "12 3" - html = (u'12\n3' - u'') - assert get_text(html, config) == u'12\n3' + html = '12\n3' "" + assert get_text(html, config) == "12\n3" - html = (u'12\n3' - u'') - assert get_text(html, config) == u'12\n3' + html = '12\n3' "" + assert get_text(html, config) == "12\n3" - html = (u'12\n3' - u'') - assert get_text(html, config) == u'12\n3' + html = '12\n3' "" + assert get_text(html, config) == "12\n3" def test_borderline_cases(): @@ -41,39 +36,38 @@ def test_borderline_cases(): """ # change of whitespace handling between terms; no whitespace # between the terms - html = u'Halloecho versus' - assert get_text(html, config) == u'Halloecho versus' + html = 'Halloecho versus' + assert get_text(html, config) == "Halloecho versus" # change of whitespace handling between terms; one whitespace # between the terms; option 1 - html = u'Hallo echo versus' - assert get_text(html, config) == u'Hallo echo versus' + html = 'Hallo echo versus' + assert get_text(html, config) == "Hallo echo versus" # change of whitespace handling between terms; one whitespace # between the terms; option 2 - html = u'Hallo echo versus' - assert get_text(html, config) == u'Hallo echo versus' + html = 'Hallo echo versus' + assert get_text(html, config) == "Hallo echo versus" # change of whitespace handling between terms; two whitespaces # between the terms - html = u'Hallo echo versus' - assert get_text(html, config) == u'Hallo echo versus' + html = 'Hallo echo versus' + assert get_text(html, config) == "Hallo echo versus" # change of whitespace handling between terms; multiple whitespaces # between the terms - html = u'Hallo echo versus' - assert get_text(html, config) == u'Hallo echo versus' + html = 'Hallo echo versus' + assert get_text(html, config) == "Hallo echo versus" # change of whitespace handling between terms; multiple whitespaces # between the terms - html = u'Hallo echo versus' - assert get_text(html, config) == u'Hallo echo versus' + html = 'Hallo echo versus' + assert get_text(html, config) == "Hallo echo versus" def test_tail(): """ ensure that the tail elements are formated based on the container element. """ - html = (u'Hi 1 3 ' - u' versus 1 3') - assert get_text(html, config) == u'Hi 1 3 versus 1 3' + html = 'Hi 1 3 ' " versus 1 3" + assert get_text(html, config) == "Hi 1 3 versus 1 3" diff --git a/tox.ini b/tox.ini index 63c1093..8dc0683 100644 --- a/tox.ini +++ b/tox.ini @@ -1,47 +1,42 @@ +[tox] +envlist = pytest, pyroma, flake8 + # standard unit tests [testenv:pytest] -deps = pytest ~= 7.1.2 - pytest-cov ~= 3.0.0 -commands = py.test --cov-config=.coveragerc --cov=inscriptis ./tests +deps = pytest ~= 7.4.4 + pytest-cov ~= 4.1.0 +commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests # python packaging best practices [testenv:pyroma] deps = pyroma commands = pyroma . -# checks compatible with flake 4 -[testenv:flake8-4] -deps = flake8 ~= 4.0.1 +[testenv:flake8] +deps = flake8 ~= 7.0.0 + dlint ~= 0.14.1 + flake8-bandit ~= 4.1.1 flake8-blind-except ~= 0.2.1 - flake8-bandit ~= 3.0.0 - flake8-bugbear ~= 22.7.1 - flake8-builtins ~= 1.5.3 + flake8-bugbear ~= 23.12.2 + flake8-builtins ~= 2.2.0 flake8-cognitive-complexity ~= 0.1.0 flake8-colors ~= 0.1.9 - flake8-comprehensions ~= 3.10.0 - flake8-docstrings ~= 1.6.0 - flake8-encodings ~= 0.5.0.post1 - flake8-eradicate ~= 1.2.1 + flake8-comprehensions ~= 3.14.0 + flake8-docstrings ~= 1.7.0 + flake8-eradicate ~= 1.5.0 + flake8-encodings ~= 0.5.1 flake8-expression-complexity ~= 0.0.11 + flake8-logging-format ~= 0.9.0 + flake8-mutable ~= 1.2.0 + flake8-pie ~= 0.16.0 + flake8-pytest ~= 1.4 + flake8-raise ~= 0.0.5 + flake8-simplify ~= 0.21.0 flake8-string-format ~= 0.3.0 flake8-tuple ~= 0.4.1 - flake8-logging-format ~= 0.6.0 - flake8-pytest ~= 1.3 - flake8-quotes ~= 3.3.1 - flake8-raise ~= 0.0.5 - flake8-simplify ~= 0.19.2 - pep8-naming ~= 0.13.1 - flake8-mutable ~= 1.2.0 - flake8-use-pathlib ~= 0.2.1 -commands = flake8 - -[flake8] -exclude = .tox - docs - benchmarking - setup.py - tests - venv + flake8-use-pathlib ~= 0.3.0 + flake8-warnings ~= 0.4.1 + pep8-naming ~= 0.13.3 # S104 - do not cleanup XML data prior to processing # S410 - bind to all IPs is okay in the case of the Web service, since it is @@ -50,12 +45,13 @@ exclude = .tox # D102 - missing docstring in public method # D105 - missing docstring in magic method (e.g., __str__) # D107 - missing docstring in __init__ -ignore = S104, S410, W503, D107, D105, D102 -show-source = true -enable-extensions=G -application-import-names = inscriptis - -# flake8 cognitive complexity -max-cognitive-complexity=13 +# E203, E704 black +commands = flake8 --exclude=".tox, setup.py, tests, venv, docs, benchmarking, build" \ + --show-source \ + --max-line-length=88 \ + --ignore="DUO107, W503, D107, D105, D102, S104, S410, E203, E708" \ + --max-cognitive-complexity=13 -# +# --ignore="S104, S410, W503, D107, D105, D102" \ +# --enable-extensions=G \ +# --max-cognitive-complexity=13
Hallo
EinsEcho
Zwei