From 74161ea277c75a29ba6970d12d15b73e6d0f6050 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Mon, 8 Jan 2024 16:03:20 +0100 Subject: [PATCH 01/29] chg: add python 3.12 support. --- .github/workflows/python-package.yml | 4 ++-- setup.py | 5 ++--- src/inscriptis/metadata.py | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 7d2aff2..afd8ae4 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -9,11 +9,11 @@ on: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: fail-fast: false matrix: - python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'] + python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12'] steps: - uses: actions/checkout@v3 diff --git a/setup.py b/setup.py index 9ef7d76..d3ec856 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ long_description=long_description, author=__author__, author_email=__author_email__, - python_requires='>=3.6', + python_requires='>=3.8', classifiers=[ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', @@ -33,12 +33,11 @@ 'Topic :: Text Processing :: Markup :: HTML', 'Topic :: Utilities', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', ], keywords='HTML,converter,text', url='https://github.com/weblyzard/inscriptis', diff --git a/src/inscriptis/metadata.py b/src/inscriptis/metadata.py index ff06c8d..2b2a771 100644 --- a/src/inscriptis/metadata.py +++ b/src/inscriptis/metadata.py @@ -2,6 +2,6 @@ __author__ = 'Albert Weichselbraun, Fabian Odoni' __author_email__ = 'albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch' -__copyright__ = '2016-2023 Albert Weichselbraun, Fabian Odoni' +__copyright__ = '2016-2024 Albert Weichselbraun, Fabian Odoni' __license__ = 'Apache 2.0' -__version__ = '2.3.2' +__version__ = '2.3.3' From 176040374eedf4786f4c0508ff70a36090935566 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Mon, 8 Jan 2024 16:08:37 +0100 Subject: [PATCH 02/29] chg: run codeql and build on all branches. --- .github/workflows/codeql-analysis.yml | 3 --- .github/workflows/python-package.yml | 2 -- 2 files changed, 5 deletions(-) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index bb76eb2..0e79ee4 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -13,10 +13,7 @@ name: "CodeQL" on: push: - branches: [ master ] pull_request: - # The branches below must be a subset of the branches above - branches: [ master ] schedule: - cron: '26 5 * * 2' diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index afd8ae4..c313db5 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -2,9 +2,7 @@ name: build on: push: - branches: [ master ] pull_request: - branches: [ master ] jobs: build: From 3014cdae9684c407c144493504f802a1af4f4dff Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Mon, 8 Jan 2024 17:24:12 +0100 Subject: [PATCH 03/29] add: read timeout. --- scripts/inscript.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/inscript.py b/scripts/inscript.py index 6821445..0697e44 100755 --- a/scripts/inscript.py +++ b/scripts/inscript.py @@ -15,6 +15,7 @@ from inscriptis.model.config import ParserConfig DEFAULT_ENCODING = 'utf8' +DEFAULT_TIMEOUT = 5 # default timeout in seconds def get_postprocessor(name): @@ -69,6 +70,9 @@ def get_parser(): parser.add_argument('--table-cell-separator', default=' ', help='Separator to use between table cells (default: ' 'three spaces).') + parser.add_argument('--timeout', default=DEFAULT_TIMEOUT, + help='Request timeout in seconds (default: ' + f'{DEFAULT_TIMEOUT}).') parser.add_argument('-v', '--version', action='store_true', default=False, help='display version information') @@ -95,7 +99,7 @@ def get_parser(): errors='ignore') as f: html_content = f.read() elif args.input.startswith('http://') or args.input.startswith('https://'): - req = requests.get(args.input) + req = requests.get(args.input, timeout=args.timeout) html_content = req.content.decode(args.encoding or req.encoding) else: print("ERROR: Cannot open input file '{0}'.\n".format(args.input)) From 263bf997075a763be61a699c56356050aaa1754a Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Mon, 8 Jan 2024 17:24:41 +0100 Subject: [PATCH 04/29] add: improved tox.ini --- tox.ini | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/tox.ini b/tox.ini index 63c1093..aba12a6 100644 --- a/tox.ini +++ b/tox.ini @@ -1,8 +1,11 @@ +[tox] +envlist = pytest, pyroma, flake8-4 + # standard unit tests [testenv:pytest] deps = pytest ~= 7.1.2 pytest-cov ~= 3.0.0 -commands = py.test --cov-config=.coveragerc --cov=inscriptis ./tests +commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests # python packaging best practices [testenv:pyroma] @@ -33,15 +36,6 @@ deps = flake8 ~= 4.0.1 pep8-naming ~= 0.13.1 flake8-mutable ~= 1.2.0 flake8-use-pathlib ~= 0.2.1 -commands = flake8 - -[flake8] -exclude = .tox - docs - benchmarking - setup.py - tests - venv # S104 - do not cleanup XML data prior to processing # S410 - bind to all IPs is okay in the case of the Web service, since it is @@ -50,12 +44,9 @@ exclude = .tox # D102 - missing docstring in public method # D105 - missing docstring in magic method (e.g., __str__) # D107 - missing docstring in __init__ -ignore = S104, S410, W503, D107, D105, D102 -show-source = true -enable-extensions=G -application-import-names = inscriptis - # flake8 cognitive complexity -max-cognitive-complexity=13 - -# +commands = flake8 --exclude=".tox, setup.py, tests, venv, docs, benchmarking, build" \ + --ignore="S104, S410, W503, D107, D105, D102" \ + --show-source \ + --enable-extensions=G \ + --max-cognitive-complexity=13 From e74c24ffd4089ab1ea54123ac42004e5fb1a41ce Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Mon, 8 Jan 2024 17:24:50 +0100 Subject: [PATCH 05/29] chg: use f-strings. --- src/inscriptis/html_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index 85664b7..a39ea3f 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -164,7 +164,7 @@ def _start_img(self, attrs): image_text = attrs.get('alt', '') or attrs.get('title', '') if image_text and not (self.config.deduplicate_captions and image_text == self.last_caption): - self.tags[-1].write('[{0}]'.format(image_text)) + self.tags[-1].write(f'[{image_text}]') self.last_caption = image_text def _start_a(self, attrs): @@ -179,7 +179,7 @@ def _start_a(self, attrs): def _end_a(self): if self.link_target: - self.tags[-1].write(']({0})'.format(self.link_target)) + self.tags[-1].write(f']({self.link_target})') def _start_ol(self, _): self.li_counter.append(1) @@ -191,7 +191,7 @@ def _start_li(self, _): bullet = self.li_counter[-1] if self.li_counter else '* ' if isinstance(bullet, int): self.li_counter[-1] += 1 - self.tags[-1].list_bullet = '{0}. '.format(bullet) + self.tags[-1].list_bullet = f'{bullet}. ' else: self.tags[-1].list_bullet = bullet From ffef3f91549fe321f1e28b41376f17c1397d7ef1 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Mon, 8 Jan 2024 18:32:15 +0100 Subject: [PATCH 06/29] chg: optimized tox config. --- tox.ini | 52 ++++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/tox.ini b/tox.ini index aba12a6..22b7f68 100644 --- a/tox.ini +++ b/tox.ini @@ -1,10 +1,10 @@ [tox] -envlist = pytest, pyroma, flake8-4 +envlist = pytest, pyroma, flake8 # standard unit tests [testenv:pytest] -deps = pytest ~= 7.1.2 - pytest-cov ~= 3.0.0 +deps = pytest ~= 7.4.4 + pytest-cov ~= 4.1.0 commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests # python packaging best practices @@ -12,30 +12,32 @@ commands = pytest --cov-config=.coveragerc --cov=inscriptis ./tests deps = pyroma commands = pyroma . -# checks compatible with flake 4 -[testenv:flake8-4] -deps = flake8 ~= 4.0.1 +[testenv:flake8] +deps = flake8 ~= 7.0.0 + dlint ~= 0.14.1 + flake8-bandit ~= 4.1.1 flake8-blind-except ~= 0.2.1 - flake8-bandit ~= 3.0.0 - flake8-bugbear ~= 22.7.1 - flake8-builtins ~= 1.5.3 + flake8-bugbear ~= 23.12.2 + flake8-builtins ~= 2.2.0 flake8-cognitive-complexity ~= 0.1.0 flake8-colors ~= 0.1.9 - flake8-comprehensions ~= 3.10.0 - flake8-docstrings ~= 1.6.0 - flake8-encodings ~= 0.5.0.post1 - flake8-eradicate ~= 1.2.1 + flake8-comprehensions ~= 3.14.0 + flake8-docstrings ~= 1.7.0 + flake8-eradicate ~= 1.5.0 + flake8-encodings ~= 0.5.1 flake8-expression-complexity ~= 0.0.11 + flake8-logging-format ~= 0.9.0 + flake8-mutable ~= 1.2.0 + flake8-pie ~= 0.16.0 + flake8-pytest ~= 1.4 + flake8-quotes ~= 3.3.2 + flake8-raise ~= 0.0.5 + flake8-simplify ~= 0.21.0 flake8-string-format ~= 0.3.0 flake8-tuple ~= 0.4.1 - flake8-logging-format ~= 0.6.0 - flake8-pytest ~= 1.3 - flake8-quotes ~= 3.3.1 - flake8-raise ~= 0.0.5 - flake8-simplify ~= 0.19.2 - pep8-naming ~= 0.13.1 - flake8-mutable ~= 1.2.0 - flake8-use-pathlib ~= 0.2.1 + flake8-use-pathlib ~= 0.3.0 + flake8-warnings ~= 0.4.1 + pep8-naming ~= 0.13.3 # S104 - do not cleanup XML data prior to processing # S410 - bind to all IPs is okay in the case of the Web service, since it is @@ -44,9 +46,11 @@ deps = flake8 ~= 4.0.1 # D102 - missing docstring in public method # D105 - missing docstring in magic method (e.g., __str__) # D107 - missing docstring in __init__ -# flake8 cognitive complexity commands = flake8 --exclude=".tox, setup.py, tests, venv, docs, benchmarking, build" \ - --ignore="S104, S410, W503, D107, D105, D102" \ --show-source \ - --enable-extensions=G \ + --ignore="DUO107, W503, D107, D105, D102, S104, S410" \ --max-cognitive-complexity=13 + +# --ignore="S104, S410, W503, D107, D105, D102" \ +# --enable-extensions=G \ +# --max-cognitive-complexity=13 From 57120cea447938a6e7ed855c78b13233c610af6f Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Mon, 8 Jan 2024 18:32:48 +0100 Subject: [PATCH 07/29] chg: code cleanup. --- src/inscriptis/annotation/output/html.py | 4 +--- src/inscriptis/html_engine.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/inscriptis/annotation/output/html.py b/src/inscriptis/annotation/output/html.py index 310f935..2ea498b 100644 --- a/src/inscriptis/annotation/output/html.py +++ b/src/inscriptis/annotation/output/html.py @@ -68,9 +68,7 @@ def _get_label_colors(labels: List[str]) -> Dict[str, str]: A mapping between the available labels and the corresponding color from the COLOR_SCHEMA. """ - return {label: color - for label, color in zip({a[2] for a in sorted(labels)}, - cycle(COLOR_SCHEMA))} + return dict(zip({a[2] for a in sorted(labels)}, cycle(COLOR_SCHEMA))) def _get_css(self, labels: List[str]) -> str: """Compute the CSS to be included into the HTML output. diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index a39ea3f..e2c7f3e 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -137,7 +137,7 @@ def handle_starttag(self, tag, attrs): tag, DEFAULT_HTML_ELEMENT).__copy__().set_tag(tag))) self.tags.append(cur) - handler = self.start_tag_handler_dict.get(tag, None) + handler = self.start_tag_handler_dict.get(tag) if handler: handler(attrs) @@ -150,7 +150,7 @@ def handle_endtag(self, tag): Args: tag: the HTML end tag to process. """ - handler = self.end_tag_handler_dict.get(tag, None) + handler = self.end_tag_handler_dict.get(tag) if handler: handler() From 5cb6c3c8794083dc36563acd805cdd71c7aa4226 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Mon, 8 Jan 2024 18:34:36 +0100 Subject: [PATCH 08/29] chg: improved build. --- .github/workflows/python-package.yml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index c313db5..08ee6b2 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -22,15 +22,12 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest pytest-cov codecov + python -m pip install tox if [ -f requirements.txt ]; then pip install -r requirements.txt; fi python setup.py install - - name: Lint with flake8 + - name: Lint with tox run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=80 --statistics + tox - name: Test with pytest run: | py.test --cov=inscripits ./tests && codecov From d346a111cbe29515bdc197708fb0c607709fa30d Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Mon, 8 Jan 2024 18:34:46 +0100 Subject: [PATCH 09/29] chg: improved build. --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 08ee6b2..f7229d4 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -22,7 +22,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install tox + python -m pip install tox setuptools if [ -f requirements.txt ]; then pip install -r requirements.txt; fi python setup.py install - name: Lint with tox From a9d1a2602eee477a72e741b663f816cff75b6a71 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Mon, 8 Jan 2024 18:39:28 +0100 Subject: [PATCH 10/29] chg: improve buid process. --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index f7229d4..1498480 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -22,7 +22,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install tox setuptools + python -m pip install tox setuptools pytest pytest-cov codecov if [ -f requirements.txt ]; then pip install -r requirements.txt; fi python setup.py install - name: Lint with tox From 74c6347df76c242c45a0306bddbdb3576eccc304 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Wed, 10 Jan 2024 19:54:25 +0100 Subject: [PATCH 11/29] add: black configuration. --- pyproject.toml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8992400 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,31 @@ +[tool.poetry] +name = "inscriptis" +version = "2.4.0" +description = "inscriptis - HTML to text converter." +authors = ["Albert Weichselbraun ", "Fabian Odoni "] +license = "Apache 2.0" +readme = "README.rst" + +[tool.poetry.dependencies] +python = "^3.8, ^3.9, ^3.10, ^3.11, ^3.12" +requests = "^2.31.0" +lxml = "^5.1.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +# code formatting with black +[tool.black] +line-length = 88 +target-version = ["py38", "py39", "py310", "py311", "py312"] +include = "\.pyi?$" +# 'extend-exclude' excludes files or directories in addition to the defaults +extend-exclude = """ +# A regex preceded with ^/ will apply only to files and directories +# in the root of the project. +( + ^/foo.py # exclude a file named foo.py in the root of the project + | .*_pb2.py # exclude autogenerated Protocol Buffer files anywhere in the project +) +""" From 9f995fe6f31c07a65a58e27bdf733e94ce2bf606 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Thu, 11 Jan 2024 14:16:21 +0100 Subject: [PATCH 12/29] fix: specification of the supported python versions. --- pyproject.toml | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8992400..2ed5029 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ license = "Apache 2.0" readme = "README.rst" [tool.poetry.dependencies] -python = "^3.8, ^3.9, ^3.10, ^3.11, ^3.12" +python = "^3.8 || ^3.9 || ^3.10 || ^3.11 || ^3.12" requests = "^2.31.0" lxml = "^5.1.0" @@ -19,13 +19,8 @@ build-backend = "poetry.core.masonry.api" [tool.black] line-length = 88 target-version = ["py38", "py39", "py310", "py311", "py312"] -include = "\.pyi?$" -# 'extend-exclude' excludes files or directories in addition to the defaults -extend-exclude = """ -# A regex preceded with ^/ will apply only to files and directories -# in the root of the project. -( - ^/foo.py # exclude a file named foo.py in the root of the project - | .*_pb2.py # exclude autogenerated Protocol Buffer files anywhere in the project -) -""" +include = ''' + ^scripts/ + ^src/ + ^tests/ +''' From 8ea3468ac537c2c8e55d82be7015899f8dab6c27 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Thu, 11 Jan 2024 17:28:03 +0100 Subject: [PATCH 13/29] chg: improved tox and pyproject configs. --- pyproject.toml | 5 ++--- tox.ini | 5 +++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2ed5029..53f64cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,8 +19,7 @@ build-backend = "poetry.core.masonry.api" [tool.black] line-length = 88 target-version = ["py38", "py39", "py310", "py311", "py312"] +extend-exclude = '\.html$|\.json$|\.txt$' include = ''' - ^scripts/ - ^src/ - ^tests/ + ^/scripts/|^/src/|^/tests/ ''' diff --git a/tox.ini b/tox.ini index 22b7f68..8dc0683 100644 --- a/tox.ini +++ b/tox.ini @@ -30,7 +30,6 @@ deps = flake8 ~= 7.0.0 flake8-mutable ~= 1.2.0 flake8-pie ~= 0.16.0 flake8-pytest ~= 1.4 - flake8-quotes ~= 3.3.2 flake8-raise ~= 0.0.5 flake8-simplify ~= 0.21.0 flake8-string-format ~= 0.3.0 @@ -46,9 +45,11 @@ deps = flake8 ~= 7.0.0 # D102 - missing docstring in public method # D105 - missing docstring in magic method (e.g., __str__) # D107 - missing docstring in __init__ +# E203, E704 black commands = flake8 --exclude=".tox, setup.py, tests, venv, docs, benchmarking, build" \ --show-source \ - --ignore="DUO107, W503, D107, D105, D102, S104, S410" \ + --max-line-length=88 \ + --ignore="DUO107, W503, D107, D105, D102, S104, S410, E203, E708" \ --max-cognitive-complexity=13 # --ignore="S104, S410, W503, D107, D105, D102" \ From 55fa29ca39f9ed5895f9e88b2eb0f17e4d84245f Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Thu, 11 Jan 2024 17:29:46 +0100 Subject: [PATCH 14/29] chg: apply black formatting. --- scripts/inscript.py | 194 +++++++++++++------- src/inscriptis/__init__.py | 22 +-- src/inscriptis/annotation/__init__.py | 15 +- src/inscriptis/annotation/output/html.py | 73 ++++---- src/inscriptis/annotation/output/surface.py | 8 +- src/inscriptis/annotation/output/xml.py | 23 ++- src/inscriptis/annotation/parser.py | 39 ++-- src/inscriptis/css_profiles.py | 108 +++++------ src/inscriptis/html_engine.py | 100 +++++----- src/inscriptis/html_properties.py | 6 +- src/inscriptis/metadata.py | 10 +- src/inscriptis/model/attribute.py | 25 ++- src/inscriptis/model/canvas/__init__.py | 26 +-- src/inscriptis/model/canvas/block.py | 20 +- src/inscriptis/model/canvas/prefix.py | 28 ++- src/inscriptis/model/config.py | 25 +-- src/inscriptis/model/css.py | 32 ++-- src/inscriptis/model/html_element.py | 90 +++++---- src/inscriptis/model/table.py | 104 ++++++----- src/inscriptis/service/web.py | 36 ++-- tests/test_annotation.py | 86 ++++----- tests/test_annotation_output_processor.py | 71 +++---- tests/test_annotation_rule_parsing.py | 59 +++--- tests/test_block.py | 32 ++-- tests/test_broken_table_handling.py | 13 +- tests/test_double_a.py | 16 +- tests/test_empty_string.py | 9 +- tests/test_engine.py | 4 +- tests/test_html_conversion_options.py | 41 +++-- tests/test_html_snippets.py | 30 +-- tests/test_html_snippets_annotations.py | 52 +++--- tests/test_limit_whitespace_affixes.py | 44 +++-- tests/test_list_div.py | 22 +-- tests/test_margin_before_at_start.py | 18 +- tests/test_margin_handling.py | 20 +- tests/test_metadata.py | 25 ++- tests/test_model_html_element_canvas.py | 16 +- tests/test_model_prefix.py | 35 ++-- tests/test_parse_css.py | 49 ++--- tests/test_strip_xml_header.py | 6 +- tests/test_style_parsing.py | 5 +- tests/test_table_cell.py | 21 ++- tests/test_table_cell_formatting.py | 36 ++-- tests/test_table_row.py | 12 +- tests/test_white_space_handling.py | 56 +++--- 45 files changed, 952 insertions(+), 810 deletions(-) diff --git a/scripts/inscript.py b/scripts/inscript.py index 0697e44..2e694a5 100755 --- a/scripts/inscript.py +++ b/scripts/inscript.py @@ -14,7 +14,7 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig -DEFAULT_ENCODING = 'utf8' +DEFAULT_ENCODING = "utf8" DEFAULT_TIMEOUT = 5 # default timeout in seconds @@ -27,78 +27,125 @@ def get_postprocessor(name): Returns: The matching postprocessing function """ - pp_class = name.capitalize() + 'Extractor' - mod = __import__('inscriptis.annotation.output.' + name, - fromlist=[pp_class]) + pp_class = name.capitalize() + "Extractor" + mod = __import__("inscriptis.annotation.output." + name, fromlist=[pp_class]) return getattr(mod, pp_class)() def get_parser(): """Parse the arguments if script is run via console.""" parser = argparse.ArgumentParser( - description='Convert the given HTML document to text.') - parser.add_argument('input', nargs='?', default=None, - help='Html input either from a file or a URL ' - '(default:stdin).') - parser.add_argument('-o', '--output', type=str, - help='Output file (default:stdout).') - parser.add_argument('-e', '--encoding', type=str, - help='Input encoding to use (default:utf-8 for ' - 'files; detected server encoding for Web URLs).') - parser.add_argument('-i', '--display-image-captions', - action='store_true', default=False, - help='Display image captions (default:false).') - parser.add_argument('-d', '--deduplicate-image-captions', - action='store_true', default=False, - help='Deduplicate image captions (default:false).') - parser.add_argument('-l', '--display-link-targets', - action='store_true', default=False, - help='Display link targets (default:false).') - parser.add_argument('-a', '--display-anchor-urls', - action='store_true', default=False, - help='Display anchor URLs (default:false).') - parser.add_argument('-r', '--annotation-rules', default=None, - help='Path to an optional JSON file containing rules ' - 'for annotating the retrieved text.') - parser.add_argument('-p', '--postprocessor', type=get_postprocessor, - default=lambda x: x, - help='Optional component for postprocessing the ' - 'result (html, surface, xml). ') - parser.add_argument('--indentation', default='extended', - help='How to handle indentation (extended or strict;' - ' default: extended).') - parser.add_argument('--table-cell-separator', default=' ', - help='Separator to use between table cells (default: ' - 'three spaces).') - parser.add_argument('--timeout', default=DEFAULT_TIMEOUT, - help='Request timeout in seconds (default: ' - f'{DEFAULT_TIMEOUT}).') - parser.add_argument('-v', '--version', - action='store_true', default=False, - help='display version information') + description="Convert the given HTML document to text." + ) + parser.add_argument( + "input", + nargs="?", + default=None, + help="Html input either from a file or a URL " "(default:stdin).", + ) + parser.add_argument( + "-o", "--output", type=str, help="Output file (default:stdout)." + ) + parser.add_argument( + "-e", + "--encoding", + type=str, + help="Input encoding to use (default:utf-8 for " + "files; detected server encoding for Web URLs).", + ) + parser.add_argument( + "-i", + "--display-image-captions", + action="store_true", + default=False, + help="Display image captions (default:false).", + ) + parser.add_argument( + "-d", + "--deduplicate-image-captions", + action="store_true", + default=False, + help="Deduplicate image captions (default:false).", + ) + parser.add_argument( + "-l", + "--display-link-targets", + action="store_true", + default=False, + help="Display link targets (default:false).", + ) + parser.add_argument( + "-a", + "--display-anchor-urls", + action="store_true", + default=False, + help="Display anchor URLs (default:false).", + ) + parser.add_argument( + "-r", + "--annotation-rules", + default=None, + help="Path to an optional JSON file containing rules " + "for annotating the retrieved text.", + ) + parser.add_argument( + "-p", + "--postprocessor", + type=get_postprocessor, + default=lambda x: x, + help="Optional component for postprocessing the " + "result (html, surface, xml). ", + ) + parser.add_argument( + "--indentation", + default="extended", + help="How to handle indentation (extended or strict;" " default: extended).", + ) + parser.add_argument( + "--table-cell-separator", + default=" ", + help="Separator to use between table cells (default: " "three spaces).", + ) + parser.add_argument( + "--timeout", + default=DEFAULT_TIMEOUT, + help="Request timeout in seconds (default: " f"{DEFAULT_TIMEOUT}).", + ) + parser.add_argument( + "-v", + "--version", + action="store_true", + default=False, + help="display version information", + ) return parser -if __name__ == '__main__': +if __name__ == "__main__": parser = get_parser() args = parser.parse_args() if args.version: - print('Inscript HTML to text conversion (based on the inscriptis ' - 'library version {0})'.format(__version__)) - print('Copyright (C)', __copyright__) - print('\nInscript comes with ABSOLUTELY NO WARRANTY.') - print('This is free software and you are welcome to redistribute it ' - 'under the terms of the {0}.'.format(__license__)) + print( + "Inscript HTML to text conversion (based on the inscriptis " + "library version {0})".format(__version__) + ) + print("Copyright (C)", __copyright__) + print("\nInscript comes with ABSOLUTELY NO WARRANTY.") + print( + "This is free software and you are welcome to redistribute it " + "under the terms of the {0}.".format(__license__) + ) sys.exit(0) if not args.input: html_content = sys.stdin.read() elif Path(args.input).is_file(): - with Path(args.input).open(encoding=args.encoding or DEFAULT_ENCODING, - errors='ignore') as f: + with Path(args.input).open( + encoding=args.encoding or DEFAULT_ENCODING, errors="ignore" + ) as f: html_content = f.read() - elif args.input.startswith('http://') or args.input.startswith('https://'): + elif args.input.startswith("http://") or args.input.startswith("https://"): req = requests.get(args.input, timeout=args.timeout) html_content = req.content.decode(args.encoding or req.encoding) else: @@ -111,33 +158,38 @@ def get_parser(): with Path(args.annotation_rules).open() as f: annotation_rules = load(f) except IOError: - print("ERROR: Cannot open annotation rule file '{0}'.".format( - args.annotation_rules - )) + print( + "ERROR: Cannot open annotation rule file '{0}'.".format( + args.annotation_rules + ) + ) sys.exit(-1) else: annotation_rules = None - css_profile = CSS_PROFILES['relaxed'] if args.indentation == 'extended' \ - else CSS_PROFILES['strict'] - config = ParserConfig(css=css_profile, - display_images=args.display_image_captions, - deduplicate_captions=args.deduplicate_image_captions, - display_links=args.display_link_targets, - display_anchors=args.display_anchor_urls, - annotation_rules=annotation_rules, - table_cell_separator=args.table_cell_separator) + css_profile = ( + CSS_PROFILES["relaxed"] + if args.indentation == "extended" + else CSS_PROFILES["strict"] + ) + config = ParserConfig( + css=css_profile, + display_images=args.display_image_captions, + deduplicate_captions=args.deduplicate_image_captions, + display_links=args.display_link_targets, + display_anchors=args.display_anchor_urls, + annotation_rules=annotation_rules, + table_cell_separator=args.table_cell_separator, + ) if not annotation_rules: output = get_text(html_content, config) else: - output = args.postprocessor( - get_annotated_text(html_content, config)) - if hasattr(args.postprocessor, 'verbatim') \ - and not args.postprocessor.verbatim: + output = args.postprocessor(get_annotated_text(html_content, config)) + if hasattr(args.postprocessor, "verbatim") and not args.postprocessor.verbatim: output = dumps(output) if args.output: - with Path(args.output).open('w', encoding=DEFAULT_ENCODING) as f: + with Path(args.output).open("w", encoding=DEFAULT_ENCODING) as f: f.write(output) else: print(output) diff --git a/src/inscriptis/__init__.py b/src/inscriptis/__init__.py index 2eb150e..2ca6414 100644 --- a/src/inscriptis/__init__.py +++ b/src/inscriptis/__init__.py @@ -68,7 +68,7 @@ from inscriptis.model.config import ParserConfig from inscriptis.html_engine import Inscriptis -RE_STRIP_XML_DECLARATION = re.compile(r'^<\?xml [^>]+?\?>') +RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>") def _get_html_tree(html_content: str) -> Optional[HtmlElement]: @@ -85,13 +85,13 @@ def _get_html_tree(html_content: str) -> Optional[HtmlElement]: return None # strip XML declaration, if necessary - if html_content.startswith('' + html_content + '') + return fromstring("
" + html_content + "
") def get_text(html_content: str, config: ParserConfig = None) -> str: @@ -105,12 +105,12 @@ def get_text(html_content: str, config: ParserConfig = None) -> str: The text representation of the HTML content. """ html_tree = _get_html_tree(html_content) - return Inscriptis(html_tree, config).get_text() if html_tree is not None \ - else '' + return Inscriptis(html_tree, config).get_text() if html_tree is not None else "" -def get_annotated_text(html_content: str, - config: ParserConfig = None) -> Dict[str, Any]: +def get_annotated_text( + html_content: str, config: ParserConfig = None +) -> Dict[str, Any]: """Return a dictionary of the extracted text and annotations. Notes: @@ -132,7 +132,5 @@ def get_annotated_text(html_content: str, return {} inscriptis = Inscriptis(html_tree, config) - labels = [(a.start, a.end, a.metadata) - for a in inscriptis.get_annotations()] - return {'text': inscriptis.get_text(), - 'label': labels} + labels = [(a.start, a.end, a.metadata) for a in inscriptis.get_annotations()] + return {"text": inscriptis.get_text(), "label": labels} diff --git a/src/inscriptis/annotation/__init__.py b/src/inscriptis/annotation/__init__.py index 3d2b626..acf3d09 100644 --- a/src/inscriptis/annotation/__init__.py +++ b/src/inscriptis/annotation/__init__.py @@ -29,9 +29,13 @@ class Annotation(NamedTuple): """a tuple of tags to be attached to the annotation.""" -def horizontal_shift(annotations: List[Annotation], content_width: int, - line_width: int, align: HorizontalAlignment, - shift: int = 0) -> List[Annotation]: +def horizontal_shift( + annotations: List[Annotation], + content_width: int, + line_width: int, + align: HorizontalAlignment, + shift: int = 0, +) -> List[Annotation]: r"""Shift annotations based on the given line's formatting. Adjusts the start and end indices of annotations based on the line's @@ -56,5 +60,6 @@ def horizontal_shift(annotations: List[Annotation], content_width: int, else: h_align = shift + (line_width - content_width) // 2 - return [Annotation(a.start + h_align, a.end + h_align, a.metadata) - for a in annotations] + return [ + Annotation(a.start + h_align, a.end + h_align, a.metadata) for a in annotations + ] diff --git a/src/inscriptis/annotation/output/html.py b/src/inscriptis/annotation/output/html.py index 2ea498b..f7da4a8 100644 --- a/src/inscriptis/annotation/output/html.py +++ b/src/inscriptis/annotation/output/html.py @@ -5,8 +5,7 @@ from inscriptis.annotation.output import AnnotationProcessor -COLOR_SCHEMA = ('#D8115980', '#8F2D5680', '#21838080', - '#FBB13C80', '#73D2DE80') +COLOR_SCHEMA = ("#D8115980", "#8F2D5680", "#21838080", "#FBB13C80", "#73D2DE80") class HtmlExtractor(AnnotationProcessor): @@ -21,39 +20,43 @@ class HtmlExtractor(AnnotationProcessor): def __call__(self, annotated_text: Dict[str, Any]) -> str: tag_indices = defaultdict(list) - for start, end, label in sorted(annotated_text['label']): + for start, end, label in sorted(annotated_text["label"]): tag_indices[start].append(label) - tag_indices[end].append('/' + label) + tag_indices[end].append("/" + label) open_tags = [] - tagged_content = ['
']
-        for idx, ch in enumerate(annotated_text['text']):
+        tagged_content = [
+            "
",
+        ]
+        for idx, ch in enumerate(annotated_text["text"]):
             if idx in tag_indices:
                 tags = tag_indices[idx]
                 # close tags:
-                for _ in (t for t in sorted(tags, reverse=True)
-                          if t.startswith('/')):
+                for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
                     open_tags.pop()
-                    tagged_content.append('')
+                    tagged_content.append("")
                 # open tags
-                for tag in (t for t in sorted(tags, reverse=True)
-                            if not t.startswith('/')):
+                for tag in (
+                    t for t in sorted(tags, reverse=True) if not t.startswith("/")
+                ):
                     open_tags.append(tag)
                     tagged_content.append(
                         '{tag}'
-                        ''.format(tag=tag))
-
-            if ch == '\n':
-                tagged_content.extend(['' for _ in open_tags])
-                tagged_content.append('
\n
')
-                tagged_content.extend([''.format(tag=tag)
-                                       for tag in open_tags])
+                        ''.format(tag=tag)
+                    )
+
+            if ch == "\n":
+                tagged_content.extend(["" for _ in open_tags])
+                tagged_content.append("
\n
")
+                tagged_content.extend(
+                    [''.format(tag=tag) for tag in open_tags]
+                )
             else:
                 tagged_content.append(ch)
 
-        return ''.join(tagged_content) + '
' + return "".join(tagged_content) + "
" @staticmethod def _get_label_colors(labels: List[str]) -> Dict[str, str]: @@ -84,18 +87,18 @@ def _get_css(self, labels: List[str]) -> str: css = [] for label, color in sorted(self._get_label_colors(labels).items()): css.append( - 'pre{{' - ' position: relative;\n' - '}}\n' - '.{label} {{\n' - ' background-color: {color};\n' - ' border-radius: 0.4em;\n' - '}}\n' - '.{label}-label {{\n' - ' top: -1.0em;\n' + "pre{{" + " position: relative;\n" + "}}\n" + ".{label} {{\n" + " background-color: {color};\n" + " border-radius: 0.4em;\n" + "}}\n" + ".{label}-label {{\n" + " top: -1.0em;\n" ' content: "{label}";\n' - ' position: absolute;\n' - ' background-color: {color};\n' - ' font-size: 75%; }}\n'.format(label=label, - color=color)) - return '\n'.join(css) + " position: absolute;\n" + " background-color: {color};\n" + " font-size: 75%; }}\n".format(label=label, color=color) + ) + return "\n".join(css) diff --git a/src/inscriptis/annotation/output/surface.py b/src/inscriptis/annotation/output/surface.py index 52472d4..e4e5252 100644 --- a/src/inscriptis/annotation/output/surface.py +++ b/src/inscriptis/annotation/output/surface.py @@ -21,7 +21,9 @@ def __call__(self, annotated_text: Dict[str, Any]) -> Dict[str, Any]: An extended dictionary which contains the extracted surface-forms of the annotations under the key 'surface'. """ - surface_forms = [(label, annotated_text['text'][start:end]) - for start, end, label in annotated_text['label']] - annotated_text['surface'] = surface_forms + surface_forms = [ + (label, annotated_text["text"][start:end]) + for start, end, label in annotated_text["label"] + ] + annotated_text["surface"] = surface_forms return annotated_text diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py index 9c983f7..c31aa06 100644 --- a/src/inscriptis/annotation/output/xml.py +++ b/src/inscriptis/annotation/output/xml.py @@ -22,23 +22,28 @@ def __call__(self, annotated_text: Dict[str, Any]) -> str: """ tag_indices = defaultdict(list) - for start, end, label in sorted(annotated_text['label']): + for start, end, label in sorted(annotated_text["label"]): tag_indices[start].append(label) - tag_indices[end].append('/' + label) + tag_indices[end].append("/" + label) current_idx = 0 tagged_content = ['\n'] - text = annotated_text['text'] + text = annotated_text["text"] for index, tags in sorted(tag_indices.items()): tagged_content.append(text[current_idx:index]) # close tags - tagged_content.extend(['<' + tag + '>' - for tag in sorted(tags, reverse=True) - if tag.startswith('/')]) + tagged_content.extend( + [ + "<" + tag + ">" + for tag in sorted(tags, reverse=True) + if tag.startswith("/") + ] + ) # open tags - tagged_content.extend(['<' + tag + '>' for tag in sorted(tags) - if not tag.startswith('/')]) + tagged_content.extend( + ["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")] + ) current_idx = index tagged_content.append(text[current_idx:]) - return ''.join(tagged_content) + return "".join(tagged_content) diff --git a/src/inscriptis/annotation/parser.py b/src/inscriptis/annotation/parser.py index 500df4f..56bdf61 100644 --- a/src/inscriptis/annotation/parser.py +++ b/src/inscriptis/annotation/parser.py @@ -34,10 +34,15 @@ class ApplyAnnotation: match_value. """ - __slots__ = ('annotations', 'match_tag', 'match_value', 'attr', 'matcher') - - def __init__(self, annotations: tuple, attr: str, match_tag: str = None, - match_value: str = None): + __slots__ = ("annotations", "match_tag", "match_value", "attr", "matcher") + + def __init__( + self, + annotations: tuple, + attr: str, + match_tag: str = None, + match_value: str = None, + ): self.annotations = tuple(annotations) self.attr = attr self.match_tag = match_tag @@ -46,17 +51,18 @@ def __init__(self, annotations: tuple, attr: str, match_tag: str = None, def apply(self, attr_value: str, html_element: HtmlElement): """Apply the annotation to HtmlElements with matching tags.""" if (self.match_tag and self.match_tag != html_element.tag) or ( - self.match_value and self.match_value - not in attr_value.split()): + self.match_value and self.match_value not in attr_value.split() + ): return html_element.annotation += self.annotations def __str__(self): - return ' 'AnnotationModel': + def _parse(model: dict) -> "AnnotationModel": """Compute the AnnotationModel from a model dictionary. Returns: @@ -88,14 +94,13 @@ def _parse(model: dict) -> 'AnnotationModel': tags = defaultdict(list) attrs = [] for key, annotations in model.items(): - if '#' in key: - tag, attr = key.split('#') - if '=' in attr: - attr, value = attr.split('=') + if "#" in key: + tag, attr = key.split("#") + if "=" in attr: + attr, value = attr.split("=") else: value = None - attrs.append(ApplyAnnotation(annotations, attr, - tag, value)) + attrs.append(ApplyAnnotation(annotations, attr, tag, value)) else: tags[key].extend(annotations) return tags, attrs diff --git a/src/inscriptis/css_profiles.py b/src/inscriptis/css_profiles.py index 3d08c45..51889b3 100644 --- a/src/inscriptis/css_profiles.py +++ b/src/inscriptis/css_profiles.py @@ -12,73 +12,53 @@ from inscriptis.html_properties import Display, WhiteSpace STRICT_CSS_PROFILE = { - 'body': HtmlElement(display=Display.inline, - whitespace=WhiteSpace.normal), - 'head': HtmlElement(display=Display.none), - 'link': HtmlElement(display=Display.none), - 'meta': HtmlElement(display=Display.none), - 'script': HtmlElement(display=Display.none), - 'title': HtmlElement(display=Display.none), - 'style': HtmlElement(display=Display.none), - - 'p': HtmlElement(display=Display.block, margin_before=1, - margin_after=1), - 'figure': HtmlElement(display=Display.block, margin_before=1, - margin_after=1), - - 'h1': HtmlElement(display=Display.block, margin_before=1, - margin_after=1), - 'h2': HtmlElement(display=Display.block, margin_before=1, - margin_after=1), - 'h3': HtmlElement(display=Display.block, margin_before=1, - margin_after=1), - 'h4': HtmlElement(display=Display.block, margin_before=1, - margin_after=1), - 'h5': HtmlElement(display=Display.block, margin_before=1, - margin_after=1), - 'h6': HtmlElement(display=Display.block, margin_before=1, - margin_after=1), - - 'ul': HtmlElement(display=Display.block, margin_before=0, - margin_after=0, padding_inline=4), - 'ol': HtmlElement(display=Display.block, margin_before=0, - margin_after=0, padding_inline=4), - 'li': HtmlElement(display=Display.block), - - 'address': HtmlElement(display=Display.block), - 'article': HtmlElement(display=Display.block), - 'aside': HtmlElement(display=Display.block), - 'div': HtmlElement(display=Display.block), - 'footer': HtmlElement(display=Display.block), - 'header': HtmlElement(display=Display.block), - 'hgroup': HtmlElement(display=Display.block), - 'layer': HtmlElement(display=Display.block), - 'main': HtmlElement(display=Display.block), - 'nav': HtmlElement(display=Display.block), - 'figcaption': HtmlElement(display=Display.block), - - 'blockquote': HtmlElement(display=Display.block), - - 'q': HtmlElement(prefix='"', suffix='"'), - + "body": HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal), + "head": HtmlElement(display=Display.none), + "link": HtmlElement(display=Display.none), + "meta": HtmlElement(display=Display.none), + "script": HtmlElement(display=Display.none), + "title": HtmlElement(display=Display.none), + "style": HtmlElement(display=Display.none), + "p": HtmlElement(display=Display.block, margin_before=1, margin_after=1), + "figure": HtmlElement(display=Display.block, margin_before=1, margin_after=1), + "h1": HtmlElement(display=Display.block, margin_before=1, margin_after=1), + "h2": HtmlElement(display=Display.block, margin_before=1, margin_after=1), + "h3": HtmlElement(display=Display.block, margin_before=1, margin_after=1), + "h4": HtmlElement(display=Display.block, margin_before=1, margin_after=1), + "h5": HtmlElement(display=Display.block, margin_before=1, margin_after=1), + "h6": HtmlElement(display=Display.block, margin_before=1, margin_after=1), + "ul": HtmlElement( + display=Display.block, margin_before=0, margin_after=0, padding_inline=4 + ), + "ol": HtmlElement( + display=Display.block, margin_before=0, margin_after=0, padding_inline=4 + ), + "li": HtmlElement(display=Display.block), + "address": HtmlElement(display=Display.block), + "article": HtmlElement(display=Display.block), + "aside": HtmlElement(display=Display.block), + "div": HtmlElement(display=Display.block), + "footer": HtmlElement(display=Display.block), + "header": HtmlElement(display=Display.block), + "hgroup": HtmlElement(display=Display.block), + "layer": HtmlElement(display=Display.block), + "main": HtmlElement(display=Display.block), + "nav": HtmlElement(display=Display.block), + "figcaption": HtmlElement(display=Display.block), + "blockquote": HtmlElement(display=Display.block), + "q": HtmlElement(prefix='"', suffix='"'), # Handling of
-    'pre': HtmlElement(display=Display.block,
-                       whitespace=WhiteSpace.pre),
-    'xmp': HtmlElement(display=Display.block,
-                       whitespace=WhiteSpace.pre),
-    'listing': HtmlElement(display=Display.block,
-                           whitespace=WhiteSpace.pre),
-    'plaintext': HtmlElement(display=Display.block,
-                             whitespace=WhiteSpace.pre),
+    "pre": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre),
+    "xmp": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre),
+    "listing": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre),
+    "plaintext": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre),
 }
 
 RELAXED_CSS_PROFILE = STRICT_CSS_PROFILE.copy()
-RELAXED_CSS_PROFILE['div'] = HtmlElement(display=Display.block,
-                                         padding_inline=2)
-RELAXED_CSS_PROFILE['span'] = HtmlElement(display=Display.inline,
-                                          prefix=' ', suffix=' ',
-                                          limit_whitespace_affixes=True)
+RELAXED_CSS_PROFILE["div"] = HtmlElement(display=Display.block, padding_inline=2)
+RELAXED_CSS_PROFILE["span"] = HtmlElement(
+    display=Display.inline, prefix=" ", suffix=" ", limit_whitespace_affixes=True
+)
 
 
-CSS_PROFILES = {'strict': STRICT_CSS_PROFILE,
-                'relaxed': RELAXED_CSS_PROFILE}
+CSS_PROFILES = {"strict": STRICT_CSS_PROFILE, "relaxed": RELAXED_CSS_PROFILE}
diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py
index e2c7f3e..35496fb 100644
--- a/src/inscriptis/html_engine.py
+++ b/src/inscriptis/html_engine.py
@@ -35,34 +35,33 @@ class Inscriptis:
       text = parser.get_text()
     """
 
-    UL_COUNTER = ('* ', '+ ', 'o ', '- ')
+    UL_COUNTER = ("* ", "+ ", "o ", "- ")
     UL_COUNTER_LEN = len(UL_COUNTER)
 
-    def __init__(self, html_tree: lxml.html.HtmlElement,
-                 config: ParserConfig = None):
+    def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
         # use the default configuration, if no config object is provided
         self.config = config or ParserConfig()
 
         # setup start and end tag call tables
         self.start_tag_handler_dict = {
-            'table': self._start_table,
-            'tr': self._start_tr,
-            'td': self._start_td,
-            'th': self._start_td,
-            'ul': self._start_ul,
-            'ol': self._start_ol,
-            'li': self._start_li,
-            'br': self._newline,
-            'a': self._start_a if self.config.parse_a() else None,
-            'img': self._start_img if self.config.display_images else None,
+            "table": self._start_table,
+            "tr": self._start_tr,
+            "td": self._start_td,
+            "th": self._start_td,
+            "ul": self._start_ul,
+            "ol": self._start_ol,
+            "li": self._start_li,
+            "br": self._newline,
+            "a": self._start_a if self.config.parse_a() else None,
+            "img": self._start_img if self.config.display_images else None,
         }
         self.end_tag_handler_dict = {
-            'table': self._end_table,
-            'ul': self._end_ul,
-            'ol': self._end_ol,
-            'td': self._end_td,
-            'th': self._end_td,
-            'a': self._end_a if self.config.parse_a() else None,
+            "table": self._end_table,
+            "ul": self._end_ul,
+            "ol": self._end_ol,
+            "td": self._end_td,
+            "th": self._end_td,
+            "a": self._end_a if self.config.parse_a() else None,
         }
 
         # instance variables
@@ -70,13 +69,13 @@ def __init__(self, html_tree: lxml.html.HtmlElement,
         self.css = self.config.css
         self.apply_attributes = self.config.attribute_handler.apply_attributes
 
-        self.tags = [self.css['body'].set_canvas(self.canvas)]
+        self.tags = [self.css["body"].set_canvas(self.canvas)]
         self.current_table = []
         self.li_counter = []
         self.last_caption = None
 
         # used if display_links is enabled
-        self.link_target = ''
+        self.link_target = ""
 
         # crawl the html tree
         self._parse_html_tree(html_tree)
@@ -133,8 +132,13 @@ def handle_starttag(self, tag, attrs):
         """
         # use the css to handle tags known to it :)
         cur = self.tags[-1].get_refined_html_element(
-            self.apply_attributes(attrs, html_element=self.css.get(
-                tag, DEFAULT_HTML_ELEMENT).__copy__().set_tag(tag)))
+            self.apply_attributes(
+                attrs,
+                html_element=self.css.get(tag, DEFAULT_HTML_ELEMENT)
+                .__copy__()
+                .set_tag(tag),
+            )
+        )
         self.tags.append(cur)
 
         handler = self.start_tag_handler_dict.get(tag)
@@ -161,25 +165,26 @@ def _end_ul(self):
         self.li_counter.pop()
 
     def _start_img(self, attrs):
-        image_text = attrs.get('alt', '') or attrs.get('title', '')
-        if image_text and not (self.config.deduplicate_captions
-                               and image_text == self.last_caption):
-            self.tags[-1].write(f'[{image_text}]')
+        image_text = attrs.get("alt", "") or attrs.get("title", "")
+        if image_text and not (
+            self.config.deduplicate_captions and image_text == self.last_caption
+        ):
+            self.tags[-1].write(f"[{image_text}]")
             self.last_caption = image_text
 
     def _start_a(self, attrs):
-        self.link_target = ''
+        self.link_target = ""
         if self.config.display_links:
-            self.link_target = attrs.get('href', '')
+            self.link_target = attrs.get("href", "")
         if self.config.display_anchors:
-            self.link_target = self.link_target or attrs.get('name', '')
+            self.link_target = self.link_target or attrs.get("name", "")
 
         if self.link_target:
-            self.tags[-1].write('[')
+            self.tags[-1].write("[")
 
     def _end_a(self):
         if self.link_target:
-            self.tags[-1].write(f']({self.link_target})')
+            self.tags[-1].write(f"]({self.link_target})")
 
     def _start_ol(self, _):
         self.li_counter.append(1)
@@ -188,20 +193,23 @@ def _end_ol(self):
         self.li_counter.pop()
 
     def _start_li(self, _):
-        bullet = self.li_counter[-1] if self.li_counter else '* '
+        bullet = self.li_counter[-1] if self.li_counter else "* "
         if isinstance(bullet, int):
             self.li_counter[-1] += 1
-            self.tags[-1].list_bullet = f'{bullet}. '
+            self.tags[-1].list_bullet = f"{bullet}. "
         else:
             self.tags[-1].list_bullet = bullet
 
-        self.tags[-1].write('')
+        self.tags[-1].write("")
 
     def _start_table(self, _):
         self.tags[-1].set_canvas(Canvas())
-        self.current_table.append(Table(
-            left_margin_len=self.tags[-1].canvas.left_margin,
-            cell_separator=self.config.table_cell_separator))
+        self.current_table.append(
+            Table(
+                left_margin_len=self.tags[-1].canvas.left_margin,
+                cell_separator=self.config.table_cell_separator,
+            )
+        )
 
     def _start_tr(self, _):
         if self.current_table:
@@ -210,8 +218,9 @@ def _start_tr(self, _):
     def _start_td(self, _):
         if self.current_table:
             # open td tag
-            table_cell = TableCell(align=self.tags[-1].align,
-                                   valign=self.tags[-1].valign)
+            table_cell = TableCell(
+                align=self.tags[-1].align, valign=self.tags[-1].valign
+            )
             self.tags[-1].canvas = table_cell
             self.current_table[-1].add_cell(table_cell)
 
@@ -239,17 +248,18 @@ def _end_table(self):
         if self.tags[-1].annotation:
             end_idx = self.tags[-2].canvas.current_block.idx
             for a in self.tags[-1].annotation:
-                self.tags[-2].canvas.annotations.append(Annotation(
-                    start_idx, end_idx, a))
+                self.tags[-2].canvas.annotations.append(
+                    Annotation(start_idx, end_idx, a)
+                )
 
         # transfer in-table annotations
         self.tags[-2].canvas.annotations.extend(
-            table.get_annotations(start_idx, self.tags[-2].canvas.left_margin))
+            table.get_annotations(start_idx, self.tags[-2].canvas.left_margin)
+        )
 
     def _newline(self, _):
         self.tags[-1].canvas.write_newline()
 
     def get_bullet(self) -> str:
         """Return the bullet that correspond to the given index."""
-        return Inscriptis.UL_COUNTER[
-            len(self.li_counter) % Inscriptis.UL_COUNTER_LEN]
+        return Inscriptis.UL_COUNTER[len(self.li_counter) % Inscriptis.UL_COUNTER_LEN]
diff --git a/src/inscriptis/html_properties.py b/src/inscriptis/html_properties.py
index b1d24ea..4dc9dea 100644
--- a/src/inscriptis/html_properties.py
+++ b/src/inscriptis/html_properties.py
@@ -39,11 +39,11 @@ class WhiteSpace(Enum):
 class HorizontalAlignment(Enum):
     """Specify the content's horizontal alignment."""
 
-    left = '<'
+    left = "<"
     """Left alignment of the block's content."""
-    right = '>'
+    right = ">"
     """Right alignment of the block's content."""
-    center = '^'
+    center = "^"
     """Center the block's content."""
 
 
diff --git a/src/inscriptis/metadata.py b/src/inscriptis/metadata.py
index 2b2a771..f7112f0 100644
--- a/src/inscriptis/metadata.py
+++ b/src/inscriptis/metadata.py
@@ -1,7 +1,7 @@
 """Inscriptis metadata information."""
 
-__author__ = 'Albert Weichselbraun, Fabian Odoni'
-__author_email__ = 'albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch'
-__copyright__ = '2016-2024 Albert Weichselbraun, Fabian Odoni'
-__license__ = 'Apache 2.0'
-__version__ = '2.3.3'
+__author__ = "Albert Weichselbraun, Fabian Odoni"
+__author_email__ = "albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch"
+__copyright__ = "2016-2024 Albert Weichselbraun, Fabian Odoni"
+__license__ = "Apache 2.0"
+__version__ = "2.3.3"
diff --git a/src/inscriptis/model/attribute.py b/src/inscriptis/model/attribute.py
index 0102e3f..f4f8efc 100644
--- a/src/inscriptis/model/attribute.py
+++ b/src/inscriptis/model/attribute.py
@@ -10,9 +10,9 @@
 from inscriptis.model.html_element import HtmlElement
 
 DEFAULT_ATTRIBUTE_MAP = {
-    'style': CssParse.attr_style,
-    'align': CssParse.attr_horizontal_align,
-    'valign': CssParse.attr_vertical_align
+    "style": CssParse.attr_style,
+    "align": CssParse.attr_horizontal_align,
+    "valign": CssParse.attr_vertical_align,
 }
 
 
@@ -26,9 +26,11 @@ def merge_function(func1, func2):
         func1: the first function
         func2: the second function
     """
+
     def merged(*args):
         func1(*args)
         func2(*args)
+
     return merged
 
 
@@ -46,16 +48,20 @@ class Attribute:
     def __init__(self):
         self.attribute_mapping = DEFAULT_ATTRIBUTE_MAP
 
-    def apply_attributes(self, attributes: Dict[str, str],
-                         html_element: HtmlElement) -> HtmlElement:
+    def apply_attributes(
+        self, attributes: Dict[str, str], html_element: HtmlElement
+    ) -> HtmlElement:
         """Apply the attributes to the given HTML element.
 
         Args:
             attributes: the list of attributes
             html_element: the HTML element for which the attributes are parsed
         """
-        supported_attributes = ((name, val) for name, val in attributes.items()
-                                if name in self.attribute_mapping)
+        supported_attributes = (
+            (name, val)
+            for name, val in attributes.items()
+            if name in self.attribute_mapping
+        )
         for attr_name, attr_value in supported_attributes:
             self.attribute_mapping[attr_name](attr_value, html_element)
         return html_element
@@ -63,6 +69,9 @@ def apply_attributes(self, attributes: Dict[str, str],
     def merge_attribute_map(self, annotations: List[ApplyAnnotation] = None):
         attributes = copy(self.attribute_mapping)
         for a in annotations:
-            attributes[a.attr] = a.apply if a.attr not in attributes \
+            attributes[a.attr] = (
+                a.apply
+                if a.attr not in attributes
                 else merge_function(attributes[a.attr], a.apply)
+            )
         self.attribute_mapping = attributes
diff --git a/src/inscriptis/model/canvas/__init__.py b/src/inscriptis/model/canvas/__init__.py
index ef41254..7cf5ca4 100644
--- a/src/inscriptis/model/canvas/__init__.py
+++ b/src/inscriptis/model/canvas/__init__.py
@@ -37,8 +37,13 @@ class Canvas:
         _open_annotations: a map of open tags that contain annotations.
     """
 
-    __slots__ = ('annotations', 'blocks', 'current_block', '_open_annotations',
-                 'margin')
+    __slots__ = (
+        "annotations",
+        "blocks",
+        "current_block",
+        "_open_annotations",
+        "margin",
+    )
 
     def __init__(self):
         self.margin = 1000  # margin to the previous block
@@ -64,15 +69,14 @@ def open_block(self, tag: HtmlElement):
         # write missing bullets, if no content has been written
         if not self._flush_inline() and tag.list_bullet:
             self.write_unconsumed_bullet()
-        self.current_block.prefix.register_prefix(tag.padding_inline,
-                                                  tag.list_bullet)
+        self.current_block.prefix.register_prefix(tag.padding_inline, tag.list_bullet)
 
         # write the block margin
         required_margin = max(tag.previous_margin_after, tag.margin_before)
         if required_margin > self.margin:
             required_newlines = required_margin - self.margin
             self.current_block.idx += required_newlines
-            self.blocks.append('\n' * (required_newlines - 1))
+            self.blocks.append("\n" * (required_newlines - 1))
             self.margin = required_margin
 
     def write_unconsumed_bullet(self):
@@ -84,8 +88,7 @@ def write_unconsumed_bullet(self):
             self.current_block = self.current_block.new_block()
             self.margin = 0
 
-    def write(self, tag: HtmlElement, text: str,
-              whitespace: WhiteSpace = None) -> None:
+    def write(self, tag: HtmlElement, text: str, whitespace: WhiteSpace = None) -> None:
         """Write the given text to the current block."""
         self.current_block.merge(text, whitespace or tag.whitespace)
 
@@ -110,7 +113,8 @@ def close_tag(self, tag: HtmlElement) -> None:
 
             for annotation in tag.annotation:
                 self.annotations.append(
-                    Annotation(start_idx, self.current_block.idx, annotation))
+                    Annotation(start_idx, self.current_block.idx, annotation)
+                )
 
     def close_block(self, tag: HtmlElement):
         """Close the given HtmlElement by writing its bottom margin.
@@ -121,18 +125,18 @@ def close_block(self, tag: HtmlElement):
         if tag.margin_after > self.margin:
             required_newlines = tag.margin_after - self.margin
             self.current_block.idx += required_newlines
-            self.blocks.append('\n' * (required_newlines - 1))
+            self.blocks.append("\n" * (required_newlines - 1))
             self.margin = tag.margin_after
 
     def write_newline(self):
         if not self._flush_inline():
-            self.blocks.append('')
+            self.blocks.append("")
             self.current_block = self.current_block.new_block()
 
     def get_text(self) -> str:
         """Provide a text representation of the Canvas."""
         self._flush_inline()
-        return '\n'.join(self.blocks)
+        return "\n".join(self.blocks)
 
     def _flush_inline(self) -> bool:
         """Attempt to flush the content in self.current_block into a new block.
diff --git a/src/inscriptis/model/canvas/block.py b/src/inscriptis/model/canvas/block.py
index 23c6906..59ba05f 100644
--- a/src/inscriptis/model/canvas/block.py
+++ b/src/inscriptis/model/canvas/block.py
@@ -17,12 +17,12 @@ class Block:
         prefix: prefix used within the current block.
     """
 
-    __slots__ = ('idx', 'prefix', '_content', 'collapsable_whitespace')
+    __slots__ = ("idx", "prefix", "_content", "collapsable_whitespace")
 
     def __init__(self, idx: int, prefix: str):
         self.idx = idx
         self.prefix = prefix
-        self._content = ''
+        self._content = ""
         self.collapsable_whitespace = True
 
     def merge(self, text: str, whitespace: WhiteSpace) -> None:
@@ -50,12 +50,15 @@ def merge_normal_text(self, text: str) -> None:
                 normalized_text.append(ch)
                 self.collapsable_whitespace = False
             elif not self.collapsable_whitespace:
-                normalized_text.append(' ')
+                normalized_text.append(" ")
                 self.collapsable_whitespace = True
 
         if normalized_text:
-            text = ''.join((self.prefix.first, *normalized_text)) if not \
-                self._content else ''.join(normalized_text)
+            text = (
+                "".join((self.prefix.first, *normalized_text))
+                if not self._content
+                else "".join(normalized_text)
+            )
             text = unescape(text)
             self._content += text
             self.idx += len(text)
@@ -66,8 +69,7 @@ def merge_pre_text(self, text: str) -> None:
         Args:
             text: the text to merge
         """
-        text = ''.join((self.prefix.first,
-                        text.replace('\n', '\n' + self.prefix.rest)))
+        text = "".join((self.prefix.first, text.replace("\n", "\n" + self.prefix.rest)))
         text = unescape(text)
         self._content += text
         self.idx += len(text)
@@ -81,12 +83,12 @@ def content(self):
         if not self.collapsable_whitespace:
             return self._content
 
-        if self._content.endswith(' '):
+        if self._content.endswith(" "):
             self._content = self._content[:-1]
             self.idx -= 1
         return self._content
 
-    def new_block(self) -> 'Block':
+    def new_block(self) -> "Block":
         """Return a new Block based on the current one."""
         self.prefix.consumed = False
         return Block(idx=self.idx + 1, prefix=self.prefix)
diff --git a/src/inscriptis/model/canvas/prefix.py b/src/inscriptis/model/canvas/prefix.py
index ca0b768..8a68066 100644
--- a/src/inscriptis/model/canvas/prefix.py
+++ b/src/inscriptis/model/canvas/prefix.py
@@ -14,7 +14,7 @@ class Prefix:
         consumed: whether the current bullet has already been consumed.
     """
 
-    __slots__ = ('current_padding', 'paddings', 'bullets', 'consumed')
+    __slots__ = ("current_padding", "paddings", "bullets", "consumed")
 
     def __init__(self):
         self.current_padding = 0
@@ -31,7 +31,7 @@ def register_prefix(self, padding_inline, bullet):
         """
         self.current_padding += padding_inline
         self.paddings.append(padding_inline)
-        self.bullets.append(bullet if bullet else '')
+        self.bullets.append(bullet if bullet else "")
 
     def remove_last_prefix(self):
         """Remove the last prefix from the list."""
@@ -41,15 +41,15 @@ def remove_last_prefix(self):
 
     def pop_next_bullet(self):
         """Pop the next bullet to use, if any bullet is available."""
-        next_bullet_idx = next((-idx for idx, val
-                                in enumerate(reversed(self.bullets))
-                                if val), 1) - 1
+        next_bullet_idx = (
+            next((-idx for idx, val in enumerate(reversed(self.bullets)) if val), 1) - 1
+        )
 
         if not next_bullet_idx:
-            return ''
+            return ""
 
         bullet = self.bullets[next_bullet_idx]
-        self.bullets[next_bullet_idx] = ''
+        self.bullets[next_bullet_idx] = ""
         return bullet
 
     @property
@@ -62,12 +62,11 @@ def first(self):
             further prefixes should be used for a line.
         """
         if self.consumed:
-            return ''
+            return ""
 
         self.consumed = True
         bullet = self.pop_next_bullet()
-        return ' ' * (self.current_padding - len(bullet)) \
-               + bullet
+        return " " * (self.current_padding - len(bullet)) + bullet
 
     @property
     def unconsumed_bullet(self):
@@ -78,15 +77,14 @@ def unconsumed_bullet(self):
             not been consumed yet.
         """
         if self.consumed:
-            return ''
+            return ""
 
         bullet = self.pop_next_bullet()
         if not bullet:
-            return ''
+            return ""
 
         padding = self.current_padding - self.paddings[-1]
-        return ' ' * (padding - len(bullet)) \
-               + bullet
+        return " " * (padding - len(bullet)) + bullet
 
     @property
     def rest(self):
@@ -96,4 +94,4 @@ def rest(self):
         need to be prefixed with the right padding to preserver the
         indentation.
         """
-        return ' ' * self.current_padding
+        return " " * self.current_padding
diff --git a/src/inscriptis/model/config.py b/src/inscriptis/model/config.py
index 9bc216d..0aaeb7a 100644
--- a/src/inscriptis/model/config.py
+++ b/src/inscriptis/model/config.py
@@ -9,19 +9,22 @@
 from inscriptis.model.attribute import Attribute
 from inscriptis.model.html_element import HtmlElement
 
-DEFAULT_CSS_PROFILE_NAME = 'relaxed'
+DEFAULT_CSS_PROFILE_NAME = "relaxed"
 
 
 class ParserConfig:
     """Encapsulate configuration options and CSS definitions."""
 
-    def __init__(self, css: Dict[str, HtmlElement] = None,
-                 display_images: bool = False,
-                 deduplicate_captions: bool = False,
-                 display_links: bool = False,
-                 display_anchors: bool = False,
-                 annotation_rules: Attribute = None,
-                 table_cell_separator: str = '  '):
+    def __init__(
+        self,
+        css: Dict[str, HtmlElement] = None,
+        display_images: bool = False,
+        deduplicate_captions: bool = False,
+        display_links: bool = False,
+        display_anchors: bool = False,
+        annotation_rules: Attribute = None,
+        table_cell_separator: str = "  ",
+    ):
         """Create a ParserConfig configuration.
 
         Args:
@@ -47,13 +50,11 @@ def __init__(self, css: Dict[str, HtmlElement] = None,
         if annotation_rules:
             # ensure that we do not modify the original model or its
             # members.
-            annotation_model = AnnotationModel(deepcopy(self.css),
-                                               annotation_rules)
+            annotation_model = AnnotationModel(deepcopy(self.css), annotation_rules)
             # css with annotation support
             self.css = annotation_model.css
             # attribute handler with annotation support
-            self.attribute_handler.merge_attribute_map(
-                annotation_model.css_attr)
+            self.attribute_handler.merge_attribute_map(annotation_model.css_attr)
 
     def parse_a(self) -> bool:
         """Indicate whether the text output should contain links or anchors.
diff --git a/src/inscriptis/model/css.py b/src/inscriptis/model/css.py
index 1610055..d9efa44 100644
--- a/src/inscriptis/model/css.py
+++ b/src/inscriptis/model/css.py
@@ -7,8 +7,12 @@
 """
 from contextlib import suppress
 from re import compile as re_compile
-from inscriptis.html_properties import (Display, WhiteSpace,
-                                        HorizontalAlignment, VerticalAlignment)
+from inscriptis.html_properties import (
+    Display,
+    WhiteSpace,
+    HorizontalAlignment,
+    VerticalAlignment,
+)
 from inscriptis.model.html_element import HtmlElement
 
 
@@ -20,7 +24,7 @@ class CssParse:
     """
 
     # used to separate value and unit from each other
-    RE_UNIT = re_compile(r'(-?[0-9.]+)(\w+)')
+    RE_UNIT = re_compile(r"(-?[0-9.]+)(\w+)")
 
     @staticmethod
     def attr_style(style_attribute: str, html_element: HtmlElement):
@@ -31,15 +35,15 @@ def attr_style(style_attribute: str, html_element: HtmlElement):
                            Example: display: none
           html_element: The HtmlElement to which the given style is applied.
         """
-        for style_directive in style_attribute.lower().split(';'):
-            if ':' not in style_directive:
+        for style_directive in style_attribute.lower().split(";"):
+            if ":" not in style_directive:
                 continue
-            key, value = (s.strip() for s in style_directive.split(':', 1))
+            key, value = (s.strip() for s in style_directive.split(":", 1))
 
             try:
-                apply_style = getattr(CssParse, 'attr_'
-                                      + key.replace('-webkit-', '')
-                                      .replace('-', '_'))
+                apply_style = getattr(
+                    CssParse, "attr_" + key.replace("-webkit-", "").replace("-", "_")
+                )
                 apply_style(value, html_element)
             except AttributeError:
                 pass
@@ -61,7 +65,7 @@ def _get_em(length: str) -> int:
         value = float(_m.group(1))
         unit = _m.group(2)
 
-        if unit not in ('em', 'qem', 'rem'):
+        if unit not in ("em", "qem", "rem"):
             return int(round(value / 8))
         return int(round(value))
 
@@ -75,9 +79,9 @@ def attr_display(value: str, html_element: HtmlElement):
         if html_element.display == Display.none:
             return
 
-        if value == 'block':
+        if value == "block":
             html_element.display = Display.block
-        elif value == 'none':
+        elif value == "none":
             html_element.display = Display.none
         else:
             html_element.display = Display.inline
@@ -85,9 +89,9 @@ def attr_display(value: str, html_element: HtmlElement):
     @staticmethod
     def attr_white_space(value: str, html_element: HtmlElement):
         """Apply the given white-space value."""
-        if value in ('normal', 'nowrap'):
+        if value in ("normal", "nowrap"):
             html_element.whitespace = WhiteSpace.normal
-        elif value in ('pre', 'pre-line', 'pre-wrap'):
+        elif value in ("pre", "pre-line", "pre-wrap"):
             html_element.whitespace = WhiteSpace.pre
 
     @staticmethod
diff --git a/src/inscriptis/model/html_element.py b/src/inscriptis/model/html_element.py
index 3ea95fe..5d16e9d 100644
--- a/src/inscriptis/model/html_element.py
+++ b/src/inscriptis/model/html_element.py
@@ -1,8 +1,12 @@
 """Data structures for handling HTML Elements."""
 from typing import Tuple
 
-from inscriptis.html_properties import Display, HorizontalAlignment, \
-    VerticalAlignment, WhiteSpace
+from inscriptis.html_properties import (
+    Display,
+    HorizontalAlignment,
+    VerticalAlignment,
+    WhiteSpace,
+)
 
 
 class HtmlElement:
@@ -28,22 +32,40 @@ class HtmlElement:
     - annotation: annotations associated with the HtmlElement.
     """
 
-    __slots__ = ('canvas', 'tag', 'prefix', 'suffix', 'display',
-                 'margin_before', 'margin_after', 'padding_inline',
-                 'list_bullet', 'whitespace', 'limit_whitespace_affixes',
-                 'align', 'valign', 'previous_margin_after', 'annotation')
-
-    def __init__(self, tag='default', prefix='', suffix='',
-                 display: Display = Display.inline,
-                 margin_before: int = 0,
-                 margin_after: int = 0,
-                 padding_inline: int = 0,
-                 list_bullet: str = '',
-                 whitespace: WhiteSpace = None,
-                 limit_whitespace_affixes: bool = False,
-                 align: HorizontalAlignment = HorizontalAlignment.left,
-                 valign: VerticalAlignment = VerticalAlignment.middle,
-                 annotation: Tuple[str] = ()):
+    __slots__ = (
+        "canvas",
+        "tag",
+        "prefix",
+        "suffix",
+        "display",
+        "margin_before",
+        "margin_after",
+        "padding_inline",
+        "list_bullet",
+        "whitespace",
+        "limit_whitespace_affixes",
+        "align",
+        "valign",
+        "previous_margin_after",
+        "annotation",
+    )
+
+    def __init__(
+        self,
+        tag="default",
+        prefix="",
+        suffix="",
+        display: Display = Display.inline,
+        margin_before: int = 0,
+        margin_after: int = 0,
+        padding_inline: int = 0,
+        list_bullet: str = "",
+        whitespace: WhiteSpace = None,
+        limit_whitespace_affixes: bool = False,
+        align: HorizontalAlignment = HorizontalAlignment.left,
+        valign: VerticalAlignment = VerticalAlignment.middle,
+        annotation: Tuple[str] = (),
+    ):
         self.canvas = None
         self.tag = tag
         self.prefix = prefix
@@ -60,7 +82,7 @@ def __init__(self, tag='default', prefix='', suffix='',
         self.previous_margin_after = 0
         self.annotation = annotation
 
-    def __copy__(self) -> 'HtmlElement':
+    def __copy__(self) -> "HtmlElement":
         """Performance-optimized copy implementation."""
         copy = self.__class__.__new__(self.__class__)
         for attr in self.__slots__:
@@ -71,14 +93,13 @@ def write(self, text: str):
         """Write the given HTML text to the element's canvas."""
         if not text or self.display == Display.none:
             return
-        self.canvas.write(self, ''.join(
-            (self.prefix, text, self.suffix)))
+        self.canvas.write(self, "".join((self.prefix, text, self.suffix)))
 
-    def set_canvas(self, canvas) -> 'HtmlElement':
+    def set_canvas(self, canvas) -> "HtmlElement":
         self.canvas = canvas
         return self
 
-    def set_tag(self, tag: str) -> 'HtmlElement':
+    def set_tag(self, tag: str) -> "HtmlElement":
         self.tag = tag
         return self
 
@@ -99,7 +120,7 @@ def write_verbatim_text(self, text: str):
         if self.display == Display.block:
             self.canvas.close_block(self)
 
-    def get_refined_html_element(self, new: 'HtmlElement') -> 'HtmlElement':
+    def get_refined_html_element(self, new: "HtmlElement") -> "HtmlElement":
         """Compute the new HTML element based on the previous one.
 
         Adaptations:
@@ -124,12 +145,11 @@ def get_refined_html_element(self, new: 'HtmlElement') -> 'HtmlElement':
 
         # do not display whitespace only affixes in Whitespace.pre areas
         # if `limit_whitespace_affixes` is set.
-        if (new.limit_whitespace_affixes
-                and self.whitespace == WhiteSpace.pre):
+        if new.limit_whitespace_affixes and self.whitespace == WhiteSpace.pre:
             if new.prefix.isspace():
-                new.prefix = ''
+                new.prefix = ""
             if new.suffix.isspace():
-                new.suffix = ''
+                new.suffix = ""
 
         if new.display == Display.block and self.display == Display.block:
             new.previous_margin_after = self.margin_after
@@ -138,13 +158,13 @@ def get_refined_html_element(self, new: 'HtmlElement') -> 'HtmlElement':
 
     def __str__(self):
         return (
-            '<{self.tag} prefix={self.prefix}, suffix={self.suffix}, '
-            'display={self.display}, margin_before={self.margin_before}, '
-            'margin_after={self.margin_after}, '
-            'padding_inline={self.padding_inline}, '
-            'list_bullet={self.list_bullet}, '
-            'whitespace={self.whitespace}, align={self.align}, '
-            'valign={self.valign}, annotation={self.annotation}>'
+            "<{self.tag} prefix={self.prefix}, suffix={self.suffix}, "
+            "display={self.display}, margin_before={self.margin_before}, "
+            "margin_after={self.margin_after}, "
+            "padding_inline={self.padding_inline}, "
+            "list_bullet={self.list_bullet}, "
+            "whitespace={self.whitespace}, align={self.align}, "
+            "valign={self.valign}, annotation={self.annotation}>"
         ).format(self=self)
 
     __repr__ = __str__
diff --git a/src/inscriptis/model/table.py b/src/inscriptis/model/table.py
index 559aa79..073c626 100644
--- a/src/inscriptis/model/table.py
+++ b/src/inscriptis/model/table.py
@@ -20,9 +20,19 @@ class TableCell(Canvas):
                           vertical formatting rules.
     """
 
-    __slots__ = ('annotations', 'block_annotations', 'blocks', 'current_block',
-                 'margin', 'annotation_counter', 'align', 'valign', '_width',
-                 'line_width', 'vertical_padding')
+    __slots__ = (
+        "annotations",
+        "block_annotations",
+        "blocks",
+        "current_block",
+        "margin",
+        "annotation_counter",
+        "align",
+        "valign",
+        "_width",
+        "line_width",
+        "vertical_padding",
+    )
 
     def __init__(self, align: HorizontalAlignment, valign: VerticalAlignment):
         super().__init__()
@@ -39,9 +49,9 @@ def normalize_blocks(self) -> int:
             The height of the normalized cell.
         """
         self._flush_inline()
-        self.blocks = list(chain(*(line.split('\n') for line in self.blocks)))
+        self.blocks = list(chain(*(line.split("\n") for line in self.blocks)))
         if not self.blocks:
-            self.blocks = ['']
+            self.blocks = [""]
         return len(self.blocks)
 
     @property
@@ -62,8 +72,9 @@ def width(self):
         """
         if self._width:
             return self._width
-        return max((len(line) for line in chain(*(block.split('\n')
-                                                  for block in self.blocks))))
+        return max(
+            (len(line) for line in chain(*(block.split("\n") for block in self.blocks)))
+        )
 
     @width.setter
     def width(self, width):
@@ -77,8 +88,7 @@ def width(self, width):
 
         # record new width and start reformatting
         self._width = width
-        format_spec = '{{:{align}{width}}}'.format(align=self.align.value,
-                                                   width=width)
+        format_spec = "{{:{align}{width}}}".format(align=self.align.value, width=width)
         self.blocks = [format_spec.format(b) for b in self.blocks]
 
     @height.setter
@@ -91,14 +101,17 @@ def height(self, height: int):
         """
         rows = len(self.blocks)
         if rows < height:
-            empty_line = ['']
+            empty_line = [""]
             if self.valign == VerticalAlignment.bottom:
-                self.vertical_padding = (height - rows)
+                self.vertical_padding = height - rows
                 self.blocks = self.vertical_padding * empty_line + self.blocks
             elif self.valign == VerticalAlignment.middle:
                 self.vertical_padding = (height - rows) // 2
-                self.blocks = self.vertical_padding * empty_line + \
-                    self.blocks + ((height - rows + 1) // 2 * empty_line)
+                self.blocks = (
+                    self.vertical_padding * empty_line
+                    + self.blocks
+                    + ((height - rows + 1) // 2 * empty_line)
+                )
             else:
                 self.blocks = self.blocks + ((height - rows) * empty_line)
 
@@ -116,9 +129,9 @@ def get_annotations(self, idx: int, row_width: int) -> List[Annotation]:
         # the easy case - the cell has only one line :)
         if len(self.blocks) == 1:
             self.line_width[0] = self.width
-            return horizontal_shift(self.annotations,
-                                    self.line_width[0],
-                                    self.width, self.align, idx)
+            return horizontal_shift(
+                self.annotations, self.line_width[0], self.width, self.align, idx
+            )
 
         # the more challenging one - multiple cell lines
         line_break_pos = list(accumulate(self.line_width))
@@ -127,17 +140,19 @@ def get_annotations(self, idx: int, row_width: int) -> List[Annotation]:
         # assign annotations to the corresponding line
         for a in self.annotations:
             for no, line_break in enumerate(line_break_pos):
-                if a.start <= (line_break + no):         # consider newline
+                if a.start <= (line_break + no):  # consider newline
                     annotation_lines[no + self.vertical_padding].append(a)
                     break
 
         # compute the annotation index based on its line and delta :)
         result = []
-        idx += self.vertical_padding   # newlines introduced by the padding
-        for line_annotations, line_len in zip(annotation_lines,
-                                              self.line_width):
-            result.extend(horizontal_shift(line_annotations, line_len,
-                                           self.width, self.align, idx))
+        idx += self.vertical_padding  # newlines introduced by the padding
+        for line_annotations, line_len in zip(annotation_lines, self.line_width):
+            result.extend(
+                horizontal_shift(
+                    line_annotations, line_len, self.width, self.align, idx
+                )
+            )
             idx += row_width - line_len
         self.line_width = [self.width for _ in self.line_width]
         return result
@@ -151,7 +166,7 @@ class TableRow:
         cell_separator: string used for separating columns from each other.
     """
 
-    __slots__ = ('columns', 'cell_separator')
+    __slots__ = ("columns", "cell_separator")
 
     def __init__(self, cell_separator):
         self.columns: List[TableCell] = []
@@ -162,10 +177,11 @@ def __len__(self):
 
     def get_text(self) -> str:
         """Return a text representation of the TableRow."""
-        row_lines = [self.cell_separator.join(line)
-                     for line in zip(*[column.blocks
-                                       for column in self.columns])]
-        return '\n'.join(row_lines)
+        row_lines = [
+            self.cell_separator.join(line)
+            for line in zip(*[column.blocks for column in self.columns])
+        ]
+        return "\n".join(row_lines)
 
     @property
     def width(self):
@@ -173,8 +189,9 @@ def width(self):
         if not self.columns:
             return 0
 
-        return sum((cell.width for cell in self.columns)) + len(
-            self.cell_separator) * (len(self.columns) - 1)
+        return sum((cell.width for cell in self.columns)) + len(self.cell_separator) * (
+            len(self.columns) - 1
+        )
 
 
 class Table:
@@ -186,7 +203,7 @@ class Table:
         cell_separator: string used for separating cells from each other.
     """
 
-    __slots__ = ('rows', 'left_margin_len', 'cell_separator')
+    __slots__ = ("rows", "left_margin_len", "cell_separator")
 
     def __init__(self, left_margin_len: int, cell_separator):
         self.rows = []
@@ -210,9 +227,11 @@ def add_cell(self, table_cell: TableCell):
     def _set_row_height(self):
         """Set the cell height for all :class:`TableCell`s in the table."""
         for row in self.rows:
-            max_row_height = max((cell.normalize_blocks()
-                                  for cell in row.columns)) \
-                if row.columns else 0
+            max_row_height = (
+                max((cell.normalize_blocks() for cell in row.columns))
+                if row.columns
+                else 0
+            )
             for cell in row.columns:
                 cell.height = max_row_height
 
@@ -223,9 +242,13 @@ def _set_column_width(self):
 
         for cur_column_idx in range(max_columns):
             # determine the required column width for the current column
-            max_column_width = max((row.columns[cur_column_idx].width
-                                    for row in self.rows
-                                    if len(row) > cur_column_idx))
+            max_column_width = max(
+                (
+                    row.columns[cur_column_idx].width
+                    for row in self.rows
+                    if len(row) > cur_column_idx
+                )
+            )
 
             # set column width for all TableCells in the current column
             for row in self.rows:
@@ -235,14 +258,13 @@ def _set_column_width(self):
     def get_text(self):
         """Return and render the text of the given table."""
         if not self.rows:
-            return '\n'
+            return "\n"
 
         self._set_row_height()
         self._set_column_width()
-        return '\n'.join((row.get_text() for row in self.rows)) + '\n'
+        return "\n".join((row.get_text() for row in self.rows)) + "\n"
 
-    def get_annotations(self, idx: int,
-                        left_margin_len: int) -> List[Annotation]:
+    def get_annotations(self, idx: int, left_margin_len: int) -> List[Annotation]:
         r"""Return all annotations in the given table.
 
         Args:
@@ -270,6 +292,6 @@ def get_annotations(self, idx: int,
                 annotations += cell.get_annotations(cell_idx, row_width)
                 cell_idx += cell.width + len(row.cell_separator)
 
-            idx += (row_width + 1) * row_height   # linebreak
+            idx += (row_width + 1) * row_height  # linebreak
 
         return annotations
diff --git a/src/inscriptis/service/web.py b/src/inscriptis/service/web.py
index f654b26..bb54665 100755
--- a/src/inscriptis/service/web.py
+++ b/src/inscriptis/service/web.py
@@ -10,35 +10,39 @@
 from inscriptis.model.config import ParserConfig
 
 app = Flask(__name__)
-CONFIG = ParserConfig(css=RELAXED_CSS_PROFILE, display_images=True,
-                      deduplicate_captions=True, display_links=False)
+CONFIG = ParserConfig(
+    css=RELAXED_CSS_PROFILE,
+    display_images=True,
+    deduplicate_captions=True,
+    display_links=False,
+)
 
 
-@app.route('/')
+@app.route("/")
 def index():
     """Print a short status message for the Web service's base URL."""
-    return 'Inscriptis text to HTML Web service.'
+    return "Inscriptis text to HTML Web service."
 
 
-@app.route('/get_text', methods=['POST'])
+@app.route("/get_text", methods=["POST"])
 def get_text_call():
     """Return the text representation of the given HTML content."""
-    content_type = request.headers['Content-type']
-    if '; encoding=' in content_type:
-        encoding = content_type.split('; encoding=')[1]
+    content_type = request.headers["Content-type"]
+    if "; encoding=" in content_type:
+        encoding = content_type.split("; encoding=")[1]
     else:
-        encoding = 'UTF-8'
-    html_content = request.data.decode(encoding, errors='ignore')
+        encoding = "UTF-8"
+    html_content = request.data.decode(encoding, errors="ignore")
     text = get_text(html_content, CONFIG)
-    return Response(text, mimetype='text/plain')
+    return Response(text, mimetype="text/plain")
 
 
-@app.route('/version', methods=['GET'])
+@app.route("/version", methods=["GET"])
 def get_version_call():
     """Return the used inscriptis version."""
-    return Response(__version__ + '\n', mimetype='text/plain')
+    return Response(__version__ + "\n", mimetype="text/plain")
 
 
-if __name__ == '__main__':
-    print('Starting Web service based on Inscriptis', __version__)
-    app.run(threaded=True, host='127.0.0.1', port=5000)
+if __name__ == "__main__":
+    print("Starting Web service based on Inscriptis", __version__)
+    app.run(threaded=True, host="127.0.0.1", port=5000)
diff --git a/tests/test_annotation.py b/tests/test_annotation.py
index b19ddeb..c3518b3 100644
--- a/tests/test_annotation.py
+++ b/tests/test_annotation.py
@@ -11,57 +11,61 @@
 
 
 def test_horizontal_shift():
-    a = [Annotation(0, 4, 'test')]
+    a = [Annotation(0, 4, "test")]
 
     # no shift
-    assert horizontal_shift(a,
-                            content_width=5,
-                            line_width=10,
-                            align=HorizontalAlignment.left,
-                            shift=0).pop() == Annotation(0, 4, 'test')
+    assert horizontal_shift(
+        a, content_width=5, line_width=10, align=HorizontalAlignment.left, shift=0
+    ).pop() == Annotation(0, 4, "test")
 
     # shift
-    assert horizontal_shift(a,
-                            content_width=5,
-                            line_width=10,
-                            align=HorizontalAlignment.left,
-                            shift=3).pop() == Annotation(3, 7, 'test')
+    assert horizontal_shift(
+        a, content_width=5, line_width=10, align=HorizontalAlignment.left, shift=3
+    ).pop() == Annotation(3, 7, "test")
 
     # realignment to the right
-    assert horizontal_shift(a,
-                            content_width=len('test'),
-                            line_width=10,
-                            align=HorizontalAlignment.right,
-                            shift=0).pop() == Annotation(6, 10, 'test')
-    assert '{:>10}'.format('test')[6:10] == 'test'
-
+    assert horizontal_shift(
+        a,
+        content_width=len("test"),
+        line_width=10,
+        align=HorizontalAlignment.right,
+        shift=0,
+    ).pop() == Annotation(6, 10, "test")
+    assert "{:>10}".format("test")[6:10] == "test"
 
     # shift + realignment to the right
-    assert horizontal_shift(a,
-                            content_width=len('test'),
-                            line_width=10,
-                            align=HorizontalAlignment.right,
-                            shift=3).pop() == Annotation(9, 13, 'test')
+    assert horizontal_shift(
+        a,
+        content_width=len("test"),
+        line_width=10,
+        align=HorizontalAlignment.right,
+        shift=3,
+    ).pop() == Annotation(9, 13, "test")
 
     # realignment to the center
-    assert horizontal_shift(a,
-                            content_width=len('test'),
-                            line_width=10,
-                            align=HorizontalAlignment.center,
-                            shift=0).pop() == Annotation(3, 7, 'test')
-    assert '{:^10}'.format('test')[3:7] == 'test'
+    assert horizontal_shift(
+        a,
+        content_width=len("test"),
+        line_width=10,
+        align=HorizontalAlignment.center,
+        shift=0,
+    ).pop() == Annotation(3, 7, "test")
+    assert "{:^10}".format("test")[3:7] == "test"
 
-    assert horizontal_shift(a,
-                            content_width=len('test'),
-                            line_width=11,
-                            align=HorizontalAlignment.center,
-                            shift=0).pop() == Annotation(3, 7, 'test')
-    assert '{:^11}'.format('test')[3:7] == 'test'
+    assert horizontal_shift(
+        a,
+        content_width=len("test"),
+        line_width=11,
+        align=HorizontalAlignment.center,
+        shift=0,
+    ).pop() == Annotation(3, 7, "test")
+    assert "{:^11}".format("test")[3:7] == "test"
 
     # realignment + shift
-    assert horizontal_shift(a,
-                            content_width=len('test'),
-                            line_width=11,
-                            align=HorizontalAlignment.center,
-                            shift=7).pop() == Annotation(10, 14, 'test')
-
+    assert horizontal_shift(
+        a,
+        content_width=len("test"),
+        line_width=11,
+        align=HorizontalAlignment.center,
+        shift=7,
+    ).pop() == Annotation(10, 14, "test")
diff --git a/tests/test_annotation_output_processor.py b/tests/test_annotation_output_processor.py
index c80a654..82fdc7a 100644
--- a/tests/test_annotation_output_processor.py
+++ b/tests/test_annotation_output_processor.py
@@ -11,12 +11,12 @@
 from inscriptis.annotation.output.surface import SurfaceExtractor
 from inscriptis.annotation.output.xml import XmlExtractor
 
-EXAMPLE_OUTPUT = {'text': 'Chur\n\nChur is the capital and largest town of '
-                          'the Swiss canton of the Grisons and lies in the '
-                          'Grisonian Rhine Valley.',
-                  'label': [[0, 4, 'heading'],
-                            [0, 4, 'h1'],
-                            [6, 10, 'emphasis']]}
+EXAMPLE_OUTPUT = {
+    "text": "Chur\n\nChur is the capital and largest town of "
+    "the Swiss canton of the Grisons and lies in the "
+    "Grisonian Rhine Valley.",
+    "label": [[0, 4, "heading"], [0, 4, "h1"], [6, 10, "emphasis"]],
+}
 
 
 def test_abstract_class():
@@ -31,13 +31,15 @@ def test_surface_annotator():
     result = processor(EXAMPLE_OUTPUT)
 
     # the old keys haven't been changed
-    assert 'text' in result
-    assert 'label' in result
+    assert "text" in result
+    assert "label" in result
 
     # and we have additional information on surface forms :)
-    assert result['surface'] == [('heading', 'Chur'),
-                                 ('h1', 'Chur'),
-                                 ('emphasis', 'Chur')]
+    assert result["surface"] == [
+        ("heading", "Chur"),
+        ("h1", "Chur"),
+        ("emphasis", "Chur"),
+    ]
 
 
 def test_xml_annotator():
@@ -45,35 +47,40 @@ def test_xml_annotator():
     result = processor(EXAMPLE_OUTPUT)
 
     # and we have additional information on surface forms :)
-    assert result == ('\n'
-                      '

Chur

\n\n' - 'Chur is the capital and largest town ' - 'of the Swiss canton of the Grisons and lies in ' - 'the Grisonian Rhine Valley.') + assert result == ( + '\n' + "

Chur

\n\n" + "Chur is the capital and largest town " + "of the Swiss canton of the Grisons and lies in " + "the Grisonian Rhine Valley." + ) def test_html_annotator(): processor = HtmlExtractor() result = processor(EXAMPLE_OUTPUT) - assert result.startswith('' - '
heading'
-                           ''
-                           'h1'
-                           'Chur
\n' - '
\n'
-                           '
emphasis'
-                           'Chur is the capital '
-                           'and largest town of the Swiss canton of the '
-                            'Grisons and lies in the Grisonian Rhine Valley.'
-                           '
') + assert result.startswith("" + '
heading'
+        ''
+        'h1'
+        "Chur
\n" + "
\n"
+        '
emphasis'
+        'Chur is the capital '
+        "and largest town of the Swiss canton of the "
+        "Grisons and lies in the Grisonian Rhine Valley."
+        "
" + ) def test_trailing_tag_annotation(): processor = XmlExtractor() - result = processor({'text': 'Ehre sei Gott!', - 'label': [[9, 14, 'emphasis']]}) + result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]}) - assert result == ('\n' - 'Ehre sei Gott!') + assert result == ( + '\n' + "Ehre sei Gott!" + ) diff --git a/tests/test_annotation_rule_parsing.py b/tests/test_annotation_rule_parsing.py index fef265a..5893831 100644 --- a/tests/test_annotation_rule_parsing.py +++ b/tests/test_annotation_rule_parsing.py @@ -18,59 +18,58 @@ def test_parse(): """ basic rule parsing. """ - rules = {'table#border=1': ['table'], - 'hr': ['horizontal-line']} + rules = {"table#border=1": ["table"], "hr": ["horizontal-line"]} tags, attrs = AnnotationModel._parse(rules) - assert tags == {'hr': ['horizontal-line']} + assert tags == {"hr": ["horizontal-line"]} - apply_annotation= attrs[0] - assert apply_annotation.match_tag == 'table' - assert apply_annotation.match_value == '1' - assert apply_annotation.attr == 'border' + apply_annotation = attrs[0] + assert apply_annotation.match_tag == "table" + assert apply_annotation.match_value == "1" + assert apply_annotation.attr == "border" - e = HtmlElement(tag='table') - apply_annotation.apply('1', e) - assert e.annotation == ('table', ) + e = HtmlElement(tag="table") + apply_annotation.apply("1", e) + assert e.annotation == ("table",) def test_apply_annotation(): """ rule application. """ - rules = {'table#border=1': ['table'], - 'hr': ['horizontal-line'], - '#color=red': ['red'], - '#bgcolor': ['bgcolor']} - - css = deepcopy(CSS_PROFILES['strict']) + rules = { + "table#border=1": ["table"], + "hr": ["horizontal-line"], + "#color=red": ["red"], + "#bgcolor": ["bgcolor"], + } + + css = deepcopy(CSS_PROFILES["strict"]) annotation_model = AnnotationModel(css, rules) - assert annotation_model.css['hr'].annotation == ('horizontal-line', ) + assert annotation_model.css["hr"].annotation == ("horizontal-line",) attribute_handler = Attribute() attribute_handler.merge_attribute_map(annotation_model.css_attr) - assert 'table#border=1' in str(attribute_handler.attribute_mapping['border']) - assert '{any}#color=red' in str(attribute_handler.attribute_mapping['color']) - assert '{any}#bgcolor={any}' in str(attribute_handler.attribute_mapping['bgcolor']) + assert "table#border=1" in str(attribute_handler.attribute_mapping["border"]) + assert "{any}#color=red" in str(attribute_handler.attribute_mapping["color"]) + assert "{any}#bgcolor={any}" in str(attribute_handler.attribute_mapping["bgcolor"]) + def test_merged_attribute(): """ test multiple rules per attribute """ - rules = {'#color=white': ['white'], - '#color=yellow': ['yellow']} - css = deepcopy(CSS_PROFILES['strict']) + rules = {"#color=white": ["white"], "#color=yellow": ["yellow"]} + css = deepcopy(CSS_PROFILES["strict"]) annotation_model = AnnotationModel(css, rules) attribute_handler = Attribute() attribute_handler.merge_attribute_map(annotation_model.css_attr) e = HtmlElement() - attribute_handler.attribute_mapping['color']('green', e) + attribute_handler.attribute_mapping["color"]("green", e) assert e.annotation == () - attribute_handler.attribute_mapping['color']('yellow', e) - assert e.annotation == ('yellow', ) - attribute_handler.attribute_mapping['color']('white', e) - assert e.annotation == ('yellow', 'white') - - + attribute_handler.attribute_mapping["color"]("yellow", e) + assert e.annotation == ("yellow",) + attribute_handler.attribute_mapping["color"]("white", e) + assert e.annotation == ("yellow", "white") diff --git a/tests/test_block.py b/tests/test_block.py index 21ac592..8aacc93 100644 --- a/tests/test_block.py +++ b/tests/test_block.py @@ -11,25 +11,25 @@ def test_merge_normal_text_collapsable_whitespaces(): """ b = Block(0, Prefix()) b.merge_normal_text("Hallo") - assert b._content == 'Hallo' + assert b._content == "Hallo" assert not b.collapsable_whitespace b = Block(0, Prefix()) b.merge_normal_text(" Hallo ") - assert b._content == 'Hallo ' + assert b._content == "Hallo " assert b.collapsable_whitespace b = Block(0, Prefix()) - b.merge_normal_text('') - assert b._content == '' + b.merge_normal_text("") + assert b._content == "" assert b.collapsable_whitespace - b.merge_normal_text(' ') - assert b._content == '' + b.merge_normal_text(" ") + assert b._content == "" assert b.collapsable_whitespace - b.merge_normal_text(' ') - assert b._content == '' + b.merge_normal_text(" ") + assert b._content == "" assert b.collapsable_whitespace @@ -37,29 +37,29 @@ def test_merge_normal_non_collapsable_whitespaces(): b = Block(0, Prefix()) b.collapsable_whitespace = False b.merge_normal_text("Hallo") - assert b._content == 'Hallo' + assert b._content == "Hallo" assert not b.collapsable_whitespace b = Block(0, Prefix()) b.collapsable_whitespace = False b.merge_normal_text(" Hallo ") - assert b._content == ' Hallo ' + assert b._content == " Hallo " assert b.collapsable_whitespace b = Block(0, Prefix()) b.collapsable_whitespace = False - b.merge_normal_text('') - assert b._content == '' + b.merge_normal_text("") + assert b._content == "" assert not b.collapsable_whitespace b = Block(0, Prefix()) b.collapsable_whitespace = False - b.merge_normal_text(' ') - assert b._content == ' ' + b.merge_normal_text(" ") + assert b._content == " " assert b.collapsable_whitespace b = Block(0, Prefix()) b.collapsable_whitespace = False - b.merge_normal_text(' ') - assert b._content == ' ' + b.merge_normal_text(" ") + assert b._content == " " assert b.collapsable_whitespace diff --git a/tests/test_broken_table_handling.py b/tests/test_broken_table_handling.py index bd210e9..dee75f3 100644 --- a/tests/test_broken_table_handling.py +++ b/tests/test_broken_table_handling.py @@ -9,21 +9,16 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig -config = ParserConfig(css=CSS_PROFILES['strict']) +config = ParserConfig(css=CSS_PROFILES["strict"]) def test_forgotten_td_close_tag(): # one line (i.e., missing before the next and the next - html = ('hallo' - '' - '
12
echo') + html = "hallo" "" "
12
echo" print(html) # assert get_text(html, config) == u'hallo\n1 2\necho' # two lines (i.e. missing before the and before the - html = ('hallo' - '
12' - '
34' - '
echo') + html = "hallo" "
12" "
34" "
echo" print(html) - assert get_text(html, config) == u'hallo\n1 2\n3 4\n\necho' + assert get_text(html, config) == "hallo\n1 2\n3 4\n\necho" diff --git a/tests/test_double_a.py b/tests/test_double_a.py index 24623bd..a16ceb7 100644 --- a/tests/test_double_a.py +++ b/tests/test_double_a.py @@ -9,10 +9,14 @@ def test_successive_a(): - html = 'first' \ - 'second' - assert get_text(html) == 'firstsecond' + html = ( + 'first' + 'second' + ) + assert get_text(html) == "firstsecond" - html = 'first\n' \ - 'second' - assert get_text(html) == 'first second' + html = ( + 'first\n' + 'second' + ) + assert get_text(html) == "first second" diff --git a/tests/test_empty_string.py b/tests/test_empty_string.py index dd46353..9f7987c 100644 --- a/tests/test_empty_string.py +++ b/tests/test_empty_string.py @@ -9,9 +9,8 @@ def test_empty_and_corrupt(): - assert get_text('test').strip() == 'test' - assert get_text(' ') == '' - assert get_text('') == '' + assert get_text("test").strip() == "test" + assert get_text(" ") == "" + assert get_text("") == "" # test for the behaviour of older and recent lxml versions. - assert get_text('<<<').strip() in ('<<<', '<<', '') - + assert get_text("<<<").strip() in ("<<<", "<<", "") diff --git a/tests/test_engine.py b/tests/test_engine.py index 728191b..519c1ee 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -4,8 +4,8 @@ def test_text_from_empty_content(): - assert get_text('') == '' + assert get_text("") == "" def test_annotations_from_empty_content(): - assert get_annotated_text('') == {} + assert get_annotated_text("") == {} diff --git a/tests/test_html_conversion_options.py b/tests/test_html_conversion_options.py index fdc69be..c9bb878 100644 --- a/tests/test_html_conversion_options.py +++ b/tests/test_html_conversion_options.py @@ -9,69 +9,70 @@ def test_display_links(): - html = ''' + html = """ first second third - ''' + """ config = ParserConfig(display_links=True) - assert get_text(html, config).strip() == \ - '[first](first) [second](second) third' + assert get_text(html, config).strip() == "[first](first) [second](second) third" def test_display_anchors(): - html = ''' + html = """ first second - ''' + """ config = ParserConfig(display_anchors=True) - assert get_text(html, config).strip() == \ - '[first](first) second' + assert get_text(html, config).strip() == "[first](first) second" def test_display_links_and_anchors(): - html = ''' + html = """ first second third - ''' + """ config = ParserConfig(display_links=True, display_anchors=True) - assert get_text(html, config).strip() == \ - '[first](first) [second](second) [third](third)' + assert ( + get_text(html, config).strip() + == "[first](first) [second](second) [third](third)" + ) def test_display_images(): - html = ''' + html = """ Ein Test Bild Ein Test Bild Ein zweites Bild - ''' + """ config = ParserConfig(display_images=True) - assert get_text(html, config).strip() == \ - '[Ein Test Bild] [Ein Test Bild] [Ein zweites Bild]' + assert ( + get_text(html, config).strip() + == "[Ein Test Bild] [Ein Test Bild] [Ein zweites Bild]" + ) def test_display_images_deduplicated(): - html = ''' + html = """ Ein Test Bild Ein Test Bild Ein zweites Bild - ''' + """ config = ParserConfig(display_images=True, deduplicate_captions=True) - assert get_text(html, config).strip() == \ - '[Ein Test Bild] [Ein zweites Bild]' + assert get_text(html, config).strip() == "[Ein Test Bild] [Ein zweites Bild]" diff --git a/tests/test_html_snippets.py b/tests/test_html_snippets.py index 9e7197f..9df864d 100644 --- a/tests/test_html_snippets.py +++ b/tests/test_html_snippets.py @@ -11,10 +11,10 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig -TESTCASE_PATTERN = join(dirname(__file__), 'html/*.txt') +TESTCASE_PATTERN = join(dirname(__file__), "html/*.txt") -def test_html_snippets(filter_str=''): +def test_html_snippets(filter_str=""): for testcase_txt in glob(TESTCASE_PATTERN): if filter_str not in testcase_txt: continue @@ -22,26 +22,30 @@ def test_html_snippets(filter_str=''): with open(testcase_txt) as f: reference_txt = f.read().rstrip() - with open(testcase_txt.replace('.txt', '.html')) as f: + with open(testcase_txt.replace(".txt", ".html")) as f: print(f.name) - html = '{}'.format(f.read()) + html = "{}".format(f.read()) - converted_txt = get_text(html, ParserConfig( - css=CSS_PROFILES['strict'])).rstrip() + converted_txt = get_text( + html, ParserConfig(css=CSS_PROFILES["strict"]) + ).rstrip() if converted_txt != reference_txt: - print('File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}' - .format(testcase_txt, html, reference_txt, converted_txt)) - print('HTML file:', testcase_txt.replace('.txt', '.html')) - print("Visualize differences with `vimdiff reference.txt " - "converted.txt`") + print( + "File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}".format( + testcase_txt, html, reference_txt, converted_txt + ) + ) + print("HTML file:", testcase_txt.replace(".txt", ".html")) + print("Visualize differences with `vimdiff reference.txt " "converted.txt`") open("reference.txt", "w").write(reference_txt) open("converted.txt", "w").write(converted_txt) assert converted_txt == reference_txt -if __name__ == '__main__': +if __name__ == "__main__": from sys import argv - filter_str = argv[1] if len(argv) > 1 else '' + + filter_str = argv[1] if len(argv) > 1 else "" test_html_snippets(filter_str) diff --git a/tests/test_html_snippets_annotations.py b/tests/test_html_snippets_annotations.py index 9655afa..6c481a1 100644 --- a/tests/test_html_snippets_annotations.py +++ b/tests/test_html_snippets_annotations.py @@ -12,18 +12,18 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig -TESTCASE_PATTERN = os.path.join(os.path.dirname(__file__), 'html/*.json') +TESTCASE_PATTERN = os.path.join(os.path.dirname(__file__), "html/*.json") -def assert_equal_ignoring_whitespace(reference: List[str], - converted: List[str]) -> bool: +def assert_equal_ignoring_whitespace( + reference: List[str], converted: List[str] +) -> bool: for (ref_tag, ref_str), (conv_tag, conv_str) in zip(reference, converted): - assert ref_tag == conv_tag - assert ''.join(ref_str.split()) == ''.join(conv_str.split()) + assert "".join(ref_str.split()) == "".join(conv_str.split()) -def test_html_annotations(filter_str=''): +def test_html_annotations(filter_str=""): for annotation_file in glob(TESTCASE_PATTERN): if filter_str not in annotation_file: continue @@ -31,33 +31,39 @@ def test_html_annotations(filter_str=''): with open(annotation_file) as f: reference = load(f) - with open(annotation_file.replace('.json', '.html')) as f: + with open(annotation_file.replace(".json", ".html")) as f: print(f.name) - html = '{}'.format(f.read()) + html = "{}".format(f.read()) - for indentation_strategy in ('strict', 'relaxed'): - result = get_annotated_text(html, ParserConfig( - css=CSS_PROFILES[indentation_strategy], - annotation_rules=reference['annotation_rules'])) + for indentation_strategy in ("strict", "relaxed"): + result = get_annotated_text( + html, + ParserConfig( + css=CSS_PROFILES[indentation_strategy], + annotation_rules=reference["annotation_rules"], + ), + ) - converted = [[a[2], result['text'][a[0]:a[1]]] - for a in result['label']] + converted = [[a[2], result["text"][a[0] : a[1]]] for a in result["label"]] - if reference['result'] != converted: + if reference["result"] != converted: print("Reference:") - print(reference['result']) - print("\nConverted (indentation strategy: {})".format(indentation_strategy)) + print(reference["result"]) + print( + "\nConverted (indentation strategy: {})".format( + indentation_strategy + ) + ) print(converted) - if indentation_strategy == 'strict': - assert reference['result'] == converted + if indentation_strategy == "strict": + assert reference["result"] == converted else: - assert_equal_ignoring_whitespace(reference['result'], - converted) + assert_equal_ignoring_whitespace(reference["result"], converted) -if __name__ == '__main__': +if __name__ == "__main__": from sys import argv - filter_str = argv[1] if len(argv) > 1 else '' + filter_str = argv[1] if len(argv) > 1 else "" test_html_annotations(filter_str) diff --git a/tests/test_limit_whitespace_affixes.py b/tests/test_limit_whitespace_affixes.py index 20d6666..53e97fd 100644 --- a/tests/test_limit_whitespace_affixes.py +++ b/tests/test_limit_whitespace_affixes.py @@ -13,36 +13,41 @@ def test_html_element_refinement(): - new = HtmlElement('span', display=Display.inline, prefix=' ', suffix=' ', - limit_whitespace_affixes=True) - pre = HtmlElement('pre', display=Display.block, whitespace=WhiteSpace.pre) - code = HtmlElement('code') + new = HtmlElement( + "span", + display=Display.inline, + prefix=" ", + suffix=" ", + limit_whitespace_affixes=True, + ) + pre = HtmlElement("pre", display=Display.block, whitespace=WhiteSpace.pre) + code = HtmlElement("code") # refinement with pre and whitespaces refined = pre.get_refined_html_element(copy(new)) - assert refined.prefix == '' - assert refined.suffix == '' + assert refined.prefix == "" + assert refined.suffix == "" # refinement with code and whitespaces refined = code.get_refined_html_element(copy(new)) - assert refined.prefix == ' ' - assert refined.suffix == ' ' + assert refined.prefix == " " + assert refined.suffix == " " # refinement with pre and non-whitespaces - new.prefix = ' 1. ' - new.suffix = '<' + new.prefix = " 1. " + new.suffix = "<" refined = pre.get_refined_html_element(copy(new)) - assert refined.prefix == ' 1. ' - assert refined.suffix == '<' + assert refined.prefix == " 1. " + assert refined.suffix == "<" # refinement with code and non-whitespaces refined = code.get_refined_html_element(copy(new)) - assert refined.prefix == ' 1. ' - assert refined.suffix == '<' + assert refined.prefix == " 1. " + assert refined.suffix == "<" def test_limit_whitespace_affixes(): - html = ''' + html = """ halloecho
@@ -51,9 +56,10 @@ def hallo():
                    
- ''' + """ config = ParserConfig(css=RELAXED_CSS_PROFILE) - assert get_text(html, config).strip() == \ - 'hallo echo\n\n' \ - 'def hallo():\n' \ + assert ( + get_text(html, config).strip() == "hallo echo\n\n" + "def hallo():\n" ' print("echo")' + ) diff --git a/tests/test_list_div.py b/tests/test_list_div.py index 07ae5d1..44c1ef5 100644 --- a/tests/test_list_div.py +++ b/tests/test_list_div.py @@ -10,21 +10,21 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig -config = ParserConfig(css=CSS_PROFILES['strict']) +config = ParserConfig(css=CSS_PROFILES["strict"]) def test_divs(): - html = u'Thomas
Anton
Maria' - assert get_text(html, config) == u'Thomas\nAnton\nMaria' + html = "Thomas
Anton
Maria" + assert get_text(html, config) == "Thomas\nAnton\nMaria" - html = u'Thomas
Anna läuft weit weg.
' - assert get_text(html, config) == u'Thomas\nAnna läuft weit weg.' + html = "Thomas
Anna läuft weit weg.
" + assert get_text(html, config) == "Thomas\nAnna läuft weit weg." - html = u'Thomas
  • Anton
    Maria
' - assert get_text(html, config) == u'Thomas\n * Anton\n Maria' + html = "Thomas
  • Anton
    Maria
" + assert get_text(html, config) == "Thomas\n * Anton\n Maria" - html = u'Thomas
  • Anton
    Maria
' - assert get_text(html, config) == u'Thomas\n * Anton\n Maria' + html = "Thomas
  • Anton
    Maria
" + assert get_text(html, config) == "Thomas\n * Anton\n Maria" - html = u'Thomas
  • a
    Anton
    Maria
' - assert get_text(html, config) == u'Thomas\n * a\n Anton\n Maria' + html = "Thomas
  • a
    Anton
    Maria
" + assert get_text(html, config) == "Thomas\n * a\n Anton\n Maria" diff --git a/tests/test_margin_before_at_start.py b/tests/test_margin_before_at_start.py index bcadbc5..870c076 100644 --- a/tests/test_margin_before_at_start.py +++ b/tests/test_margin_before_at_start.py @@ -9,20 +9,18 @@ def test_content(): - html = 'first' - assert get_text(html) == 'first' + html = "first" + assert get_text(html) == "first" def test_margin_before(): - html = '

first

' - assert get_text(html) == 'first\n' + html = "

first

" + assert get_text(html) == "first\n" - html = 'first

' \ - 'second

' - assert get_text(html) == 'first\n\nsecond\n' + html = "first

" "second

" + assert get_text(html) == "first\n\nsecond\n" def test_br(): - html = '
' \ - 'first

' - assert get_text(html) == '\nfirst' + html = "
" "first

" + assert get_text(html) == "\nfirst" diff --git a/tests/test_margin_handling.py b/tests/test_margin_handling.py index c09d944..c6a9906 100644 --- a/tests/test_margin_handling.py +++ b/tests/test_margin_handling.py @@ -9,29 +9,29 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig -config = ParserConfig(css=CSS_PROFILES['strict']) +config = ParserConfig(css=CSS_PROFILES["strict"]) def test_margin_handling(): - html = u'''Hallo + html = """Hallo
Echo
Mecho
sei Gott - ''' - assert get_text(html, config) == u'Hallo\n\nEcho\n\n\nMecho\n\nsei Gott' + """ + assert get_text(html, config) == "Hallo\n\nEcho\n\n\nMecho\n\nsei Gott" - html = u'''Hallo + html = """Hallo
Echo
Mecho
sei Gott - ''' - assert get_text(html, config) == u'Hallo\n\nEcho\n\n\nMecho\nsei Gott' + """ + assert get_text(html, config) == "Hallo\n\nEcho\n\n\nMecho\nsei Gott" - html = u'''Hallo + html = """Hallo
Ehre
sei Gott - ''' - assert get_text(html, config) == u'Hallo\n\n\nEhre\n\nsei Gott' + """ + assert get_text(html, config) == "Hallo\n\n\nEhre\n\nsei Gott" diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 9ffe217..2094695 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,19 +1,24 @@ -from inscriptis.metadata import (__author__, __author_email__, __copyright__, - __license__, __version__) +from inscriptis.metadata import ( + __author__, + __author_email__, + __copyright__, + __license__, + __version__, +) def test_metadata(): """Test inscriptis package metadata.""" - assert 'Albert Weichselbraun' in __author__ - assert 'Fabian Odoni' in __author__ + assert "Albert Weichselbraun" in __author__ + assert "Fabian Odoni" in __author__ - assert '@' in __author_email__ + assert "@" in __author_email__ - assert '2016-' in __copyright__ - assert 'Albert Weichselbraun' in __copyright__ - assert 'Fabian Odoni' in __copyright__ + assert "2016-" in __copyright__ + assert "Albert Weichselbraun" in __copyright__ + assert "Fabian Odoni" in __copyright__ - assert __license__ == 'Apache 2.0' + assert __license__ == "Apache 2.0" assert __version__[0].isnumeric() - assert '.' in __version__ + assert "." in __version__ diff --git a/tests/test_model_html_element_canvas.py b/tests/test_model_html_element_canvas.py index 574c047..e0d8c66 100644 --- a/tests/test_model_html_element_canvas.py +++ b/tests/test_model_html_element_canvas.py @@ -26,31 +26,31 @@ def _get_text(html_element): HtmlElement().set_canvas(c).write("last") c._flush_inline() - return '\n'.join(c.blocks) + return "\n".join(c.blocks) def test_formatting(): # standard line h = HtmlElement() - assert _get_text(h) == 'firstEhre sei Gott!last' + assert _get_text(h) == "firstEhre sei Gott!last" h.display = Display.block h.margin_before = 1 h.margin_after = 2 print(h) print(_get_text(h)) - assert _get_text(h) == 'first\n\nEhre sei Gott!\n\n\nlast' + assert _get_text(h) == "first\n\nEhre sei Gott!\n\n\nlast" # list bullet without padding_inline h.list_bullet = "* " - assert _get_text(h) == 'first\n\n* Ehre sei Gott!\n\n\nlast' + assert _get_text(h) == "first\n\n* Ehre sei Gott!\n\n\nlast" # add a padding_inline h.padding_inline = 3 - assert _get_text(h) == 'first\n\n * Ehre sei Gott!\n\n\nlast' + assert _get_text(h) == "first\n\n * Ehre sei Gott!\n\n\nlast" # and prefixes + suffixes - h.prefix = '>>' - h.suffix = '<<' - assert _get_text(h)== 'first\n\n * >>Ehre sei Gott!<<\n\n\nlast' + h.prefix = ">>" + h.suffix = "<<" + assert _get_text(h) == "first\n\n * >>Ehre sei Gott!<<\n\n\nlast" diff --git a/tests/test_model_prefix.py b/tests/test_model_prefix.py index 6682bbb..f5e3f8c 100644 --- a/tests/test_model_prefix.py +++ b/tests/test_model_prefix.py @@ -11,46 +11,45 @@ def test_simple_prefix(): p = Prefix() - p.register_prefix(5, '1. ') + p.register_prefix(5, "1. ") # first use - assert p.first == ' 1. ' + assert p.first == " 1. " # the prefix has been consumed - assert p.first == '' + assert p.first == "" # prefix used to indent lines separated with newlines - assert p.rest == ' ' + assert p.rest == " " def test_combined_prefix(): p = Prefix() - p.register_prefix(5, '1. ') - p.register_prefix(2, '') + p.register_prefix(5, "1. ") + p.register_prefix(2, "") - assert p.first == ' 1. ' - assert p.first == '' + assert p.first == " 1. " + assert p.first == "" p.remove_last_prefix() - assert p.first == '' + assert p.first == "" p.remove_last_prefix() # final consumption - no prefix - assert p.first == '' + assert p.first == "" # ensure that there are no interactions between different runs with # bullets p.consumed = False - p.register_prefix(5, '2. ') - p.register_prefix(2, '- ') + p.register_prefix(5, "2. ") + p.register_prefix(2, "- ") - assert p.first == ' - ' - assert p.first == '' - assert p.rest == ' ' + assert p.first == " - " + assert p.first == "" + assert p.rest == " " p.consumed = False p.remove_last_prefix() - assert p.first == ' 2. ' - assert p.rest == ' ' - + assert p.first == " 2. " + assert p.rest == " " diff --git a/tests/test_parse_css.py b/tests/test_parse_css.py index 9822644..8b26bf5 100644 --- a/tests/test_parse_css.py +++ b/tests/test_parse_css.py @@ -7,54 +7,61 @@ from copy import copy from inscriptis.css_profiles import CSS_PROFILES -from inscriptis.html_properties import (Display, WhiteSpace, VerticalAlignment, - HorizontalAlignment) +from inscriptis.html_properties import ( + Display, + WhiteSpace, + VerticalAlignment, + HorizontalAlignment, +) from inscriptis.model.css import CssParse from inscriptis.model.html_element import HtmlElement def test_css_parsing(): - html_element = copy(CSS_PROFILES['strict']['div']) - CssParse.attr_style('padding_left: 8px; display: block', html_element) + html_element = copy(CSS_PROFILES["strict"]["div"]) + CssParse.attr_style("padding_left: 8px; display: block", html_element) assert html_element.padding_inline == 1 assert html_element.display == Display.block - CssParse.attr_style('margin_before: 8em; display: inline', html_element) + CssParse.attr_style("margin_before: 8em; display: inline", html_element) assert html_element.margin_before == 8 assert html_element.display == Display.inline def test_html_element_str(): - ''' + """ Tests the string representation of an HtmlElement. - ''' - html_element = HtmlElement('div', '', '', Display.inline, 0, 0, 0, - '', WhiteSpace.pre) - assert str(html_element) == ('
') + """ + html_element = HtmlElement( + "div", "", "", Display.inline, 0, 0, 0, "", WhiteSpace.pre + ) + assert str(html_element) == ( + "
" + ) def test_parse_vertical_align(): html_element = HtmlElement() - CssParse.attr_vertical_align('top', html_element) + CssParse.attr_vertical_align("top", html_element) assert html_element.valign == VerticalAlignment.top # invalid value - CssParse.attr_vertical_align('unknown', html_element) + CssParse.attr_vertical_align("unknown", html_element) assert html_element.valign == VerticalAlignment.top def test_parse_horizontal_align(): html_element = HtmlElement() - CssParse.attr_horizontal_align('center', html_element) + CssParse.attr_horizontal_align("center", html_element) assert html_element.align == HorizontalAlignment.center # invalid value - CssParse.attr_horizontal_align('unknown', html_element) + CssParse.attr_horizontal_align("unknown", html_element) assert html_element.align == HorizontalAlignment.center diff --git a/tests/test_strip_xml_header.py b/tests/test_strip_xml_header.py index b2e8e44..cc28f05 100644 --- a/tests/test_strip_xml_header.py +++ b/tests/test_strip_xml_header.py @@ -4,7 +4,7 @@ from inscriptis import get_text -def test_successive_a(): - html = u' Hallo?>' - assert get_text(html).strip() == 'Hallo?>' +def test_successive_a(): + html = ' Hallo?>' + assert get_text(html).strip() == "Hallo?>" diff --git a/tests/test_style_parsing.py b/tests/test_style_parsing.py index 8efce8f..d23ae48 100644 --- a/tests/test_style_parsing.py +++ b/tests/test_style_parsing.py @@ -10,7 +10,8 @@ def test_style_unit_parsing(): html_element = HtmlElement() - CssParse.attr_style("margin-top:2.666666667em;margin-bottom: 2.666666667em", - html_element) + CssParse.attr_style( + "margin-top:2.666666667em;margin-bottom: 2.666666667em", html_element + ) assert html_element.margin_before == 3 assert html_element.margin_after == 3 diff --git a/tests/test_table_cell.py b/tests/test_table_cell.py index 8c728b2..597af19 100644 --- a/tests/test_table_cell.py +++ b/tests/test_table_cell.py @@ -9,39 +9,40 @@ from inscriptis.model.table import TableCell from inscriptis.html_properties import HorizontalAlignment, VerticalAlignment + def test_height(): cell = TableCell(HorizontalAlignment.left, VerticalAlignment.top) - cell.blocks = ['hallo'] + cell.blocks = ["hallo"] cell.normalize_blocks() - assert cell.height == len('\n'.join(cell.blocks).split('\n')) + assert cell.height == len("\n".join(cell.blocks).split("\n")) - cell.blocks = ['hallo', 'echo'] + cell.blocks = ["hallo", "echo"] cell.normalize_blocks() assert cell.height == 2 - cell.blocks = ['hallo\necho'] + cell.blocks = ["hallo\necho"] cell.normalize_blocks() assert cell.height == 2 - cell.blocks = ['hallo\necho', 'Ehre sei Gott', 'Jump\n&\nRun!\n\n\n'] + cell.blocks = ["hallo\necho", "Ehre sei Gott", "Jump\n&\nRun!\n\n\n"] cell.normalize_blocks() assert cell.height == 9 - assert cell.height == len('\n'.join(cell.blocks).split('\n')) + assert cell.height == len("\n".join(cell.blocks).split("\n")) + def test_width(): cell = TableCell(HorizontalAlignment.left, VerticalAlignment.top) - cell.blocks = ['hallo'] + cell.blocks = ["hallo"] cell.normalize_blocks() assert cell.width == len(cell.blocks[0]) - cell.blocks = ['hallo\necho', 'Ehre sei Gott', 'Jump\n&\nRun!\n\n\n'] + cell.blocks = ["hallo\necho", "Ehre sei Gott", "Jump\n&\nRun!\n\n\n"] cell.normalize_blocks() - assert cell.width == len('Ehre sei Gott') + assert cell.width == len("Ehre sei Gott") # fixed set width cell.width = 95 cell.normalize_blocks() assert cell.width == 95 - diff --git a/tests/test_table_cell_formatting.py b/tests/test_table_cell_formatting.py index 7062d78..f8d6de8 100644 --- a/tests/test_table_cell_formatting.py +++ b/tests/test_table_cell_formatting.py @@ -11,50 +11,38 @@ def test_horizontal_cell_formatting(): - - cell = TableCell(align=HorizontalAlignment.left, - valign=VerticalAlignment.top) + cell = TableCell(align=HorizontalAlignment.left, valign=VerticalAlignment.top) # left alignment - cell.blocks = ['Ehre sei Gott!'] + cell.blocks = ["Ehre sei Gott!"] cell.width = 16 - assert cell.blocks == ['Ehre sei Gott! '] + assert cell.blocks == ["Ehre sei Gott! "] # right alignment cell.align = HorizontalAlignment.right - cell.blocks = ['Ehre sei Gott!'] + cell.blocks = ["Ehre sei Gott!"] cell.width = 16 - assert cell.blocks == [' Ehre sei Gott!'] + assert cell.blocks == [" Ehre sei Gott!"] def test_vertical_cell_formatting(): - cell = TableCell(align=HorizontalAlignment.left, - valign=VerticalAlignment.top) + cell = TableCell(align=HorizontalAlignment.left, valign=VerticalAlignment.top) # default top alignment - cell.blocks = ['Ehre sei Gott!'] + cell.blocks = ["Ehre sei Gott!"] cell.width = 16 cell.height = 4 - assert cell.blocks == ['Ehre sei Gott! ', - '', - '', - ''] + assert cell.blocks == ["Ehre sei Gott! ", "", "", ""] # bottom alignment - cell.blocks = ['Ehre sei Gott!'] + cell.blocks = ["Ehre sei Gott!"] cell.valign = VerticalAlignment.bottom cell.width = 16 cell.height = 4 - assert cell.blocks == ['', - '', - '', - 'Ehre sei Gott! '] + assert cell.blocks == ["", "", "", "Ehre sei Gott! "] # middle alignment - cell.blocks = ['Ehre sei Gott!'] + cell.blocks = ["Ehre sei Gott!"] cell.valign = VerticalAlignment.middle cell.width = 16 cell.height = 4 - assert cell.blocks == ['', - 'Ehre sei Gott! ', - '', - ''] + assert cell.blocks == ["", "Ehre sei Gott! ", "", ""] diff --git a/tests/test_table_row.py b/tests/test_table_row.py index dc2f5f6..b0ea143 100644 --- a/tests/test_table_row.py +++ b/tests/test_table_row.py @@ -11,17 +11,17 @@ def test_empty_row(): - tr = TableRow(cell_separator=' ') + tr = TableRow(cell_separator=" ") assert tr.width == 0 - assert tr.get_text() == '' + assert tr.get_text() == "" def test_table_cell_separator(): - html = '
Hallo
Eins
Echo
Zwei
' + html = "
Hallo
Eins
Echo
Zwei
" config = ParserConfig() - assert get_text(html, config) == 'Hallo Echo\nEins Zwei\n' + assert get_text(html, config) == "Hallo Echo\nEins Zwei\n" - config = ParserConfig(table_cell_separator='\t') - assert get_text(html, config) == 'Hallo\tEcho\nEins \tZwei\n' + config = ParserConfig(table_cell_separator="\t") + assert get_text(html, config) == "Hallo\tEcho\nEins \tZwei\n" diff --git a/tests/test_white_space_handling.py b/tests/test_white_space_handling.py index cf43d4d..b8b8e28 100644 --- a/tests/test_white_space_handling.py +++ b/tests/test_white_space_handling.py @@ -9,29 +9,24 @@ from inscriptis.css_profiles import CSS_PROFILES from inscriptis.model.config import ParserConfig -config = ParserConfig(css=CSS_PROFILES['strict']) +config = ParserConfig(css=CSS_PROFILES["strict"]) def test_white_space(): - html = (u'12\n3' - u'') - assert get_text(html, config) == u'12 3' + html = '12\n3' "" + assert get_text(html, config) == "12 3" - html = (u'12\n3' - u'') - assert get_text(html, config) == u'12 3' + html = '12\n3' "" + assert get_text(html, config) == "12 3" - html = (u'12\n3' - u'') - assert get_text(html, config) == u'12\n3' + html = '12\n3' "" + assert get_text(html, config) == "12\n3" - html = (u'12\n3' - u'') - assert get_text(html, config) == u'12\n3' + html = '12\n3' "" + assert get_text(html, config) == "12\n3" - html = (u'12\n3' - u'') - assert get_text(html, config) == u'12\n3' + html = '12\n3' "" + assert get_text(html, config) == "12\n3" def test_borderline_cases(): @@ -41,39 +36,38 @@ def test_borderline_cases(): """ # change of whitespace handling between terms; no whitespace # between the terms - html = u'Halloecho versus' - assert get_text(html, config) == u'Halloecho versus' + html = 'Halloecho versus' + assert get_text(html, config) == "Halloecho versus" # change of whitespace handling between terms; one whitespace # between the terms; option 1 - html = u'Hallo echo versus' - assert get_text(html, config) == u'Hallo echo versus' + html = 'Hallo echo versus' + assert get_text(html, config) == "Hallo echo versus" # change of whitespace handling between terms; one whitespace # between the terms; option 2 - html = u'Hallo echo versus' - assert get_text(html, config) == u'Hallo echo versus' + html = 'Hallo echo versus' + assert get_text(html, config) == "Hallo echo versus" # change of whitespace handling between terms; two whitespaces # between the terms - html = u'Hallo echo versus' - assert get_text(html, config) == u'Hallo echo versus' + html = 'Hallo echo versus' + assert get_text(html, config) == "Hallo echo versus" # change of whitespace handling between terms; multiple whitespaces # between the terms - html = u'Hallo echo versus' - assert get_text(html, config) == u'Hallo echo versus' + html = 'Hallo echo versus' + assert get_text(html, config) == "Hallo echo versus" # change of whitespace handling between terms; multiple whitespaces # between the terms - html = u'Hallo echo versus' - assert get_text(html, config) == u'Hallo echo versus' + html = 'Hallo echo versus' + assert get_text(html, config) == "Hallo echo versus" def test_tail(): """ ensure that the tail elements are formated based on the container element. """ - html = (u'Hi 1 3 ' - u' versus 1 3') - assert get_text(html, config) == u'Hi 1 3 versus 1 3' + html = 'Hi 1 3 ' " versus 1 3" + assert get_text(html, config) == "Hi 1 3 versus 1 3" From 8329a305ea0ef6ac629c77f08e9c56d6701071a2 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Thu, 11 Jan 2024 17:31:42 +0100 Subject: [PATCH 15/29] chg: ignore black reformatting in 'git blame'. --- .git-blame-ignore-revs | 1 + 1 file changed, 1 insertion(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..e768d6d --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1 @@ +55fa29ca39f9ed5895f9e88b2eb0f17e4d84245f From 255615337d976686c0f88ae2e1b2ea8c3de73ca1 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Thu, 11 Jan 2024 18:08:46 +0100 Subject: [PATCH 16/29] chg: improved dependencies. --- pyproject.toml | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 53f64cd..92f8ba6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,36 @@ [tool.poetry] name = "inscriptis" version = "2.4.0" -description = "inscriptis - HTML to text converter." authors = ["Albert Weichselbraun ", "Fabian Odoni "] -license = "Apache 2.0" +description = "inscriptis - HTML to text converter." +keywords = ["HTML", "converter", "text"] +classifiers = [ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Topic :: Text Processing', + 'Topic :: Text Processing :: Markup :: HTML', + 'Topic :: Utilities', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + ] +homepage = "https://github.com/weblyzard/inscriptis" +repository = "https://github.com/weblyzard/inscriptis" +documentation = "https://inscriptis.readthedocs.io/en" +license = "Apache-2.0" readme = "README.rst" +# [tool.poetry.scripts] +# inscript = "scripts/inscript.py" + [tool.poetry.dependencies] python = "^3.8 || ^3.9 || ^3.10 || ^3.11 || ^3.12" -requests = "^2.31.0" -lxml = "^5.1.0" +requests = ">=2.23.0" +lxml = ">=4.5.0" [build-system] requires = ["poetry-core"] From e68329bd4eac5473dd335b4874a1aceb1d56b73a Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Fri, 12 Jan 2024 16:07:53 +0100 Subject: [PATCH 17/29] wip: cli. --- pyproject.toml | 11 ++++-- src/inscriptis/cli/__init__.py | 1 + {scripts => src/inscriptis/cli}/inscript.py | 40 ++++++++++++++------- 3 files changed, 38 insertions(+), 14 deletions(-) create mode 100644 src/inscriptis/cli/__init__.py rename {scripts => src/inscriptis/cli}/inscript.py (85%) diff --git a/pyproject.toml b/pyproject.toml index 92f8ba6..2b86e63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,14 +24,21 @@ documentation = "https://inscriptis.readthedocs.io/en" license = "Apache-2.0" readme = "README.rst" -# [tool.poetry.scripts] -# inscript = "scripts/inscript.py" +packages = [ + { include = "inscriptis" } +] + +[tool.poetry.scripts] +inscript = "inscriptis.cli.inscript:cli" [tool.poetry.dependencies] python = "^3.8 || ^3.9 || ^3.10 || ^3.11 || ^3.12" requests = ">=2.23.0" lxml = ">=4.5.0" +[tool.poetry.group.test.dependencies] +pytest = ">=7.2.0" + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" diff --git a/src/inscriptis/cli/__init__.py b/src/inscriptis/cli/__init__.py new file mode 100644 index 0000000..e37fc78 --- /dev/null +++ b/src/inscriptis/cli/__init__.py @@ -0,0 +1 @@ +"""Inscriptis command line interface clients.""" diff --git a/scripts/inscript.py b/src/inscriptis/cli/inscript.py similarity index 85% rename from scripts/inscript.py rename to src/inscriptis/cli/inscript.py index 2e694a5..58a047f 100755 --- a/scripts/inscript.py +++ b/src/inscriptis/cli/inscript.py @@ -5,6 +5,7 @@ import argparse import sys from json import load, dumps +from typing import Optional from pathlib import Path import requests @@ -121,7 +122,32 @@ def get_parser(): return parser -if __name__ == "__main__": +def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[str]: + """ + Return the HTML content to convert. + + Args: + url: URL to the HTML content, or None if the content is obtained from stdin. + encoding: used encoding. + + Returns: + The html_content or None, if no content could be extracted. + + """ + if not url: + return sys.stdin.read() + elif Path(url).is_file(): + with Path(url).open( + encoding=encoding or DEFAULT_ENCODING, errors="ignore" + ) as f: + return f.read() + elif url.startswith("http://") or url.startswith("https://"): + req = requests.get(url, timeout=timeout) + return req.content.decode(encoding or req.encoding) + + +def cli(): + """Run the inscript command line client.""" parser = get_parser() args = parser.parse_args() @@ -138,17 +164,7 @@ def get_parser(): ) sys.exit(0) - if not args.input: - html_content = sys.stdin.read() - elif Path(args.input).is_file(): - with Path(args.input).open( - encoding=args.encoding or DEFAULT_ENCODING, errors="ignore" - ) as f: - html_content = f.read() - elif args.input.startswith("http://") or args.input.startswith("https://"): - req = requests.get(args.input, timeout=args.timeout) - html_content = req.content.decode(args.encoding or req.encoding) - else: + if not (html_content := get_html_content(args.input, args.timeout, args.encoding)): print("ERROR: Cannot open input file '{0}'.\n".format(args.input)) parser.print_help() sys.exit(-1) From 7ba651326d8c26f4796421a9f71cd0d088f5b3b9 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Mon, 15 Jan 2024 08:23:00 +0100 Subject: [PATCH 18/29] fix: metadata handling. --- pyproject.toml | 4 ++-- src/inscriptis/cli/inscript.py | 41 +++++++++++++++++++--------------- src/inscriptis/metadata.py | 10 ++++++--- tests/test_metadata.py | 7 ++---- 4 files changed, 34 insertions(+), 28 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2b86e63..a6aef78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ license = "Apache-2.0" readme = "README.rst" packages = [ - { include = "inscriptis" } + {include = "inscriptis", from="src"}, ] [tool.poetry.scripts] @@ -49,5 +49,5 @@ line-length = 88 target-version = ["py38", "py39", "py310", "py311", "py312"] extend-exclude = '\.html$|\.json$|\.txt$' include = ''' - ^/scripts/|^/src/|^/tests/ + ^/src/|^/tests/ ''' diff --git a/src/inscriptis/cli/inscript.py b/src/inscriptis/cli/inscript.py index 58a047f..a0d7fb7 100755 --- a/src/inscriptis/cli/inscript.py +++ b/src/inscriptis/cli/inscript.py @@ -33,8 +33,12 @@ def get_postprocessor(name): return getattr(mod, pp_class)() -def get_parser(): - """Parse the arguments if script is run via console.""" +def parse_command_line() -> argparse.Namespace: + """Parse the command line arguments. + + Returns: + The parsed command line arguments. + """ parser = argparse.ArgumentParser( description="Convert the given HTML document to text." ) @@ -119,7 +123,22 @@ def get_parser(): default=False, help="display version information", ) - return parser + + # parse command line arguments + args = parser.parse_args() + if args.version: + print( + "Inscript HTML to text conversion (based on the inscriptis " + "library version {0})".format(__version__) + ) + print("Copyright (C)", __copyright__) + print("\nInscript comes with ABSOLUTELY NO WARRANTY.") + print( + "This is free software and you are welcome to redistribute it " + "under the terms of the {0}.".format(__license__) + ) + sys.exit(0) + return args def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[str]: @@ -149,21 +168,7 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s def cli(): """Run the inscript command line client.""" parser = get_parser() - args = parser.parse_args() - - if args.version: - print( - "Inscript HTML to text conversion (based on the inscriptis " - "library version {0})".format(__version__) - ) - print("Copyright (C)", __copyright__) - print("\nInscript comes with ABSOLUTELY NO WARRANTY.") - print( - "This is free software and you are welcome to redistribute it " - "under the terms of the {0}.".format(__license__) - ) - sys.exit(0) - + args = parse_command_line() if not (html_content := get_html_content(args.input, args.timeout, args.encoding)): print("ERROR: Cannot open input file '{0}'.\n".format(args.input)) parser.print_help() diff --git a/src/inscriptis/metadata.py b/src/inscriptis/metadata.py index f7112f0..f1c3747 100644 --- a/src/inscriptis/metadata.py +++ b/src/inscriptis/metadata.py @@ -1,7 +1,11 @@ """Inscriptis metadata information.""" +import importlib.metadata as metadata + +PACKAGE = "inscriptis" + __author__ = "Albert Weichselbraun, Fabian Odoni" __author_email__ = "albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch" -__copyright__ = "2016-2024 Albert Weichselbraun, Fabian Odoni" -__license__ = "Apache 2.0" -__version__ = "2.3.3" +__copyright__ = f"{metadata.metadata(PACKAGE)['Name']} {metadata.metadata(PACKAGE)['Version']} © 2016-2023 {__author__}" +__license__ = metadata.metadata(PACKAGE)["License"] +__version__ = metadata.metadata(PACKAGE)["Version"] diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 2094695..d0f5a9a 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -12,13 +12,10 @@ def test_metadata(): assert "Albert Weichselbraun" in __author__ assert "Fabian Odoni" in __author__ - assert "@" in __author_email__ - - assert "2016-" in __copyright__ assert "Albert Weichselbraun" in __copyright__ assert "Fabian Odoni" in __copyright__ - assert __license__ == "Apache 2.0" - + assert "@" in __author_email__ + assert __license__ == "Apache-2.0" assert __version__[0].isnumeric() assert "." in __version__ From dec34b567369dcd9e3be0907451d71e424fc46ea Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Mon, 15 Jan 2024 10:30:30 +0100 Subject: [PATCH 19/29] chg: improved README. --- README.rst | 2 +- benchmarking/run_benchmarking.py | 13 +++++++++++++ src/inscriptis/cli/inscript.py | 4 +--- src/inscriptis/metadata.py | 5 ++++- src/inscriptis/model/attribute.py | 2 +- src/inscriptis/model/canvas/block.py | 4 ++++ src/inscriptis/model/html_element.py | 16 ++++++++-------- src/inscriptis/model/table.py | 8 ++++---- 8 files changed, 36 insertions(+), 18 deletions(-) diff --git a/README.rst b/README.rst index a6fac94..116843e 100644 --- a/README.rst +++ b/README.rst @@ -315,7 +315,7 @@ The Flask Web Service translates HTML pages to the corresponding plain text. Run the Web Service on your host system --------------------------------------- -Provide additional requirement `python3-flask `_, then start the inscriptis Web service with the following command:: +Install the additional requirement `python3-flask `_, then start the inscriptis Web service with the following command:: $ export FLASK_APP="inscriptis.service.web" $ python3 -m flask run diff --git a/benchmarking/run_benchmarking.py b/benchmarking/run_benchmarking.py index dd09ce5..2de67f3 100755 --- a/benchmarking/run_benchmarking.py +++ b/benchmarking/run_benchmarking.py @@ -368,6 +368,7 @@ def benchmark(args, source_list): _setup_benchmarking_directories(args) output = [] + total_times = {} for source in source_list: source_name, html = _fetch_url(source, args.cache) @@ -383,10 +384,22 @@ def benchmark(args, source_list): save_to_file(converter.name, source_name, text, args.benchmarking_results) + for converter, conversion_time in times.items(): + total_times[converter] = total_times.get(converter, 0) + conversion_time speed_table = get_speed_table(times) print(speed_table) output.append(speed_table) + print('\nTotal') + output.append('\nTotal\n') + speed_table = get_speed_table(total_times) + print(speed_table) + output.append(speed_table) + + + + + with open(os.path.join(args.benchmarking_results, OUTFILE), 'w') as output_file: output_file.write('\n'.join(output) + '\n') diff --git a/src/inscriptis/cli/inscript.py b/src/inscriptis/cli/inscript.py index a0d7fb7..c2861dc 100755 --- a/src/inscriptis/cli/inscript.py +++ b/src/inscriptis/cli/inscript.py @@ -167,11 +167,9 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s def cli(): """Run the inscript command line client.""" - parser = get_parser() args = parse_command_line() if not (html_content := get_html_content(args.input, args.timeout, args.encoding)): - print("ERROR: Cannot open input file '{0}'.\n".format(args.input)) - parser.print_help() + print("ERROR: Cannot open input file '{0}'.".format(args.input)) sys.exit(-1) if args.annotation_rules: diff --git a/src/inscriptis/metadata.py b/src/inscriptis/metadata.py index f1c3747..c86c482 100644 --- a/src/inscriptis/metadata.py +++ b/src/inscriptis/metadata.py @@ -6,6 +6,9 @@ __author__ = "Albert Weichselbraun, Fabian Odoni" __author_email__ = "albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch" -__copyright__ = f"{metadata.metadata(PACKAGE)['Name']} {metadata.metadata(PACKAGE)['Version']} © 2016-2023 {__author__}" +__copyright__ = ( + f"{metadata.metadata(PACKAGE)['Name']} " + + f"{metadata.metadata(PACKAGE)['Version']} © 2016-2023 {__author__}" +) __license__ = metadata.metadata(PACKAGE)["License"] __version__ = metadata.metadata(PACKAGE)["Version"] diff --git a/src/inscriptis/model/attribute.py b/src/inscriptis/model/attribute.py index f4f8efc..d8cf3f6 100644 --- a/src/inscriptis/model/attribute.py +++ b/src/inscriptis/model/attribute.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # encoding: utf-8 """HTML attribute handling.""" diff --git a/src/inscriptis/model/canvas/block.py b/src/inscriptis/model/canvas/block.py index 59ba05f..6dc1361 100644 --- a/src/inscriptis/model/canvas/block.py +++ b/src/inscriptis/model/canvas/block.py @@ -42,6 +42,10 @@ def merge_normal_text(self, text: str) -> None: Args: text: the text to merge + + Note: + If the previous text ended with a whitespace and text starts with one, both + will automatically collapse into a single whitespace. """ normalized_text = [] diff --git a/src/inscriptis/model/html_element.py b/src/inscriptis/model/html_element.py index 5d16e9d..91e9585 100644 --- a/src/inscriptis/model/html_element.py +++ b/src/inscriptis/model/html_element.py @@ -158,14 +158,14 @@ def get_refined_html_element(self, new: "HtmlElement") -> "HtmlElement": def __str__(self): return ( - "<{self.tag} prefix={self.prefix}, suffix={self.suffix}, " - "display={self.display}, margin_before={self.margin_before}, " - "margin_after={self.margin_after}, " - "padding_inline={self.padding_inline}, " - "list_bullet={self.list_bullet}, " - "whitespace={self.whitespace}, align={self.align}, " - "valign={self.valign}, annotation={self.annotation}>" - ).format(self=self) + f"<{self.tag} prefix={self.prefix}, suffix={self.suffix}, " + f"display={self.display}, margin_before={self.margin_before}, " + f"margin_after={self.margin_after}, " + f"padding_inline={self.padding_inline}, " + f"list_bullet={self.list_bullet}, " + f"whitespace={self.whitespace}, align={self.align}, " + f"valign={self.valign}, annotation={self.annotation}>" + ) __repr__ = __str__ diff --git a/src/inscriptis/model/table.py b/src/inscriptis/model/table.py index 073c626..75a2cd3 100644 --- a/src/inscriptis/model/table.py +++ b/src/inscriptis/model/table.py @@ -55,7 +55,7 @@ def normalize_blocks(self) -> int: return len(self.blocks) @property - def height(self): + def height(self) -> int: """Compute the table cell's height. Returns: @@ -64,7 +64,7 @@ def height(self): return max(1, len(self.blocks)) @property - def width(self): + def width(self) -> int: """Compute the table cell's width. Returns: @@ -184,7 +184,7 @@ def get_text(self) -> str: return "\n".join(row_lines) @property - def width(self): + def width(self) -> int: """Compute and return the width of the current row.""" if not self.columns: return 0 @@ -255,7 +255,7 @@ def _set_column_width(self): if len(row) > cur_column_idx: row.columns[cur_column_idx].width = max_column_width - def get_text(self): + def get_text(self) -> str: """Return and render the text of the given table.""" if not self.rows: return "\n" From 161035dd0bf915861c35b5aded8eb7eb5f8d7f69 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Mon, 15 Jan 2024 20:06:06 +0100 Subject: [PATCH 20/29] chg: removed setup.py. --- setup.py | 60 -------------------------------------------------------- 1 file changed, 60 deletions(-) delete mode 100644 setup.py diff --git a/setup.py b/setup.py deleted file mode 100644 index d3ec856..0000000 --- a/setup.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python - -"""Inscriptis setup script.""" - -from pathlib import Path -from setuptools import setup, find_packages -from os import path - - -here = Path(path.dirname(__file__)).resolve() -# get version information -with here.joinpath('src/inscriptis/metadata.py').open() as f: - exec(f.read()) - -# Get the long description from the README.md file -with here.joinpath('README.rst').open() as f: # , encoding='utf-8' - long_description = f.read() - -setup( - # Metadata - name='inscriptis', - version=__version__, - description='inscriptis - HTML to text converter.', - long_description=long_description, - author=__author__, - author_email=__author_email__, - python_requires='>=3.8', - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: Apache Software License', - 'Topic :: Text Processing', - 'Topic :: Text Processing :: Markup :: HTML', - 'Topic :: Utilities', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - ], - keywords='HTML,converter,text', - url='https://github.com/weblyzard/inscriptis', - license=__license__, - package_dir={'': 'src'}, - - # Package List - packages=find_packages('src'), - - # Scripts - scripts=[ - 'scripts/inscript.py' - ], - - # Requirements - install_requires=[ - 'lxml', - 'requests' - ], -) From 4dc04d5d1b194dd9774c3162b85e5079737d19f7 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Tue, 16 Jan 2024 08:13:11 +0100 Subject: [PATCH 21/29] add: benchmarking.py to black config. --- benchmarking/run_benchmarking.py | 270 +++++++++++++++++-------------- pyproject.toml | 4 +- 2 files changed, 152 insertions(+), 122 deletions(-) diff --git a/benchmarking/run_benchmarking.py b/benchmarking/run_benchmarking.py index 2de67f3..f0f44af 100755 --- a/benchmarking/run_benchmarking.py +++ b/benchmarking/run_benchmarking.py @@ -21,17 +21,19 @@ # any installed module versions). # -LYNX_BIN = '/usr/bin/lynx' -LINKS_BIN = '/usr/bin/links' +LYNX_BIN = "/usr/bin/lynx" +LINKS_BIN = "/usr/bin/links" BENCHMARKING_ROOT = os.path.dirname(os.path.abspath(__file__)) -SRC_DIR = os.path.join(BENCHMARKING_ROOT, '../src') +SRC_DIR = os.path.join(BENCHMARKING_ROOT, "../src") sys.path.insert(0, os.path.abspath(SRC_DIR)) try: import inscriptis except ImportError: - print('Inscriptis is not available. Please install it in order to ' - 'compare with inscriptis.') + print( + "Inscriptis is not available. Please install it in order to " + "compare with inscriptis." + ) # # Import third-party HTML 2 text converters. @@ -39,22 +41,28 @@ try: from bs4 import BeautifulSoup except ImportError: - print('BeautifulSoup is not available. Please install it in order to ' - 'compare with BeautifulSoup.') + print( + "BeautifulSoup is not available. Please install it in order to " + "compare with BeautifulSoup." + ) try: import html2text except ImportError: - print('html2text is not available. Please install it in order to ' - 'compare with html2text.') + print( + "html2text is not available. Please install it in order to " + "compare with html2text." + ) try: import justext except ImportError: - print('justext is not available. Please install it in order to compare ' - 'with justext.') + print( + "justext is not available. Please install it in order to compare " + "with justext." + ) TRIES = 7 -OUTFILE = 'speed_comparisons.txt' +OUTFILE = "speed_comparisons.txt" class AbstractHtmlConverter: @@ -86,22 +94,22 @@ class BeautifulSoupHtmlConverter(AbstractHtmlConverter): """ Converts HTML to text using BeautifulSoup. """ - name = 'BeautifulSoup' + + name = "BeautifulSoup" def __init__(self): - self.available = 'bs4' in sys.modules + self.available = "bs4" in sys.modules def get_text(self, html): - soup = BeautifulSoup(html, 'lxml') + soup = BeautifulSoup(html, "lxml") - for script in soup(['script', 'style']): + for script in soup(["script", "style"]): script.extract() text = soup.get_text() lines = (line.strip() for line in text.splitlines()) - chunks = (phrase.strip() for line in lines - for phrase in line.split(' ')) - result = '\n'.join(chunk for chunk in chunks if chunk) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + result = "\n".join(chunk for chunk in chunks if chunk) return result @@ -109,67 +117,74 @@ class JustextConverter(AbstractHtmlConverter): """ Converts HTML to text using Justtext. """ - name = 'Justtext' + + name = "Justtext" def __init__(self): - self.available = 'justext' in sys.modules + self.available = "justext" in sys.modules def get_text(self, html): - paragraphs = justext.justext(html, stoplist='English') + paragraphs = justext.justext(html, stoplist="English") result = [paragraph.text for paragraph in paragraphs] - return '\n'.join(result) + return "\n".join(result) class Html2TextConverter(AbstractHtmlConverter): """ Converts HTML to text using Html2Text. """ - name = 'Html2Text' + + name = "Html2Text" def __init__(self): - self.available = 'html2text' in sys.modules + self.available = "html2text" in sys.modules def get_text(self, html): converter = html2text.HTML2Text() converter.ignore_links = True result = converter.handle(str(html)) - return ''.join(result) + return "".join(result) class LynxConverter(AbstractHtmlConverter): """ Converts HTML to text using lynx. """ - name = 'Lynx' + + name = "Lynx" def __init__(self): try: - subprocess.call([LYNX_BIN, '-dump \'www.google.com\''], - stdout=subprocess.PIPE) + subprocess.call( + [LYNX_BIN, "-dump 'www.google.com'"], stdout=subprocess.PIPE + ) self.available = True except OSError: - print('lynx can not be called. Please check in order to compare ' - 'with lynx.') + print( + "lynx can not be called. Please check in order to compare " "with lynx." + ) self.available = False def get_text(self, html): - def kill_lynx(pid): os.kill(pid, signal.SIGKILL) os.waitpid(-1, os.WNOHANG) - print('lynx killed') - - lynx_args = '-stdin -width=20000 -force_html -nocolor -dump -nolist ' \ - '-nobold -display_charset=utf8' - cmd = [LYNX_BIN, ] + lynx_args.split(' ') - lynx = subprocess.Popen(cmd, stdin=subprocess.PIPE, - stdout=subprocess.PIPE) - lynx.stdin.write(html.encode('utf8')) + print("lynx killed") + + lynx_args = ( + "-stdin -width=20000 -force_html -nocolor -dump -nolist " + "-nobold -display_charset=utf8" + ) + cmd = [ + LYNX_BIN, + ] + lynx_args.split(" ") + lynx = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + lynx.stdin.write(html.encode("utf8")) lynx.stdin.close() _t = threading.Timer(200.0, kill_lynx, args=[lynx.pid]) _t.start() - text = lynx.stdout.read().decode('utf-8', 'replace') + text = lynx.stdout.read().decode("utf-8", "replace") _t.cancel() return text @@ -178,34 +193,38 @@ class LinksConverter(AbstractHtmlConverter): """ Converts HTML to text using links. """ - name = 'Links' + + name = "Links" def __init__(self): try: - subprocess.call([LINKS_BIN, '-dump \'www.google.com\''], - stdout=subprocess.PIPE) + subprocess.call( + [LINKS_BIN, "-dump 'www.google.com'"], stdout=subprocess.PIPE + ) self.available = True except OSError: - print('links can not be called. Please check in order to compare ' - 'with links.') + print( + "links can not be called. Please check in order to compare " + "with links." + ) self.available = False def get_text(self, html): - def kill_links(pid): os.kill(pid, signal.SIGKILL) os.waitpid(-1, os.WNOHANG) - print('links killed') - - links_args= '-dump ' - cmd = [LINKS_BIN, ] + links_args.split(' ') - links = subprocess.Popen(cmd, stdin=subprocess.PIPE, - stdout=subprocess.PIPE) - links.stdin.write(html.encode('utf8')) + print("links killed") + + links_args = "-dump " + cmd = [ + LINKS_BIN, + ] + links_args.split(" ") + links = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + links.stdin.write(html.encode("utf8")) links.stdin.close() _t = threading.Timer(200.0, kill_links, args=[links.pid]) _t.start() - text = links.stdout.read().decode('utf-8', 'replace') + text = links.stdout.read().decode("utf-8", "replace") _t.cancel() return text @@ -214,32 +233,33 @@ class InscriptisHtmlConverter(AbstractHtmlConverter): """ Converts HTML to text using Inscriptis. """ - name = 'Inscriptis' + + name = "Inscriptis" def __init__(self): - self.available = 'inscriptis' in sys.modules + self.available = "inscriptis" in sys.modules if self.available: from inscriptis import get_text + self.get_text = get_text def get_text(self, html): return self.get_text(html) -timestamp = str(datetime.now()).replace(' ', '_').replace(':', '-')\ - .split('.')[0] -DEFAULT_RESULT_DIR = os.path.join(BENCHMARKING_ROOT, 'benchmarking_results', - timestamp) -DEFAULT_CACHE_DIR = os.path.join(BENCHMARKING_ROOT, 'html_cache') +timestamp = str(datetime.now()).replace(" ", "_").replace(":", "-").split(".")[0] +DEFAULT_RESULT_DIR = os.path.join(BENCHMARKING_ROOT, "benchmarking_results", timestamp) +DEFAULT_CACHE_DIR = os.path.join(BENCHMARKING_ROOT, "html_cache") def save_to_file(algorithm, url, data, benchmarking_results_dir): """ Saves a benchmarking result to the given file. """ - result_file = os.path.join(benchmarking_results_dir, - '{}_{}.txt'.format(algorithm, url)) - with open(result_file, 'w') as output_file: + result_file = os.path.join( + benchmarking_results_dir, "{}_{}.txt".format(algorithm, url) + ) + with open(result_file, "w") as output_file: output_file.write(data) @@ -251,19 +271,22 @@ def get_speed_table(times): longest_key = max(len(key) for key, _ in times.items()) longest_value = max(len(str(value)) for _, value in times.items()) - result = '' + result = "" for key, value in sorted(times.items(), key=operator.itemgetter(1)): difference = value - fastest if difference == 0: - difference = '--> fastest' + difference = "--> fastest" else: - difference = '{0:+f}'.format(difference) + difference = "{0:+f}".format(difference) - output = '{}{}: {}{} {}'.format(key, ' ' * (longest_key - len(key)), - value, ' ' * (longest_value - - len(str(value))), - difference) - result += output + '\n' + output = "{}{}: {}{} {}".format( + key, + " " * (longest_key - len(key)), + value, + " " * (longest_value - len(str(value))), + difference, + ) + result += output + "\n" return result @@ -272,46 +295,54 @@ def get_fname(url) -> str: """ Transforms a URL to a file name. """ - trash = (('http://', ''), - ('https://', ''), - ('/', '-'), - (':', '-'), - ('%', '')) + trash = (("http://", ""), ("https://", ""), ("/", "-"), (":", "-"), ("%", "")) for key, value in trash: url = url.replace(key, value) return url[0:100] -CONVERTER = (BeautifulSoupHtmlConverter(), - JustextConverter(), - Html2TextConverter(), - LynxConverter(), - LinksConverter(), - InscriptisHtmlConverter()) +CONVERTER = ( + BeautifulSoupHtmlConverter(), + JustextConverter(), + Html2TextConverter(), + LynxConverter(), + LinksConverter(), + InscriptisHtmlConverter(), +) def parse_args(): """ Parse optional benchmarking arguments. """ - parser = argparse.ArgumentParser(description='Inscriptis benchmarking ' - 'suite') - parser.add_argument('converter', type=str, nargs='*', - help='The list of converters to benchmark (options:' - 'BeautifulSoup, Justext, Html2Text, Lynx, ' - 'Inscriptis; default: all)') - parser.add_argument('-u', '--benchmarking-urls', - default=os.path.join(BENCHMARKING_ROOT, - 'url_list.txt'), - help='A list of URLs to use in the benchmark.') - parser.add_argument('-r', '--benchmarking-results', - default=DEFAULT_RESULT_DIR, - help='Optional directory for saving the benchmarking ' - 'results.') - parser.add_argument('-c', '--cache', default=DEFAULT_CACHE_DIR, - help='Optional cache directory for the retrieved Web ' - 'pages.') + parser = argparse.ArgumentParser(description="Inscriptis benchmarking " "suite") + parser.add_argument( + "converter", + type=str, + nargs="*", + help="The list of converters to benchmark (options:" + "BeautifulSoup, Justext, Html2Text, Lynx, " + "Inscriptis; default: all)", + ) + parser.add_argument( + "-u", + "--benchmarking-urls", + default=os.path.join(BENCHMARKING_ROOT, "url_list.txt"), + help="A list of URLs to use in the benchmark.", + ) + parser.add_argument( + "-r", + "--benchmarking-results", + default=DEFAULT_RESULT_DIR, + help="Optional directory for saving the benchmarking " "results.", + ) + parser.add_argument( + "-c", + "--cache", + default=DEFAULT_CACHE_DIR, + help="Optional cache directory for the retrieved Web " "pages.", + ) return parser.parse_args() @@ -348,10 +379,10 @@ def _fetch_url(url, cache_dir): else: req = urllib.request.Request(url) try: - html = urllib.request.urlopen(req).read().decode('utf-8') + html = urllib.request.urlopen(req).read().decode("utf-8") except UnicodeDecodeError: - html = urllib.request.urlopen(req).read().decode('latin1') - open(source_cache_path, 'w').write(html) + html = urllib.request.urlopen(req).read().decode("latin1") + open(source_cache_path, "w").write(html) return source_name, html @@ -372,17 +403,21 @@ def benchmark(args, source_list): for source in source_list: source_name, html = _fetch_url(source, args.cache) - print('\nURL: {}'.format(source_name)) - output.append('\nURL: {}\n'.format(source_name)) + print("\nURL: {}".format(source_name)) + output.append("\nURL: {}\n".format(source_name)) times = {} for converter in CONVERTER: - if converter.available and not args.converter or converter.name \ - in args.converter: + if ( + converter.available + and not args.converter + or converter.name in args.converter + ): time_required, text = converter.benchmark(html) times[converter.name] = time_required - save_to_file(converter.name, source_name, text, - args.benchmarking_results) + save_to_file( + converter.name, source_name, text, args.benchmarking_results + ) for converter, conversion_time in times.items(): total_times[converter] = total_times.get(converter, 0) + conversion_time @@ -390,22 +425,17 @@ def benchmark(args, source_list): print(speed_table) output.append(speed_table) - print('\nTotal') - output.append('\nTotal\n') + print("\nTotal") + output.append("\nTotal\n") speed_table = get_speed_table(total_times) print(speed_table) output.append(speed_table) + with open(os.path.join(args.benchmarking_results, OUTFILE), "w") as output_file: + output_file.write("\n".join(output) + "\n") - - - with open(os.path.join(args.benchmarking_results, - OUTFILE), 'w') as output_file: - output_file.write('\n'.join(output) + '\n') - - -if __name__ == '__main__': +if __name__ == "__main__": # These are a few predefined urls the script will cmdline_args = parse_args() with open(cmdline_args.benchmarking_urls) as url_list: diff --git a/pyproject.toml b/pyproject.toml index a6aef78..010dd1d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ build-backend = "poetry.core.masonry.api" [tool.black] line-length = 88 target-version = ["py38", "py39", "py310", "py311", "py312"] -extend-exclude = '\.html$|\.json$|\.txt$' +extend-exclude = '\.html$|\.json$|\.txt$|/a$|/b$' include = ''' - ^/src/|^/tests/ + ^/src/|^/tests/|^/benchmarking/ ''' From 61b93f4fc9752b19fc91bfbf5f4fc5682085833d Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Tue, 16 Jan 2024 15:24:30 +0100 Subject: [PATCH 22/29] chg: migrate webservice to fastapi. --- .gitignore | 1 + pyproject.toml | 18 ++++++++++++++---- requirements.txt | 2 -- src/inscriptis/service/web.py | 33 ++++++++++++++++++++------------- 4 files changed, 35 insertions(+), 19 deletions(-) delete mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 18e246b..53ece72 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ tests/reference.txt *.c docs/paper/*.pdf htmlcov/ +poetry.lock diff --git a/pyproject.toml b/pyproject.toml index 010dd1d..e43429c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,21 +28,31 @@ packages = [ {include = "inscriptis", from="src"}, ] + [tool.poetry.scripts] inscript = "inscriptis.cli.inscript:cli" +inscriptis-api = "inscriptis.service.web:start" + + +[tool.poetry.extras] +web-service = ["fastapi", "uvicorn"] + [tool.poetry.dependencies] python = "^3.8 || ^3.9 || ^3.10 || ^3.11 || ^3.12" -requests = ">=2.23.0" -lxml = ">=4.5.0" +requests = ">=2.31.0" +lxml = ">=4.9.3" + +# optional dependencies +fastapi = { version = "^0.109.0", optional = true } +uvicorn = { version = "^0.25.0", optional = true } -[tool.poetry.group.test.dependencies] -pytest = ">=7.2.0" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" + # code formatting with black [tool.black] line-length = 88 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index a3596c0..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -lxml -requests diff --git a/src/inscriptis/service/web.py b/src/inscriptis/service/web.py index bb54665..fdf47ca 100755 --- a/src/inscriptis/service/web.py +++ b/src/inscriptis/service/web.py @@ -2,14 +2,15 @@ # coding:utf-8 """Inscriptis Web Service.""" -from flask import request, Response, Flask +from fastapi import FastAPI, Request +from fastapi.responses import PlainTextResponse from inscriptis import get_text from inscriptis.metadata import __version__ from inscriptis.css_profiles import RELAXED_CSS_PROFILE from inscriptis.model.config import ParserConfig -app = Flask(__name__) +app = FastAPI() CONFIG = ParserConfig( css=RELAXED_CSS_PROFILE, display_images=True, @@ -18,31 +19,37 @@ ) -@app.route("/") +@app.get("/") def index(): """Print a short status message for the Web service's base URL.""" return "Inscriptis text to HTML Web service." -@app.route("/get_text", methods=["POST"]) -def get_text_call(): +@app.post("/get_text", response_class=PlainTextResponse) +async def get_text_call(request: Request): """Return the text representation of the given HTML content.""" - content_type = request.headers["Content-type"] + content_type = request.headers.get("Content-type") if "; encoding=" in content_type: encoding = content_type.split("; encoding=")[1] else: encoding = "UTF-8" - html_content = request.data.decode(encoding, errors="ignore") - text = get_text(html_content, CONFIG) - return Response(text, mimetype="text/plain") + html_content = await request.body() + return get_text(html_content.decode(encoding, errors="ignore"), CONFIG) -@app.route("/version", methods=["GET"]) +@app.get("/version", response_class=PlainTextResponse) def get_version_call(): """Return the used inscriptis version.""" - return Response(__version__ + "\n", mimetype="text/plain") + return __version__ -if __name__ == "__main__": +def start(): + """Start the webservice.""" + import uvicorn + print("Starting Web service based on Inscriptis", __version__) - app.run(threaded=True, host="127.0.0.1", port=5000) + uvicorn.run(app, host="127.0.0.1", port=5000) + + +if __name__ == "__main__": + start() From 789e0bc58ee418b0e44cea35b157453b221ea354 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Tue, 16 Jan 2024 15:25:07 +0100 Subject: [PATCH 23/29] chg: use inscriptis package rather than local build. --- Dockerfile | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 70e30e5..089e929 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,10 +4,8 @@ FROM python:3.11-slim-bullseye AS builder WORKDIR /inscriptis -COPY requirements.txt . RUN python -m venv .venv && .venv/bin/python -m pip install --upgrade pip -RUN .venv/bin/pip install --no-cache-dir -r requirements.txt && \ - .venv/bin/pip install --no-cache-dir Flask waitress && \ +RUN .venv/bin/pip install --no-cache-dir inscriptis[web-service] && \ find /inscriptis/.venv \( -type d -a -name test -o -name tests \) -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) -exec rm -rf '{}' \+ # @@ -18,10 +16,9 @@ LABEL maintainer="albert@weichselbraun.net" # Note: only copy the src directory, to prevent bloating the image with # irrelevant files from the project directory. -WORKDIR /inscriptis/src +WORKDIR /inscriptis COPY --from=builder /inscriptis /inscriptis -COPY ./src /inscriptis/src ENV PATH="/inscriptis/.venv/bin:$PATH" -CMD ["waitress-serve", "inscriptis.service.web:app", "--port=5000", "--host=0.0.0.0"] +CMD ["uvicorn", "inscriptis.service.web:app", "--port=5000", "--host=0.0.0.0"] EXPOSE 5000 From 49fd08b685cddc694a897d5d8b794e96b90799ff Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Tue, 16 Jan 2024 15:31:40 +0100 Subject: [PATCH 24/29] chg: updated documentation on the Inscriptis Web service. --- README.rst | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 116843e..7fc6ef2 100644 --- a/README.rst +++ b/README.rst @@ -311,14 +311,18 @@ Currently, inscriptis supports the following postprocessors: Web Service =========== -The Flask Web Service translates HTML pages to the corresponding plain text. +A FastAPI-based Web Service that uses Inscriptis for translating HTML pages to plain text. Run the Web Service on your host system --------------------------------------- -Install the additional requirement `python3-flask `_, then start the inscriptis Web service with the following command:: +Install the optional feature `web-service` for inscriptis:: + + $ pip install inscriptis[web-service] + +Start the Inscriptis Web service with the following command:: + + $ uvicorn inscriptis.service.web:app --port 5000 --host 127.0.0.1 - $ export FLASK_APP="inscriptis.service.web" - $ python3 -m flask run Run the Web Service with Docker ------------------------------- From 44461512aa906c3151c0dbe2af50fc5fbd093643 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Tue, 16 Jan 2024 15:42:27 +0100 Subject: [PATCH 25/29] fix: package builds. --- .github/workflows/python-package.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 1498480..7613ee1 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -22,9 +22,9 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install tox setuptools pytest pytest-cov codecov + python -m pip install tox setuptools pytest pytest-cov codecov poetry if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - python setup.py install + poetry build - name: Lint with tox run: | tox From a60df16810f77cefd0f7837532a6812461594b1c Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Tue, 16 Jan 2024 15:43:16 +0100 Subject: [PATCH 26/29] chg: adapted documentation to inscript. --- README.rst | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.rst b/README.rst index 7fc6ef2..87eca1e 100644 --- a/README.rst +++ b/README.rst @@ -131,9 +131,9 @@ the corresponding text representation. Command line parameters ----------------------- -The inscript.py command line client supports the following parameters:: +The inscript command line client supports the following parameters:: - usage: inscript.py [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR] [--indentation INDENTATION] + usage: inscript [-h] [-o OUTPUT] [-e ENCODING] [-i] [-d] [-l] [-a] [-r ANNOTATION_RULES] [-p POSTPROCESSOR] [--indentation INDENTATION] [--table-cell-separator TABLE_CELL_SEPARATOR] [-v] [input] @@ -172,19 +172,19 @@ HTML to text conversion ----------------------- convert the given page to text and output the result to the screen:: - $ inscript.py https://www.fhgr.ch + $ inscript https://www.fhgr.ch convert the file to text and save the output to fhgr.txt:: - $ inscript.py fhgr.html -o fhgr.txt + $ inscript fhgr.html -o fhgr.txt convert the file using strict indentation (i.e., minimize indentation and extra spaces) and save the output to fhgr-layout-optimized.txt:: - $ inscript.py --indentation strict fhgr.html -o fhgr-layout-optimized.txt + $ inscript --indentation strict fhgr.html -o fhgr-layout-optimized.txt convert HTML provided via stdin and save the output to output.txt:: - $ echo "

Make it so!

" | inscript.py -o output.txt + $ echo "

Make it so!

" | inscript -o output.txt HTML to annotated text conversion @@ -193,7 +193,7 @@ convert and annotate HTML from a Web page using the provided annotation rules. Download the example `annotation-profile.json `_ and save it to your working directory:: - $ inscript.py https://www.fhgr.ch -r annotation-profile.json + $ inscript https://www.fhgr.ch -r annotation-profile.json The annotation rules are specified in `annotation-profile.json`: @@ -241,7 +241,7 @@ Annotation postprocessors enable the post processing of annotations to formats that are suitable for your particular application. Post processors can be specified with the ``-p`` or ``--postprocessor`` command line argument:: - $ inscript.py https://www.fhgr.ch \ + $ inscript https://www.fhgr.ch \ -r ./examples/annotation-profile.json \ -p surface @@ -286,7 +286,7 @@ Currently, inscriptis supports the following postprocessors: .. code-block:: bash - inscript.py --annotation-rules ./wikipedia.json \ + inscript --annotation-rules ./wikipedia.json \ --postprocessor html \ https://en.wikipedia.org/wiki/Chur.html @@ -503,7 +503,7 @@ The following options are available for fine tuning inscriptis' HTML rendering: 1. **More rigorous indentation:** call ``inscriptis.get_text()`` with the parameter ``indentation='extended'`` to also use indentation for tags such as ``
`` and ```` that do not provide indentation in their standard - definition. This strategy is the default in ``inscript.py`` and many other + definition. This strategy is the default in ``inscript`` and many other tools such as Lynx. If you do not want extended indentation you can use the parameter ``indentation='standard'`` instead. From 2b6c4494de4e2ce86c36d79f14f4b110caf26b81 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Tue, 16 Jan 2024 15:46:39 +0100 Subject: [PATCH 27/29] fix: build process. --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 7613ee1..4364f97 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -24,7 +24,7 @@ jobs: python -m pip install --upgrade pip python -m pip install tox setuptools pytest pytest-cov codecov poetry if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - poetry build + poetry install - name: Lint with tox run: | tox From 34eeb7875c2e3a508c5884b1623f80a191700bed Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Tue, 16 Jan 2024 15:53:10 +0100 Subject: [PATCH 28/29] chg: improved build process. --- .github/workflows/python-package.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4364f97..bae9208 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -22,12 +22,9 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install tox setuptools pytest pytest-cov codecov poetry + python -m pip install tox setuptools pytest pytest-cov codecov if [ -f requirements.txt ]; then pip install -r requirements.txt; fi poetry install - name: Lint with tox run: | tox - - name: Test with pytest - run: | - py.test --cov=inscripits ./tests && codecov From c0630ffb487e0972336c4c88a4b53f6800d3a41d Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Tue, 16 Jan 2024 15:56:44 +0100 Subject: [PATCH 29/29] fix: improved build process. --- .github/workflows/python-package.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index bae9208..c2cb2d5 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -19,12 +19,10 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Install build environment run: | python -m pip install --upgrade pip python -m pip install tox setuptools pytest pytest-cov codecov - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - poetry install - - name: Lint with tox + - name: Build and test with tox. run: | tox