From 4846792138f7263d01cade88f7aebcc87df038a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20=C4=8Ciha=C5=99?= <michal@cihar.com>
Date: Sat, 18 Jun 2022 17:35:49 +0200
Subject: [PATCH] Re-use decoded buffer for single byte character sets (#175)

* Re-use decoded buffer for short texts

This avoids issues with detecting string boundaries while improving
performance (avoids multiple decoding of the sequence).

Fixes #174

* :bookmark: Bump version to 2.1.0.dev0

* :bug: Workaround a potential bug in Python isspace table character

 bug discovered in Python, Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.


Co-authored-by: TAHRI Ahmed R <Ousret@users.noreply.github.com>
Co-authored-by: Ahmed TAHRI <ahmed.tahri@cloudnursery.dev>
---
 CHANGELOG.md                  |   8 ++
 charset_normalizer/api.py     |  82 +++++---------
 charset_normalizer/utils.py   |  62 ++++++++++-
 charset_normalizer/version.py |   2 +-
 data/sample-polish.txt        | 204 ++++++++++++++++++++++++++++++++++
 tests/test_full_detection.py  |   1 +
 6 files changed, 304 insertions(+), 55 deletions(-)
 create mode 100644 data/sample-polish.txt

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bfb08756..52b32f04 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,14 @@
 All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [2.1.0.dev0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...master) (2022-??-??)
+
+### Changed
+- Re-use decoded buffer for single byte character sets (PR #175)
+
+### Fixed
+- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
+
 ## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
 
 ### Fixed
diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
index bdc8ed98..2a82e5e7 100644
--- a/charset_normalizer/api.py
+++ b/charset_normalizer/api.py
@@ -18,6 +18,7 @@
 from .models import CharsetMatch, CharsetMatches
 from .utils import (
     any_specified_encoding,
+    cut_sequence_chunks,
     iana_name,
     identify_sig_or_bom,
     is_cp_similar,
@@ -285,63 +286,38 @@ def from_bytes(
         md_chunks = []  # type: List[str]
         md_ratios = []
 
-        for i in r_:
-            if i + chunk_size > length + 8:
-                continue
-
-            cut_sequence = sequences[i : i + chunk_size]
-
-            if bom_or_sig_available and strip_sig_or_bom is False:
-                cut_sequence = sig_payload + cut_sequence
-
-            try:
-                chunk = cut_sequence.decode(
-                    encoding_iana,
-                    errors="ignore" if is_multi_byte_decoder else "strict",
-                )  # type: str
-            except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
-                logger.log(
-                    TRACE,
-                    "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
-                    encoding_iana,
-                    str(e),
-                )
-                early_stop_count = max_chunk_gave_up
-                lazy_str_hard_failure = True
-                break
+        try:
+            for chunk in cut_sequence_chunks(
+                sequences,
+                encoding_iana,
+                r_,
+                chunk_size,
+                bom_or_sig_available,
+                strip_sig_or_bom,
+                sig_payload,
+                is_multi_byte_decoder,
+                decoded_payload,
+            ):
+                md_chunks.append(chunk)
 
-            # multi-byte bad cutting detector and adjustment
-            # not the cleanest way to perform that fix but clever enough for now.
-            if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
+                md_ratios.append(mess_ratio(chunk, threshold))
 
-                chunk_partial_size_chk = min(chunk_size, 16)  # type: int
+                if md_ratios[-1] >= threshold:
+                    early_stop_count += 1
 
-                if (
-                    decoded_payload
-                    and chunk[:chunk_partial_size_chk] not in decoded_payload
+                if (early_stop_count >= max_chunk_gave_up) or (
+                    bom_or_sig_available and strip_sig_or_bom is False
                 ):
-                    for j in range(i, i - 4, -1):
-                        cut_sequence = sequences[j : i + chunk_size]
-
-                        if bom_or_sig_available and strip_sig_or_bom is False:
-                            cut_sequence = sig_payload + cut_sequence
-
-                        chunk = cut_sequence.decode(encoding_iana, errors="ignore")
-
-                        if chunk[:chunk_partial_size_chk] in decoded_payload:
-                            break
-
-            md_chunks.append(chunk)
-
-            md_ratios.append(mess_ratio(chunk, threshold))
-
-            if md_ratios[-1] >= threshold:
-                early_stop_count += 1
-
-            if (early_stop_count >= max_chunk_gave_up) or (
-                bom_or_sig_available and strip_sig_or_bom is False
-            ):
-                break
+                    break
+        except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
+            logger.log(
+                TRACE,
+                "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
+                encoding_iana,
+                str(e),
+            )
+            early_stop_count = max_chunk_gave_up
+            lazy_str_hard_failure = True
 
         # We might want to check the sequence again with the whole content
         # Only if initial MD tests passes
diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py
index 56ac246b..c8ecb0f4 100644
--- a/charset_normalizer/utils.py
+++ b/charset_normalizer/utils.py
@@ -9,7 +9,7 @@
 from encodings.aliases import aliases
 from functools import lru_cache
 from re import findall
-from typing import List, Optional, Set, Tuple, Union
+from typing import Generator, List, Optional, Set, Tuple, Union
 
 from _multibytecodec import MultibyteIncrementalDecoder  # type: ignore
 
@@ -204,6 +204,8 @@ def is_unprintable(character: str) -> bool:
         character.isspace() is False  # includes \n \t \r \v
         and character.isprintable() is False
         and character != "\x1A"  # Why? Its the ASCII substitute character.
+        and character != b"\xEF\xBB\xBF".decode("utf_8")  # bug discovered in Python,
+        # Zero Width No-Break Space located in 	Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
     )
 
 
@@ -350,3 +352,61 @@ def set_logging_handler(
     handler = logging.StreamHandler()
     handler.setFormatter(logging.Formatter(format_string))
     logger.addHandler(handler)
+
+
+def cut_sequence_chunks(
+    sequences: bytes,
+    encoding_iana: str,
+    offsets: range,
+    chunk_size: int,
+    bom_or_sig_available: bool,
+    strip_sig_or_bom: bool,
+    sig_payload: bytes,
+    is_multi_byte_decoder: bool,
+    decoded_payload: Optional[str] = None,
+) -> Generator[str, None, None]:
+
+    if decoded_payload and is_multi_byte_decoder is False:
+        for i in offsets:
+            chunk = decoded_payload[i : i + chunk_size]
+            if not chunk:
+                break
+            yield chunk
+    else:
+        for i in offsets:
+            chunk_end = i + chunk_size
+            if chunk_end > len(sequences) + 8:
+                continue
+
+            cut_sequence = sequences[i : i + chunk_size]
+
+            if bom_or_sig_available and strip_sig_or_bom is False:
+                cut_sequence = sig_payload + cut_sequence
+
+            chunk = cut_sequence.decode(
+                encoding_iana,
+                errors="ignore" if is_multi_byte_decoder else "strict",
+            )
+
+            # multi-byte bad cutting detector and adjustment
+            # not the cleanest way to perform that fix but clever enough for now.
+            if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
+
+                chunk_partial_size_chk = min(chunk_size, 16)  # type: int
+
+                if (
+                    decoded_payload
+                    and chunk[:chunk_partial_size_chk] not in decoded_payload
+                ):
+                    for j in range(i, i - 4, -1):
+                        cut_sequence = sequences[j:chunk_end]
+
+                        if bom_or_sig_available and strip_sig_or_bom is False:
+                            cut_sequence = sig_payload + cut_sequence
+
+                        chunk = cut_sequence.decode(encoding_iana, errors="ignore")
+
+                        if chunk[:chunk_partial_size_chk] in decoded_payload:
+                            break
+
+            yield chunk
diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py
index 77cfff25..51bf52e2 100644
--- a/charset_normalizer/version.py
+++ b/charset_normalizer/version.py
@@ -2,5 +2,5 @@
 Expose version
 """
 
-__version__ = "2.0.12"
+__version__ = "2.1.0.dev0"
 VERSION = __version__.split(".")
diff --git a/data/sample-polish.txt b/data/sample-polish.txt
new file mode 100644
index 00000000..9e506c26
--- /dev/null
+++ b/data/sample-polish.txt
@@ -0,0 +1,204 @@
+"source";"target"
+"﻿REF.-2";"POLISH"
+"KW-P00-01";"SYSTEM VIDEODOMOFONOWY MEET"
+"KW-P00-02";"URZĄDZENIE"
+"KW-P00-03";"OGÓLNE"
+"KW-P00-04";"SIEĆ"
+"KW-P00-05";"KD"
+"KW-P00-06";"ROZP. TWARZY."
+"KW-P00-07";"KAMERY IP"
+"KW-P00-08";"SIP"
+"KW-P00-09";"SIP TRUNK"
+"KW-P00-10";"PRZEKIEROWANIA"
+"KW-P00-11";"ZAAWANSOWANE"
+"KW-P00-12";"KOD PIN"
+"KW-P00-13";"WECHAT QR"
+"KW-P00-14";"PRZYWRACAĆ"
+"KW-P00-16";"WINDA"
+"KW-P01-01";"INFORMACJE O URZĄDZENIU"
+"KW-P01-02";"PANEL VIDEO FOOBAR KIN"
+"KW-P01-03";"FIRMWARE: V02.10"
+"KW-P01-04";"URZĄDZENIE: PANEL BLOKOWY-CYFROWY 001-02"
+"KW-P01-05";"URZĄDZENIE: PANEL BLOKOWY PRZYCISKI 020-02"
+"KW-P01-06";"URZĄDZENIE: PANEL GŁÓWNY 01"
+"KW-P01-07";"URZĄDZENIE: PANEL 1W 006-0102-01"
+"KW-P01-08";"NUMER SERYJNY:"
+"KW-P01-09";"MAC:"
+"KW-P01-10";"IP:"
+"KW-P01-11";"COPYRIGHT © FOOBAR                  "
+"KW-P01-12";"www.example.com"
+"KW-P02-01";"USTAWIENIA GŁÓWNE"
+"KW-P02-02";"TYP:"
+"KW-P02-03";"PANEL GŁÓWNY"
+"KW-P02-04";"CYFROWY P. BLOKOWY"
+"KW-P02-05";"P. BLOK. PRZYCISKI"
+"KW-P02-06";"PANEL 1NR"
+"KW-P02-07";"BLOK:"
+"KW-P02-08";"LOKAL:"
+"KW-P02-09";"MONIT WYŚWIETLACZA:"
+"KW-P02-10";"THIS INTERFACE IS NOT ENABLED"
+"KW-P02-11";"NUMER PANELU:"
+"KW-P02-12";"NAZWA URZĄDZENIA:"
+"KW-P02-13";"(≤16 ZNAKÓW)"
+"KW-P02-14";"JĘZYK:"
+"KW-P02-15";"ENGLISH"
+"KW-P02-16";"中文"
+"KW-P02-17";"ESPAÑOL"
+"KW-P02-18";"РУССКИЙ"
+"KW-P02-19";"DEUTSCH"
+"KW-P02-20";"TÜRKÇE"
+"KW-P02-21";"POLSKI"
+"KW-P02-22";"עברית"
+"KW-P02-23";"FRANÇAIS"
+"KW-P02-24";"فارسی"
+"KW-P02-25";"GŁOŚNOŚĆ PANELU:"
+"KW-P02-26";"JASNOŚĆ"
+"KW-P02-27";"ROZDZIELCZOŚĆ VIDEO:"
+"KW-P02-28";"TRYB PRZEKIEROWANIA SIP:"
+"KW-P02-29";"SEKWENCYJNE"
+"KW-P02-30";"JEDNOCZESNE"
+"KW-P02-31";"PORTIER:"
+"KW-P02-32";"PORTIERNIA 1:"
+"KW-P02-33";"PORTIERNIA 2:"
+"KW-P02-34";"USTAW. DATY I CZASU"
+"KW-P02-35";"FORMAT DATY:"
+"KW-P02-36";"DATA:"
+"KW-P02-37";"CZAS:"
+"KW-P02-38";"STREFA CZASOWA:"
+"KW-P02-39";"ZAPISZ"
+"KW-P02-40";"BŁĘDNE DANE"
+"KW-P02-41";"KLAWIATURA ALFANUM.:"
+"KW-P02-42";"KOMUNIKAT OTWARCIA DRZWI:"
+"KW-P02-43";"WYGASZACZ EKRANU:"
+"KW-P02-44";"WSPARCIE:"
+"KW-P02-45";"OCZEKIWANIE"
+"KW-P02-46";"POŁĄCZENIE"
+"KW-P02-47";"WSPARCIE"
+"KW-P02-48";"lista"
+"KW-P02-49";"DST:"
+"KW-P02-57";"TŁO:"
+"KW-P02-58";"CIEMNE"
+"KW-P02-59";"JASNE"
+"KW-P02-60";"IMPORT"
+"KW-P02-61";"EKSPORT"
+"KW-P02-62";"USUŃ"
+"KW-P02-63";"WYBIERZ PRAWIDŁOWY PLIK PNG"
+"KW-P02-64";"IMPORTUJ"
+"KW-P02-65";"WYSYŁANIE ZAKOŃCZONE"
+"KW-P02-66";"BRAK OBRAZU"
+"KW-P02-67";"USUNIĘTE"
+"KW-P02-68";"BŁĄD USUWANIA"
+"KW-P03-01";"USTAWIENIA SIECI"
+"KW-P03-02";"IP:"
+"KW-P03-03";"MASKA:"
+"KW-P03-04";"BRAMA:"
+"KW-P03-05";"DNS:"
+"KW-P03-06";"SOFTWARE IP:"
+"KW-P03-07";"SW. PIN:"
+"KW-P03-08";"ZAPISZ"
+"KW-P04-01";"USTAWIENIA KONTROLI DOSTĘPU"
+"KW-P04-02";"PRZYCISK EGRESS:"
+"KW-P04-03";"CZAS ELEKTROZACZEPU:"
+"KW-P04-04";"CZAS KONTAKTRONU:"
+"KW-P04-05";"REF.1491 4 RELAY:"
+"KW-P04-06";"CZAS ELEKTROZACZEPU:"
+"KW-P04-07";"CZAS KONTAKTRONU:"
+"KW-P04-08";"KARTA ADMINISTRATORA:"
+"KW-P04-09";"ROZBRAJANIE KARTĄ:"
+"KW-P04-10";"MONITY KART:"
+"KW-P04-11";"KOD GOŚCIA:"
+"KW-P04-12";"KOD DOSTĘPU:"
+"KW-P04-13";"#1"
+"KW-P04-14";"#2"
+"KW-P04-15";"#3"
+"KW-P04-16";"#4"
+"KW-P04-17";"ALARM DRZWI"
+"KW-P04-18";"GWAŁTOWNY ALARM OTWARCIA"
+"KW-P04-19";"WIEGAND:"
+"KW-P04-20";"BURST"
+"KW-P04-21";"26-BIT"
+"KW-P04-22";"FACILITY:"
+"KW-P04-24";"ZAPISZ"
+"KW-P04-25";"WYŁĄCZONY"
+"KW-P04-26";"REF.1490 2 RELAY:"
+"KW-P04-27";"KOD QR:"
+"KW-P04-28";"WIEGAND:"
+"KW-P04-29";"26-BIT"
+"KW-P04-30";"34-BIT"
+"KW-P04-31";"KOD MIEJSCA:"
+"KW-P04-32";"AUTO AKTYWACJA:"
+"KW-P04-33";"BŁĘDNE DANE"
+"KW-P05-01";"ROZPOZNAWANIE TWARZY"
+"KW-P05-02";"ROZPOZNAWANIE TWARZY:"
+"KW-P05-04";"MODEL:"
+"KW-P05-05";"Wykrycie obecności:"
+"KW-P05-06";"WŁĄCZONY"
+"KW-P05-07";"WYŁĄCZONY"
+"KW-P05-08";"PODOBIEŃSTWO:"
+"KW-P05-09";"NISKIE"
+"KW-P05-10";"ŚREDNIE"
+"KW-P05-11";"WYSOKIE"
+"KW-P05-12";"ZAPISZ"
+"KW-P06-01";"USTAWIENIA KAMER IP"
+"KW-P06-02";"ILOŚĆ KAMER:"
+"KW-P06-03";"KAMERA"
+"KW-P06-04";"URL:"
+"KW-P06-05";"ZAPISZ"
+"KW-P07-01";"USTAWIENIA SIP"
+"KW-P07-02";"WŁĄCZ SIP:"
+"KW-P07-03";"SPRAWDŹ STATUS SIP"
+"KW-P07-04";"SIP ZAREJESTROWANY"
+"KW-P07-05";"BŁĄD REJESTRACJI SIP"
+"KW-P07-06";"SERWER SIP:"
+"KW-P07-07";"DOMENA:"
+"KW-P07-08";"OUTBOUND:"
+"KW-P07-09";"STUN IP:"
+"KW-P07-10";"PORT STUN:"
+"KW-P07-11";"H.264:"
+"KW-P07-12";"UŻYTKOWNIK SIP:"
+"KW-P07-13";"HASŁO SIP:"
+"KW-P07-14";"CZAS ROZMOWY:"
+"KW-P07-15";"CZAS DZWONIENIA:"
+"KW-P07-16";"ZAPISZ"
+"KW-P08-01";"USTAWIENIA SIP TRUNK"
+"KW-P08-02";"WŁĄCZ SIP TRUNK:"
+"KW-P08-03";"URL:"
+"KW-P08-04";"ZAPISZ"
+"KW-P09-01";"USTAWIENIA PRZEKIEROWAŃ"
+"KW-P09-02";"IMPORT"
+"KW-P09-03";"EKSPORT"
+"KW-P09-04";"APARTAMENT"
+"KW-P09-05";"NUMER"
+"KW-P10-01";"USTAWIENIA ZAAWANSOWANE"
+"KW-P10-02";"SZYBKIE WYBIERANIE:"
+"KW-P10-03";"URL:"
+"KW-P10-04";"ONU:"
+"KW-P10-05";"MAPOWANIE POŁĄCZEŃ:"
+"KW-P10-06";"BIAŁA LISTA:"
+"KW-P10-07";"Lista telefoniczna:"
+"KW-P10-08";"IMPORT"
+"KW-P10-09";"EKSPORT"
+"KW-P10-10";"IMPORTUJ"
+"KW-P10-11";"WYSYŁANIE ZAKOŃCZONE"
+"KW-P10-12";"UŻYJ WŁAŚCIWEGO PLIKU CSV."
+"KW-P10-13";"OK"
+"KW-P10-14";"ZAPISZ"
+"KW-P11-01";"USTAWIENIA KODU PIN"
+"KW-P11-02";"OBECNY PIN:"
+"KW-P11-03";"NOWY PIN:"
+"KW-P11-04";"POTWIERDŹ PIN:"
+"KW-P11-05";"ZAPISZ"
+"KW-P12-01";"WECHAT QR"
+"KW-P12-02";"WŁĄCZ"
+"KW-P12-03";"UUID:"
+"KW-P12-04";"HASŁO:"
+"KW-P12-05";"SERWER:"
+"KW-P12-06";"WŁĄCZ CZYTNIK QR:"
+"KW-P12-07";"STATUS:"
+"KW-P12-08";"REJESTRACJA POMYŚLNIE"
+"KW-P12-09";"REJESTRACJA NIE POWIODŁA SIĘ"
+"KW-P12-10";"ZAPISZ"
+"KW-P13-01";"PRZYWRACAĆ"
+"KW-P13-02";"PRZYWRÓCIĆ USTAWIENIA FABRYCZNE"
+"KW-P13-03";"POTWIERDZAĆ PRZYWRÓĆ USTAWIENIA FABRYCZNE?"
+"KW-P13-04";"URZĄDZENIE REBOOT"
diff --git a/tests/test_full_detection.py b/tests/test_full_detection.py
index 218080bf..96e0b797 100644
--- a/tests/test_full_detection.py
+++ b/tests/test_full_detection.py
@@ -22,6 +22,7 @@
         ('sample-turkish.txt', 'cp1254', 'Turkish'),
         ('sample-russian-2.txt', 'utf_8', 'Russian'),
         ('sample-russian.txt', 'mac_cyrillic', 'Russian'),
+        ('sample-polish.txt', 'utf_8', 'Polish'),
     ]
 )
 def test_elementary_detection(