From 4846792138f7263d01cade88f7aebcc87df038a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20=C4=8Ciha=C5=99?= Date: Sat, 18 Jun 2022 17:35:49 +0200 Subject: [PATCH] Re-use decoded buffer for single byte character sets (#175) * Re-use decoded buffer for short texts This avoids issues with detecting string boundaries while improving performance (avoids multiple decoding of the sequence). Fixes #174 * :bookmark: Bump version to 2.1.0.dev0 * :bug: Workaround a potential bug in Python isspace table character bug discovered in Python, Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space. Co-authored-by: TAHRI Ahmed R Co-authored-by: Ahmed TAHRI --- CHANGELOG.md | 8 ++ charset_normalizer/api.py | 82 +++++--------- charset_normalizer/utils.py | 62 ++++++++++- charset_normalizer/version.py | 2 +- data/sample-polish.txt | 204 ++++++++++++++++++++++++++++++++++ tests/test_full_detection.py | 1 + 6 files changed, 304 insertions(+), 55 deletions(-) create mode 100644 data/sample-polish.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index bfb08756..52b32f04 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [2.1.0.dev0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...master) (2022-??-??) + +### Changed +- Re-use decoded buffer for single byte character sets (PR #175) + +### Fixed +- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175) + ## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12) ### Fixed diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index bdc8ed98..2a82e5e7 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -18,6 +18,7 @@ from .models import CharsetMatch, CharsetMatches from .utils import ( any_specified_encoding, + cut_sequence_chunks, iana_name, identify_sig_or_bom, is_cp_similar, @@ -285,63 +286,38 @@ def from_bytes( md_chunks = [] # type: List[str] md_ratios = [] - for i in r_: - if i + chunk_size > length + 8: - continue - - cut_sequence = sequences[i : i + chunk_size] - - if bom_or_sig_available and strip_sig_or_bom is False: - cut_sequence = sig_payload + cut_sequence - - try: - chunk = cut_sequence.decode( - encoding_iana, - errors="ignore" if is_multi_byte_decoder else "strict", - ) # type: str - except UnicodeDecodeError as e: # Lazy str loading may have missed something there - logger.log( - TRACE, - "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s", - encoding_iana, - str(e), - ) - early_stop_count = max_chunk_gave_up - lazy_str_hard_failure = True - break + try: + for chunk in cut_sequence_chunks( + sequences, + encoding_iana, + r_, + chunk_size, + bom_or_sig_available, + strip_sig_or_bom, + sig_payload, + is_multi_byte_decoder, + decoded_payload, + ): + md_chunks.append(chunk) - # multi-byte bad cutting detector and adjustment - # not the cleanest way to perform that fix but clever enough for now. - if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80: + md_ratios.append(mess_ratio(chunk, threshold)) - chunk_partial_size_chk = min(chunk_size, 16) # type: int + if md_ratios[-1] >= threshold: + early_stop_count += 1 - if ( - decoded_payload - and chunk[:chunk_partial_size_chk] not in decoded_payload + if (early_stop_count >= max_chunk_gave_up) or ( + bom_or_sig_available and strip_sig_or_bom is False ): - for j in range(i, i - 4, -1): - cut_sequence = sequences[j : i + chunk_size] - - if bom_or_sig_available and strip_sig_or_bom is False: - cut_sequence = sig_payload + cut_sequence - - chunk = cut_sequence.decode(encoding_iana, errors="ignore") - - if chunk[:chunk_partial_size_chk] in decoded_payload: - break - - md_chunks.append(chunk) - - md_ratios.append(mess_ratio(chunk, threshold)) - - if md_ratios[-1] >= threshold: - early_stop_count += 1 - - if (early_stop_count >= max_chunk_gave_up) or ( - bom_or_sig_available and strip_sig_or_bom is False - ): - break + break + except UnicodeDecodeError as e: # Lazy str loading may have missed something there + logger.log( + TRACE, + "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s", + encoding_iana, + str(e), + ) + early_stop_count = max_chunk_gave_up + lazy_str_hard_failure = True # We might want to check the sequence again with the whole content # Only if initial MD tests passes diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index 56ac246b..c8ecb0f4 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -9,7 +9,7 @@ from encodings.aliases import aliases from functools import lru_cache from re import findall -from typing import List, Optional, Set, Tuple, Union +from typing import Generator, List, Optional, Set, Tuple, Union from _multibytecodec import MultibyteIncrementalDecoder # type: ignore @@ -204,6 +204,8 @@ def is_unprintable(character: str) -> bool: character.isspace() is False # includes \n \t \r \v and character.isprintable() is False and character != "\x1A" # Why? Its the ASCII substitute character. + and character != b"\xEF\xBB\xBF".decode("utf_8") # bug discovered in Python, + # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space. ) @@ -350,3 +352,61 @@ def set_logging_handler( handler = logging.StreamHandler() handler.setFormatter(logging.Formatter(format_string)) logger.addHandler(handler) + + +def cut_sequence_chunks( + sequences: bytes, + encoding_iana: str, + offsets: range, + chunk_size: int, + bom_or_sig_available: bool, + strip_sig_or_bom: bool, + sig_payload: bytes, + is_multi_byte_decoder: bool, + decoded_payload: Optional[str] = None, +) -> Generator[str, None, None]: + + if decoded_payload and is_multi_byte_decoder is False: + for i in offsets: + chunk = decoded_payload[i : i + chunk_size] + if not chunk: + break + yield chunk + else: + for i in offsets: + chunk_end = i + chunk_size + if chunk_end > len(sequences) + 8: + continue + + cut_sequence = sequences[i : i + chunk_size] + + if bom_or_sig_available and strip_sig_or_bom is False: + cut_sequence = sig_payload + cut_sequence + + chunk = cut_sequence.decode( + encoding_iana, + errors="ignore" if is_multi_byte_decoder else "strict", + ) + + # multi-byte bad cutting detector and adjustment + # not the cleanest way to perform that fix but clever enough for now. + if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80: + + chunk_partial_size_chk = min(chunk_size, 16) # type: int + + if ( + decoded_payload + and chunk[:chunk_partial_size_chk] not in decoded_payload + ): + for j in range(i, i - 4, -1): + cut_sequence = sequences[j:chunk_end] + + if bom_or_sig_available and strip_sig_or_bom is False: + cut_sequence = sig_payload + cut_sequence + + chunk = cut_sequence.decode(encoding_iana, errors="ignore") + + if chunk[:chunk_partial_size_chk] in decoded_payload: + break + + yield chunk diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index 77cfff25..51bf52e2 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.0.12" +__version__ = "2.1.0.dev0" VERSION = __version__.split(".") diff --git a/data/sample-polish.txt b/data/sample-polish.txt new file mode 100644 index 00000000..9e506c26 --- /dev/null +++ b/data/sample-polish.txt @@ -0,0 +1,204 @@ +"source";"target" +"REF.-2";"POLISH" +"KW-P00-01";"SYSTEM VIDEODOMOFONOWY MEET" +"KW-P00-02";"URZĄDZENIE" +"KW-P00-03";"OGÓLNE" +"KW-P00-04";"SIEĆ" +"KW-P00-05";"KD" +"KW-P00-06";"ROZP. TWARZY." +"KW-P00-07";"KAMERY IP" +"KW-P00-08";"SIP" +"KW-P00-09";"SIP TRUNK" +"KW-P00-10";"PRZEKIEROWANIA" +"KW-P00-11";"ZAAWANSOWANE" +"KW-P00-12";"KOD PIN" +"KW-P00-13";"WECHAT QR" +"KW-P00-14";"PRZYWRACAĆ" +"KW-P00-16";"WINDA" +"KW-P01-01";"INFORMACJE O URZĄDZENIU" +"KW-P01-02";"PANEL VIDEO FOOBAR KIN" +"KW-P01-03";"FIRMWARE: V02.10" +"KW-P01-04";"URZĄDZENIE: PANEL BLOKOWY-CYFROWY 001-02" +"KW-P01-05";"URZĄDZENIE: PANEL BLOKOWY PRZYCISKI 020-02" +"KW-P01-06";"URZĄDZENIE: PANEL GŁÓWNY 01" +"KW-P01-07";"URZĄDZENIE: PANEL 1W 006-0102-01" +"KW-P01-08";"NUMER SERYJNY:" +"KW-P01-09";"MAC:" +"KW-P01-10";"IP:" +"KW-P01-11";"COPYRIGHT © FOOBAR " +"KW-P01-12";"www.example.com" +"KW-P02-01";"USTAWIENIA GŁÓWNE" +"KW-P02-02";"TYP:" +"KW-P02-03";"PANEL GŁÓWNY" +"KW-P02-04";"CYFROWY P. BLOKOWY" +"KW-P02-05";"P. BLOK. PRZYCISKI" +"KW-P02-06";"PANEL 1NR" +"KW-P02-07";"BLOK:" +"KW-P02-08";"LOKAL:" +"KW-P02-09";"MONIT WYŚWIETLACZA:" +"KW-P02-10";"THIS INTERFACE IS NOT ENABLED" +"KW-P02-11";"NUMER PANELU:" +"KW-P02-12";"NAZWA URZĄDZENIA:" +"KW-P02-13";"(≤16 ZNAKÓW)" +"KW-P02-14";"JĘZYK:" +"KW-P02-15";"ENGLISH" +"KW-P02-16";"中文" +"KW-P02-17";"ESPAÑOL" +"KW-P02-18";"РУССКИЙ" +"KW-P02-19";"DEUTSCH" +"KW-P02-20";"TÜRKÇE" +"KW-P02-21";"POLSKI" +"KW-P02-22";"עברית" +"KW-P02-23";"FRANÇAIS" +"KW-P02-24";"فارسی" +"KW-P02-25";"GŁOŚNOŚĆ PANELU:" +"KW-P02-26";"JASNOŚĆ" +"KW-P02-27";"ROZDZIELCZOŚĆ VIDEO:" +"KW-P02-28";"TRYB PRZEKIEROWANIA SIP:" +"KW-P02-29";"SEKWENCYJNE" +"KW-P02-30";"JEDNOCZESNE" +"KW-P02-31";"PORTIER:" +"KW-P02-32";"PORTIERNIA 1:" +"KW-P02-33";"PORTIERNIA 2:" +"KW-P02-34";"USTAW. DATY I CZASU" +"KW-P02-35";"FORMAT DATY:" +"KW-P02-36";"DATA:" +"KW-P02-37";"CZAS:" +"KW-P02-38";"STREFA CZASOWA:" +"KW-P02-39";"ZAPISZ" +"KW-P02-40";"BŁĘDNE DANE" +"KW-P02-41";"KLAWIATURA ALFANUM.:" +"KW-P02-42";"KOMUNIKAT OTWARCIA DRZWI:" +"KW-P02-43";"WYGASZACZ EKRANU:" +"KW-P02-44";"WSPARCIE:" +"KW-P02-45";"OCZEKIWANIE" +"KW-P02-46";"POŁĄCZENIE" +"KW-P02-47";"WSPARCIE" +"KW-P02-48";"lista" +"KW-P02-49";"DST:" +"KW-P02-57";"TŁO:" +"KW-P02-58";"CIEMNE" +"KW-P02-59";"JASNE" +"KW-P02-60";"IMPORT" +"KW-P02-61";"EKSPORT" +"KW-P02-62";"USUŃ" +"KW-P02-63";"WYBIERZ PRAWIDŁOWY PLIK PNG" +"KW-P02-64";"IMPORTUJ" +"KW-P02-65";"WYSYŁANIE ZAKOŃCZONE" +"KW-P02-66";"BRAK OBRAZU" +"KW-P02-67";"USUNIĘTE" +"KW-P02-68";"BŁĄD USUWANIA" +"KW-P03-01";"USTAWIENIA SIECI" +"KW-P03-02";"IP:" +"KW-P03-03";"MASKA:" +"KW-P03-04";"BRAMA:" +"KW-P03-05";"DNS:" +"KW-P03-06";"SOFTWARE IP:" +"KW-P03-07";"SW. PIN:" +"KW-P03-08";"ZAPISZ" +"KW-P04-01";"USTAWIENIA KONTROLI DOSTĘPU" +"KW-P04-02";"PRZYCISK EGRESS:" +"KW-P04-03";"CZAS ELEKTROZACZEPU:" +"KW-P04-04";"CZAS KONTAKTRONU:" +"KW-P04-05";"REF.1491 4 RELAY:" +"KW-P04-06";"CZAS ELEKTROZACZEPU:" +"KW-P04-07";"CZAS KONTAKTRONU:" +"KW-P04-08";"KARTA ADMINISTRATORA:" +"KW-P04-09";"ROZBRAJANIE KARTĄ:" +"KW-P04-10";"MONITY KART:" +"KW-P04-11";"KOD GOŚCIA:" +"KW-P04-12";"KOD DOSTĘPU:" +"KW-P04-13";"#1" +"KW-P04-14";"#2" +"KW-P04-15";"#3" +"KW-P04-16";"#4" +"KW-P04-17";"ALARM DRZWI" +"KW-P04-18";"GWAŁTOWNY ALARM OTWARCIA" +"KW-P04-19";"WIEGAND:" +"KW-P04-20";"BURST" +"KW-P04-21";"26-BIT" +"KW-P04-22";"FACILITY:" +"KW-P04-24";"ZAPISZ" +"KW-P04-25";"WYŁĄCZONY" +"KW-P04-26";"REF.1490 2 RELAY:" +"KW-P04-27";"KOD QR:" +"KW-P04-28";"WIEGAND:" +"KW-P04-29";"26-BIT" +"KW-P04-30";"34-BIT" +"KW-P04-31";"KOD MIEJSCA:" +"KW-P04-32";"AUTO AKTYWACJA:" +"KW-P04-33";"BŁĘDNE DANE" +"KW-P05-01";"ROZPOZNAWANIE TWARZY" +"KW-P05-02";"ROZPOZNAWANIE TWARZY:" +"KW-P05-04";"MODEL:" +"KW-P05-05";"Wykrycie obecności:" +"KW-P05-06";"WŁĄCZONY" +"KW-P05-07";"WYŁĄCZONY" +"KW-P05-08";"PODOBIEŃSTWO:" +"KW-P05-09";"NISKIE" +"KW-P05-10";"ŚREDNIE" +"KW-P05-11";"WYSOKIE" +"KW-P05-12";"ZAPISZ" +"KW-P06-01";"USTAWIENIA KAMER IP" +"KW-P06-02";"ILOŚĆ KAMER:" +"KW-P06-03";"KAMERA" +"KW-P06-04";"URL:" +"KW-P06-05";"ZAPISZ" +"KW-P07-01";"USTAWIENIA SIP" +"KW-P07-02";"WŁĄCZ SIP:" +"KW-P07-03";"SPRAWDŹ STATUS SIP" +"KW-P07-04";"SIP ZAREJESTROWANY" +"KW-P07-05";"BŁĄD REJESTRACJI SIP" +"KW-P07-06";"SERWER SIP:" +"KW-P07-07";"DOMENA:" +"KW-P07-08";"OUTBOUND:" +"KW-P07-09";"STUN IP:" +"KW-P07-10";"PORT STUN:" +"KW-P07-11";"H.264:" +"KW-P07-12";"UŻYTKOWNIK SIP:" +"KW-P07-13";"HASŁO SIP:" +"KW-P07-14";"CZAS ROZMOWY:" +"KW-P07-15";"CZAS DZWONIENIA:" +"KW-P07-16";"ZAPISZ" +"KW-P08-01";"USTAWIENIA SIP TRUNK" +"KW-P08-02";"WŁĄCZ SIP TRUNK:" +"KW-P08-03";"URL:" +"KW-P08-04";"ZAPISZ" +"KW-P09-01";"USTAWIENIA PRZEKIEROWAŃ" +"KW-P09-02";"IMPORT" +"KW-P09-03";"EKSPORT" +"KW-P09-04";"APARTAMENT" +"KW-P09-05";"NUMER" +"KW-P10-01";"USTAWIENIA ZAAWANSOWANE" +"KW-P10-02";"SZYBKIE WYBIERANIE:" +"KW-P10-03";"URL:" +"KW-P10-04";"ONU:" +"KW-P10-05";"MAPOWANIE POŁĄCZEŃ:" +"KW-P10-06";"BIAŁA LISTA:" +"KW-P10-07";"Lista telefoniczna:" +"KW-P10-08";"IMPORT" +"KW-P10-09";"EKSPORT" +"KW-P10-10";"IMPORTUJ" +"KW-P10-11";"WYSYŁANIE ZAKOŃCZONE" +"KW-P10-12";"UŻYJ WŁAŚCIWEGO PLIKU CSV." +"KW-P10-13";"OK" +"KW-P10-14";"ZAPISZ" +"KW-P11-01";"USTAWIENIA KODU PIN" +"KW-P11-02";"OBECNY PIN:" +"KW-P11-03";"NOWY PIN:" +"KW-P11-04";"POTWIERDŹ PIN:" +"KW-P11-05";"ZAPISZ" +"KW-P12-01";"WECHAT QR" +"KW-P12-02";"WŁĄCZ" +"KW-P12-03";"UUID:" +"KW-P12-04";"HASŁO:" +"KW-P12-05";"SERWER:" +"KW-P12-06";"WŁĄCZ CZYTNIK QR:" +"KW-P12-07";"STATUS:" +"KW-P12-08";"REJESTRACJA POMYŚLNIE" +"KW-P12-09";"REJESTRACJA NIE POWIODŁA SIĘ" +"KW-P12-10";"ZAPISZ" +"KW-P13-01";"PRZYWRACAĆ" +"KW-P13-02";"PRZYWRÓCIĆ USTAWIENIA FABRYCZNE" +"KW-P13-03";"POTWIERDZAĆ PRZYWRÓĆ USTAWIENIA FABRYCZNE?" +"KW-P13-04";"URZĄDZENIE REBOOT" diff --git a/tests/test_full_detection.py b/tests/test_full_detection.py index 218080bf..96e0b797 100644 --- a/tests/test_full_detection.py +++ b/tests/test_full_detection.py @@ -22,6 +22,7 @@ ('sample-turkish.txt', 'cp1254', 'Turkish'), ('sample-russian-2.txt', 'utf_8', 'Russian'), ('sample-russian.txt', 'mac_cyrillic', 'Russian'), + ('sample-polish.txt', 'utf_8', 'Polish'), ] ) def test_elementary_detection(