Skip to content

Commit

Permalink
Re-use decoded buffer for single byte character sets (#175)
Browse files Browse the repository at this point in the history
* Re-use decoded buffer for short texts

This avoids issues with detecting string boundaries while improving
performance (avoids multiple decoding of the sequence).

Fixes #174

* 🔖 Bump version to 2.1.0.dev0

* 🐛 Workaround a potential bug in Python isspace table character

 bug discovered in Python, Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.


Co-authored-by: TAHRI Ahmed R <[email protected]>
Co-authored-by: Ahmed TAHRI <[email protected]>
  • Loading branch information
3 people authored Jun 18, 2022
1 parent 7cbd7fc commit 4846792
Show file tree
Hide file tree
Showing 6 changed files with 304 additions and 55 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [2.1.0.dev0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...master) (2022-??-??)

### Changed
- Re-use decoded buffer for single byte character sets (PR #175)

### Fixed
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)

## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)

### Fixed
Expand Down
82 changes: 29 additions & 53 deletions charset_normalizer/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .models import CharsetMatch, CharsetMatches
from .utils import (
any_specified_encoding,
cut_sequence_chunks,
iana_name,
identify_sig_or_bom,
is_cp_similar,
Expand Down Expand Up @@ -285,63 +286,38 @@ def from_bytes(
md_chunks = [] # type: List[str]
md_ratios = []

for i in r_:
if i + chunk_size > length + 8:
continue

cut_sequence = sequences[i : i + chunk_size]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

try:
chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
) # type: str
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True
break
try:
for chunk in cut_sequence_chunks(
sequences,
encoding_iana,
r_,
chunk_size,
bom_or_sig_available,
strip_sig_or_bom,
sig_payload,
is_multi_byte_decoder,
decoded_payload,
):
md_chunks.append(chunk)

# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
md_ratios.append(mess_ratio(chunk, threshold))

chunk_partial_size_chk = min(chunk_size, 16) # type: int
if md_ratios[-1] >= threshold:
early_stop_count += 1

if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j : i + chunk_size]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

chunk = cut_sequence.decode(encoding_iana, errors="ignore")

if chunk[:chunk_partial_size_chk] in decoded_payload:
break

md_chunks.append(chunk)

md_ratios.append(mess_ratio(chunk, threshold))

if md_ratios[-1] >= threshold:
early_stop_count += 1

if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
break
break
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True

# We might want to check the sequence again with the whole content
# Only if initial MD tests passes
Expand Down
62 changes: 61 additions & 1 deletion charset_normalizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
from typing import List, Optional, Set, Tuple, Union
from typing import Generator, List, Optional, Set, Tuple, Union

from _multibytecodec import MultibyteIncrementalDecoder # type: ignore

Expand Down Expand Up @@ -204,6 +204,8 @@ def is_unprintable(character: str) -> bool:
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
and character != b"\xEF\xBB\xBF".decode("utf_8") # bug discovered in Python,
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
)


Expand Down Expand Up @@ -350,3 +352,61 @@ def set_logging_handler(
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_string))
logger.addHandler(handler)


def cut_sequence_chunks(
sequences: bytes,
encoding_iana: str,
offsets: range,
chunk_size: int,
bom_or_sig_available: bool,
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
decoded_payload: Optional[str] = None,
) -> Generator[str, None, None]:

if decoded_payload and is_multi_byte_decoder is False:
for i in offsets:
chunk = decoded_payload[i : i + chunk_size]
if not chunk:
break
yield chunk
else:
for i in offsets:
chunk_end = i + chunk_size
if chunk_end > len(sequences) + 8:
continue

cut_sequence = sequences[i : i + chunk_size]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
)

# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:

chunk_partial_size_chk = min(chunk_size, 16) # type: int

if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j:chunk_end]

if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence

chunk = cut_sequence.decode(encoding_iana, errors="ignore")

if chunk[:chunk_partial_size_chk] in decoded_payload:
break

yield chunk
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "2.0.12"
__version__ = "2.1.0.dev0"
VERSION = __version__.split(".")
Loading

0 comments on commit 4846792

Please sign in to comment.