🔖 Bump version to 2.0.4 (#72)

* 🔖 Bump version to 2.0.4 * 🩹 MD sensitivity adjustments (#76) * 🩹 MD sensitivity adjustments * 📌Make sure that CN deps from requests does not shadow the current dev-version * 📝 Do not mislead, dont say if multibyte, priority given (logger, explain) * 📝 🐛 Tiny mistake when logging detected language using specific cp (debug, explain) * 🐛 submatch factoring were incorrect in rare cases * 📝 ⚡ Performance claims update * 🐛 Multiple file given to the CLI would not result in array JSON (omit after the first file)
jawah · Jul 30, 2021 · 558d1e2 · 558d1e2
1 parent 8247f3a
commit 558d1e2
Show file tree

Hide file tree

Showing 12 changed files with 112 additions and 26 deletions.
diff --git a/.github/workflows/chardet-bc.yml b/.github/workflows/chardet-bc.yml
@@ -23,6 +23,7 @@ jobs:
         pip install -U pip setuptools
         pip install -r requirements.txt
         pip install -r ./bin/requirements.txt
+        pip uninstall -y charset-normalizer
     - name: Install the package
       run: |
         python setup.py install

diff --git a/.github/workflows/detector-coverage.yml b/.github/workflows/detector-coverage.yml
@@ -23,6 +23,7 @@ jobs:
         pip install -U pip setuptools
         pip install -r requirements.txt
         pip install -r ./bin/requirements.txt
+        pip uninstall -y charset-normalizer
     - name: Install the package
       run: |
         python setup.py install

diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml
@@ -23,6 +23,7 @@ jobs:
         pip install -U pip setuptools
         pip install -r requirements.txt
         pip install -r ./bin/requirements.txt
+        pip uninstall -y charset-normalizer
     - name: Install the package
       run: |
         python setup.py install

diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -25,6 +25,7 @@ jobs:
       run: |
         pip install -U pip setuptools
         pip install -r requirements.txt
+        pip uninstall -y charset-normalizer
     - name: Install the package
       run: |
         python setup.py install

diff --git a/README.md b/README.md
@@ -55,13 +55,13 @@ This package offer better performance than its counterpart Chardet. Here are som
 
 | Package       | Accuracy       | Mean per file (ns) | File per sec (est) |
 | ------------- | :-------------: | :------------------: | :------------------: |
-|      [chardet](https://github.com/chardet/chardet)        |     93.0 %     |     67 ms      |       15.38 file/sec        |
-| charset-normalizer |    **95.0 %**     |     **37 ms**      |       27.77 file/sec    |
+|      [chardet](https://github.com/chardet/chardet)        |     93.0 %     |     150 ms      |       7 file/sec        |
+| charset-normalizer |    **95.0 %**     |     **36 ms**      |       28 file/sec    |
 
 | Package       | 99th percentile       | 95th percentile | 50th percentile |
 | ------------- | :-------------: | :------------------: | :------------------: |
-|      [chardet](https://github.com/chardet/chardet)        |     424 ms     |     234 ms      |       26 ms        |
-| charset-normalizer |    335 ms     |     186 ms      |       17 ms    |
+|      [chardet](https://github.com/chardet/chardet)        |     647 ms     |     250 ms      |       24 ms        |
+| charset-normalizer |    354 ms     |     202 ms      |       16 ms    |
 
 Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
 

diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -206,7 +206,7 @@ def from_bytes(
         multi_byte_bonus = is_multi_byte_decoder and decoded_payload is not None and len(decoded_payload) < length  # type: bool
 
         if multi_byte_bonus:
-            logger.info('Code page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes. Should not be a coincidence. Priority +1 given.', encoding_iana)
+            logger.info('Code page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.', encoding_iana)
 
         max_chunk_gave_up = int(len(r_) / 4)  # type: int
 
@@ -331,7 +331,7 @@ def from_bytes(
             logger.info(
                 "Using %s code page we detected the following languages: %s",
                 encoding_iana,
-                results[-1]._languages
+                results[encoding_iana]._languages
             )
 
     if len(results) == 0:

diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py
@@ -93,6 +93,8 @@ def cli_detect(argv=None):
         print('--threshold VALUE should be between 0. AND 1.', file=sys.stderr)
         return 1
 
+    x_ = []
+
     for my_file in args.files:
 
         matches = from_fp(
@@ -101,8 +103,6 @@ def cli_detect(argv=None):
             explain=args.verbose
         )
 
-        x_ = []
-
         if len(matches) == 0:
             print('Unable to identify originating encoding for "{}". {}'.format(my_file.name, 'Maybe try increasing maximum amount of chaos.' if args.threshold < 1. else ''), file=sys.stderr)
             x_.append(
@@ -202,7 +202,7 @@ def cli_detect(argv=None):
             dumps(
                 [
                     el.__dict__ for el in x_
-                ] if args.alternatives else x_[0].__dict__,
+                ] if len(x_) > 1 else x_[0].__dict__,
                 ensure_ascii=True,
                 indent=4
             )

diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py
@@ -3,7 +3,7 @@
 
 from charset_normalizer.constant import UNICODE_SECONDARY_RANGE_KEYWORD
 from charset_normalizer.utils import is_punctuation, is_symbol, unicode_range, is_accentuated, is_latin, \
-    remove_accent, is_separator, is_cjk
+    remove_accent, is_separator, is_cjk, is_case_variable, is_hangul, is_katakana, is_hiragana, is_ascii, is_thai
 
 
 class MessDetectorPlugin:
@@ -140,11 +140,15 @@ def __init__(self):
         self._last_latin_character = None  # type: Optional[str]
 
     def eligible(self, character: str) -> bool:
-        return is_latin(character)
+        return character.isalpha() and is_latin(character)
 
     def feed(self, character: str) -> None:
+        self._character_count += 1
         if self._last_latin_character is not None:
             if is_accentuated(character) and is_accentuated(self._last_latin_character):
+                if character.isupper() and self._last_latin_character.isupper():
+                    self._successive_count += 1
+                # Worse if its the same char duplicated with different accent.
                 if remove_accent(character) == remove_accent(self._last_latin_character):
                     self._successive_count += 1
         self._last_latin_character = character
@@ -175,14 +179,14 @@ def eligible(self, character: str) -> bool:
     def feed(self, character: str) -> None:
         self._character_count += 1
 
-        if self._last_printable_seen is None:
-            self._last_printable_seen = character
-            return
-
         if character.isspace() or is_punctuation(character):
             self._last_printable_seen = None
             return
 
+        if self._last_printable_seen is None:
+            self._last_printable_seen = character
+            return
+
         unicode_range_a = unicode_range(self._last_printable_seen)  # type: Optional[str]
         unicode_range_b = unicode_range(character)  # type: Optional[str]
 
@@ -215,6 +219,7 @@ def __init__(self):
         self._word_count = 0  # type: int
         self._bad_word_count = 0  # type: int
         self._is_current_word_bad = False  # type: bool
+        self._foreign_long_watch = False  # type: bool
 
         self._character_count = 0  # type: int
         self._bad_character_count = 0  # type: int
@@ -230,6 +235,8 @@ def feed(self, character: str) -> None:
             self._buffer = "".join([self._buffer, character])
             if is_accentuated(character):
                 self._buffer_accent_count += 1
+            if self._foreign_long_watch is False and is_latin(character) is False and is_cjk(character) is False and is_hangul(character) is False and is_katakana(character) is False and is_hiragana(character) is False and is_thai(character) is False:
+                self._foreign_long_watch = True
             return
         if not self._buffer:
             return
@@ -241,12 +248,15 @@ def feed(self, character: str) -> None:
 
             if buffer_length >= 4 and self._buffer_accent_count / buffer_length >= 0.3:
                 self._is_current_word_bad = True
+            if buffer_length >= 24 and self._foreign_long_watch:
+                self._is_current_word_bad = True
 
             if self._is_current_word_bad:
                 self._bad_word_count += 1
                 self._bad_character_count += len(self._buffer)
                 self._is_current_word_bad = False
 
+            self._foreign_long_watch = False
             self._buffer = ""
             self._buffer_accent_count = 0
         elif character not in {"<", ">", "-", "="} and character.isdigit() is False and is_symbol(character):
@@ -256,14 +266,15 @@ def feed(self, character: str) -> None:
     def reset(self) -> None:
         self._buffer = ""
         self._is_current_word_bad = False
+        self._foreign_long_watch = False
         self._bad_word_count = 0
         self._word_count = 0
         self._character_count = 0
         self._bad_character_count = 0
 
     @property
     def ratio(self) -> float:
-        if self._word_count <= 16:
+        if self._word_count <= 10:
             return 0.
 
         return self._bad_character_count / self._character_count
@@ -313,27 +324,43 @@ def __init__(self):
         self._character_count = 0  # type: int
 
         self._last_alpha_seen = None  # type: Optional[str]
+        self._current_ascii_only = True  # type: bool
 
     def eligible(self, character: str) -> bool:
-        return character.isspace() or character.isalpha()
+        return True
 
     def feed(self, character: str) -> None:
-        if is_separator(character):
-            if self._character_count_since_last_sep < 24:
+        is_concerned = character.isalpha() and is_case_variable(character)
+        chunk_sep = is_concerned is False
+
+        if chunk_sep and self._character_count_since_last_sep > 0:
+            if self._character_count_since_last_sep <= 64 and character.isdigit() is False and self._current_ascii_only is False:
                 self._successive_upper_lower_count_final += self._successive_upper_lower_count
+
             self._successive_upper_lower_count = 0
             self._character_count_since_last_sep = 0
+            self._last_alpha_seen = None
+            self._buf = False
+            self._character_count += 1
+            self._current_ascii_only = True
+
+            return
+
+        if self._current_ascii_only is True and is_ascii(character) is False:
+            self._current_ascii_only = False
 
         if self._last_alpha_seen is not None:
             if (character.isupper() and self._last_alpha_seen.islower()) or (character.islower() and self._last_alpha_seen.isupper()):
                 if self._buf is True:
-                    self._successive_upper_lower_count += 1
+                    self._successive_upper_lower_count += 2
+                    self._buf = False
                 else:
                     self._buf = True
             else:
                 self._buf = False
 
         self._character_count += 1
+        self._character_count_since_last_sep += 1
         self._last_alpha_seen = character
 
     def reset(self) -> None:
@@ -342,13 +369,15 @@ def reset(self) -> None:
         self._successive_upper_lower_count = 0
         self._successive_upper_lower_count_final = 0
         self._last_alpha_seen = None
+        self._buf = False
+        self._current_ascii_only = True
 
     @property
     def ratio(self) -> float:
         if self._character_count == 0:
             return 0.
 
-        return (self._successive_upper_lower_count_final * 2) / self._character_count
+        return self._successive_upper_lower_count_final / self._character_count
 
 
 def is_suspiciously_successive_range(unicode_range_a: Optional[str], unicode_range_b: Optional[str]) -> bool:

diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py
@@ -290,7 +290,7 @@ def append(self, item: CharsetMatch) -> None:
         # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
         if len(item.raw) <= TOO_BIG_SEQUENCE:
             for match in self._results:
-                if match.fingerprint == item.fingerprint:
+                if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
                     match.add_submatch(item)
                     return
         self._results.append(item)

diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py
@@ -22,7 +22,7 @@ def is_accentuated(character: str) -> bool:
         description = unicodedata.name(character)  # type: str
     except ValueError:
         return False
-    return "WITH GRAVE" in description or "WITH ACUTE" in description or "WITH CEDILLA" in description
+    return "WITH GRAVE" in description or "WITH ACUTE" in description or "WITH CEDILLA" in description or "WITH DIAERESIS" in description or "WITH CIRCUMFLEX" in description
 
 
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
@@ -64,6 +64,13 @@ def is_latin(character: str) -> bool:
     return "LATIN" in description
 
 
+def is_ascii(character: str) -> bool:
+    try:
+        character.encode("ascii")
+    except UnicodeEncodeError:
+        return False
+    return True
+
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_punctuation(character: str) -> bool:
     character_category = unicodedata.category(character)  # type: str
@@ -96,20 +103,26 @@ def is_symbol(character: str) -> bool:
 
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_separator(character: str) -> bool:
-    if character.isspace() or character in ["｜", "+"]:
+    if character.isspace() or character in ["｜", "+", ",", ";", "<", ">"]:
         return True
 
     character_category = unicodedata.category(character)  # type: str
 
     return "Z" in character_category
 
 
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_case_variable(character: str) -> bool:
+    return character.islower() != character.isupper()
+
+
 def is_private_use_only(character: str) -> bool:
     character_category = unicodedata.category(character)  # type: str
 
     return "Co" == character_category
 
 
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_cjk(character: str) -> bool:
     try:
         character_name = unicodedata.name(character)
@@ -119,6 +132,46 @@ def is_cjk(character: str) -> bool:
     return "CJK" in character_name
 
 
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_hiragana(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:
+        return False
+
+    return "HIRAGANA" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_katakana(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:
+        return False
+
+    return "KATAKANA" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_hangul(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:
+        return False
+
+    return "HANGUL" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_thai(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:
+        return False
+
+    return "THAI" in character_name
+
+
 @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
 def is_unicode_range_secondary(range_name: str) -> bool:
     for keyword in UNICODE_SECONDARY_RANGE_KEYWORD:

diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py
@@ -2,5 +2,5 @@
 Expose version
 """
 
-__version__ = "2.0.3"
+__version__ = "2.0.4"
 VERSION = __version__.split('.')
diff --git a/tests/test_probe_chaos.py b/tests/test_probe_chaos.py
@@ -40,7 +40,7 @@ def test_subtle_gibberish(self):
 
         self.assertGreater(
             mess_ratio("´Á¥½³ø§i --  ±i®Ìºû, ³¯·Ø©v"),
-            0.7
+            0.5
         )
 
         self.assertGreater(
-Original file line number
+Diff line change
@@ Expand Up / @@ -40,7 +40,7 @@ def test_subtle_gibberish(self): @@
             self.assertGreater(
                 mess_ratio("´Á¥½³ø§i --  ±i®Ìºû, ³¯·Ø©v"),
-.7
+.5
             )
             self.assertGreater(
@@ Expand Down @@