From 7cee03585246a0f03b9edf1284937753209b7154 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 26 Sep 2021 16:59:12 +0100 Subject: [PATCH 01/12] :pencil: Update claims --- README.md | 5 +++-- docs/why_migrate.rst | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 289de31c..1de3b47d 100644 --- a/README.md +++ b/README.md @@ -50,8 +50,8 @@ This package offer better performance than its counterpart Chardet. Here are som | Package | Accuracy | Mean per file (ns) | File per sec (est) | | ------------- | :-------------: | :------------------: | :------------------: | -| [chardet](https://github.com/chardet/chardet) | 92.0 % | 220 ms | 5 file/sec | -| charset-normalizer | **97.0 %** | **40 ms** | 25 file/sec | +| [chardet](https://github.com/chardet/chardet) | 92 % | 220 ms | 5 file/sec | +| charset-normalizer | **98 %** | **40 ms** | 25 file/sec | | Package | 99th percentile | 95th percentile | 50th percentile | | ------------- | :-------------: | :------------------: | :------------------: | @@ -62,6 +62,7 @@ Chardet's performance on larger file (1MB+) are very poor. Expect huge differenc > Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows. > And yes, these results might change at any time. The dataset can be updated to include more files. +> The actual delays depends heavily on your CPU capability. The factors should remain the same. [cchardet](https://github.com/PyYoshi/cChardet) is a non-native (cpp binding) faster alternative. If speed is the most important factor, you should try it. diff --git a/docs/why_migrate.rst b/docs/why_migrate.rst index c888d43f..730d1745 100644 --- a/docs/why_migrate.rst +++ b/docs/why_migrate.rst @@ -4,7 +4,7 @@ Why should I migrate to Charset-Normalizer? There is so many reason to migrate your current project. Here are some of them: - Remove ANY license ambiguity/restriction for projects bundling Chardet (even indirectly). -- X4 faster than Chardet (average) AND support X3 more encoding. +- X5 faster than Chardet in average and X2 faster in 99% of the cases AND support 3 times more encoding. - Never return a encoding if not suited for the given decoder. Eg. Never get UnicodeDecodeError! - Actively maintained, open to contributors. - Have the backward compatible function ``detect`` that come from Chardet. From 026e10887a1c09d4b1ad6d897cb99fbb10148535 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 26 Sep 2021 17:01:08 +0100 Subject: [PATCH 02/12] :pencil: fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1de3b47d..42697ec6 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ Chardet's performance on larger file (1MB+) are very poor. Expect huge differenc > Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows. > And yes, these results might change at any time. The dataset can be updated to include more files. -> The actual delays depends heavily on your CPU capability. The factors should remain the same. +> The actual delays heavily depends on your CPU capabilities. The factors should remain the same. [cchardet](https://github.com/PyYoshi/cChardet) is a non-native (cpp binding) faster alternative. If speed is the most important factor, you should try it. From a00f19362686f022ab332048bf77a87349d30f60 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 27 Sep 2021 21:27:29 +0100 Subject: [PATCH 03/12] :bookmark: Bump version to 2.0.7 --- charset_normalizer/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index bdba9fc2..98e53fb3 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.0.6" +__version__ = "2.0.7" VERSION = __version__.split(".") From e35fa7450c6e1e4775b9589c9b15e2806fd22259 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 27 Sep 2021 21:34:49 +0100 Subject: [PATCH 04/12] :zap: update percentiles ms stats --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 42697ec6..4183ca87 100644 --- a/README.md +++ b/README.md @@ -55,8 +55,8 @@ This package offer better performance than its counterpart Chardet. Here are som | Package | 99th percentile | 95th percentile | 50th percentile | | ------------- | :-------------: | :------------------: | :------------------: | -| [chardet](https://github.com/chardet/chardet) | 888 ms | 300 ms | 27 ms | -| charset-normalizer | 430 ms | 220 ms | 18 ms | +| [chardet](https://github.com/chardet/chardet) | 1115 ms | 300 ms | 27 ms | +| charset-normalizer | 460 ms | 240 ms | 18 ms | Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload. From a6f2412477f2924ce4b8046f6f4c0db3d99ab643 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 27 Sep 2021 21:35:20 +0100 Subject: [PATCH 05/12] :pencil: fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4183ca87..2cdf389f 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ This project offers you an alternative to **Universal Charset Encoding Detector* This package offer better performance than its counterpart Chardet. Here are some numbers. -| Package | Accuracy | Mean per file (ns) | File per sec (est) | +| Package | Accuracy | Mean per file (ms) | File per sec (est) | | ------------- | :-------------: | :------------------: | :------------------: | | [chardet](https://github.com/chardet/chardet) | 92 % | 220 ms | 5 file/sec | | charset-normalizer | **98 %** | **40 ms** | 25 file/sec | From 866e7a3aba03424311ac4c2ee97840f4aa2b99cf Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 27 Sep 2021 21:48:00 +0100 Subject: [PATCH 06/12] :bug: Fix regression from PR #113 List instead of Set for alphabets property --- charset_normalizer/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index 23411dae..5faaf1db 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -231,7 +231,7 @@ def alphabets(self) -> List[str]: unicode_range(char) for char in str(self) ] # type: List[Optional[str]] # filter and sort - self._unicode_ranges = sorted([r for r in detected_ranges if r]) # type: ignore + self._unicode_ranges = sorted({r for r in detected_ranges if r}) # type: ignore return self._unicode_ranges @property From 28c3ae15ad7a544643263a3bd1c12a2f16c1401e Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 27 Sep 2021 22:34:29 +0100 Subject: [PATCH 07/12] fix type output in alphabets property --- charset_normalizer/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index 5faaf1db..68c27b89 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -231,7 +231,7 @@ def alphabets(self) -> List[str]: unicode_range(char) for char in str(self) ] # type: List[Optional[str]] # filter and sort - self._unicode_ranges = sorted({r for r in detected_ranges if r}) # type: ignore + self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) return self._unicode_ranges @property From d28595eef68798e9e468c72750160cf5243b2a2c Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 27 Sep 2021 22:40:14 +0100 Subject: [PATCH 08/12] :heavy_check_mark: Add test case to ensure non-regression upon 28c3ae15ad7a544643263a3bd1c12a2f16c1401e --- tests/test_base_detection.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_base_detection.py b/tests/test_base_detection.py index a3e78fb8..cb80003a 100644 --- a/tests/test_base_detection.py +++ b/tests/test_base_detection.py @@ -105,3 +105,13 @@ def test_mb_cutting_chk(): assert len(guesses) == 1, "cp isolation is set and given seq should be clear CP949!" assert best_guess.encoding == "cp949" + + +def test_alphabets_property(): + best_guess = from_bytes( + "😀 Hello World! How affairs are going? 😀".encode("utf_8") + ).best() + + assert "Basic Latin" in best_guess.alphabets + assert "Emoticons range(Emoji)" in best_guess.alphabets + assert best_guess.alphabets.count("Basic Latin") == 1 From fdb4a39fa99cc705390160439d05c843b17b3c74 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 27 Sep 2021 23:05:33 +0100 Subject: [PATCH 09/12] :heavy_check_mark: Add tests and ignore old legacy methods cover --- charset_normalizer/legacy.py | 18 +++++++++--------- tests/test_cli.py | 23 +++++++++++++++++++++++ tests/test_normalize_fp.py | 20 ++++++++++++++++++++ 3 files changed, 52 insertions(+), 9 deletions(-) create mode 100644 tests/test_normalize_fp.py diff --git a/charset_normalizer/legacy.py b/charset_normalizer/legacy.py index 71fa3cf8..cdebe2b8 100644 --- a/charset_normalizer/legacy.py +++ b/charset_normalizer/legacy.py @@ -17,7 +17,7 @@ def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]: :param byte_str: The byte sequence to examine. """ if not isinstance(byte_str, (bytearray, bytes)): - raise TypeError( + raise TypeError( # pragma: nocover "Expected object of type bytes or bytearray, got: " "{0}".format(type(byte_str)) ) @@ -52,39 +52,39 @@ class CharsetNormalizerMatch(CharsetMatch): class CharsetNormalizerMatches(CharsetMatches): @staticmethod def from_fp(*args, **kwargs): # type: ignore - warnings.warn( + warnings.warn( # pragma: nocover "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " "and scheduled to be removed in 3.0", DeprecationWarning, ) - return from_fp(*args, **kwargs) + return from_fp(*args, **kwargs) # pragma: nocover @staticmethod def from_bytes(*args, **kwargs): # type: ignore - warnings.warn( + warnings.warn( # pragma: nocover "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " "and scheduled to be removed in 3.0", DeprecationWarning, ) - return from_bytes(*args, **kwargs) + return from_bytes(*args, **kwargs) # pragma: nocover @staticmethod def from_path(*args, **kwargs): # type: ignore - warnings.warn( + warnings.warn( # pragma: nocover "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " "and scheduled to be removed in 3.0", DeprecationWarning, ) - return from_path(*args, **kwargs) + return from_path(*args, **kwargs) # pragma: nocover @staticmethod def normalize(*args, **kwargs): # type: ignore - warnings.warn( + warnings.warn( # pragma: nocover "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " "and scheduled to be removed in 3.0", DeprecationWarning, ) - return normalize(*args, **kwargs) + return normalize(*args, **kwargs) # pragma: nocover class CharsetDetector(CharsetNormalizerMatches): diff --git a/tests/test_cli.py b/tests/test_cli.py index 1ba234f3..3afe02ff 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -114,6 +114,29 @@ def test_non_existent_file(self): self.assertEqual(cm.exception.code, 2) + def test_replace_without_normalize(self): + + self.assertEqual( + cli_detect( + [ + './data/sample.1.ar.srt', + '--replace' + ] + ), + 1 + ) + + def test_force_replace_without_replace(self): + self.assertEqual( + cli_detect( + [ + './data/sample.1.ar.srt', + '--force' + ] + ), + 1 + ) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_normalize_fp.py b/tests/test_normalize_fp.py new file mode 100644 index 00000000..4c75dd9c --- /dev/null +++ b/tests/test_normalize_fp.py @@ -0,0 +1,20 @@ +import pytest +from charset_normalizer import normalize +from os.path import exists +from os import unlink + + +def test_normalize_fp_creation(): + guesses = normalize( + "./data/sample.1.ar.srt" + ) + + predicted_path = "./data/sample.1.ar-{}.srt".format(guesses.best().encoding) + path_exist = exists( + "./data/sample.1.ar-{}.srt".format(guesses.best().encoding) + ) + + assert path_exist is True + + if path_exist: + unlink(predicted_path) From d14d608824ab9dc811fe84b556fc650a99e80c70 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 27 Sep 2021 23:11:49 +0100 Subject: [PATCH 10/12] :sparkle: Add autofix script for black and isort linters --- bin/run_autofix.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100755 bin/run_autofix.sh diff --git a/bin/run_autofix.sh b/bin/run_autofix.sh new file mode 100755 index 00000000..64c7832c --- /dev/null +++ b/bin/run_autofix.sh @@ -0,0 +1,11 @@ +#!/bin/sh -e + +export PREFIX="" +if [ -d 'venv' ] ; then + export PREFIX="venv/bin/" +fi + +set -x + +${PREFIX}black --diff --target-version=py35 charset_normalizer +${PREFIX}isort --diff charset_normalizer From 07d615bd33bff87de0e29b6cbed7f960deaedabe Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 27 Sep 2021 23:22:44 +0100 Subject: [PATCH 11/12] :pencil: Update contrib.md --- CONTRIBUTING.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d546941f..566ca4fe 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -56,7 +56,7 @@ the backward-compatibility. ## What PR may be doomed? - Dropping EOL Python 3.5 -> We are waiting upon the right moment to drop it. Hint, wait for requests to drop it first. +> Scheduled for the 3.0 milestone. - Add support for a Python unsupported charset/encoding > If you looked carefully at the project, you would see that it aims to be generic whenever possible. So adding a specific prober is out of the question. @@ -71,3 +71,5 @@ the backward-compatibility. It is essential that you run, prior to any submissions the mandatory checks. Run the script `./bin/run_checks.sh` to verify that your modification are not breaking anything. + +Also, make sure to run the `./bin/run_autofix.sh` to comply with the style format and import sorting. From 67bb7ece1953018e93d7fce77ae59bfd717caeb1 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Thu, 7 Oct 2021 22:46:32 +0200 Subject: [PATCH 12/12] :wrench: Python 3.10 (using public release) tests --- .github/workflows/run-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index f1c5af49..fe225038 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9, "3.10.0-rc.2"] + python-version: [3.5, 3.6, 3.7, 3.8, 3.9, "3.10"] os: [ubuntu-latest] steps: