From 5a8dceae2829a985ed70339b62490c7af89ef871 Mon Sep 17 00:00:00 2001 From: AbdealiJK Date: Fri, 28 Oct 2022 19:35:01 +0530 Subject: [PATCH 1/6] init: Remove __future__ statements or py2 The __future__ statements we used were: - absolute_import - Mandatory in 3.0 - with_statement - Mandatory in 2.6 - print_function - Mandatory in 3.0 As we only support python >= 3.6 - this can be removed now --- pypandoc/__init__.py | 1 - pypandoc/py3compat.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/pypandoc/__init__.py b/pypandoc/__init__.py index 3fd81b3..f453343 100644 --- a/pypandoc/__init__.py +++ b/pypandoc/__init__.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import, print_function, with_statement from typing import Iterable from typing import Union from typing import Generator diff --git a/pypandoc/py3compat.py b/pypandoc/py3compat.py index afb5b74..771a150 100644 --- a/pypandoc/py3compat.py +++ b/pypandoc/py3compat.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import with_statement - import locale import sys From 90169d77d2bf95d3dc72e751872f5530813640a0 Mon Sep 17 00:00:00 2001 From: AbdealiJK Date: Fri, 28 Oct 2022 19:36:06 +0530 Subject: [PATCH 2/6] py3compat: Remove conditions for py2 We were handling compat for py2 and py3 (mainly for urllib) Now this is not required as we expect python >= 3.6 --- pypandoc/pandoc_download.py | 8 ++------ pypandoc/py3compat.py | 39 ++++++++++--------------------------- 2 files changed, 12 insertions(+), 35 deletions(-) diff --git a/pypandoc/pandoc_download.py b/pypandoc/pandoc_download.py index dc09301..919a9b0 100644 --- a/pypandoc/pandoc_download.py +++ b/pypandoc/pandoc_download.py @@ -9,13 +9,9 @@ import subprocess import sys import tempfile -from typing import Union - import urllib -try: - from urllib.request import urlopen -except ImportError: - from urllib import urlopen +from typing import Union +from urllib.request import urlopen from .handler import _check_log_handler diff --git a/pypandoc/py3compat.py b/pypandoc/py3compat.py index 771a150..5adece2 100644 --- a/pypandoc/py3compat.py +++ b/pypandoc/py3compat.py @@ -38,38 +38,19 @@ def cast_bytes(s, encoding=None): return _encode(s, encoding) return s +PY3 = True -if sys.version_info[0] >= 3: - PY3 = True +string_types = (str,) +unicode_type = str - string_types = (str,) - unicode_type = str +# from http://stackoverflow.com/questions/11687478/convert-a-filename-to-a-file-url +from urllib.parse import urljoin, urlparse +from urllib.request import pathname2url, url2pathname - # from http://stackoverflow.com/questions/11687478/convert-a-filename-to-a-file-url - from urllib.parse import urljoin, urlparse - from urllib.request import pathname2url, url2pathname +def path2url(path): # noqa: E303 + return urljoin('file:', pathname2url(path)) - def path2url(path): # noqa: E303 - return urljoin('file:', pathname2url(path)) - - def url2path(url): # noqa: E303 - return url2pathname(urlparse(url).path) - -else: - PY3 = False - - string_types = (str, unicode) # noqa: F821 - unicode_type = unicode # noqa: F821 - - from urlparse import urljoin, urlparse - import urllib - - - def path2url(path): # noqa: E303 - return urljoin('file:', urllib.pathname2url(path)) - - - def url2path(url): # noqa: E303 - return urllib.url2pathname(urlparse(url).path) +def url2path(url): # noqa: E303 + return url2pathname(urlparse(url).path) From 41912de3f79782d3b9a0e4f675d264e36d85558d Mon Sep 17 00:00:00 2001 From: AbdealiJK Date: Fri, 28 Oct 2022 19:40:15 +0530 Subject: [PATCH 3/6] py3compat: Stop using string/unicode types In py3 they are both str() So, remove logic for the types as we support py3.6+ only --- pypandoc/__init__.py | 5 +++-- pypandoc/py3compat.py | 5 ----- tests.py | 11 +++++------ 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/pypandoc/__init__.py b/pypandoc/__init__.py index f453343..125342a 100644 --- a/pypandoc/__init__.py +++ b/pypandoc/__init__.py @@ -12,10 +12,11 @@ import textwrap import glob from pathlib import Path +from urllib.parse import urlparse from .handler import _check_log_handler from .pandoc_download import DEFAULT_TARGET_FOLDER, download_pandoc -from .py3compat import cast_bytes, cast_unicode, string_types, url2path, urlparse +from .py3compat import cast_bytes, cast_unicode, url2path __author__ = u'Juho Vepsäläinen' __author_email__ = "bebraw@gmail.com" @@ -355,7 +356,7 @@ def _convert_input(source, format, input_type, to, extra_args=(), # adds the proper filter syntax for each item in the filters list if filters is not None: - if isinstance(filters, string_types): + if isinstance(filters, str): filters = filters.split() f = ['--lua-filter=' + x if x.endswith(".lua") else '--filter=' + x for x in filters] args.extend(f) diff --git a/pypandoc/py3compat.py b/pypandoc/py3compat.py index 5adece2..6ae2c4b 100644 --- a/pypandoc/py3compat.py +++ b/pypandoc/py3compat.py @@ -38,11 +38,6 @@ def cast_bytes(s, encoding=None): return _encode(s, encoding) return s -PY3 = True - -string_types = (str,) -unicode_type = str - # from http://stackoverflow.com/questions/11687478/convert-a-filename-to-a-file-url from urllib.parse import urljoin, urlparse from urllib.request import pathname2url, url2pathname diff --git a/tests.py b/tests.py index 982fe66..c2dec57 100755 --- a/tests.py +++ b/tests.py @@ -16,7 +16,6 @@ from pathlib import Path import pypandoc -from pypandoc.py3compat import path2url, string_types, unicode_type @contextlib.contextmanager @@ -53,7 +52,7 @@ def closed_tempfile(suffix, text=None, dir_name=None): # Stolen from pandas def is_list_like(arg): return (hasattr(arg, '__iter__') and - not isinstance(arg, string_types)) + not isinstance(arg, str)) @contextlib.contextmanager @@ -155,7 +154,7 @@ def test_get_pandoc_formats(self): def test_get_pandoc_version(self): assert "HOME" in os.environ, "No HOME set, this will error..." version = pypandoc.get_pandoc_version() - self.assertTrue(isinstance(version, pypandoc.string_types)) + self.assertTrue(isinstance(version, str)) major = int(version.split(".")[0]) # according to http://pandoc.org/releases.html there were only two versions 0.x ... self.assertTrue(major in [0, 1, 2]) @@ -490,12 +489,12 @@ def test_unicode_input(self): # make sure that pandoc always returns unicode and does not mishandle it expected = u'üäöîôû{0}'.format(os.linesep) written = pypandoc.convert_text(u'

üäöîôû

', 'md', format='html') - self.assertTrue(isinstance(written, unicode_type)) + self.assertTrue(isinstance(written, str)) self.assertEqualExceptForNewlineEnd(expected, written) bytes = u'

üäöîôû

'.encode("utf-8") written = pypandoc.convert_text(bytes, 'md', format='html') self.assertTrue(expected == written) - self.assertTrue(isinstance(written, unicode_type)) + self.assertTrue(isinstance(written, str)) # Only use german umlauts in the next test, as iso-8859-15 covers that expected = u'äüäö{0}'.format(os.linesep) @@ -516,7 +515,7 @@ def f(): # with the right encoding it should work... written = pypandoc.convert_text(bytes, 'md', format='html', encoding="iso-8859-15") self.assertEqualExceptForNewlineEnd(expected, written) - self.assertTrue(isinstance(written, unicode_type)) + self.assertTrue(isinstance(written, str)) def test_conversion_from_non_plain_text_file(self): with closed_tempfile('.docx') as file_name: From 9871a319beac46851645cfee27763b07ddddba37 Mon Sep 17 00:00:00 2001 From: AbdealiJK Date: Fri, 28 Oct 2022 19:42:56 +0530 Subject: [PATCH 4/6] py3compat: Move url2path/path2url out of py3compat path2url was only used in 1 placce (in tests) so remove the function And move url2path into the `__init__` file where it is used twice --- pypandoc/__init__.py | 8 +++++++- pypandoc/py3compat.py | 12 ------------ tests.py | 6 +++++- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/pypandoc/__init__.py b/pypandoc/__init__.py index 125342a..6a521d9 100644 --- a/pypandoc/__init__.py +++ b/pypandoc/__init__.py @@ -13,10 +13,11 @@ import glob from pathlib import Path from urllib.parse import urlparse +from urllib.request import url2pathname from .handler import _check_log_handler from .pandoc_download import DEFAULT_TARGET_FOLDER, download_pandoc -from .py3compat import cast_bytes, cast_unicode, url2path +from .py3compat import cast_bytes, cast_unicode __author__ = u'Juho Vepsäläinen' __author_email__ = "bebraw@gmail.com" @@ -53,6 +54,11 @@ # Set up the module level logger logger = logging.getLogger(__name__) +def url2path(url): # noqa: E303 + # from http://stackoverflow.com/questions/11687478/convert-a-filename-to-a-file-url + return url2pathname(urlparse(url).path) + + def convert_text(source:str, to:str, format:str, extra_args:Iterable=(), encoding:str='utf-8', outputfile:Union[None, str, Path]=None, filters:Union[Iterable, None]=None, verify_format:bool=True, sandbox:bool=True, cworkdir:Union[str, None]=None) -> str: diff --git a/pypandoc/py3compat.py b/pypandoc/py3compat.py index 6ae2c4b..15fea0a 100644 --- a/pypandoc/py3compat.py +++ b/pypandoc/py3compat.py @@ -37,15 +37,3 @@ def cast_bytes(s, encoding=None): if not isinstance(s, bytes): return _encode(s, encoding) return s - -# from http://stackoverflow.com/questions/11687478/convert-a-filename-to-a-file-url -from urllib.parse import urljoin, urlparse -from urllib.request import pathname2url, url2pathname - - -def path2url(path): # noqa: E303 - return urljoin('file:', pathname2url(path)) - - -def url2path(url): # noqa: E303 - return url2pathname(urlparse(url).path) diff --git a/tests.py b/tests.py index c2dec57..5c8dbf1 100755 --- a/tests.py +++ b/tests.py @@ -14,6 +14,8 @@ import unittest import warnings from pathlib import Path +from urllib.parse import urljoin +from urllib.request import pathname2url import pypandoc @@ -220,7 +222,9 @@ def test_basic_conversion_from_file_url(self): expected = u'some title{0}=========={0}{0}'.format(os.linesep) # this keeps the : (which should be '|' on windows but pandoc # doesn't like it - file_url = path2url(file_name) + + # from http://stackoverflow.com/questions/11687478/convert-a-filename-to-a-file-url + file_url = urljoin('file:', pathname2url(file_name)) assert pypandoc._identify_path(file_url) received = pypandoc.convert_file(file_url, 'rst') From 06e7c098abf3406937853064d05e0cfe14582ae1 Mon Sep 17 00:00:00 2001 From: AbdealiJK Date: Fri, 28 Oct 2022 19:46:00 +0530 Subject: [PATCH 5/6] py3compat: Remove cast_bytes() This function was only used in 1 place and always with encoding="utf-8" being hardcoded So, remove the function and just use the logic required because in py3.6+ it is a 1 liner: s.encode(...) --- pypandoc/__init__.py | 5 +++-- pypandoc/py3compat.py | 13 ------------- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/pypandoc/__init__.py b/pypandoc/__init__.py index 6a521d9..4e42ad1 100644 --- a/pypandoc/__init__.py +++ b/pypandoc/__init__.py @@ -17,7 +17,7 @@ from .handler import _check_log_handler from .pandoc_download import DEFAULT_TARGET_FOLDER, download_pandoc -from .py3compat import cast_bytes, cast_unicode +from .py3compat import cast_unicode __author__ = u'Juho Vepsäläinen' __author_email__ = "bebraw@gmail.com" @@ -398,7 +398,8 @@ def _convert_input(source, format, input_type, to, extra_args=(), if string_input: try: - source = cast_bytes(source, encoding='utf-8') + if not isinstance(source, bytes): + source = source.encode('utf-8') except (UnicodeDecodeError, UnicodeEncodeError): # assume that it is already a utf-8 encoded string pass diff --git a/pypandoc/py3compat.py b/pypandoc/py3compat.py index 15fea0a..22c7fc4 100644 --- a/pypandoc/py3compat.py +++ b/pypandoc/py3compat.py @@ -20,20 +20,7 @@ def _decode(s, encoding=None): encoding = encoding or _DEFAULT_ENCODING return s.decode(encoding) - -def _encode(u, encoding=None): - encoding = encoding or _DEFAULT_ENCODING - return u.encode(encoding) - - def cast_unicode(s, encoding=None): if isinstance(s, bytes): return _decode(s, encoding) return s - - -def cast_bytes(s, encoding=None): - # bytes == str on py2.7 -> always encode on py2 - if not isinstance(s, bytes): - return _encode(s, encoding) - return s From 47eea0c81930dacf1f721a0ac327dfd1874d0832 Mon Sep 17 00:00:00 2001 From: AbdealiJK Date: Fri, 28 Oct 2022 19:57:13 +0530 Subject: [PATCH 6/6] py3compat: Remove cast_unicode() This function is a simple function now that we support only py3.6+ So, remove the original function and use the logic a required --- pypandoc/__init__.py | 7 +++++-- pypandoc/py3compat.py | 10 ---------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/pypandoc/__init__.py b/pypandoc/__init__.py index 4e42ad1..4e6359f 100644 --- a/pypandoc/__init__.py +++ b/pypandoc/__init__.py @@ -17,7 +17,7 @@ from .handler import _check_log_handler from .pandoc_download import DEFAULT_TARGET_FOLDER, download_pandoc -from .py3compat import cast_unicode +from .py3compat import _DEFAULT_ENCODING __author__ = u'Juho Vepsäläinen' __author_email__ = "bebraw@gmail.com" @@ -244,7 +244,10 @@ def _as_unicode(source:any, encoding:str) -> any: # if a source and a different encoding is given, try to decode the the source into a # unicode string try: - source = cast_unicode(source, encoding=encoding) + if isinstance(source, bytes): + encoding = encoding or _DEFAULT_ENCODING + source = source.decode(encoding) + except (UnicodeDecodeError, UnicodeEncodeError): pass return source diff --git a/pypandoc/py3compat.py b/pypandoc/py3compat.py index 22c7fc4..d292509 100644 --- a/pypandoc/py3compat.py +++ b/pypandoc/py3compat.py @@ -14,13 +14,3 @@ pass _DEFAULT_ENCODING = _DEFAULT_ENCODING or sys.getdefaultencoding() - - -def _decode(s, encoding=None): - encoding = encoding or _DEFAULT_ENCODING - return s.decode(encoding) - -def cast_unicode(s, encoding=None): - if isinstance(s, bytes): - return _decode(s, encoding) - return s