From 2d7c46e8784612500d12b56c53eaaac0f5ddb748 Mon Sep 17 00:00:00 2001 From: horpto <__singleton__@hackerdom.ru> Date: Sat, 28 Oct 2017 16:40:35 +0500 Subject: [PATCH 1/4] Fix inner file-like objects not closed properly so ResourceWarning-s was raised --- smart_open/smart_open_lib.py | 25 +++++++++++++++++++------ smart_open/tests/test_smart_open.py | 19 ++++++++++++------- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index 4b36caf1..ead8313d 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -497,19 +497,32 @@ def __exit__(self, type, value, traceback): pass -def make_closing(base, **attrs): +def make_closing(base, under_stream=None, **attrs): """ Add support for `with Base(attrs) as fout:` to the base class if it's missing. The base class' `close()` method will be called on context exit, to always close the file properly. This is needed for gzip.GzipFile, bz2.BZ2File etc in older Pythons (<=2.6), which otherwise raise "AttributeError: GzipFile instance has no attribute '__exit__'". - + + Some base classes like gzip.GzipFile, bz2.BZ2File etc can wrap the other file-like objects, + but not closing it. It is not a bug, because inner stream are not owned to the base class. + Function make_closing fix that behaviour due to all under_stream-s are created by this library. """ if not hasattr(base, '__enter__'): attrs['__enter__'] = lambda self: self - if not hasattr(base, '__exit__'): - attrs['__exit__'] = lambda self, type, value, traceback: self.close() + + __exit__ = getattr(base, '__exit__', None) + + def exit_attr(self, type, value, traceback): + if __exit__ is not None: + __exit__(self, type, value, traceback) + else: + self.close() + if under_stream is not None: + under_stream.close() + + attrs['__exit__'] = exit_attr return type('Closing' + base.__name__, (base, object), attrs) @@ -530,11 +543,11 @@ def compression_wrapper(file_obj, filename, mode): from bz2file import BZ2File else: from bz2 import BZ2File - return make_closing(BZ2File)(file_obj, mode) + return make_closing(BZ2File, file_obj)(file_obj, mode) elif ext == '.gz': from gzip import GzipFile - return make_closing(GzipFile)(fileobj=file_obj, mode=mode) + return make_closing(GzipFile, file_obj)(fileobj=file_obj, mode=mode) else: return file_obj diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index b2f6488d..972b055c 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -131,7 +131,8 @@ def test_http_pass(self): def test_http_gz(self): """Can open gzip via http?""" fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz') - data = open(fpath, 'rb').read() + with open(fpath, 'rb') as file: + data = file.read() responses.add(responses.GET, "http://127.0.0.1/data.gz", body=data) @@ -146,7 +147,8 @@ def test_http_gz(self): def test_http_bz2(self): """Can open bz2 via http?""" test_string = b'Hello World Compressed.' - test_file = tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False).name + with tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False) as file: + test_file = file.name with smart_open.smart_open(test_file, 'wb') as outfile: outfile.write(test_string) @@ -785,20 +787,23 @@ def write_read_assertion(self, test_file): def test_open_gz(self): """Can open gzip?""" fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz') - data = smart_open.smart_open(fpath).read() + with smart_open.smart_open(fpath) as file: + data = file.read() m = hashlib.md5(data) assert m.hexdigest() == '18473e60f8c7c98d29d65bf805736a0d', \ 'Failed to read gzip' def test_write_read_gz(self): """Can write and read gzip?""" - test_file = tempfile.NamedTemporaryFile('wb', suffix='.gz', delete=False).name - self.write_read_assertion(test_file) + with tempfile.NamedTemporaryFile('wb', suffix='.gz', delete=False) as file: + test_file_name = file.name + self.write_read_assertion(test_file_name) def test_write_read_bz2(self): """Can write and read bz2?""" - test_file = tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False).name - self.write_read_assertion(test_file) + with tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False) as file: + test_file_name = file.name + self.write_read_assertion(test_file_name) class MultistreamsBZ2Test(unittest.TestCase): From a8bfb63b118eeb0d4528b9cd820ebc13a721fe5e Mon Sep 17 00:00:00 2001 From: horpto <__singleton__@hackerdom.ru> Date: Sat, 28 Oct 2017 16:53:45 +0500 Subject: [PATCH 2/4] Fix DeprecationWarning-s in tests --- smart_open/tests/test_smart_open.py | 38 ++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index 972b055c..bd804294 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -122,10 +122,10 @@ def test_http_pass(self): """Does http authentication work correctly""" responses.add(responses.GET, "http://127.0.0.1/index.html", body='line1\nline2') _ = smart_open.HttpOpenRead(smart_open.ParseUri("http://127.0.0.1/index.html"), user='me', password='pass') - self.assertEquals(len(responses.calls), 1) + self.assertEqual(len(responses.calls), 1) actual_request = responses.calls[0].request - self.assert_('Authorization' in actual_request.headers) - self.assert_(actual_request.headers['Authorization'].startswith('Basic ')) + self.assertTrue('Authorization' in actual_request.headers) + self.assertTrue(actual_request.headers['Authorization'].startswith('Basic ')) @responses.activate def test_http_gz(self): @@ -184,9 +184,9 @@ def test_read_never_returns_none(self): fout.write(test_string.encode('utf8')) r = smart_open.smart_open("s3://mybucket/mykey", "rb") - self.assertEquals(r.read(), test_string.encode("utf-8")) - self.assertEquals(r.read(), b"") - self.assertEquals(r.read(), b"") + self.assertEqual(r.read(), test_string.encode("utf-8")) + self.assertEqual(r.read(), b"") + self.assertEqual(r.read(), b"") @mock_s3 def test_readline(self): @@ -198,7 +198,7 @@ def test_readline(self): fout.write(test_string) reader = smart_open.smart_open("s3://mybucket/mykey", "rb") - self.assertEquals(reader.readline(), u"hello žluťoučký world!\n".encode("utf-8")) + self.assertEqual(reader.readline(), u"hello žluťoučký world!\n".encode("utf-8")) @mock_s3 def test_readline_iter(self): @@ -212,9 +212,9 @@ def test_readline_iter(self): reader = smart_open.smart_open("s3://mybucket/mykey", "rb") actual_lines = [l.decode("utf-8") for l in reader] - self.assertEquals(2, len(actual_lines)) - self.assertEquals(lines[0], actual_lines[0]) - self.assertEquals(lines[1], actual_lines[1]) + self.assertEqual(2, len(actual_lines)) + self.assertEqual(lines[0], actual_lines[0]) + self.assertEqual(lines[1], actual_lines[1]) @mock_s3 def test_readline_eof(self): @@ -226,9 +226,9 @@ def test_readline_eof(self): reader = smart_open.smart_open("s3://mybucket/mykey", "rb") - self.assertEquals(reader.readline(), b"") - self.assertEquals(reader.readline(), b"") - self.assertEquals(reader.readline(), b"") + self.assertEqual(reader.readline(), b"") + self.assertEqual(reader.readline(), b"") + self.assertEqual(reader.readline(), b"") @mock_s3 def test_s3_iter_lines(self): @@ -911,11 +911,11 @@ def test_r(self): key.set_contents_from_string(text.encode("utf-8")) with smart_open.s3_open_key(key, "r") as fin: - self.assertEquals(fin.read(), u"физкульт-привет!") + self.assertEqual(fin.read(), u"физкульт-привет!") parsed_uri = smart_open.ParseUri("s3://bucket/key") with smart_open.s3_open_uri(parsed_uri, "r") as fin: - self.assertEquals(fin.read(), u"физкульт-привет!") + self.assertEqual(fin.read(), u"физкульт-привет!") def test_bad_mode(self): """Bad mode should raise and exception.""" @@ -935,10 +935,10 @@ def test_rw_encoding(self): fout.write(text) with smart_open.s3_open_uri(uri, "r", encoding="koi8-r") as fin: - self.assertEquals(text, fin.read()) + self.assertEqual(text, fin.read()) with smart_open.s3_open_uri(uri, "rb") as fin: - self.assertEquals(text.encode("koi8-r"), fin.read()) + self.assertEqual(text.encode("koi8-r"), fin.read()) with smart_open.s3_open_uri(uri, "r", encoding="euc-jp") as fin: self.assertRaises(UnicodeDecodeError, fin.read) @@ -963,13 +963,13 @@ def test_rw_gzip(self): # with smart_open.s3_open_uri(uri, "rb", ignore_extension=True) as fin: gz = gzip.GzipFile(fileobj=fin) - self.assertEquals(gz.read().decode("utf-8"), text) + self.assertEqual(gz.read().decode("utf-8"), text) # # We should be able to read it back as well. # with smart_open.s3_open_uri(uri, "rb") as fin: - self.assertEquals(fin.read().decode("utf-8"), text) + self.assertEqual(fin.read().decode("utf-8"), text) @mock_s3 def test_gzip_write_mode(self): From c3c83143ebf036be3211a8d96c2ed80519c2b403 Mon Sep 17 00:00:00 2001 From: horpto <__Singleton__@hackerdom.ru> Date: Mon, 30 Oct 2017 22:26:52 +0500 Subject: [PATCH 3/4] rename name file to infile in tests --- smart_open/tests/test_smart_open.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/smart_open/tests/test_smart_open.py b/smart_open/tests/test_smart_open.py index bd804294..049af376 100644 --- a/smart_open/tests/test_smart_open.py +++ b/smart_open/tests/test_smart_open.py @@ -131,8 +131,8 @@ def test_http_pass(self): def test_http_gz(self): """Can open gzip via http?""" fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz') - with open(fpath, 'rb') as file: - data = file.read() + with open(fpath, 'rb') as infile: + data = infile.read() responses.add(responses.GET, "http://127.0.0.1/data.gz", body=data) @@ -147,8 +147,8 @@ def test_http_gz(self): def test_http_bz2(self): """Can open bz2 via http?""" test_string = b'Hello World Compressed.' - with tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False) as file: - test_file = file.name + with tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False) as infile: + test_file = infile.name with smart_open.smart_open(test_file, 'wb') as outfile: outfile.write(test_string) @@ -787,22 +787,22 @@ def write_read_assertion(self, test_file): def test_open_gz(self): """Can open gzip?""" fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz') - with smart_open.smart_open(fpath) as file: - data = file.read() + with smart_open.smart_open(fpath) as infile: + data = infile.read() m = hashlib.md5(data) assert m.hexdigest() == '18473e60f8c7c98d29d65bf805736a0d', \ 'Failed to read gzip' def test_write_read_gz(self): """Can write and read gzip?""" - with tempfile.NamedTemporaryFile('wb', suffix='.gz', delete=False) as file: - test_file_name = file.name + with tempfile.NamedTemporaryFile('wb', suffix='.gz', delete=False) as infile: + test_file_name = infile.name self.write_read_assertion(test_file_name) def test_write_read_bz2(self): """Can write and read bz2?""" - with tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False) as file: - test_file_name = file.name + with tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False) as infile: + test_file_name = infile.name self.write_read_assertion(test_file_name) From 51fa5c7018501e00c04d1cd0037214e813584403 Mon Sep 17 00:00:00 2001 From: horpto <__Singleton__@hackerdom.ru> Date: Tue, 31 Oct 2017 01:03:38 +0500 Subject: [PATCH 4/4] Redo slightly strange code as classes --- smart_open/smart_open_lib.py | 59 +++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/smart_open/smart_open_lib.py b/smart_open/smart_open_lib.py index ead8313d..f075b9d9 100644 --- a/smart_open/smart_open_lib.py +++ b/smart_open/smart_open_lib.py @@ -61,6 +61,11 @@ logger.warning("multiprocessing could not be imported and won't be used") from itertools import imap +if IS_PY2: + from bz2file import BZ2File +else: + from bz2 import BZ2File + import gzip import smart_open.s3 as smart_open_s3 @@ -497,35 +502,48 @@ def __exit__(self, type, value, traceback): pass -def make_closing(base, under_stream=None, **attrs): +def make_closing(base, **attrs): """ Add support for `with Base(attrs) as fout:` to the base class if it's missing. The base class' `close()` method will be called on context exit, to always close the file properly. This is needed for gzip.GzipFile, bz2.BZ2File etc in older Pythons (<=2.6), which otherwise raise "AttributeError: GzipFile instance has no attribute '__exit__'". - - Some base classes like gzip.GzipFile, bz2.BZ2File etc can wrap the other file-like objects, - but not closing it. It is not a bug, because inner stream are not owned to the base class. - Function make_closing fix that behaviour due to all under_stream-s are created by this library. """ if not hasattr(base, '__enter__'): attrs['__enter__'] = lambda self: self - __exit__ = getattr(base, '__exit__', None) + if not hasattr(base, '__exit__'): + attrs['__exit__'] = lambda self, type, value, traceback: self.close() - def exit_attr(self, type, value, traceback): - if __exit__ is not None: - __exit__(self, type, value, traceback) - else: - self.close() - if under_stream is not None: - under_stream.close() - - attrs['__exit__'] = exit_attr return type('Closing' + base.__name__, (base, object), attrs) +class ClosingBZ2File(make_closing(BZ2File)): + """ + Implements wrapper for BZ2File that closes file object receieved as argument + + """ + def __init__(self, inner_stream, *args, **kwargs): + self.inner_stream = inner_stream + super(ClosingBZ2File, self).__init__(inner_stream, *args, **kwargs) + + def close(self): + super(ClosingBZ2File, self).close() + if not self.inner_stream.closed: + self.inner_stream.close() + +class ClosingGzipFile(make_closing(gzip.GzipFile)): + """ + Implement wrapper for GzipFile that closes file object receieved from arguments + + """ + def close(self): + fileobj = self.fileobj + super(ClosingGzipFile, self).close() + if not fileobj.closed: + fileobj.close() + def compression_wrapper(file_obj, filename, mode): """ This function will wrap the file_obj with an appropriate @@ -539,16 +557,9 @@ def compression_wrapper(file_obj, filename, mode): """ _, ext = os.path.splitext(filename) if ext == '.bz2': - if IS_PY2: - from bz2file import BZ2File - else: - from bz2 import BZ2File - return make_closing(BZ2File, file_obj)(file_obj, mode) - + return ClosingBZ2File(file_obj, mode) elif ext == '.gz': - from gzip import GzipFile - return make_closing(GzipFile, file_obj)(fileobj=file_obj, mode=mode) - + return ClosingGzipFile(fileobj=file_obj, mode=mode) else: return file_obj