From ccfd2408aa025f4721c7a85e2c701b1a8455abd1 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 17 Mar 2018 19:01:47 +0000 Subject: [PATCH 01/39] initial commit --- pandas/io/common.py | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index e312181f08512..e1989de3abf34 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -363,18 +363,33 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, # ZIP Compression elif compression == 'zip': - import zipfile - zip_file = zipfile.ZipFile(path_or_buf) - zip_names = zip_file.namelist() - if len(zip_names) == 1: - f = zip_file.open(zip_names.pop()) - elif len(zip_names) == 0: - raise ValueError('Zero files found in ZIP file {}' - .format(path_or_buf)) - else: - raise ValueError('Multiple files found in ZIP file.' - ' Only one file per ZIP: {}' - .format(zip_names)) + from zipfile import ZipFile + # GH 17778 + + class _ZipFile(ZipFile): + """uses writestr method as write to accept bytes.""" + def __init__(self, file, mode='r', **kwargs): + if mode in ['wb', 'rb']: + mode = mode.replace('b', '') + super(_ZipFile, self).__init__(file, mode=mode, **kwargs) + + def write(self, data): + super(_ZipFile, self).writestr(self.filename, data) + + zf = _ZipFile(path_or_buf, mode) + if zf.mode == 'w': + f = zf + elif zf.mode == 'r': + zip_names = zf.namelist() + if len(zip_names) == 1: + f = zf.open(zip_names.pop()) + elif len(zip_names) == 0: + raise ValueError('Zero files found in ZIP file {}' + .format(path_or_buf)) + else: + raise ValueError('Multiple files found in ZIP file.' + ' Only one file per ZIP: {}' + .format(zip_names)) # XZ Compression elif compression == 'xz': From fd7362c77b556c46f73159f054f4ab8c546ddf91 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 17 Mar 2018 19:57:45 +0000 Subject: [PATCH 02/39] add zip to compression --- pandas/core/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5682ad411fd2f..578ba8e22ecd9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2085,7 +2085,8 @@ def to_pickle(self, path, compression='infer', ---------- path : str File path where the pickled object will be stored. - compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' + compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, + default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. From c5700911463d3c7bb96c3901b53201663302f126 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 17 Mar 2018 19:58:09 +0000 Subject: [PATCH 03/39] add zip to compression in to_pickle --- pandas/io/pickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 8c72c315c142c..da1d27bd20f69 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -18,7 +18,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): Any python object. path : str File path where the pickled object will be stored. - compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' + compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. From ec712b9183b69e24d858516e3d1e3c3a5130094e Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 17 Mar 2018 23:36:36 +0000 Subject: [PATCH 04/39] inherit io.BufferedIOBase --- pandas/io/common.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index e1989de3abf34..be258b450421c 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -363,20 +363,27 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, # ZIP Compression elif compression == 'zip': + import io from zipfile import ZipFile # GH 17778 - class _ZipFile(ZipFile): - """uses writestr method as write to accept bytes.""" + class BytesZipFile(ZipFile, io.BufferedIOBase): + """override write method with writestr to accept bytes.""" def __init__(self, file, mode='r', **kwargs): if mode in ['wb', 'rb']: mode = mode.replace('b', '') - super(_ZipFile, self).__init__(file, mode=mode, **kwargs) + super(BytesZipFile, self).__init__(file, mode, **kwargs) def write(self, data): - super(_ZipFile, self).writestr(self.filename, data) + super(BytesZipFile, self).writestr(self.filename, data) - zf = _ZipFile(path_or_buf, mode) + def writable(self): + return self.mode == 'w' + + def readable(self): + return self.mode == 'r' + + zf = BytesZipFile(path_or_buf, mode) if zf.mode == 'w': f = zf elif zf.mode == 'r': From bf271cebf92ebfea25540e180ffed8c75269bf38 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 17 Mar 2018 23:58:02 +0000 Subject: [PATCH 05/39] xfail test_compress_zip_value_error --- pandas/tests/io/json/test_compression.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 08335293f9292..a51287e219e66 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -21,14 +21,15 @@ def test_compression_roundtrip(compression_no_zip): assert_frame_equal(df, pd.read_json(result)) +@pytest.mark.xfail(reason='zip compression is now supported for json.') def test_compress_zip_value_error(): df = pd.DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) with tm.ensure_clean() as path: - import zipfile - pytest.raises(zipfile.BadZipfile, df.to_json, path, compression="zip") + from zipfile import BadZipfile + pytest.raises(BadZipfile, df.to_json, path, compression="zip") def test_read_zipped_json(): From 113db830f92a5937d5a6a6ad9565e7c412e85c4d Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 00:16:33 +0000 Subject: [PATCH 06/39] add zip in compression parameter description --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 6 +++--- pandas/core/series.py | 4 ++-- pandas/io/pickle.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index efb002474f876..4ece844a91e8b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1655,7 +1655,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. compression : string, optional a string representing the compression to use in the output file, - allowed values are 'gzip', 'bz2', 'xz', + allowed values are 'gzip', 'bz2', 'zip', 'xz', only used when the first argument is a filename line_terminator : string, default ``'\n'`` The newline character or character sequence to use in the output diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 578ba8e22ecd9..19d8d645ca923 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1814,7 +1814,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, .. versionadded:: 0.19.0 - compression : {None, 'gzip', 'bz2', 'xz'} + compression : {None, 'gzip', 'bz2', 'zip', 'xz'} A string representing the compression to use in the output file, only used when the first argument is a filename @@ -2085,8 +2085,8 @@ def to_pickle(self, path, compression='infer', ---------- path : str File path where the pickled object will be stored. - compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, - default 'infer' + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \ + default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. diff --git a/pandas/core/series.py b/pandas/core/series.py index e4801242073a2..367ffccc72799 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3633,8 +3633,8 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='', non-ascii, for python versions prior to 3 compression : string, optional a string representing the compression to use in the output file, - allowed values are 'gzip', 'bz2', 'xz', only used when the first - argument is a filename + allowed values are 'gzip', 'bz2', 'zip', 'xz', only used when the + first argument is a filename date_format: string, default None Format string for datetime objects. decimal: string, default '.' diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index da1d27bd20f69..33601269db866 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -18,7 +18,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): Any python object. path : str File path where the pickled object will be stored. - compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. @@ -93,7 +93,7 @@ def read_pickle(path, compression='infer'): ---------- path : str File path where the pickled object will be loaded. - compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', or '.zip' respectively, and no decompression otherwise. From 9b9e5d16e0ba004d47c20ce21027c59677412a9c Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 00:48:31 +0000 Subject: [PATCH 07/39] xfail test_to_csv_compression_value_error --- pandas/tests/frame/test_to_csv.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index dda5cdea52cac..858fcc7697f51 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -919,7 +919,7 @@ def test_to_csv_path_is_none(self): recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) - def test_to_csv_compression(self, compression_no_zip): + def test_to_csv_compression(self, compression): df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], @@ -927,22 +927,23 @@ def test_to_csv_compression(self, compression_no_zip): with ensure_clean() as filename: - df.to_csv(filename, compression=compression_no_zip) + df.to_csv(filename, compression=compression) # test the round trip - to_csv -> read_csv - rs = read_csv(filename, compression=compression_no_zip, + rs = read_csv(filename, compression=compression, index_col=0) assert_frame_equal(df, rs) # explicitly make sure file is compressed - with tm.decompress_file(filename, compression_no_zip) as fh: + with tm.decompress_file(filename, compression) as fh: text = fh.read().decode('utf8') for col in df.columns: assert col in text - with tm.decompress_file(filename, compression_no_zip) as fh: + with tm.decompress_file(filename, compression) as fh: assert_frame_equal(df, read_csv(fh, index_col=0)) + @pytest.mark.xfail(reason='zip compression is now supported for csv.') def test_to_csv_compression_value_error(self): # GH7615 # use the compression kw in to_csv From dedb853cf63cf0bfe74e29c39eddba15f3d276c2 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 00:49:05 +0000 Subject: [PATCH 08/39] include zip in all tests --- pandas/tests/io/json/test_compression.py | 26 ++++++++++++------------ pandas/tests/io/test_pickle.py | 6 +++--- pandas/tests/series/test_io.py | 10 ++++----- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index a51287e219e66..89b4053db8846 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -5,18 +5,18 @@ from pandas.util.testing import assert_frame_equal, assert_raises_regex -def test_compression_roundtrip(compression_no_zip): +def test_compression_roundtrip(compression): df = pd.DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) with tm.ensure_clean() as path: - df.to_json(path, compression=compression_no_zip) + df.to_json(path, compression=compression) assert_frame_equal(df, pd.read_json(path, - compression=compression_no_zip)) + compression=compression)) # explicitly ensure file was compressed. - with tm.decompress_file(path, compression_no_zip) as fh: + with tm.decompress_file(path, compression) as fh: result = fh.read().decode('utf8') assert_frame_equal(df, pd.read_json(result)) @@ -42,7 +42,7 @@ def test_read_zipped_json(): assert_frame_equal(uncompressed_df, compressed_df) -def test_with_s3_url(compression_no_zip): +def test_with_s3_url(compression): boto3 = pytest.importorskip('boto3') pytest.importorskip('s3fs') moto = pytest.importorskip('moto') @@ -53,35 +53,35 @@ def test_with_s3_url(compression_no_zip): bucket = conn.create_bucket(Bucket="pandas-test") with tm.ensure_clean() as path: - df.to_json(path, compression=compression_no_zip) + df.to_json(path, compression=compression) with open(path, 'rb') as f: bucket.put_object(Key='test-1', Body=f) roundtripped_df = pd.read_json('s3://pandas-test/test-1', - compression=compression_no_zip) + compression=compression) assert_frame_equal(df, roundtripped_df) -def test_lines_with_compression(compression_no_zip): +def test_lines_with_compression(compression): with tm.ensure_clean() as path: df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') df.to_json(path, orient='records', lines=True, - compression=compression_no_zip) + compression=compression) roundtripped_df = pd.read_json(path, lines=True, - compression=compression_no_zip) + compression=compression) assert_frame_equal(df, roundtripped_df) -def test_chunksize_with_compression(compression_no_zip): +def test_chunksize_with_compression(compression): with tm.ensure_clean() as path: df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}') df.to_json(path, orient='records', lines=True, - compression=compression_no_zip) + compression=compression) res = pd.read_json(path, lines=True, chunksize=1, - compression=compression_no_zip) + compression=compression) roundtripped_df = pd.concat(res) assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 2ba3e174404c7..6bc3af2ba3fd2 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -352,7 +352,7 @@ def compress_file(self, src_path, dest_path, compression): f.write(fh.read()) f.close() - def test_write_explicit(self, compression_no_zip, get_random_path): + def test_write_explicit(self, compression, get_random_path): base = get_random_path path1 = base + ".compressed" path2 = base + ".raw" @@ -361,10 +361,10 @@ def test_write_explicit(self, compression_no_zip, get_random_path): df = tm.makeDataFrame() # write to compressed file - df.to_pickle(p1, compression=compression_no_zip) + df.to_pickle(p1, compression=compression) # decompress - with tm.decompress_file(p1, compression=compression_no_zip) as f: + with tm.decompress_file(p1, compression=compression) as f: with open(p2, "wb") as fh: fh.write(f.read()) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 62d1372525cc8..0b0d4334c86a3 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -138,26 +138,26 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) assert isinstance(csv_str, str) - def test_to_csv_compression(self, compression_no_zip): + def test_to_csv_compression(self, compression): s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], name='X') with ensure_clean() as filename: - s.to_csv(filename, compression=compression_no_zip, header=True) + s.to_csv(filename, compression=compression, header=True) # test the round trip - to_csv -> read_csv - rs = pd.read_csv(filename, compression=compression_no_zip, + rs = pd.read_csv(filename, compression=compression, index_col=0, squeeze=True) assert_series_equal(s, rs) # explicitly ensure file was compressed - with tm.decompress_file(filename, compression_no_zip) as fh: + with tm.decompress_file(filename, compression) as fh: text = fh.read().decode('utf8') assert s.name in text - with tm.decompress_file(filename, compression_no_zip) as fh: + with tm.decompress_file(filename, compression) as fh: assert_series_equal(s, pd.read_csv(fh, index_col=0, squeeze=True)) From dfa99139a5255945a6de0799c94ae2ff7598a5be Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 01:03:28 +0000 Subject: [PATCH 09/39] move BytesZipFile out of _get_handle --- pandas/io/common.py | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index be258b450421c..ed96b597c5060 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,7 +5,8 @@ import codecs import mmap from contextlib import contextmanager, closing - +from zipfile import ZipFile +from io import BufferedIOBase from pandas.compat import StringIO, BytesIO, string_types, text_type from pandas import compat from pandas.io.formats.printing import pprint_thing @@ -363,26 +364,6 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, # ZIP Compression elif compression == 'zip': - import io - from zipfile import ZipFile - # GH 17778 - - class BytesZipFile(ZipFile, io.BufferedIOBase): - """override write method with writestr to accept bytes.""" - def __init__(self, file, mode='r', **kwargs): - if mode in ['wb', 'rb']: - mode = mode.replace('b', '') - super(BytesZipFile, self).__init__(file, mode, **kwargs) - - def write(self, data): - super(BytesZipFile, self).writestr(self.filename, data) - - def writable(self): - return self.mode == 'w' - - def readable(self): - return self.mode == 'r' - zf = BytesZipFile(path_or_buf, mode) if zf.mode == 'w': f = zf @@ -447,6 +428,24 @@ def readable(self): return f, handles +class BytesZipFile(ZipFile, BufferedIOBase): + """override write method with writestr to accept bytes.""" + # GH 17778 + def __init__(self, file, mode='r', **kwargs): + if mode in ['wb', 'rb']: + mode = mode.replace('b', '') + super(BytesZipFile, self).__init__(file, mode, **kwargs) + + def write(self, data): + super(BytesZipFile, self).writestr(self.filename, data) + + def writable(self): + return self.mode == 'w' + + def readable(self): + return self.mode == 'r' + + class MMapWrapper(BaseIterator): """ Wrapper for the Python's mmap class so that it can be properly read in From 67b9727f0efb8e43976236f400d83354e518d729 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 01:23:45 +0000 Subject: [PATCH 10/39] inherit BytesIO --- pandas/io/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index ed96b597c5060..cd0fb3652c276 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -6,7 +6,7 @@ import mmap from contextlib import contextmanager, closing from zipfile import ZipFile -from io import BufferedIOBase + from pandas.compat import StringIO, BytesIO, string_types, text_type from pandas import compat from pandas.io.formats.printing import pprint_thing @@ -428,7 +428,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, return f, handles -class BytesZipFile(ZipFile, BufferedIOBase): +class BytesZipFile(ZipFile, BytesIO): """override write method with writestr to accept bytes.""" # GH 17778 def __init__(self, file, mode='r', **kwargs): From ecdf5a2fa525da9ed100f0c2f8372c758893ea40 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 01:26:27 +0000 Subject: [PATCH 11/39] restore import pattern --- pandas/tests/io/json/test_compression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 89b4053db8846..6e0c3e88e8ee0 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -28,8 +28,8 @@ def test_compress_zip_value_error(): index=['A', 'B'], columns=['X', 'Y', 'Z']) with tm.ensure_clean() as path: - from zipfile import BadZipfile - pytest.raises(BadZipfile, df.to_json, path, compression="zip") + import zipfile + pytest.raises(zipfile.BadZipfile, df.to_json, path, compression="zip") def test_read_zipped_json(): From b9fab3c8795d693711918506f54b48e7afd4565a Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 12:19:17 +0000 Subject: [PATCH 12/39] attributes already implemented in Base class --- pandas/io/common.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index cd0fb3652c276..c43f1bdbbf94e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -439,12 +439,6 @@ def __init__(self, file, mode='r', **kwargs): def write(self, data): super(BytesZipFile, self).writestr(self.filename, data) - def writable(self): - return self.mode == 'w' - - def readable(self): - return self.mode == 'r' - class MMapWrapper(BaseIterator): """ From 5c5c16136f3a74537302cab3b40f067092cdde6e Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 12:45:48 +0000 Subject: [PATCH 13/39] add zip in compression parameter description --- pandas/util/testing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a223e4d8fd23e..f79e73b8ba417 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -172,7 +172,7 @@ def decompress_file(path, compression): path : str The path where the file is read from - compression : {'gzip', 'bz2', 'xz', None} + compression : {'gzip', 'bz2', 'zip', 'xz', None} Name of the decompression to use Returns From d072ca8ccbc7a98061d83dd3daf4bdb59f1b4cb5 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 12:55:56 +0000 Subject: [PATCH 14/39] prevent writing duplicates --- pandas/io/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index c43f1bdbbf94e..135fcf82f3534 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -437,7 +437,8 @@ def __init__(self, file, mode='r', **kwargs): super(BytesZipFile, self).__init__(file, mode, **kwargs) def write(self, data): - super(BytesZipFile, self).writestr(self.filename, data) + if self.filename not in self.nameslist(): + super(BytesZipFile, self).writestr(self.filename, data) class MMapWrapper(BaseIterator): From cecb0ac2c09fda878b15bb37cf6f05ae3f199d41 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 13:01:01 +0000 Subject: [PATCH 15/39] prevent writing duplicates --- pandas/io/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 135fcf82f3534..b53c63cc5b3a7 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -437,7 +437,7 @@ def __init__(self, file, mode='r', **kwargs): super(BytesZipFile, self).__init__(file, mode, **kwargs) def write(self, data): - if self.filename not in self.nameslist(): + if self.filename not in self.namelist(): super(BytesZipFile, self).writestr(self.filename, data) From ed189c47ac2c6dea8e2b6fb884168dc6f9279de0 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 13:57:00 +0000 Subject: [PATCH 16/39] add whatsnew entry in Other Enhancement --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 4179277291478..37ecd7eaa5246 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -344,6 +344,7 @@ Other Enhancements - :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row. ``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`) - :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`) +- zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`) .. _whatsnew_0230.api_breaking: From 4ac948857a37c0955ddafbec497e794bfae7d4d4 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 14:01:42 +0000 Subject: [PATCH 17/39] revert prevent duplicate --- pandas/io/common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index b53c63cc5b3a7..c43f1bdbbf94e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -437,8 +437,7 @@ def __init__(self, file, mode='r', **kwargs): super(BytesZipFile, self).__init__(file, mode, **kwargs) def write(self, data): - if self.filename not in self.namelist(): - super(BytesZipFile, self).writestr(self.filename, data) + super(BytesZipFile, self).writestr(self.filename, data) class MMapWrapper(BaseIterator): From 694c6b597c0b210a3cb9b32e562c2c01c219f347 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 22:05:09 +0000 Subject: [PATCH 18/39] xfail zip compression csv pickle in python 2.x --- pandas/tests/frame/test_to_csv.py | 5 ++++- pandas/tests/io/test_pickle.py | 5 ++++- pandas/tests/series/test_io.py | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 858fcc7697f51..0e1b523d83830 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -8,7 +8,7 @@ from numpy import nan import numpy as np -from pandas.compat import (lmap, range, lrange, StringIO, u) +from pandas.compat import (lmap, range, lrange, StringIO, u, PY2) import pandas.core.common as com from pandas.errors import ParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, @@ -925,6 +925,9 @@ def test_to_csv_compression(self, compression): [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) + if PY2 and compression == 'zip': + pytest.xfail(reason='zip compression not supported in Python 2.') + with ensure_clean() as filename: df.to_csv(filename, compression=compression) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 6bc3af2ba3fd2..903b6bf8db84e 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -20,7 +20,7 @@ from distutils.version import LooseVersion import pandas as pd from pandas import Index -from pandas.compat import is_platform_little_endian +from pandas.compat import is_platform_little_endian, PY2 import pandas import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -416,6 +416,9 @@ def test_read_explicit(self, compression, get_random_path): path1 = base + ".raw" path2 = base + ".compressed" + if PY2 and compression == 'zip': + pytest.xfail(reason='zip compression not supported in Python 2.') + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 0b0d4334c86a3..7a5744e6f1056 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -10,7 +10,7 @@ from pandas import Series, DataFrame -from pandas.compat import StringIO, u +from pandas.compat import StringIO, u, PY2 from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm @@ -143,6 +143,9 @@ def test_to_csv_compression(self, compression): s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], name='X') + if PY2 and compression == 'zip': + pytest.xfail(reason='zip compression not supported in Python 2.') + with ensure_clean() as filename: s.to_csv(filename, compression=compression, header=True) From 80992a3685b5f76f2eec9c2cf5b382e69bd1e881 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 22:08:31 +0000 Subject: [PATCH 19/39] xfail zip compression csv pickle in python 2.x --- pandas/tests/io/test_pickle.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 903b6bf8db84e..3fb65fd0a3a6c 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -357,6 +357,9 @@ def test_write_explicit(self, compression, get_random_path): path1 = base + ".compressed" path2 = base + ".raw" + if PY2 and compression == 'zip': + pytest.xfail(reason='zip compression not supported in Python 2.') + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() @@ -416,9 +419,6 @@ def test_read_explicit(self, compression, get_random_path): path1 = base + ".raw" path2 = base + ".compressed" - if PY2 and compression == 'zip': - pytest.xfail(reason='zip compression not supported in Python 2.') - with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() From 3288691f1319f5e5f79d16cc558e167a72c2f6ff Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 22:24:39 +0000 Subject: [PATCH 20/39] writing zip compression not supported in Python 2 --- pandas/io/common.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index c43f1bdbbf94e..c0993c0ffb5da 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -366,7 +366,11 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, elif compression == 'zip': zf = BytesZipFile(path_or_buf, mode) if zf.mode == 'w': - f = zf + if compat.PY3: + f = zf + elif compat.PY2: + raise NotImplementedError('Writing zip compression is not' + ' supported in Python 2.') elif zf.mode == 'r': zip_names = zf.namelist() if len(zip_names) == 1: From 272c6e72da56ecd1a5b28afc408acc301cc65879 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 22:40:43 +0000 Subject: [PATCH 21/39] compression parameter descriptions --- pandas/core/frame.py | 5 +++-- pandas/core/generic.py | 4 ++-- pandas/io/pickle.py | 5 ++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4ece844a91e8b..2e532f2df92c1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1655,8 +1655,9 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. compression : string, optional a string representing the compression to use in the output file, - allowed values are 'gzip', 'bz2', 'zip', 'xz', - only used when the first argument is a filename + allowed values are 'gzip', 'bz2', 'zip', 'xz', 'zip' only + supported with Python>=3.0, only used when the first argument is a + filename. line_terminator : string, default ``'\n'`` The newline character or character sequence to use in the output file diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 19d8d645ca923..14d7988257626 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1816,7 +1816,8 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, compression : {None, 'gzip', 'bz2', 'zip', 'xz'} A string representing the compression to use in the output file, - only used when the first argument is a filename + 'zip' only supported with Python>=3.0, only used when the first + argument is a filename. .. versionadded:: 0.21.0 @@ -2130,7 +2131,6 @@ def to_pickle(self, path, compression='infer', 2 2 7 3 3 8 4 4 9 - >>> import os >>> os.remove("./dummy.pkl") """ diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 33601269db866..e21cbd739b605 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -20,7 +20,8 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): File path where the pickled object will be stored. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' A string representing the compression to use in the output file. By - default, infers from the file extension in specified path. + default, infers from the file extension in specified path. 'zip' only + supported with Python>=3.0 .. versionadded:: 0.20.0 protocol : int @@ -62,7 +63,6 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): 2 2 7 3 3 8 4 4 9 - >>> import os >>> os.remove("./dummy.pkl") """ @@ -133,7 +133,6 @@ def read_pickle(path, compression='infer'): 2 2 7 3 3 8 4 4 9 - >>> import os >>> os.remove("./dummy.pkl") """ From d35b6af85972236f96cb21ceb666af26afe6cdfd Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 22:42:10 +0000 Subject: [PATCH 22/39] compression parameter descriptions --- pandas/core/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 14d7988257626..4f935beec7128 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2089,7 +2089,8 @@ def to_pickle(self, path, compression='infer', compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \ default 'infer' A string representing the compression to use in the output file. By - default, infers from the file extension in specified path. + default, infers from the file extension in specified path. 'zip' + only supported with Python>=3.0 .. versionadded:: 0.20.0 protocol : int From c6034b4750bc11320ef16666ac254b942a1a0e28 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 22:58:07 +0000 Subject: [PATCH 23/39] skip zip in Python 2 --- pandas/conftest.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 7a4ef56d7d749..5c523b96da6e7 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -3,6 +3,7 @@ import numpy import pandas import pandas.util._test_decorators as td +from compat import PY2 def pytest_addoption(parser): @@ -66,7 +67,11 @@ def ip(): return InteractiveShell() -@pytest.fixture(params=[None, 'gzip', 'bz2', 'zip', +@pytest.fixture(params=[None, + 'gzip', + 'bz2', + pytest.mark.skipif(PY2, reason='zip compression not' + ' supported in Python 2.')('zip'), pytest.param('xz', marks=td.skip_if_no_lzma)]) def compression(request): """ From 71d99795555a273d4694c3efe335d6f141fddbf6 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 23:00:22 +0000 Subject: [PATCH 24/39] revert tests xfail --- pandas/tests/frame/test_to_csv.py | 5 +---- pandas/tests/io/test_pickle.py | 5 +---- pandas/tests/series/test_io.py | 5 +---- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 0e1b523d83830..858fcc7697f51 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -8,7 +8,7 @@ from numpy import nan import numpy as np -from pandas.compat import (lmap, range, lrange, StringIO, u, PY2) +from pandas.compat import (lmap, range, lrange, StringIO, u) import pandas.core.common as com from pandas.errors import ParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, @@ -925,9 +925,6 @@ def test_to_csv_compression(self, compression): [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) - if PY2 and compression == 'zip': - pytest.xfail(reason='zip compression not supported in Python 2.') - with ensure_clean() as filename: df.to_csv(filename, compression=compression) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 3fb65fd0a3a6c..6bc3af2ba3fd2 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -20,7 +20,7 @@ from distutils.version import LooseVersion import pandas as pd from pandas import Index -from pandas.compat import is_platform_little_endian, PY2 +from pandas.compat import is_platform_little_endian import pandas import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -357,9 +357,6 @@ def test_write_explicit(self, compression, get_random_path): path1 = base + ".compressed" path2 = base + ".raw" - if PY2 and compression == 'zip': - pytest.xfail(reason='zip compression not supported in Python 2.') - with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 7a5744e6f1056..0b0d4334c86a3 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -10,7 +10,7 @@ from pandas import Series, DataFrame -from pandas.compat import StringIO, u, PY2 +from pandas.compat import StringIO, u from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm @@ -143,9 +143,6 @@ def test_to_csv_compression(self, compression): s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], name='X') - if PY2 and compression == 'zip': - pytest.xfail(reason='zip compression not supported in Python 2.') - with ensure_clean() as filename: s.to_csv(filename, compression=compression, header=True) From 4c87e0f379101387b25308055d34a121133372e7 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 23:11:14 +0000 Subject: [PATCH 25/39] update whatsnew --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 37ecd7eaa5246..9fa6fdde52688 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -344,7 +344,7 @@ Other Enhancements - :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row. ``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`) - :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`) -- zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`) +- zip compression is supported via ``compression=zip`` for python >= 3 in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`) .. _whatsnew_0230.api_breaking: From fd449802d8f9cf0af72cef8c64183c750fac0506 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sun, 18 Mar 2018 23:23:49 +0000 Subject: [PATCH 26/39] fix compat import --- pandas/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 5c523b96da6e7..ca9aa17d53184 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -3,7 +3,7 @@ import numpy import pandas import pandas.util._test_decorators as td -from compat import PY2 +from pandas.compat import PY2 def pytest_addoption(parser): From ab7a7b7b3a0a27c858446a19354cef4af40adb2b Mon Sep 17 00:00:00 2001 From: minggli Date: Mon, 19 Mar 2018 17:17:16 +0000 Subject: [PATCH 27/39] enable zip compression for Python 2 by avoid pickle.dump --- pandas/io/common.py | 6 +----- pandas/io/pickle.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index c0993c0ffb5da..c43f1bdbbf94e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -366,11 +366,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, elif compression == 'zip': zf = BytesZipFile(path_or_buf, mode) if zf.mode == 'w': - if compat.PY3: - f = zf - elif compat.PY2: - raise NotImplementedError('Writing zip compression is not' - ' supported in Python 2.') + f = zf elif zf.mode == 'r': zip_names = zf.namelist() if len(zip_names) == 1: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index e21cbd739b605..b34ba28d844ee 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -74,7 +74,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): if protocol < 0: protocol = pkl.HIGHEST_PROTOCOL try: - pkl.dump(obj, f, protocol=protocol) + f.write(pkl.dumps(obj, protocol=protocol)) finally: for _f in fh: _f.close() From cfd071587197aff4d62b4794f3db77012855e724 Mon Sep 17 00:00:00 2001 From: minggli Date: Mon, 19 Mar 2018 17:22:17 +0000 Subject: [PATCH 28/39] remove descriptinos zip only supported by Python3 --- pandas/core/frame.py | 5 ++--- pandas/core/generic.py | 6 ++---- pandas/io/pickle.py | 3 +-- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2e532f2df92c1..213f59fe52df9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1655,9 +1655,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. compression : string, optional a string representing the compression to use in the output file, - allowed values are 'gzip', 'bz2', 'zip', 'xz', 'zip' only - supported with Python>=3.0, only used when the first argument is a - filename. + allowed values are 'gzip', 'bz2', 'zip', 'xz', only used when the + first argument is a filename. line_terminator : string, default ``'\n'`` The newline character or character sequence to use in the output file diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4f935beec7128..6dd19a73242cd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1816,8 +1816,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, compression : {None, 'gzip', 'bz2', 'zip', 'xz'} A string representing the compression to use in the output file, - 'zip' only supported with Python>=3.0, only used when the first - argument is a filename. + only used when the first argument is a filename. .. versionadded:: 0.21.0 @@ -2089,8 +2088,7 @@ def to_pickle(self, path, compression='infer', compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \ default 'infer' A string representing the compression to use in the output file. By - default, infers from the file extension in specified path. 'zip' - only supported with Python>=3.0 + default, infers from the file extension in specified path. .. versionadded:: 0.20.0 protocol : int diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index b34ba28d844ee..f43bdb6cbfa02 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -20,8 +20,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): File path where the pickled object will be stored. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' A string representing the compression to use in the output file. By - default, infers from the file extension in specified path. 'zip' only - supported with Python>=3.0 + default, infers from the file extension in specified path. .. versionadded:: 0.20.0 protocol : int From dd958ac5a98aa80b3c01ea7eaad0c323dd72a8f9 Mon Sep 17 00:00:00 2001 From: minggli Date: Mon, 19 Mar 2018 17:25:08 +0000 Subject: [PATCH 29/39] revert conftest --- pandas/conftest.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index ca9aa17d53184..7a4ef56d7d749 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -3,7 +3,6 @@ import numpy import pandas import pandas.util._test_decorators as td -from pandas.compat import PY2 def pytest_addoption(parser): @@ -67,11 +66,7 @@ def ip(): return InteractiveShell() -@pytest.fixture(params=[None, - 'gzip', - 'bz2', - pytest.mark.skipif(PY2, reason='zip compression not' - ' supported in Python 2.')('zip'), +@pytest.fixture(params=[None, 'gzip', 'bz2', 'zip', pytest.param('xz', marks=td.skip_if_no_lzma)]) def compression(request): """ From 2956103a50152da9daf82ed085c146615de5af0a Mon Sep 17 00:00:00 2001 From: minggli Date: Mon, 19 Mar 2018 18:27:34 +0000 Subject: [PATCH 30/39] tests xfail on csv zip compression in Python 2 --- pandas/tests/frame/test_to_csv.py | 6 +++++- pandas/tests/series/test_io.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 858fcc7697f51..8899bfffa4a4d 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -8,7 +8,7 @@ from numpy import nan import numpy as np -from pandas.compat import (lmap, range, lrange, StringIO, u) +from pandas.compat import (lmap, range, lrange, StringIO, u, PY2) import pandas.core.common as com from pandas.errors import ParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, @@ -925,6 +925,10 @@ def test_to_csv_compression(self, compression): [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) + if PY2 and compression == 'zip': + pytest.xfail(reason='zip compression for csv not suppported in' + 'Python 2') + with ensure_clean() as filename: df.to_csv(filename, compression=compression) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 0b0d4334c86a3..0ab99a59efd16 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -10,7 +10,7 @@ from pandas import Series, DataFrame -from pandas.compat import StringIO, u +from pandas.compat import StringIO, u, PY2 from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm @@ -143,6 +143,10 @@ def test_to_csv_compression(self, compression): s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], name='X') + if PY2 and compression == 'zip': + pytest.xfail(reason='zip compression for csv not suppported in' + 'Python 2') + with ensure_clean() as filename: s.to_csv(filename, compression=compression, header=True) From 63890ec02c1eba95677c4ce6d22168107c0595e0 Mon Sep 17 00:00:00 2001 From: minggli Date: Tue, 20 Mar 2018 09:34:41 +0000 Subject: [PATCH 31/39] handle csv compression seperately --- pandas/io/formats/csvs.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 4e2021bcba72b..87da9045170af 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -133,8 +133,8 @@ def save(self): else: f, handles = _get_handle(self.path_or_buf, self.mode, encoding=encoding, - compression=self.compression) - close = True + compression=None) + close = True if self.compression is None else False try: writer_kwargs = dict(lineterminator=self.line_terminator, @@ -150,6 +150,16 @@ def save(self): self._save() + # GH 17778 handles compression for byte strings. + if not close and self.compression: + f.close() + with open(self.path_or_buf, 'r') as f: + data = f.read() + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=encoding, + compression=self.compression) + f.write(data) + close = True finally: if close: f.close() From e4966be240cabeda10a83f97d622769afbda5b18 Mon Sep 17 00:00:00 2001 From: minggli Date: Tue, 20 Mar 2018 09:35:07 +0000 Subject: [PATCH 32/39] revert xfail on tests csv --- pandas/tests/frame/test_to_csv.py | 6 +----- pandas/tests/series/test_io.py | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 8899bfffa4a4d..858fcc7697f51 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -8,7 +8,7 @@ from numpy import nan import numpy as np -from pandas.compat import (lmap, range, lrange, StringIO, u, PY2) +from pandas.compat import (lmap, range, lrange, StringIO, u) import pandas.core.common as com from pandas.errors import ParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, @@ -925,10 +925,6 @@ def test_to_csv_compression(self, compression): [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) - if PY2 and compression == 'zip': - pytest.xfail(reason='zip compression for csv not suppported in' - 'Python 2') - with ensure_clean() as filename: df.to_csv(filename, compression=compression) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 0ab99a59efd16..0b0d4334c86a3 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -10,7 +10,7 @@ from pandas import Series, DataFrame -from pandas.compat import StringIO, u, PY2 +from pandas.compat import StringIO, u from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm @@ -143,10 +143,6 @@ def test_to_csv_compression(self, compression): s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], name='X') - if PY2 and compression == 'zip': - pytest.xfail(reason='zip compression for csv not suppported in' - 'Python 2') - with ensure_clean() as filename: s.to_csv(filename, compression=compression, header=True) From 437d7167462191f735a3cb09735d421dba7b27e8 Mon Sep 17 00:00:00 2001 From: minggli Date: Tue, 20 Mar 2018 09:35:50 +0000 Subject: [PATCH 33/39] decommission compression_no_zip --- pandas/conftest.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 7a4ef56d7d749..81a039e484cf1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -75,16 +75,6 @@ def compression(request): return request.param -@pytest.fixture(params=[None, 'gzip', 'bz2', - pytest.param('xz', marks=td.skip_if_no_lzma)]) -def compression_no_zip(request): - """ - Fixture for trying common compression types in compression tests - except zip - """ - return request.param - - @pytest.fixture(scope='module') def datetime_tz_utc(): from datetime import timezone From 04886e9697cf7d6adf79ffaa939dec20983f6e9d Mon Sep 17 00:00:00 2001 From: minggli Date: Tue, 20 Mar 2018 09:43:08 +0000 Subject: [PATCH 34/39] remove value error test cases now that zip compression is supported for csv and json --- pandas/tests/frame/test_to_csv.py | 14 -------------- pandas/tests/io/json/test_compression.py | 11 ----------- 2 files changed, 25 deletions(-) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 858fcc7697f51..e4829ebf48561 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -943,20 +943,6 @@ def test_to_csv_compression(self, compression): with tm.decompress_file(filename, compression) as fh: assert_frame_equal(df, read_csv(fh, index_col=0)) - @pytest.mark.xfail(reason='zip compression is now supported for csv.') - def test_to_csv_compression_value_error(self): - # GH7615 - # use the compression kw in to_csv - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) - - with ensure_clean() as filename: - # zip compression is not supported and should raise ValueError - import zipfile - pytest.raises(zipfile.BadZipfile, df.to_csv, - filename, compression="zip") - def test_to_csv_date_format(self): with ensure_clean('__tmp_to_csv_date_format__') as path: dt_index = self.tsframe.index diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 6e0c3e88e8ee0..c9074ca49e5be 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -21,17 +21,6 @@ def test_compression_roundtrip(compression): assert_frame_equal(df, pd.read_json(result)) -@pytest.mark.xfail(reason='zip compression is now supported for json.') -def test_compress_zip_value_error(): - df = pd.DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) - - with tm.ensure_clean() as path: - import zipfile - pytest.raises(zipfile.BadZipfile, df.to_json, path, compression="zip") - - def test_read_zipped_json(): uncompressed_path = tm.get_data_path("tsframe_v012.json") uncompressed_df = pd.read_json(uncompressed_path) From 099993cb7cb53276f5ad4d36184d134c43968669 Mon Sep 17 00:00:00 2001 From: minggli Date: Tue, 20 Mar 2018 09:46:57 +0000 Subject: [PATCH 35/39] update whatsnew --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 9fa6fdde52688..37ecd7eaa5246 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -344,7 +344,7 @@ Other Enhancements - :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row. ``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`) - :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`) -- zip compression is supported via ``compression=zip`` for python >= 3 in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`) +- zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`) .. _whatsnew_0230.api_breaking: From 6aa14934915ff22376b200969a98d7ef35da4011 Mon Sep 17 00:00:00 2001 From: minggli Date: Tue, 20 Mar 2018 10:56:54 +0000 Subject: [PATCH 36/39] docstring for BytesZipFile --- pandas/io/common.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index c43f1bdbbf94e..4769edd157b94 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -429,7 +429,13 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, class BytesZipFile(ZipFile, BytesIO): - """override write method with writestr to accept bytes.""" + """ + Wrapper for standard library class ZipFile and allow the returned file-like + handle to accept byte strings via `write` method. + + BytesIO provides attributes of file-like object and ZipFile.writestr writes + bytes strings into a member of the archive. + """ # GH 17778 def __init__(self, file, mode='r', **kwargs): if mode in ['wb', 'rb']: From 129a55a6d68206ba26b8468818e3c5bdb31b31ba Mon Sep 17 00:00:00 2001 From: minggli Date: Tue, 20 Mar 2018 10:59:06 +0000 Subject: [PATCH 37/39] add back blank lines --- pandas/core/generic.py | 1 + pandas/io/pickle.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6dd19a73242cd..1a090f273e68e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2130,6 +2130,7 @@ def to_pickle(self, path, compression='infer', 2 2 7 3 3 8 4 4 9 + >>> import os >>> os.remove("./dummy.pkl") """ diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index f43bdb6cbfa02..d27735fbca318 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -62,6 +62,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): 2 2 7 3 3 8 4 4 9 + >>> import os >>> os.remove("./dummy.pkl") """ @@ -132,6 +133,7 @@ def read_pickle(path, compression='infer'): 2 2 7 3 3 8 4 4 9 + >>> import os >>> os.remove("./dummy.pkl") """ From 4531c783d933cfd85ee43129886b3400141900b2 Mon Sep 17 00:00:00 2001 From: minggli Date: Tue, 20 Mar 2018 11:11:12 +0000 Subject: [PATCH 38/39] move csv compression seperately --- pandas/io/formats/csvs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 87da9045170af..29b8d29af0808 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -150,6 +150,7 @@ def save(self): self._save() + finally: # GH 17778 handles compression for byte strings. if not close and self.compression: f.close() @@ -160,7 +161,6 @@ def save(self): compression=self.compression) f.write(data) close = True - finally: if close: f.close() From ebd8e6f4cf63eb0b4cdd7255b9fbed9f35463b42 Mon Sep 17 00:00:00 2001 From: minggli Date: Thu, 22 Mar 2018 11:47:58 +0000 Subject: [PATCH 39/39] parameter description --- pandas/core/frame.py | 6 +++--- pandas/core/series.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 213f59fe52df9..a03a3141a3b70 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1654,9 +1654,9 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, A string representing the encoding to use in the output file, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. compression : string, optional - a string representing the compression to use in the output file, - allowed values are 'gzip', 'bz2', 'zip', 'xz', only used when the - first argument is a filename. + A string representing the compression to use in the output file. + Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only + used when the first argument is a filename. line_terminator : string, default ``'\n'`` The newline character or character sequence to use in the output file diff --git a/pandas/core/series.py b/pandas/core/series.py index 367ffccc72799..9e086b165ca3e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3632,9 +3632,9 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='', a string representing the encoding to use if the contents are non-ascii, for python versions prior to 3 compression : string, optional - a string representing the compression to use in the output file, - allowed values are 'gzip', 'bz2', 'zip', 'xz', only used when the - first argument is a filename + A string representing the compression to use in the output file. + Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only + used when the first argument is a filename. date_format: string, default None Format string for datetime objects. decimal: string, default '.'