diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 4179277291478..37ecd7eaa5246 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -344,6 +344,7 @@ Other Enhancements - :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row. ``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`) - :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`) +- zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`) .. _whatsnew_0230.api_breaking: diff --git a/pandas/conftest.py b/pandas/conftest.py index 7a4ef56d7d749..81a039e484cf1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -75,16 +75,6 @@ def compression(request): return request.param -@pytest.fixture(params=[None, 'gzip', 'bz2', - pytest.param('xz', marks=td.skip_if_no_lzma)]) -def compression_no_zip(request): - """ - Fixture for trying common compression types in compression tests - except zip - """ - return request.param - - @pytest.fixture(scope='module') def datetime_tz_utc(): from datetime import timezone diff --git a/pandas/core/frame.py b/pandas/core/frame.py index efb002474f876..a03a3141a3b70 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1654,9 +1654,9 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, A string representing the encoding to use in the output file, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. compression : string, optional - a string representing the compression to use in the output file, - allowed values are 'gzip', 'bz2', 'xz', - only used when the first argument is a filename + A string representing the compression to use in the output file. + Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only + used when the first argument is a filename. line_terminator : string, default ``'\n'`` The newline character or character sequence to use in the output file diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5682ad411fd2f..1a090f273e68e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1814,9 +1814,9 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, .. versionadded:: 0.19.0 - compression : {None, 'gzip', 'bz2', 'xz'} + compression : {None, 'gzip', 'bz2', 'zip', 'xz'} A string representing the compression to use in the output file, - only used when the first argument is a filename + only used when the first argument is a filename. .. versionadded:: 0.21.0 @@ -2085,7 +2085,8 @@ def to_pickle(self, path, compression='infer', ---------- path : str File path where the pickled object will be stored. - compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \ + default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. diff --git a/pandas/core/series.py b/pandas/core/series.py index e4801242073a2..9e086b165ca3e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3632,9 +3632,9 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='', a string representing the encoding to use if the contents are non-ascii, for python versions prior to 3 compression : string, optional - a string representing the compression to use in the output file, - allowed values are 'gzip', 'bz2', 'xz', only used when the first - argument is a filename + A string representing the compression to use in the output file. + Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only + used when the first argument is a filename. date_format: string, default None Format string for datetime objects. decimal: string, default '.' diff --git a/pandas/io/common.py b/pandas/io/common.py index e312181f08512..4769edd157b94 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,6 +5,7 @@ import codecs import mmap from contextlib import contextmanager, closing +from zipfile import ZipFile from pandas.compat import StringIO, BytesIO, string_types, text_type from pandas import compat @@ -363,18 +364,20 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, # ZIP Compression elif compression == 'zip': - import zipfile - zip_file = zipfile.ZipFile(path_or_buf) - zip_names = zip_file.namelist() - if len(zip_names) == 1: - f = zip_file.open(zip_names.pop()) - elif len(zip_names) == 0: - raise ValueError('Zero files found in ZIP file {}' - .format(path_or_buf)) - else: - raise ValueError('Multiple files found in ZIP file.' - ' Only one file per ZIP: {}' - .format(zip_names)) + zf = BytesZipFile(path_or_buf, mode) + if zf.mode == 'w': + f = zf + elif zf.mode == 'r': + zip_names = zf.namelist() + if len(zip_names) == 1: + f = zf.open(zip_names.pop()) + elif len(zip_names) == 0: + raise ValueError('Zero files found in ZIP file {}' + .format(path_or_buf)) + else: + raise ValueError('Multiple files found in ZIP file.' + ' Only one file per ZIP: {}' + .format(zip_names)) # XZ Compression elif compression == 'xz': @@ -425,6 +428,24 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, return f, handles +class BytesZipFile(ZipFile, BytesIO): + """ + Wrapper for standard library class ZipFile and allow the returned file-like + handle to accept byte strings via `write` method. + + BytesIO provides attributes of file-like object and ZipFile.writestr writes + bytes strings into a member of the archive. + """ + # GH 17778 + def __init__(self, file, mode='r', **kwargs): + if mode in ['wb', 'rb']: + mode = mode.replace('b', '') + super(BytesZipFile, self).__init__(file, mode, **kwargs) + + def write(self, data): + super(BytesZipFile, self).writestr(self.filename, data) + + class MMapWrapper(BaseIterator): """ Wrapper for the Python's mmap class so that it can be properly read in diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 4e2021bcba72b..29b8d29af0808 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -133,8 +133,8 @@ def save(self): else: f, handles = _get_handle(self.path_or_buf, self.mode, encoding=encoding, - compression=self.compression) - close = True + compression=None) + close = True if self.compression is None else False try: writer_kwargs = dict(lineterminator=self.line_terminator, @@ -151,6 +151,16 @@ def save(self): self._save() finally: + # GH 17778 handles compression for byte strings. + if not close and self.compression: + f.close() + with open(self.path_or_buf, 'r') as f: + data = f.read() + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=encoding, + compression=self.compression) + f.write(data) + close = True if close: f.close() diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 8c72c315c142c..d27735fbca318 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -18,7 +18,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): Any python object. path : str File path where the pickled object will be stored. - compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. @@ -74,7 +74,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): if protocol < 0: protocol = pkl.HIGHEST_PROTOCOL try: - pkl.dump(obj, f, protocol=protocol) + f.write(pkl.dumps(obj, protocol=protocol)) finally: for _f in fh: _f.close() @@ -93,7 +93,7 @@ def read_pickle(path, compression='infer'): ---------- path : str File path where the pickled object will be loaded. - compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', or '.zip' respectively, and no decompression otherwise. diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index dda5cdea52cac..e4829ebf48561 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -919,7 +919,7 @@ def test_to_csv_path_is_none(self): recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) - def test_to_csv_compression(self, compression_no_zip): + def test_to_csv_compression(self, compression): df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], @@ -927,35 +927,22 @@ def test_to_csv_compression(self, compression_no_zip): with ensure_clean() as filename: - df.to_csv(filename, compression=compression_no_zip) + df.to_csv(filename, compression=compression) # test the round trip - to_csv -> read_csv - rs = read_csv(filename, compression=compression_no_zip, + rs = read_csv(filename, compression=compression, index_col=0) assert_frame_equal(df, rs) # explicitly make sure file is compressed - with tm.decompress_file(filename, compression_no_zip) as fh: + with tm.decompress_file(filename, compression) as fh: text = fh.read().decode('utf8') for col in df.columns: assert col in text - with tm.decompress_file(filename, compression_no_zip) as fh: + with tm.decompress_file(filename, compression) as fh: assert_frame_equal(df, read_csv(fh, index_col=0)) - def test_to_csv_compression_value_error(self): - # GH7615 - # use the compression kw in to_csv - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) - - with ensure_clean() as filename: - # zip compression is not supported and should raise ValueError - import zipfile - pytest.raises(zipfile.BadZipfile, df.to_csv, - filename, compression="zip") - def test_to_csv_date_format(self): with ensure_clean('__tmp_to_csv_date_format__') as path: dt_index = self.tsframe.index diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 08335293f9292..c9074ca49e5be 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -5,32 +5,22 @@ from pandas.util.testing import assert_frame_equal, assert_raises_regex -def test_compression_roundtrip(compression_no_zip): +def test_compression_roundtrip(compression): df = pd.DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) with tm.ensure_clean() as path: - df.to_json(path, compression=compression_no_zip) + df.to_json(path, compression=compression) assert_frame_equal(df, pd.read_json(path, - compression=compression_no_zip)) + compression=compression)) # explicitly ensure file was compressed. - with tm.decompress_file(path, compression_no_zip) as fh: + with tm.decompress_file(path, compression) as fh: result = fh.read().decode('utf8') assert_frame_equal(df, pd.read_json(result)) -def test_compress_zip_value_error(): - df = pd.DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) - - with tm.ensure_clean() as path: - import zipfile - pytest.raises(zipfile.BadZipfile, df.to_json, path, compression="zip") - - def test_read_zipped_json(): uncompressed_path = tm.get_data_path("tsframe_v012.json") uncompressed_df = pd.read_json(uncompressed_path) @@ -41,7 +31,7 @@ def test_read_zipped_json(): assert_frame_equal(uncompressed_df, compressed_df) -def test_with_s3_url(compression_no_zip): +def test_with_s3_url(compression): boto3 = pytest.importorskip('boto3') pytest.importorskip('s3fs') moto = pytest.importorskip('moto') @@ -52,35 +42,35 @@ def test_with_s3_url(compression_no_zip): bucket = conn.create_bucket(Bucket="pandas-test") with tm.ensure_clean() as path: - df.to_json(path, compression=compression_no_zip) + df.to_json(path, compression=compression) with open(path, 'rb') as f: bucket.put_object(Key='test-1', Body=f) roundtripped_df = pd.read_json('s3://pandas-test/test-1', - compression=compression_no_zip) + compression=compression) assert_frame_equal(df, roundtripped_df) -def test_lines_with_compression(compression_no_zip): +def test_lines_with_compression(compression): with tm.ensure_clean() as path: df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') df.to_json(path, orient='records', lines=True, - compression=compression_no_zip) + compression=compression) roundtripped_df = pd.read_json(path, lines=True, - compression=compression_no_zip) + compression=compression) assert_frame_equal(df, roundtripped_df) -def test_chunksize_with_compression(compression_no_zip): +def test_chunksize_with_compression(compression): with tm.ensure_clean() as path: df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}') df.to_json(path, orient='records', lines=True, - compression=compression_no_zip) + compression=compression) res = pd.read_json(path, lines=True, chunksize=1, - compression=compression_no_zip) + compression=compression) roundtripped_df = pd.concat(res) assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 2ba3e174404c7..6bc3af2ba3fd2 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -352,7 +352,7 @@ def compress_file(self, src_path, dest_path, compression): f.write(fh.read()) f.close() - def test_write_explicit(self, compression_no_zip, get_random_path): + def test_write_explicit(self, compression, get_random_path): base = get_random_path path1 = base + ".compressed" path2 = base + ".raw" @@ -361,10 +361,10 @@ def test_write_explicit(self, compression_no_zip, get_random_path): df = tm.makeDataFrame() # write to compressed file - df.to_pickle(p1, compression=compression_no_zip) + df.to_pickle(p1, compression=compression) # decompress - with tm.decompress_file(p1, compression=compression_no_zip) as f: + with tm.decompress_file(p1, compression=compression) as f: with open(p2, "wb") as fh: fh.write(f.read()) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 62d1372525cc8..0b0d4334c86a3 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -138,26 +138,26 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) assert isinstance(csv_str, str) - def test_to_csv_compression(self, compression_no_zip): + def test_to_csv_compression(self, compression): s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], name='X') with ensure_clean() as filename: - s.to_csv(filename, compression=compression_no_zip, header=True) + s.to_csv(filename, compression=compression, header=True) # test the round trip - to_csv -> read_csv - rs = pd.read_csv(filename, compression=compression_no_zip, + rs = pd.read_csv(filename, compression=compression, index_col=0, squeeze=True) assert_series_equal(s, rs) # explicitly ensure file was compressed - with tm.decompress_file(filename, compression_no_zip) as fh: + with tm.decompress_file(filename, compression) as fh: text = fh.read().decode('utf8') assert s.name in text - with tm.decompress_file(filename, compression_no_zip) as fh: + with tm.decompress_file(filename, compression) as fh: assert_series_equal(s, pd.read_csv(fh, index_col=0, squeeze=True)) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a223e4d8fd23e..f79e73b8ba417 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -172,7 +172,7 @@ def decompress_file(path, compression): path : str The path where the file is read from - compression : {'gzip', 'bz2', 'xz', None} + compression : {'gzip', 'bz2', 'zip', 'xz', None} Name of the decompression to use Returns