pandas-dev · jreback · Mar 22, 2018 · Mar 17, 2018 · Mar 17, 2018 · Mar 17, 2018
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -344,6 +344,7 @@ Other Enhancements
 - :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row.
   ``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`)
 - :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`)
+- zip compression is supported via ``compression=zip`` for python >= 3 in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`)
 
 .. _whatsnew_0230.api_breaking:
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1655,8 +1655,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
             defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
         compression : string, optional
             a string representing the compression to use in the output file,
-            allowed values are 'gzip', 'bz2', 'xz',
-            only used when the first argument is a filename
+            allowed values are 'gzip', 'bz2', 'zip', 'xz', only used when the
+            first argument is a filename.
         line_terminator : string, default ``'\n'``
             The newline character or character sequence to use in the output
             file

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1814,9 +1814,9 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
 
             .. versionadded:: 0.19.0
 
-        compression : {None, 'gzip', 'bz2', 'xz'}
+        compression : {None, 'gzip', 'bz2', 'zip', 'xz'}
             A string representing the compression to use in the output file,
-            only used when the first argument is a filename
+            only used when the first argument is a filename.
 
             .. versionadded:: 0.21.0
 
@@ -2085,7 +2085,8 @@ def to_pickle(self, path, compression='infer',
         ----------
         path : str
             File path where the pickled object will be stored.
-        compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
+        compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \
+        default 'infer'
             A string representing the compression to use in the output file. By
             default, infers from the file extension in specified path.
 
@@ -2129,7 +2130,6 @@ def to_pickle(self, path, compression='infer',
         2    2    7
         3    3    8
         4    4    9
-
         >>> import os
         >>> os.remove("./dummy.pkl")
         """

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -3633,8 +3633,8 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='',
             non-ascii, for python versions prior to 3
         compression : string, optional
             a string representing the compression to use in the output file,
-            allowed values are 'gzip', 'bz2', 'xz', only used when the first
-            argument is a filename
+            allowed values are 'gzip', 'bz2', 'zip', 'xz', only used when the
+            first argument is a filename
         date_format: string, default None
             Format string for datetime objects.
         decimal: string, default '.'

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -5,6 +5,7 @@
 import codecs
 import mmap
 from contextlib import contextmanager, closing
+from zipfile import ZipFile
 
 from pandas.compat import StringIO, BytesIO, string_types, text_type
 from pandas import compat
@@ -363,18 +364,20 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
 
         # ZIP Compression
         elif compression == 'zip':
-            import zipfile
-            zip_file = zipfile.ZipFile(path_or_buf)
-            zip_names = zip_file.namelist()
-            if len(zip_names) == 1:
-                f = zip_file.open(zip_names.pop())
-            elif len(zip_names) == 0:
-                raise ValueError('Zero files found in ZIP file {}'
-                                 .format(path_or_buf))
-            else:
-                raise ValueError('Multiple files found in ZIP file.'
-                                 ' Only one file per ZIP: {}'
-                                 .format(zip_names))
+            zf = BytesZipFile(path_or_buf, mode)
+            if zf.mode == 'w':
+                f = zf
+            elif zf.mode == 'r':
+                zip_names = zf.namelist()
+                if len(zip_names) == 1:
+                    f = zf.open(zip_names.pop())
+                elif len(zip_names) == 0:
+                    raise ValueError('Zero files found in ZIP file {}'
+                                     .format(path_or_buf))
+                else:
+                    raise ValueError('Multiple files found in ZIP file.'
+                                     ' Only one file per ZIP: {}'
+                                     .format(zip_names))
 
         # XZ Compression
         elif compression == 'xz':
@@ -425,6 +428,18 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
     return f, handles
 
 
+class BytesZipFile(ZipFile, BytesIO):
+    """override write method with writestr to accept bytes."""
+    # GH 17778
+    def __init__(self, file, mode='r', **kwargs):
+        if mode in ['wb', 'rb']:
+            mode = mode.replace('b', '')
+        super(BytesZipFile, self).__init__(file, mode, **kwargs)
+
+    def write(self, data):
+        super(BytesZipFile, self).writestr(self.filename, data)
+
+
 class MMapWrapper(BaseIterator):
     """
     Wrapper for the Python's mmap class so that it can be properly read in

diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -18,7 +18,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
         Any python object.
     path : str
         File path where the pickled object will be stored.
-    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
         A string representing the compression to use in the output file. By
         default, infers from the file extension in specified path.
 
@@ -62,7 +62,6 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
     2    2    7
     3    3    8
     4    4    9
-
     >>> import os
     >>> os.remove("./dummy.pkl")
     """
@@ -74,7 +73,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
     if protocol < 0:
         protocol = pkl.HIGHEST_PROTOCOL
     try:
-        pkl.dump(obj, f, protocol=protocol)
+        f.write(pkl.dumps(obj, protocol=protocol))
     finally:
         for _f in fh:
             _f.close()
@@ -93,7 +92,7 @@ def read_pickle(path, compression='infer'):
     ----------
     path : str
         File path where the pickled object will be loaded.
-    compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
         For on-the-fly decompression of on-disk data. If 'infer', then use
         gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
         or '.zip' respectively, and no decompression otherwise.
@@ -133,7 +132,6 @@ def read_pickle(path, compression='infer'):
     2    2    7
     3    3    8
     4    4    9
-
     >>> import os
     >>> os.remove("./dummy.pkl")
     """

diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -8,7 +8,7 @@
 from numpy import nan
 import numpy as np
 
-from pandas.compat import (lmap, range, lrange, StringIO, u)
+from pandas.compat import (lmap, range, lrange, StringIO, u, PY2)
 import pandas.core.common as com
 from pandas.errors import ParserError
 from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp,
@@ -919,30 +919,35 @@ def test_to_csv_path_is_none(self):
         recons = pd.read_csv(StringIO(csv_str), index_col=0)
         assert_frame_equal(self.frame, recons)
 
-    def test_to_csv_compression(self, compression_no_zip):
+    def test_to_csv_compression(self, compression):
 
         df = DataFrame([[0.123456, 0.234567, 0.567567],
                         [12.32112, 123123.2, 321321.2]],
                        index=['A', 'B'], columns=['X', 'Y', 'Z'])
 
+        if PY2 and compression == 'zip':
+            pytest.xfail(reason='zip compression for csv not suppported in'
+                                'Python 2')
+
         with ensure_clean() as filename:
 
-            df.to_csv(filename, compression=compression_no_zip)
+            df.to_csv(filename, compression=compression)
 
             # test the round trip - to_csv -> read_csv
-            rs = read_csv(filename, compression=compression_no_zip,
+            rs = read_csv(filename, compression=compression,
                           index_col=0)
             assert_frame_equal(df, rs)
 
             # explicitly make sure file is compressed
-            with tm.decompress_file(filename, compression_no_zip) as fh:
+            with tm.decompress_file(filename, compression) as fh:
                 text = fh.read().decode('utf8')
                 for col in df.columns:
                     assert col in text
 
-            with tm.decompress_file(filename, compression_no_zip) as fh:
+            with tm.decompress_file(filename, compression) as fh:
                 assert_frame_equal(df, read_csv(fh, index_col=0))
 
+    @pytest.mark.xfail(reason='zip compression is now supported for csv.')
     def test_to_csv_compression_value_error(self):
         # GH7615
         # use the compression kw in to_csv

diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
@@ -5,22 +5,23 @@
 from pandas.util.testing import assert_frame_equal, assert_raises_regex
 
 
-def test_compression_roundtrip(compression_no_zip):
+def test_compression_roundtrip(compression):
     df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
                        [12.32112, 123123.2, 321321.2]],
                       index=['A', 'B'], columns=['X', 'Y', 'Z'])
 
     with tm.ensure_clean() as path:
-        df.to_json(path, compression=compression_no_zip)
+        df.to_json(path, compression=compression)
         assert_frame_equal(df, pd.read_json(path,
-                                            compression=compression_no_zip))
+                                            compression=compression))
 
         # explicitly ensure file was compressed.
-        with tm.decompress_file(path, compression_no_zip) as fh:
+        with tm.decompress_file(path, compression) as fh:
             result = fh.read().decode('utf8')
         assert_frame_equal(df, pd.read_json(result))
 
 
+@pytest.mark.xfail(reason='zip compression is now supported for json.')
 def test_compress_zip_value_error():
     df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
                        [12.32112, 123123.2, 321321.2]],
@@ -41,7 +42,7 @@ def test_read_zipped_json():
     assert_frame_equal(uncompressed_df, compressed_df)
 
 
-def test_with_s3_url(compression_no_zip):
+def test_with_s3_url(compression):
     boto3 = pytest.importorskip('boto3')
     pytest.importorskip('s3fs')
     moto = pytest.importorskip('moto')
@@ -52,35 +53,35 @@ def test_with_s3_url(compression_no_zip):
         bucket = conn.create_bucket(Bucket="pandas-test")
 
         with tm.ensure_clean() as path:
-            df.to_json(path, compression=compression_no_zip)
+            df.to_json(path, compression=compression)
             with open(path, 'rb') as f:
                 bucket.put_object(Key='test-1', Body=f)
 
         roundtripped_df = pd.read_json('s3://pandas-test/test-1',
-                                       compression=compression_no_zip)
+                                       compression=compression)
         assert_frame_equal(df, roundtripped_df)
 
 
-def test_lines_with_compression(compression_no_zip):
+def test_lines_with_compression(compression):
 
     with tm.ensure_clean() as path:
         df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
         df.to_json(path, orient='records', lines=True,
-                   compression=compression_no_zip)
+                   compression=compression)
         roundtripped_df = pd.read_json(path, lines=True,
-                                       compression=compression_no_zip)
+                                       compression=compression)
         assert_frame_equal(df, roundtripped_df)
 
 
-def test_chunksize_with_compression(compression_no_zip):
+def test_chunksize_with_compression(compression):
 
     with tm.ensure_clean() as path:
         df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
         df.to_json(path, orient='records', lines=True,
-                   compression=compression_no_zip)
+                   compression=compression)
 
         res = pd.read_json(path, lines=True, chunksize=1,
-                           compression=compression_no_zip)
+                           compression=compression)
         roundtripped_df = pd.concat(res)
         assert_frame_equal(df, roundtripped_df)
 

diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
@@ -352,7 +352,7 @@ def compress_file(self, src_path, dest_path, compression):
                 f.write(fh.read())
             f.close()
 
-    def test_write_explicit(self, compression_no_zip, get_random_path):
+    def test_write_explicit(self, compression, get_random_path):
         base = get_random_path
         path1 = base + ".compressed"
         path2 = base + ".raw"
@@ -361,10 +361,10 @@ def test_write_explicit(self, compression_no_zip, get_random_path):
             df = tm.makeDataFrame()
 
             # write to compressed file
-            df.to_pickle(p1, compression=compression_no_zip)
+            df.to_pickle(p1, compression=compression)
 
             # decompress
-            with tm.decompress_file(p1, compression=compression_no_zip) as f:
+            with tm.decompress_file(p1, compression=compression) as f:
                 with open(p2, "wb") as fh:
                     fh.write(f.read())
 

diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py
@@ -10,7 +10,7 @@
 
 from pandas import Series, DataFrame
 
-from pandas.compat import StringIO, u
+from pandas.compat import StringIO, u, PY2
 from pandas.util.testing import (assert_series_equal, assert_almost_equal,
                                  assert_frame_equal, ensure_clean)
 import pandas.util.testing as tm
@@ -138,26 +138,30 @@ def test_to_csv_path_is_none(self):
         csv_str = s.to_csv(path=None)
         assert isinstance(csv_str, str)
 
-    def test_to_csv_compression(self, compression_no_zip):
+    def test_to_csv_compression(self, compression):
 
         s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
                    name='X')
 
+        if PY2 and compression == 'zip':
+            pytest.xfail(reason='zip compression for csv not suppported in'
+                                'Python 2')
+
         with ensure_clean() as filename:
 
-            s.to_csv(filename, compression=compression_no_zip, header=True)
+            s.to_csv(filename, compression=compression, header=True)
 
             # test the round trip - to_csv -> read_csv
-            rs = pd.read_csv(filename, compression=compression_no_zip,
+            rs = pd.read_csv(filename, compression=compression,
                              index_col=0, squeeze=True)
             assert_series_equal(s, rs)
 
             # explicitly ensure file was compressed
-            with tm.decompress_file(filename, compression_no_zip) as fh:
+            with tm.decompress_file(filename, compression) as fh:
                 text = fh.read().decode('utf8')
                 assert s.name in text
 
-            with tm.decompress_file(filename, compression_no_zip) as fh:
+            with tm.decompress_file(filename, compression) as fh:
                 assert_series_equal(s, pd.read_csv(fh,
                                                    index_col=0,
                                                    squeeze=True))

diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -172,7 +172,7 @@ def decompress_file(path, compression):
     path : str
         The path where the file is read from
 
-    compression : {'gzip', 'bz2', 'xz', None}
+    compression : {'gzip', 'bz2', 'zip', 'xz', None}
         Name of the decompression to use
 
     Returns