Skip to content

Commit

Permalink
EHN: allow zip compression in to_pickle, to_json, to_csv (#20394)
Browse files Browse the repository at this point in the history
  • Loading branch information
minggli authored and jreback committed Mar 22, 2018
1 parent 0b8db1b commit 76534d5
Show file tree
Hide file tree
Showing 13 changed files with 86 additions and 86 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ Other Enhancements
- :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row.
``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`)
- :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`)
- zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`)

.. _whatsnew_0230.api_breaking:

Expand Down
10 changes: 0 additions & 10 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,16 +75,6 @@ def compression(request):
return request.param


@pytest.fixture(params=[None, 'gzip', 'bz2',
pytest.param('xz', marks=td.skip_if_no_lzma)])
def compression_no_zip(request):
"""
Fixture for trying common compression types in compression tests
except zip
"""
return request.param


@pytest.fixture(scope='module')
def datetime_tz_utc():
from datetime import timezone
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1654,9 +1654,9 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
A string representing the encoding to use in the output file,
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
compression : string, optional
a string representing the compression to use in the output file,
allowed values are 'gzip', 'bz2', 'xz',
only used when the first argument is a filename
A string representing the compression to use in the output file.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
used when the first argument is a filename.
line_terminator : string, default ``'\n'``
The newline character or character sequence to use in the output
file
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1814,9 +1814,9 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
.. versionadded:: 0.19.0
compression : {None, 'gzip', 'bz2', 'xz'}
compression : {None, 'gzip', 'bz2', 'zip', 'xz'}
A string representing the compression to use in the output file,
only used when the first argument is a filename
only used when the first argument is a filename.
.. versionadded:: 0.21.0
Expand Down Expand Up @@ -2133,7 +2133,8 @@ def to_pickle(self, path, compression='infer',
----------
path : str
File path where the pickled object will be stored.
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \
default 'infer'
A string representing the compression to use in the output file. By
default, infers from the file extension in specified path.
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3633,9 +3633,9 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='',
a string representing the encoding to use if the contents are
non-ascii, for python versions prior to 3
compression : string, optional
a string representing the compression to use in the output file,
allowed values are 'gzip', 'bz2', 'xz', only used when the first
argument is a filename
A string representing the compression to use in the output file.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
used when the first argument is a filename.
date_format: string, default None
Format string for datetime objects.
decimal: string, default '.'
Expand Down
45 changes: 33 additions & 12 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import codecs
import mmap
from contextlib import contextmanager, closing
from zipfile import ZipFile

from pandas.compat import StringIO, BytesIO, string_types, text_type
from pandas import compat
Expand Down Expand Up @@ -363,18 +364,20 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,

# ZIP Compression
elif compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(path_or_buf)
zip_names = zip_file.namelist()
if len(zip_names) == 1:
f = zip_file.open(zip_names.pop())
elif len(zip_names) == 0:
raise ValueError('Zero files found in ZIP file {}'
.format(path_or_buf))
else:
raise ValueError('Multiple files found in ZIP file.'
' Only one file per ZIP: {}'
.format(zip_names))
zf = BytesZipFile(path_or_buf, mode)
if zf.mode == 'w':
f = zf
elif zf.mode == 'r':
zip_names = zf.namelist()
if len(zip_names) == 1:
f = zf.open(zip_names.pop())
elif len(zip_names) == 0:
raise ValueError('Zero files found in ZIP file {}'
.format(path_or_buf))
else:
raise ValueError('Multiple files found in ZIP file.'
' Only one file per ZIP: {}'
.format(zip_names))

# XZ Compression
elif compression == 'xz':
Expand Down Expand Up @@ -425,6 +428,24 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
return f, handles


class BytesZipFile(ZipFile, BytesIO):
"""
Wrapper for standard library class ZipFile and allow the returned file-like
handle to accept byte strings via `write` method.
BytesIO provides attributes of file-like object and ZipFile.writestr writes
bytes strings into a member of the archive.
"""
# GH 17778
def __init__(self, file, mode='r', **kwargs):
if mode in ['wb', 'rb']:
mode = mode.replace('b', '')
super(BytesZipFile, self).__init__(file, mode, **kwargs)

def write(self, data):
super(BytesZipFile, self).writestr(self.filename, data)


class MMapWrapper(BaseIterator):
"""
Wrapper for the Python's mmap class so that it can be properly read in
Expand Down
14 changes: 12 additions & 2 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ def save(self):
else:
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
close = True
compression=None)
close = True if self.compression is None else False

try:
writer_kwargs = dict(lineterminator=self.line_terminator,
Expand All @@ -151,6 +151,16 @@ def save(self):
self._save()

finally:
# GH 17778 handles compression for byte strings.
if not close and self.compression:
f.close()
with open(self.path_or_buf, 'r') as f:
data = f.read()
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
f.write(data)
close = True
if close:
f.close()

Expand Down
6 changes: 3 additions & 3 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
Any python object.
path : str
File path where the pickled object will be stored.
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
A string representing the compression to use in the output file. By
default, infers from the file extension in specified path.
Expand Down Expand Up @@ -74,7 +74,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
if protocol < 0:
protocol = pkl.HIGHEST_PROTOCOL
try:
pkl.dump(obj, f, protocol=protocol)
f.write(pkl.dumps(obj, protocol=protocol))
finally:
for _f in fh:
_f.close()
Expand All @@ -93,7 +93,7 @@ def read_pickle(path, compression='infer'):
----------
path : str
File path where the pickled object will be loaded.
compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
or '.zip' respectively, and no decompression otherwise.
Expand Down
23 changes: 5 additions & 18 deletions pandas/tests/frame/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,43 +919,30 @@ def test_to_csv_path_is_none(self):
recons = pd.read_csv(StringIO(csv_str), index_col=0)
assert_frame_equal(self.frame, recons)

def test_to_csv_compression(self, compression_no_zip):
def test_to_csv_compression(self, compression):

df = DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])

with ensure_clean() as filename:

df.to_csv(filename, compression=compression_no_zip)
df.to_csv(filename, compression=compression)

# test the round trip - to_csv -> read_csv
rs = read_csv(filename, compression=compression_no_zip,
rs = read_csv(filename, compression=compression,
index_col=0)
assert_frame_equal(df, rs)

# explicitly make sure file is compressed
with tm.decompress_file(filename, compression_no_zip) as fh:
with tm.decompress_file(filename, compression) as fh:
text = fh.read().decode('utf8')
for col in df.columns:
assert col in text

with tm.decompress_file(filename, compression_no_zip) as fh:
with tm.decompress_file(filename, compression) as fh:
assert_frame_equal(df, read_csv(fh, index_col=0))

def test_to_csv_compression_value_error(self):
# GH7615
# use the compression kw in to_csv
df = DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])

with ensure_clean() as filename:
# zip compression is not supported and should raise ValueError
import zipfile
pytest.raises(zipfile.BadZipfile, df.to_csv,
filename, compression="zip")

def test_to_csv_date_format(self):
with ensure_clean('__tmp_to_csv_date_format__') as path:
dt_index = self.tsframe.index
Expand Down
36 changes: 13 additions & 23 deletions pandas/tests/io/json/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,22 @@
from pandas.util.testing import assert_frame_equal, assert_raises_regex


def test_compression_roundtrip(compression_no_zip):
def test_compression_roundtrip(compression):
df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])

with tm.ensure_clean() as path:
df.to_json(path, compression=compression_no_zip)
df.to_json(path, compression=compression)
assert_frame_equal(df, pd.read_json(path,
compression=compression_no_zip))
compression=compression))

# explicitly ensure file was compressed.
with tm.decompress_file(path, compression_no_zip) as fh:
with tm.decompress_file(path, compression) as fh:
result = fh.read().decode('utf8')
assert_frame_equal(df, pd.read_json(result))


def test_compress_zip_value_error():
df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])

with tm.ensure_clean() as path:
import zipfile
pytest.raises(zipfile.BadZipfile, df.to_json, path, compression="zip")


def test_read_zipped_json():
uncompressed_path = tm.get_data_path("tsframe_v012.json")
uncompressed_df = pd.read_json(uncompressed_path)
Expand All @@ -41,7 +31,7 @@ def test_read_zipped_json():
assert_frame_equal(uncompressed_df, compressed_df)


def test_with_s3_url(compression_no_zip):
def test_with_s3_url(compression):
boto3 = pytest.importorskip('boto3')
pytest.importorskip('s3fs')
moto = pytest.importorskip('moto')
Expand All @@ -52,35 +42,35 @@ def test_with_s3_url(compression_no_zip):
bucket = conn.create_bucket(Bucket="pandas-test")

with tm.ensure_clean() as path:
df.to_json(path, compression=compression_no_zip)
df.to_json(path, compression=compression)
with open(path, 'rb') as f:
bucket.put_object(Key='test-1', Body=f)

roundtripped_df = pd.read_json('s3://pandas-test/test-1',
compression=compression_no_zip)
compression=compression)
assert_frame_equal(df, roundtripped_df)


def test_lines_with_compression(compression_no_zip):
def test_lines_with_compression(compression):

with tm.ensure_clean() as path:
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
df.to_json(path, orient='records', lines=True,
compression=compression_no_zip)
compression=compression)
roundtripped_df = pd.read_json(path, lines=True,
compression=compression_no_zip)
compression=compression)
assert_frame_equal(df, roundtripped_df)


def test_chunksize_with_compression(compression_no_zip):
def test_chunksize_with_compression(compression):

with tm.ensure_clean() as path:
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
df.to_json(path, orient='records', lines=True,
compression=compression_no_zip)
compression=compression)

res = pd.read_json(path, lines=True, chunksize=1,
compression=compression_no_zip)
compression=compression)
roundtripped_df = pd.concat(res)
assert_frame_equal(df, roundtripped_df)

Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/io/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ def compress_file(self, src_path, dest_path, compression):
f.write(fh.read())
f.close()

def test_write_explicit(self, compression_no_zip, get_random_path):
def test_write_explicit(self, compression, get_random_path):
base = get_random_path
path1 = base + ".compressed"
path2 = base + ".raw"
Expand All @@ -361,10 +361,10 @@ def test_write_explicit(self, compression_no_zip, get_random_path):
df = tm.makeDataFrame()

# write to compressed file
df.to_pickle(p1, compression=compression_no_zip)
df.to_pickle(p1, compression=compression)

# decompress
with tm.decompress_file(p1, compression=compression_no_zip) as f:
with tm.decompress_file(p1, compression=compression) as f:
with open(p2, "wb") as fh:
fh.write(f.read())

Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/series/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,26 +138,26 @@ def test_to_csv_path_is_none(self):
csv_str = s.to_csv(path=None)
assert isinstance(csv_str, str)

def test_to_csv_compression(self, compression_no_zip):
def test_to_csv_compression(self, compression):

s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
name='X')

with ensure_clean() as filename:

s.to_csv(filename, compression=compression_no_zip, header=True)
s.to_csv(filename, compression=compression, header=True)

# test the round trip - to_csv -> read_csv
rs = pd.read_csv(filename, compression=compression_no_zip,
rs = pd.read_csv(filename, compression=compression,
index_col=0, squeeze=True)
assert_series_equal(s, rs)

# explicitly ensure file was compressed
with tm.decompress_file(filename, compression_no_zip) as fh:
with tm.decompress_file(filename, compression) as fh:
text = fh.read().decode('utf8')
assert s.name in text

with tm.decompress_file(filename, compression_no_zip) as fh:
with tm.decompress_file(filename, compression) as fh:
assert_series_equal(s, pd.read_csv(fh,
index_col=0,
squeeze=True))
Expand Down
2 changes: 1 addition & 1 deletion pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def decompress_file(path, compression):
path : str
The path where the file is read from
compression : {'gzip', 'bz2', 'xz', None}
compression : {'gzip', 'bz2', 'zip', 'xz', None}
Name of the decompression to use
Returns
Expand Down

0 comments on commit 76534d5

Please sign in to comment.