-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
EHN: allow zip compression in to_pickle
, to_json
, to_csv
#20394
Changes from 30 commits
ccfd240
fd7362c
c570091
ec712b9
bf271ce
113db83
9b9e5d1
dedb853
dfa9913
67b9727
ecdf5a2
b9fab3c
5c5c161
d072ca8
cecb0ac
ed189c4
4ac9488
694c6b5
80992a3
3288691
272c6e7
d35b6af
c6034b4
71d9979
4c87e0f
fd44980
ab7a7b7
cfd0715
dd958ac
2956103
63890ec
e4966be
437d716
04886e9
099993c
6aa1493
129a55a
4531c78
ebd8e6f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3633,8 +3633,8 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='', | |
non-ascii, for python versions prior to 3 | ||
compression : string, optional | ||
a string representing the compression to use in the output file, | ||
allowed values are 'gzip', 'bz2', 'xz', only used when the first | ||
argument is a filename | ||
allowed values are 'gzip', 'bz2', 'zip', 'xz', only used when the | ||
first argument is a filename | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's the fix the docstring here as I suggested for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. |
||
date_format: string, default None | ||
Format string for datetime objects. | ||
decimal: string, default '.' | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
import codecs | ||
import mmap | ||
from contextlib import contextmanager, closing | ||
from zipfile import ZipFile | ||
|
||
from pandas.compat import StringIO, BytesIO, string_types, text_type | ||
from pandas import compat | ||
|
@@ -363,18 +364,20 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
|
||
# ZIP Compression | ||
elif compression == 'zip': | ||
import zipfile | ||
zip_file = zipfile.ZipFile(path_or_buf) | ||
zip_names = zip_file.namelist() | ||
if len(zip_names) == 1: | ||
f = zip_file.open(zip_names.pop()) | ||
elif len(zip_names) == 0: | ||
raise ValueError('Zero files found in ZIP file {}' | ||
.format(path_or_buf)) | ||
else: | ||
raise ValueError('Multiple files found in ZIP file.' | ||
' Only one file per ZIP: {}' | ||
.format(zip_names)) | ||
zf = BytesZipFile(path_or_buf, mode) | ||
if zf.mode == 'w': | ||
f = zf | ||
elif zf.mode == 'r': | ||
zip_names = zf.namelist() | ||
if len(zip_names) == 1: | ||
f = zf.open(zip_names.pop()) | ||
elif len(zip_names) == 0: | ||
raise ValueError('Zero files found in ZIP file {}' | ||
.format(path_or_buf)) | ||
else: | ||
raise ValueError('Multiple files found in ZIP file.' | ||
' Only one file per ZIP: {}' | ||
.format(zip_names)) | ||
|
||
# XZ Compression | ||
elif compression == 'xz': | ||
|
@@ -425,6 +428,18 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
return f, handles | ||
|
||
|
||
class BytesZipFile(ZipFile, BytesIO): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I personally like this location. I would keep it here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a little bit more to this class doc-strings. e.g. why its needed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added. we currently don't have ability to write zip compressed pickle, json, csv, only read them. standard library ZipFile isn't designed exactly to produce a writable file handle, hence the custom class. |
||
"""override write method with writestr to accept bytes.""" | ||
# GH 17778 | ||
def __init__(self, file, mode='r', **kwargs): | ||
if mode in ['wb', 'rb']: | ||
mode = mode.replace('b', '') | ||
super(BytesZipFile, self).__init__(file, mode, **kwargs) | ||
|
||
def write(self, data): | ||
super(BytesZipFile, self).writestr(self.filename, data) | ||
|
||
|
||
class MMapWrapper(BaseIterator): | ||
""" | ||
Wrapper for the Python's mmap class so that it can be properly read in | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,7 +18,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): | |
Any python object. | ||
path : str | ||
File path where the pickled object will be stored. | ||
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' | ||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' | ||
A string representing the compression to use in the output file. By | ||
default, infers from the file extension in specified path. | ||
|
||
|
@@ -62,7 +62,6 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): | |
2 2 7 | ||
3 3 8 | ||
4 4 9 | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add back the blank lines you removed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added back. |
||
>>> import os | ||
>>> os.remove("./dummy.pkl") | ||
""" | ||
|
@@ -74,7 +73,7 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): | |
if protocol < 0: | ||
protocol = pkl.HIGHEST_PROTOCOL | ||
try: | ||
pkl.dump(obj, f, protocol=protocol) | ||
f.write(pkl.dumps(obj, protocol=protocol)) | ||
finally: | ||
for _f in fh: | ||
_f.close() | ||
|
@@ -93,7 +92,7 @@ def read_pickle(path, compression='infer'): | |
---------- | ||
path : str | ||
File path where the pickled object will be loaded. | ||
compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' | ||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' | ||
For on-the-fly decompression of on-disk data. If 'infer', then use | ||
gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', | ||
or '.zip' respectively, and no decompression otherwise. | ||
|
@@ -133,7 +132,6 @@ def read_pickle(path, compression='infer'): | |
2 2 7 | ||
3 3 8 | ||
4 4 9 | ||
|
||
>>> import os | ||
>>> os.remove("./dummy.pkl") | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,7 @@ | |
from numpy import nan | ||
import numpy as np | ||
|
||
from pandas.compat import (lmap, range, lrange, StringIO, u) | ||
from pandas.compat import (lmap, range, lrange, StringIO, u, PY2) | ||
import pandas.core.common as com | ||
from pandas.errors import ParserError | ||
from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, | ||
|
@@ -919,30 +919,35 @@ def test_to_csv_path_is_none(self): | |
recons = pd.read_csv(StringIO(csv_str), index_col=0) | ||
assert_frame_equal(self.frame, recons) | ||
|
||
def test_to_csv_compression(self, compression_no_zip): | ||
def test_to_csv_compression(self, compression): | ||
|
||
df = DataFrame([[0.123456, 0.234567, 0.567567], | ||
[12.32112, 123123.2, 321321.2]], | ||
index=['A', 'B'], columns=['X', 'Y', 'Z']) | ||
|
||
if PY2 and compression == 'zip': | ||
pytest.xfail(reason='zip compression for csv not suppported in' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should be a skip There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for the comment. now it should not have to skip or xfail test_to_csv. |
||
'Python 2') | ||
|
||
with ensure_clean() as filename: | ||
|
||
df.to_csv(filename, compression=compression_no_zip) | ||
df.to_csv(filename, compression=compression) | ||
|
||
# test the round trip - to_csv -> read_csv | ||
rs = read_csv(filename, compression=compression_no_zip, | ||
rs = read_csv(filename, compression=compression, | ||
index_col=0) | ||
assert_frame_equal(df, rs) | ||
|
||
# explicitly make sure file is compressed | ||
with tm.decompress_file(filename, compression_no_zip) as fh: | ||
with tm.decompress_file(filename, compression) as fh: | ||
text = fh.read().decode('utf8') | ||
for col in df.columns: | ||
assert col in text | ||
|
||
with tm.decompress_file(filename, compression_no_zip) as fh: | ||
with tm.decompress_file(filename, compression) as fh: | ||
assert_frame_equal(df, read_csv(fh, index_col=0)) | ||
|
||
@pytest.mark.xfail(reason='zip compression is now supported for csv.') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are you xfailing this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is an old test case that assert raising a BadZipFile exception when zip compression was not supported. so it will now fail the test because it doesn't no longer raise that exception. this test case is now redundant and removed in 04886e9 |
||
def test_to_csv_compression_value_error(self): | ||
# GH7615 | ||
# use the compression kw in to_csv | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,22 +5,23 @@ | |
from pandas.util.testing import assert_frame_equal, assert_raises_regex | ||
|
||
|
||
def test_compression_roundtrip(compression_no_zip): | ||
def test_compression_roundtrip(compression): | ||
df = pd.DataFrame([[0.123456, 0.234567, 0.567567], | ||
[12.32112, 123123.2, 321321.2]], | ||
index=['A', 'B'], columns=['X', 'Y', 'Z']) | ||
|
||
with tm.ensure_clean() as path: | ||
df.to_json(path, compression=compression_no_zip) | ||
df.to_json(path, compression=compression) | ||
assert_frame_equal(df, pd.read_json(path, | ||
compression=compression_no_zip)) | ||
compression=compression)) | ||
|
||
# explicitly ensure file was compressed. | ||
with tm.decompress_file(path, compression_no_zip) as fh: | ||
with tm.decompress_file(path, compression) as fh: | ||
result = fh.read().decode('utf8') | ||
assert_frame_equal(df, pd.read_json(result)) | ||
|
||
|
||
@pytest.mark.xfail(reason='zip compression is now supported for json.') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are you xfailing this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same as above. |
||
def test_compress_zip_value_error(): | ||
df = pd.DataFrame([[0.123456, 0.234567, 0.567567], | ||
[12.32112, 123123.2, 321321.2]], | ||
|
@@ -41,7 +42,7 @@ def test_read_zipped_json(): | |
assert_frame_equal(uncompressed_df, compressed_df) | ||
|
||
|
||
def test_with_s3_url(compression_no_zip): | ||
def test_with_s3_url(compression): | ||
boto3 = pytest.importorskip('boto3') | ||
pytest.importorskip('s3fs') | ||
moto = pytest.importorskip('moto') | ||
|
@@ -52,35 +53,35 @@ def test_with_s3_url(compression_no_zip): | |
bucket = conn.create_bucket(Bucket="pandas-test") | ||
|
||
with tm.ensure_clean() as path: | ||
df.to_json(path, compression=compression_no_zip) | ||
df.to_json(path, compression=compression) | ||
with open(path, 'rb') as f: | ||
bucket.put_object(Key='test-1', Body=f) | ||
|
||
roundtripped_df = pd.read_json('s3://pandas-test/test-1', | ||
compression=compression_no_zip) | ||
compression=compression) | ||
assert_frame_equal(df, roundtripped_df) | ||
|
||
|
||
def test_lines_with_compression(compression_no_zip): | ||
def test_lines_with_compression(compression): | ||
|
||
with tm.ensure_clean() as path: | ||
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') | ||
df.to_json(path, orient='records', lines=True, | ||
compression=compression_no_zip) | ||
compression=compression) | ||
roundtripped_df = pd.read_json(path, lines=True, | ||
compression=compression_no_zip) | ||
compression=compression) | ||
assert_frame_equal(df, roundtripped_df) | ||
|
||
|
||
def test_chunksize_with_compression(compression_no_zip): | ||
def test_chunksize_with_compression(compression): | ||
|
||
with tm.ensure_clean() as path: | ||
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}') | ||
df.to_json(path, orient='records', lines=True, | ||
compression=compression_no_zip) | ||
compression=compression) | ||
|
||
res = pd.read_json(path, lines=True, chunksize=1, | ||
compression=compression_no_zip) | ||
compression=compression) | ||
roundtripped_df = pd.concat(res) | ||
assert_frame_equal(df, roundtripped_df) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,7 @@ | |
|
||
from pandas import Series, DataFrame | ||
|
||
from pandas.compat import StringIO, u | ||
from pandas.compat import StringIO, u, PY2 | ||
from pandas.util.testing import (assert_series_equal, assert_almost_equal, | ||
assert_frame_equal, ensure_clean) | ||
import pandas.util.testing as tm | ||
|
@@ -138,26 +138,30 @@ def test_to_csv_path_is_none(self): | |
csv_str = s.to_csv(path=None) | ||
assert isinstance(csv_str, str) | ||
|
||
def test_to_csv_compression(self, compression_no_zip): | ||
def test_to_csv_compression(self, compression): | ||
|
||
s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], | ||
name='X') | ||
|
||
if PY2 and compression == 'zip': | ||
pytest.xfail(reason='zip compression for csv not suppported in' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. skip There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this skip or xfail is no longer needed to handle zip compression (write) in Python 2. |
||
'Python 2') | ||
|
||
with ensure_clean() as filename: | ||
|
||
s.to_csv(filename, compression=compression_no_zip, header=True) | ||
s.to_csv(filename, compression=compression, header=True) | ||
|
||
# test the round trip - to_csv -> read_csv | ||
rs = pd.read_csv(filename, compression=compression_no_zip, | ||
rs = pd.read_csv(filename, compression=compression, | ||
index_col=0, squeeze=True) | ||
assert_series_equal(s, rs) | ||
|
||
# explicitly ensure file was compressed | ||
with tm.decompress_file(filename, compression_no_zip) as fh: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are there any uses of the compression_no_zip fixture left? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so, the compression_no_zip fixture is solely for excluding zip compression in tests because writing zip compression had not been implemented. |
||
with tm.decompress_file(filename, compression) as fh: | ||
text = fh.read().decode('utf8') | ||
assert s.name in text | ||
|
||
with tm.decompress_file(filename, compression_no_zip) as fh: | ||
with tm.decompress_file(filename, compression) as fh: | ||
assert_series_equal(s, pd.read_csv(fh, | ||
index_col=0, | ||
squeeze=True)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's fix this parameter description a bit:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.