-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
EHN: allow zip compression in to_pickle
, to_json
, to_csv
#20394
Changes from all commits
ccfd240
fd7362c
c570091
ec712b9
bf271ce
113db83
9b9e5d1
dedb853
dfa9913
67b9727
ecdf5a2
b9fab3c
5c5c161
d072ca8
cecb0ac
ed189c4
4ac9488
694c6b5
80992a3
3288691
272c6e7
d35b6af
c6034b4
71d9979
4c87e0f
fd44980
ab7a7b7
cfd0715
dd958ac
2956103
63890ec
e4966be
437d716
04886e9
099993c
6aa1493
129a55a
4531c78
ebd8e6f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
import codecs | ||
import mmap | ||
from contextlib import contextmanager, closing | ||
from zipfile import ZipFile | ||
|
||
from pandas.compat import StringIO, BytesIO, string_types, text_type | ||
from pandas import compat | ||
|
@@ -363,18 +364,20 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
|
||
# ZIP Compression | ||
elif compression == 'zip': | ||
import zipfile | ||
zip_file = zipfile.ZipFile(path_or_buf) | ||
zip_names = zip_file.namelist() | ||
if len(zip_names) == 1: | ||
f = zip_file.open(zip_names.pop()) | ||
elif len(zip_names) == 0: | ||
raise ValueError('Zero files found in ZIP file {}' | ||
.format(path_or_buf)) | ||
else: | ||
raise ValueError('Multiple files found in ZIP file.' | ||
' Only one file per ZIP: {}' | ||
.format(zip_names)) | ||
zf = BytesZipFile(path_or_buf, mode) | ||
if zf.mode == 'w': | ||
f = zf | ||
elif zf.mode == 'r': | ||
zip_names = zf.namelist() | ||
if len(zip_names) == 1: | ||
f = zf.open(zip_names.pop()) | ||
elif len(zip_names) == 0: | ||
raise ValueError('Zero files found in ZIP file {}' | ||
.format(path_or_buf)) | ||
else: | ||
raise ValueError('Multiple files found in ZIP file.' | ||
' Only one file per ZIP: {}' | ||
.format(zip_names)) | ||
|
||
# XZ Compression | ||
elif compression == 'xz': | ||
|
@@ -425,6 +428,24 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, | |
return f, handles | ||
|
||
|
||
class BytesZipFile(ZipFile, BytesIO): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a little bit more to this class doc-strings. e.g. why its needed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added. we currently don't have ability to write zip compressed pickle, json, csv, only read them. standard library ZipFile isn't designed exactly to produce a writable file handle, hence the custom class. |
||
""" | ||
Wrapper for standard library class ZipFile and allow the returned file-like | ||
handle to accept byte strings via `write` method. | ||
|
||
BytesIO provides attributes of file-like object and ZipFile.writestr writes | ||
bytes strings into a member of the archive. | ||
""" | ||
# GH 17778 | ||
def __init__(self, file, mode='r', **kwargs): | ||
if mode in ['wb', 'rb']: | ||
mode = mode.replace('b', '') | ||
super(BytesZipFile, self).__init__(file, mode, **kwargs) | ||
|
||
def write(self, data): | ||
super(BytesZipFile, self).writestr(self.filename, data) | ||
|
||
|
||
class MMapWrapper(BaseIterator): | ||
""" | ||
Wrapper for the Python's mmap class so that it can be properly read in | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -138,26 +138,26 @@ def test_to_csv_path_is_none(self): | |
csv_str = s.to_csv(path=None) | ||
assert isinstance(csv_str, str) | ||
|
||
def test_to_csv_compression(self, compression_no_zip): | ||
def test_to_csv_compression(self, compression): | ||
|
||
s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], | ||
name='X') | ||
|
||
with ensure_clean() as filename: | ||
|
||
s.to_csv(filename, compression=compression_no_zip, header=True) | ||
s.to_csv(filename, compression=compression, header=True) | ||
|
||
# test the round trip - to_csv -> read_csv | ||
rs = pd.read_csv(filename, compression=compression_no_zip, | ||
rs = pd.read_csv(filename, compression=compression, | ||
index_col=0, squeeze=True) | ||
assert_series_equal(s, rs) | ||
|
||
# explicitly ensure file was compressed | ||
with tm.decompress_file(filename, compression_no_zip) as fh: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are there any uses of the compression_no_zip fixture left? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so, the compression_no_zip fixture is solely for excluding zip compression in tests because writing zip compression had not been implemented. |
||
with tm.decompress_file(filename, compression) as fh: | ||
text = fh.read().decode('utf8') | ||
assert s.name in text | ||
|
||
with tm.decompress_file(filename, compression_no_zip) as fh: | ||
with tm.decompress_file(filename, compression) as fh: | ||
assert_series_equal(s, pd.read_csv(fh, | ||
index_col=0, | ||
squeeze=True)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I personally like this location. I would keep it here.