forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add chunksize param to read_json when lines=True (pandas-dev#17168)
closes pandas-dev#17048
1 parent
b6265d3
commit c39ad7c
Showing
6 changed files
with
383 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
# -*- coding: utf-8 -*- | ||
import pytest | ||
import pandas as pd | ||
from pandas import DataFrame, read_json | ||
from pandas.compat import StringIO | ||
from pandas.io.json.json import JsonReader | ||
import pandas.util.testing as tm | ||
from pandas.util.testing import (assert_frame_equal, assert_series_equal, | ||
ensure_clean) | ||
|
||
|
||
@pytest.fixture | ||
def lines_json_df(): | ||
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) | ||
return df.to_json(lines=True, orient="records") | ||
|
||
|
||
def test_read_jsonl(): | ||
# GH9180 | ||
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) | ||
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) | ||
assert_frame_equal(result, expected) | ||
|
||
|
||
def test_read_jsonl_unicode_chars(): | ||
# GH15132: non-ascii unicode characters | ||
# \u201d == RIGHT DOUBLE QUOTATION MARK | ||
|
||
# simulate file handle | ||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' | ||
json = StringIO(json) | ||
result = read_json(json, lines=True) | ||
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], | ||
columns=['a', 'b']) | ||
assert_frame_equal(result, expected) | ||
|
||
# simulate string | ||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' | ||
result = read_json(json, lines=True) | ||
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], | ||
columns=['a', 'b']) | ||
assert_frame_equal(result, expected) | ||
|
||
|
||
def test_to_jsonl(): | ||
# GH9180 | ||
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) | ||
result = df.to_json(orient="records", lines=True) | ||
expected = '{"a":1,"b":2}\n{"a":1,"b":2}' | ||
assert result == expected | ||
|
||
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b']) | ||
result = df.to_json(orient="records", lines=True) | ||
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' | ||
assert result == expected | ||
assert_frame_equal(read_json(result, lines=True), df) | ||
|
||
# GH15096: escaped characters in columns and data | ||
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], | ||
columns=["a\\", 'b']) | ||
result = df.to_json(orient="records", lines=True) | ||
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n' | ||
'{"a\\\\":"foo\\"","b":"bar"}') | ||
assert result == expected | ||
assert_frame_equal(read_json(result, lines=True), df) | ||
|
||
|
||
@pytest.mark.parametrize("chunksize", [1, 1.0]) | ||
def test_readjson_chunks(lines_json_df, chunksize): | ||
# Basic test that read_json(chunks=True) gives the same result as | ||
# read_json(chunks=False) | ||
# GH17048: memory usage when lines=True | ||
|
||
unchunked = read_json(StringIO(lines_json_df), lines=True) | ||
reader = read_json(StringIO(lines_json_df), lines=True, | ||
chunksize=chunksize) | ||
chunked = pd.concat(reader) | ||
|
||
assert_frame_equal(chunked, unchunked) | ||
|
||
|
||
def test_readjson_chunksize_requires_lines(lines_json_df): | ||
msg = "chunksize can only be passed if lines=True" | ||
with tm.assert_raises_regex(ValueError, msg): | ||
pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) | ||
|
||
|
||
def test_readjson_chunks_series(): | ||
# Test reading line-format JSON to Series with chunksize param | ||
s = pd.Series({'A': 1, 'B': 2}) | ||
|
||
strio = StringIO(s.to_json(lines=True, orient="records")) | ||
unchunked = pd.read_json(strio, lines=True, typ='Series') | ||
|
||
strio = StringIO(s.to_json(lines=True, orient="records")) | ||
chunked = pd.concat(pd.read_json( | ||
strio, lines=True, typ='Series', chunksize=1 | ||
)) | ||
|
||
assert_series_equal(chunked, unchunked) | ||
|
||
|
||
def test_readjson_each_chunk(lines_json_df): | ||
# Other tests check that the final result of read_json(chunksize=True) | ||
# is correct. This checks the intermediate chunks. | ||
chunks = list( | ||
pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2) | ||
) | ||
assert chunks[0].shape == (2, 2) | ||
assert chunks[1].shape == (1, 2) | ||
|
||
|
||
def test_readjson_chunks_from_file(): | ||
with ensure_clean('test.json') as path: | ||
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) | ||
df.to_json(path, lines=True, orient="records") | ||
chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1)) | ||
unchunked = pd.read_json(path, lines=True) | ||
assert_frame_equal(unchunked, chunked) | ||
|
||
|
||
@pytest.mark.parametrize("chunksize", [None, 1]) | ||
def test_readjson_chunks_closes(chunksize): | ||
with ensure_clean('test.json') as path: | ||
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) | ||
df.to_json(path, lines=True, orient="records") | ||
reader = JsonReader( | ||
path, orient=None, typ="frame", dtype=True, convert_axes=True, | ||
convert_dates=True, keep_default_dates=True, numpy=False, | ||
precise_float=False, date_unit=None, encoding=None, | ||
lines=True, chunksize=chunksize) | ||
reader.read() | ||
assert reader.open_stream.closed, "didn't close stream with \ | ||
chunksize = %s" % chunksize | ||
|
||
|
||
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) | ||
def test_readjson_invalid_chunksize(lines_json_df, chunksize): | ||
msg = r"'chunksize' must be an integer >=1" | ||
|
||
with tm.assert_raises_regex(ValueError, msg): | ||
pd.read_json(StringIO(lines_json_df), lines=True, | ||
chunksize=chunksize) | ||
|
||
|
||
@pytest.mark.parametrize("chunksize", [None, 1, 2]) | ||
def test_readjson_chunks_multiple_empty_lines(chunksize): | ||
j = """ | ||
{"A":1,"B":4} | ||
{"A":2,"B":5} | ||
{"A":3,"B":6} | ||
""" | ||
orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) | ||
test = pd.read_json(j, lines=True, chunksize=chunksize) | ||
if chunksize is not None: | ||
test = pd.concat(test) | ||
tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize) |