Skip to content

Commit

Permalink
Make read_json with lines=True more memory-efficient
Browse files Browse the repository at this point in the history
Instead of reading the whole file to memory and then manipulating
it, read and parse it 10k lines at a time.
This only covers some kinds of input to read_json.
This also is much slower than the previous implementation.
  • Loading branch information
louispotok committed Aug 3, 2017
1 parent c55dbf0 commit 68063d8
Showing 1 changed file with 42 additions and 4 deletions.
46 changes: 42 additions & 4 deletions pandas/io/json/json.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# pylint: disable-msg=E1101,W0613,W0603
from itertools import islice
from pandas import concat
import os
import numpy as np

Expand Down Expand Up @@ -174,7 +176,7 @@ def write(self):
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
convert_axes=True, convert_dates=True, keep_default_dates=True,
numpy=False, precise_float=False, date_unit=None, encoding=None,
lines=False):
lines=False, chunksize=None):
"""
Convert a JSON string to pandas object
Expand Down Expand Up @@ -263,6 +265,14 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
.. versionadded:: 0.19.0
chunksize: integer, default None
If `lines=True`, how many lines to read into memory at a time.
If this is None, the file will be read into memory all at once.
Passing a chunksize helps with memory usage, but is slower.
Also note this is different from the `chunksize` parameter in
`read_csv`, which returns a FileTextReader.
If the JSON input is a string, this argument has no effect.
Returns
-------
result : Series or DataFrame, depending on the value of `typ`.
Expand Down Expand Up @@ -334,12 +344,18 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
if exists:
fh, handles = _get_handle(filepath_or_buffer, 'r',
encoding=encoding)
json = fh.read()
fh.close()
if lines and chunksize:
return _read_json_as_lines(fh, chunksize)
else:
json = fh.read()
fh.close()
else:
json = filepath_or_buffer
elif hasattr(filepath_or_buffer, 'read'):
json = filepath_or_buffer.read()
if lines and chunksize:
return _read_json_as_lines(fh, chunksize)
else:
json = filepath_or_buffer.read()
else:
json = filepath_or_buffer

Expand All @@ -349,6 +365,28 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
lines = list(StringIO(json.strip()))
json = '[' + ','.join(lines) + ']'

return _get_obj(typ, json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit)

def _read_json_as_lines(fh, chunksize):
while True:
lines = list(islice(fh, chunksize))

if lines:
lines_json = '[' + ','.join(lines) + ']'
obj = _get_obj(typ, lines_json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit)
if not return_val:
obj = return_val
else:
return_val = concat([return_val, obj])

else:
break
fh.close()
return return_val

def _get_obj(typ, json, orient, dtype, convert_axes, convert_dates,
keep_default_dates, numpy, precise_float,
date_unit):
obj = None
if typ == 'frame':
obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
Expand Down

0 comments on commit 68063d8

Please sign in to comment.