Make read_json with lines=True more memory-efficient

Instead of reading the whole file to memory and then manipulating it, read and parse it 10k lines at a time. This only covers some kinds of input to read_json. This also is much slower than the previous implementation.
louispotok · Aug 3, 2017 · 68063d8 · 68063d8
1 parent c55dbf0
commit 68063d8
Showing 1 changed file with 42 additions and 4 deletions.
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -1,4 +1,6 @@
 # pylint: disable-msg=E1101,W0613,W0603
+from itertools import islice
+from pandas import concat
 import os
 import numpy as np
 
@@ -174,7 +176,7 @@ def write(self):
 def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
               convert_axes=True, convert_dates=True, keep_default_dates=True,
               numpy=False, precise_float=False, date_unit=None, encoding=None,
-              lines=False):
+              lines=False, chunksize=None):
     """
     Convert a JSON string to pandas object
 
@@ -263,6 +265,14 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
 
         .. versionadded:: 0.19.0
 
+    chunksize: integer, default None
+        If `lines=True`, how many lines to read into memory at a time.
+        If this is None, the file will be read into memory all at once.
+        Passing a chunksize helps with memory usage, but is slower.
+        Also note this is different from the `chunksize` parameter in
+            `read_csv`, which returns a FileTextReader.
+        If the JSON input is a string, this argument has no effect.
+
     Returns
     -------
     result : Series or DataFrame, depending on the value of `typ`.
@@ -334,12 +344,18 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         if exists:
             fh, handles = _get_handle(filepath_or_buffer, 'r',
                                       encoding=encoding)
-            json = fh.read()
-            fh.close()
+            if lines and chunksize:
+                return _read_json_as_lines(fh, chunksize)
+            else:
+                json = fh.read()
+                fh.close()
         else:
             json = filepath_or_buffer
     elif hasattr(filepath_or_buffer, 'read'):
-        json = filepath_or_buffer.read()
+        if lines and chunksize:
+            return _read_json_as_lines(fh, chunksize)
+        else:
+            json = filepath_or_buffer.read()
     else:
         json = filepath_or_buffer
 
@@ -349,6 +365,28 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         lines = list(StringIO(json.strip()))
         json = '[' + ','.join(lines) + ']'
 
+    return _get_obj(typ, json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit)
+
+def _read_json_as_lines(fh, chunksize):
+    while True:
+        lines = list(islice(fh, chunksize))
+
+        if lines:
+            lines_json = '[' + ','.join(lines) + ']'
+            obj = _get_obj(typ, lines_json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit)
+            if not return_val:
+                obj = return_val
+            else:
+                return_val = concat([return_val, obj])
+
+        else:
+            break
+    fh.close()
+    return return_val
+
+def _get_obj(typ, json, orient, dtype, convert_axes, convert_dates,
+                          keep_default_dates, numpy, precise_float,
+             date_unit):
     obj = None
     if typ == 'frame':
         obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,