Make read_json with lines=True more memory-efficient

louispotok · louispotok · commit 0a474731fc68 · 2017-07-27T09:06:54.000-07:00
Instead of reading the whole file to memory and then manipulating
it, read and parse it 10k lines at a time.
This only covers some kinds of input to read_json.
This also is much slower than the previous implementation.
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -1,4 +1,6 @@
 # pylint: disable-msg=E1101,W0613,W0603
+from itertools import islice
+from pandas import concat
 import os
 import numpy as np
 
@@ -174,7 +176,7 @@ def write(self):
 def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
               convert_axes=True, convert_dates=True, keep_default_dates=True,
               numpy=False, precise_float=False, date_unit=None, encoding=None,
-              lines=False):
+              lines=False, chunksize=None):
     """
     Convert a JSON string to pandas object
 
@@ -263,6 +265,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
 
         .. versionadded:: 0.19.0
 
+    chunksize: integer, default None
+        if lines is True, how many lines to read into memory at a time.
+        This is helpful if the file is large, but also slower. Larger chunks
+        are faster, but more likely to hit memory bounds. Also note this is
+        different from the `chunksize` parameter in `read_csv`, which returns
+        a FileTextReader.
+
     Returns
     -------
     result : Series or DataFrame, depending on the value of `typ`.
@@ -334,8 +343,27 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         if exists:
             fh, handles = _get_handle(filepath_or_buffer, 'r',
                                       encoding=encoding)
-            json = fh.read()
-            fh.close()
+            if lines:
+                if not chunksize:
+                    chunksize = 10000
+                return_val = None
+                while True:
+                    lines = list(islice(fh, chunksize))
+                    if lines:
+                        lines_json = '[' + ','.join(lines) + ']'
+                        obj = _get_obj(typ, lines_json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit)
+                        if not return_val:
+                            obj = return_val
+                        else:
+                            return_val = concat([return_val, obj])
+
+                    else:
+                        break
+                fh.close()
+                return return_val
+            else:
+                json = fh.read()
+                fh.close()
         else:
             json = filepath_or_buffer
     elif hasattr(filepath_or_buffer, 'read'):
@@ -349,6 +377,11 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         lines = list(StringIO(json.strip()))
         json = '[' + ','.join(lines) + ']'
 
+    return _get_obj(typ, json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit)
+
+def _get_obj(typ, json, orient, dtype, convert_axes, convert_dates,
+                          keep_default_dates, numpy, precise_float,
+             date_unit):
     obj = None
     if typ == 'frame':
         obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,