Skip to content

Commit 0a47473

Browse files
committed
Make read_json with lines=True more memory-efficient
Instead of reading the whole file to memory and then manipulating it, read and parse it 10k lines at a time. This only covers some kinds of input to read_json. This also is much slower than the previous implementation.
1 parent c55dbf0 commit 0a47473

File tree

1 file changed

+36
-3
lines changed

1 file changed

+36
-3
lines changed

pandas/io/json/json.py

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
# pylint: disable-msg=E1101,W0613,W0603
2+
from itertools import islice
3+
from pandas import concat
24
import os
35
import numpy as np
46

@@ -174,7 +176,7 @@ def write(self):
174176
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
175177
convert_axes=True, convert_dates=True, keep_default_dates=True,
176178
numpy=False, precise_float=False, date_unit=None, encoding=None,
177-
lines=False):
179+
lines=False, chunksize=None):
178180
"""
179181
Convert a JSON string to pandas object
180182
@@ -263,6 +265,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
263265
264266
.. versionadded:: 0.19.0
265267
268+
chunksize: integer, default None
269+
if lines is True, how many lines to read into memory at a time.
270+
This is helpful if the file is large, but also slower. Larger chunks
271+
are faster, but more likely to hit memory bounds. Also note this is
272+
different from the `chunksize` parameter in `read_csv`, which returns
273+
a FileTextReader.
274+
266275
Returns
267276
-------
268277
result : Series or DataFrame, depending on the value of `typ`.
@@ -334,8 +343,27 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
334343
if exists:
335344
fh, handles = _get_handle(filepath_or_buffer, 'r',
336345
encoding=encoding)
337-
json = fh.read()
338-
fh.close()
346+
if lines:
347+
if not chunksize:
348+
chunksize = 10000
349+
return_val = None
350+
while True:
351+
lines = list(islice(fh, chunksize))
352+
if lines:
353+
lines_json = '[' + ','.join(lines) + ']'
354+
obj = _get_obj(typ, lines_json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit)
355+
if not return_val:
356+
obj = return_val
357+
else:
358+
return_val = concat([return_val, obj])
359+
360+
else:
361+
break
362+
fh.close()
363+
return return_val
364+
else:
365+
json = fh.read()
366+
fh.close()
339367
else:
340368
json = filepath_or_buffer
341369
elif hasattr(filepath_or_buffer, 'read'):
@@ -349,6 +377,11 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
349377
lines = list(StringIO(json.strip()))
350378
json = '[' + ','.join(lines) + ']'
351379

380+
return _get_obj(typ, json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit)
381+
382+
def _get_obj(typ, json, orient, dtype, convert_axes, convert_dates,
383+
keep_default_dates, numpy, precise_float,
384+
date_unit):
352385
obj = None
353386
if typ == 'frame':
354387
obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,

0 commit comments

Comments
 (0)