From 8f8b1775ee53ac39a7a57674ae16873fe3176a97 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Jun 2013 15:02:42 -0400 Subject: [PATCH 1/4] BUG: not processing TypeError on reading some json (so was failing rather than trying not-numpy for dtypes) --- pandas/io/json.py | 4 ++-- pandas/io/tests/test_json/test_pandas.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 17b33931bee5a..d1c81d625d98d 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -246,7 +246,7 @@ def _parse(self): labelled=True)) else: self.obj = Series(loads(json, dtype=dtype, numpy=True)) - except ValueError: + except (ValueError,TypeError): numpy = False if not numpy: @@ -296,7 +296,7 @@ def _parse(self): else: self.obj = DataFrame(*loads(json, dtype=dtype, numpy=True, labelled=True)) - except ValueError: + except (ValueError,TypeError): numpy = False if not numpy: diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 4b1294b786df7..cb6e4711f9c42 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -338,6 +338,18 @@ def test_weird_nested_json(self): read_json(s) + @network + @slow + def test_round_trip_exception_(self): + # GH 3867 + + df = pd.read_csv('https://raw.github.com/hayd/lahman2012/master/csvs/Teams.csv') + s = df.to_json() + result = pd.read_json(s) + result.index = result.index.astype(int) + result = result.reindex(columns=df.columns,index=df.index) + assert_frame_equal(result,df) + @network @slow def test_url(self): From 186a4f80d30e45501557a4a8081e910e787e7dc3 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Jun 2013 22:48:32 -0400 Subject: [PATCH 2/4] ENH: added convert_axes argument to control whether to coerce axes ENH: changed dtype argument to accept a dict for a per-column dtype conversion, or turn off conversion (default is True) ENH: changed parse_dates to convert_dates, now defaulting to True BUG: not processing correctly some parsable JSON --- doc/source/io.rst | 56 ++++++- pandas/io/json.py | 203 +++++++++++++++-------- pandas/io/tests/test_json/test_pandas.py | 133 ++++++++++----- 3 files changed, 272 insertions(+), 120 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index e64cbc4bc8101..aec963ca81cf0 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -989,6 +989,8 @@ Writing to a file, with a date index and a date column dfj2 = dfj.copy() dfj2['date'] = Timestamp('20130101') + dfj2['ints'] = range(5) + dfj2['bools'] = True dfj2.index = date_range('20130101',periods=5) dfj2.to_json('test.json') open('test.json').read() @@ -1011,25 +1013,69 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` * records : list like [value, ... , value] * index : dict like {index -> value} -- dtype : dtype of the resulting object -- numpy : direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. -- parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is False +- dtype : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data +- convert_axes : boolean, try to convert the axes to the proper dtypes, default is True +- convert_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True - keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns +- numpy: direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parsable. +The default of ``convert_axes=True``, ``dtype=True``, and ``convert_dates=True`` will try to parse the axes, and all of the data +into appropriate types, including dates. If you need to override specific dtypes, pass a dict to ``dtype``. ``convert_axes`` should only +be set to ``False`` if you need to preserve string-like numbers (e.g. '1', '2') in an axes. + +.. warning:: + + When reading JSON data, automatic coercing into dtypes has some quirks: + + * an index can be in a different order, that is the returned order is not guaranteed to be the same as before serialization + * a column that was ``float`` data can safely be converted to ``integer``, e.g. a column of ``1.`` + * bool columns will be converted to ``integer`` on reconstruction + + Thus there are times where you may want to specify specific dtypes via the ``dtype`` keyword argument. + Reading from a JSON string .. ipython:: python pd.read_json(json) -Reading from a file, parsing dates +Reading from a file + +.. ipython:: python + + pd.read_json('test.json') + +Don't convert any data (but still convert axes and dates) + +.. ipython:: python + + pd.read_json('test.json',dtype=object).dtypes + +Specify how I want to convert data + +.. ipython:: python + + pd.read_json('test.json',dtype={'A' : 'float32', 'bools' : 'int8'}).dtypes + +I like my string indicies .. ipython:: python - pd.read_json('test.json',parse_dates=True) + si = DataFrame(np.zeros((4, 4)), + columns=range(4), + index=[str(i) for i in range(4)]) + si + si.index + si.columns + json = si.to_json() + + sij = pd.read_json(json,convert_axes=False) + sij + sij.index + sij.columns .. ipython:: python :suppress: diff --git a/pandas/io/json.py b/pandas/io/json.py index d1c81d625d98d..537d06f094cd4 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -11,6 +11,7 @@ import numpy as np from pandas.tslib import iNaT +import pandas.lib as lib ### interface to/from ### @@ -86,6 +87,11 @@ def _format_dates(self): self.copy_if_needed() self.obj = self._format_to_date(self.obj) + def _format_bools(self): + if self._needs_to_bool(self.obj): + self.copy_if_needed() + self.obj = self._format_to_bool(self.obj) + class FrameWriter(Writer): _default_orient = 'columns' @@ -112,8 +118,8 @@ def _format_dates(self): for c in dtypes.index: self.obj[c] = self._format_to_date(self.obj[c]) -def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True, - parse_dates=False, keep_default_dates=True): +def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, + convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=True): """ Convert JSON string to pandas object @@ -130,13 +136,16 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True records : list like [value, ... , value] index : dict like {index -> value} typ : type of object to recover (series or frame), default 'frame' - dtype : dtype of the resulting object - numpy: direct decoding to numpy arrays. default True but falls back - to standard decoding if a problem occurs. - parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns - default is False + dtype : if True, infer dtypes, if a dict of column to dtype, then use those, + if False, then don't infer dtypes at all, default is True, + apply only to the data + convert_axes : boolean, try to convert the axes to the proper dtypes, default is True + convert_dates : a list of columns to parse for dates; If True, then try to parse datelike columns + default is True keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns + numpy: direct decoding to numpy arrays. default True but falls back + to standard decoding if a problem occurs. Returns ------- @@ -157,16 +166,18 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True obj = None if typ == 'frame': - obj = FrameParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() + obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy).parse() if typ == 'series' or obj is None: - obj = SeriesParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() + if not isinstance(dtype,bool): + dtype = dict(data = dtype) + obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy).parse() return obj class Parser(object): - def __init__(self, json, orient, dtype, numpy, parse_dates=False, keep_default_dates=False): + def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=True): self.json = json if orient is None: @@ -175,27 +186,92 @@ def __init__(self, json, orient, dtype, numpy, parse_dates=False, keep_default_d self.orient = orient self.dtype = dtype - if dtype is not None and orient == "split": + if orient == "split": numpy = False self.numpy = numpy - self.parse_dates = parse_dates + self.convert_axes = convert_axes + self.convert_dates = convert_dates self.keep_default_dates = keep_default_dates self.obj = None def parse(self): self._parse() - if self.obj is not None: + if self.obj is None: return None + if self.convert_axes: self._convert_axes() - if self.parse_dates: - self._try_parse_dates() + self._try_convert_types() return self.obj + def _convert_axes(self): + """ try to convert axes """ + for axis in self.obj._AXIS_NUMBERS.keys(): + new_axis, result = self._try_convert_data(axis, self.obj._get_axis(axis), use_dtypes=False, convert_dates=True) + if result: + setattr(self.obj,axis,new_axis) - def _try_parse_to_date(self, data): + def _try_convert_types(self): + raise NotImplementedError + + def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): + """ try to parse a ndarray like into a column by inferring dtype """ + + # don't try to coerce, unless a force conversion + if use_dtypes: + if self.dtype is False: + return data, False + elif self.dtype is True: + pass + + else: + + # dtype to force + dtype = self.dtype.get(name) if isinstance(self.dtype,dict) else self.dtype + if dtype is not None: + try: + dtype = np.dtype(dtype) + return data.astype(dtype), True + except: + return data, False + + if convert_dates: + new_data, result = self._try_convert_to_date(data) + if result: + return new_data, True + + result = False + + if data.dtype == 'object': + + # try float + try: + data = data.astype('float64') + result = True + except: + pass + + # do't coerce 0-len data + if len(data) and (data.dtype == 'float' or data.dtype == 'object'): + + # coerce ints if we can + try: + new_data = data.astype('int64') + if (new_data == data).all(): + data = new_data + result = True + except: + pass + + return data, result + + def _try_convert_to_date(self, data): """ try to parse a ndarray like into a date column try to coerce object in epoch/iso formats and - integer/float in epcoh formats """ + integer/float in epcoh formats, return a boolean if parsing + was successful """ + + # no conversion on empty + if not len(data): return data, False new_data = data if new_data.dtype == 'object': @@ -208,7 +284,7 @@ def _try_parse_to_date(self, data): # ignore numbers that are out of range if issubclass(new_data.dtype.type,np.number): if not ((new_data == iNaT) | (new_data > 31536000000000000L)).all(): - return data + return data, False try: new_data = to_datetime(new_data) @@ -218,11 +294,11 @@ def _try_parse_to_date(self, data): except: # return old, noting more we can do - new_data = data + return data, False - return new_data + return new_data, True - def _try_parse_dates(self): + def _try_convert_dates(self): raise NotImplementedError class SeriesParser(Parser): @@ -231,21 +307,20 @@ class SeriesParser(Parser): def _parse(self): json = self.json - dtype = self.dtype orient = self.orient numpy = self.numpy if numpy: try: if orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) + decoded = loads(json, dtype=None, numpy=True) decoded = dict((str(k), v) for k, v in decoded.iteritems()) self.obj = Series(**decoded) elif orient == "columns" or orient == "index": - self.obj = Series(*loads(json, dtype=dtype, numpy=True, + self.obj = Series(*loads(json, dtype=None, numpy=True, labelled=True)) else: - self.obj = Series(loads(json, dtype=dtype, numpy=True)) + self.obj = Series(loads(json, dtype=None, numpy=True)) except (ValueError,TypeError): numpy = False @@ -253,22 +328,15 @@ def _parse(self): if orient == "split": decoded = dict((str(k), v) for k, v in loads(json).iteritems()) - self.obj = Series(dtype=dtype, **decoded) + self.obj = Series(dtype=None, **decoded) else: - self.obj = Series(loads(json), dtype=dtype) - - def _convert_axes(self): - """ try to axes if they are datelike """ - try: - self.obj.index = self._try_parse_to_date(self.obj.index) - except: - pass + self.obj = Series(loads(json), dtype=None) - def _try_parse_dates(self): + def _try_convert_types(self): if self.obj is None: return - - if self.parse_dates: - self.obj = self._try_parse_to_date(self.obj) + obj, result = self._try_convert_data('data', self.obj, convert_dates=self.convert_dates) + if result: + self.obj = obj class FrameParser(Parser): _default_orient = 'columns' @@ -276,64 +344,57 @@ class FrameParser(Parser): def _parse(self): json = self.json - dtype = self.dtype orient = self.orient numpy = self.numpy if numpy: try: if orient == "columns": - args = loads(json, dtype=dtype, numpy=True, labelled=True) + args = loads(json, dtype=None, numpy=True, labelled=True) if args: args = (args[0].T, args[2], args[1]) self.obj = DataFrame(*args) elif orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) + decoded = loads(json, dtype=None, numpy=True) decoded = dict((str(k), v) for k, v in decoded.iteritems()) self.obj = DataFrame(**decoded) elif orient == "values": - self.obj = DataFrame(loads(json, dtype=dtype, numpy=True)) + self.obj = DataFrame(loads(json, dtype=None, numpy=True)) else: - self.obj = DataFrame(*loads(json, dtype=dtype, numpy=True, + self.obj = DataFrame(*loads(json, dtype=None, numpy=True, labelled=True)) except (ValueError,TypeError): numpy = False if not numpy: if orient == "columns": - self.obj = DataFrame(loads(json), dtype=dtype) + self.obj = DataFrame(loads(json), dtype=None) elif orient == "split": decoded = dict((str(k), v) for k, v in loads(json).iteritems()) - self.obj = DataFrame(dtype=dtype, **decoded) + self.obj = DataFrame(dtype=None, **decoded) elif orient == "index": - self.obj = DataFrame(loads(json), dtype=dtype).T + self.obj = DataFrame(loads(json), dtype=None).T else: - self.obj = DataFrame(loads(json), dtype=dtype) - - def _convert_axes(self): - """ try to axes if they are datelike """ - if self.orient == 'columns': - axis = 'index' - elif self.orient == 'index': - axis = 'columns' - else: - return - - try: - a = getattr(self.obj,axis) - setattr(self.obj,axis,self._try_parse_to_date(a)) - except: - pass + self.obj = DataFrame(loads(json), dtype=None) - def _try_parse_dates(self): + def _try_convert_types(self): + if self.obj is None: return + if self.convert_dates: + self._try_convert_dates() + for col in self.obj.columns: + new_data, result = self._try_convert_data(col, self.obj[col], convert_dates=False) + if result: + self.obj[col] = new_data + + def _try_convert_dates(self): if self.obj is None: return # our columns to parse - parse_dates = self.parse_dates - if parse_dates is True: - parse_dates = [] - parse_dates = set(parse_dates) + convert_dates = self.convert_dates + if convert_dates is True: + convert_dates = [] + convert_dates = set(convert_dates) def is_ok(col): """ return if this col is ok to try for a date parse """ @@ -348,6 +409,8 @@ def is_ok(col): return False - for col, c in self.obj.iteritems(): - if (self.keep_default_dates and is_ok(col)) or col in parse_dates: - self.obj[col] = self._try_parse_to_date(c) + for col in self.obj.columns: + if (self.keep_default_dates and is_ok(col)) or col in convert_dates: + new_data, result = self._try_convert_to_date(self.obj[col]) + if result: + self.obj[col] = new_data diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index cb6e4711f9c42..bcbd4d4b91e70 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -56,13 +56,14 @@ def setUp(self): def test_frame_from_json_to_json(self): - def _check_orient(df, orient, dtype=None, numpy=True): + def _check_orient(df, orient, dtype=None, numpy=True, convert_axes=True, check_dtype=True): df = df.sort() dfjson = df.to_json(orient=orient) unser = read_json(dfjson, orient=orient, dtype=dtype, - numpy=numpy) + numpy=numpy, convert_axes=convert_axes) unser = unser.sort() - if df.index.dtype.type == np.datetime64: + + if not convert_axes and df.index.dtype.type == np.datetime64: unser.index = DatetimeIndex(unser.index.values.astype('i8')) if orient == "records": # index is not captured in this orientation @@ -78,20 +79,37 @@ def _check_orient(df, orient, dtype=None, numpy=True): unser = unser.sort() assert_almost_equal(df.values, unser.values) else: - assert_frame_equal(df, unser) - - def _check_all_orients(df, dtype=None): - _check_orient(df, "columns", dtype=dtype) - _check_orient(df, "records", dtype=dtype) - _check_orient(df, "split", dtype=dtype) - _check_orient(df, "index", dtype=dtype) - _check_orient(df, "values", dtype=dtype) - - _check_orient(df, "columns", dtype=dtype, numpy=False) - _check_orient(df, "records", dtype=dtype, numpy=False) - _check_orient(df, "split", dtype=dtype, numpy=False) - _check_orient(df, "index", dtype=dtype, numpy=False) - _check_orient(df, "values", dtype=dtype, numpy=False) + if convert_axes: + assert_frame_equal(df, unser, check_dtype=check_dtype) + else: + assert_frame_equal(df, unser, check_less_precise=False, check_dtype=check_dtype) + + def _check_all_orients(df, dtype=None, convert_axes=True): + if convert_axes: + _check_orient(df, "columns", dtype=dtype) + _check_orient(df, "records", dtype=dtype) + _check_orient(df, "split", dtype=dtype) + _check_orient(df, "index", dtype=dtype) + _check_orient(df, "values", dtype=dtype) + + _check_orient(df, "columns", dtype=dtype, convert_axes=False) + _check_orient(df, "records", dtype=dtype, convert_axes=False) + _check_orient(df, "split", dtype=dtype, convert_axes=False) + _check_orient(df, "index", dtype=dtype, convert_axes=False) + _check_orient(df, "values", dtype=dtype ,convert_axes=False) + + if convert_axes: + _check_orient(df, "columns", dtype=dtype, numpy=False) + _check_orient(df, "records", dtype=dtype, numpy=False) + _check_orient(df, "split", dtype=dtype, numpy=False) + _check_orient(df, "index", dtype=dtype, numpy=False) + _check_orient(df, "values", dtype=dtype, numpy=False) + + _check_orient(df, "columns", dtype=dtype, numpy=False, convert_axes=False) + _check_orient(df, "records", dtype=dtype, numpy=False, convert_axes=False) + _check_orient(df, "split", dtype=dtype, numpy=False, convert_axes=False) + _check_orient(df, "index", dtype=dtype, numpy=False, convert_axes=False) + _check_orient(df, "values", dtype=dtype, numpy=False, convert_axes=False) # basic _check_all_orients(self.frame) @@ -99,6 +117,7 @@ def _check_all_orients(df, dtype=None): self.frame.to_json(orient="columns")) _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) + _check_all_orients(self.intframe, dtype=False) # big one # index and columns are strings as all unserialised JSON object keys @@ -106,13 +125,13 @@ def _check_all_orients(df, dtype=None): biggie = DataFrame(np.zeros((200, 4)), columns=[str(i) for i in range(4)], index=[str(i) for i in range(200)]) - _check_all_orients(biggie) + _check_all_orients(biggie,dtype=False,convert_axes=False) # dtypes _check_all_orients(DataFrame(biggie, dtype=np.float64), - dtype=np.float64) - _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int) - _check_all_orients(DataFrame(biggie, dtype=' Date: Thu, 13 Jun 2013 09:40:56 -0400 Subject: [PATCH 3/4] TST: tests for numpy=True/False differeing in parsing --- pandas/io/tests/test_json/test_pandas.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index bcbd4d4b91e70..23ac4c4df15e3 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -383,6 +383,16 @@ def test_doc_example(self): result = read_json(json,dtype={'ints' : np.int64, 'bools' : np.bool_}) assert_frame_equal(result,result) + def test_misc_example(self): + #import pdb; pdb.set_trace() + result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]',numpy=True) + expected = DataFrame([[1,2],[1,2]],columns=['a','b']) + #assert_frame_equal(result,expected) + + result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]',numpy=False) + expected = DataFrame([[1,2],[1,2]],columns=['a','b']) + assert_frame_equal(result,expected) + @network @slow def test_round_trip_exception_(self): From 740b10fe1d5de2bf027a65c668cbb692d7237867 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Jun 2013 11:30:05 -0400 Subject: [PATCH 4/4] PERF: changed default to numpy=False to have correct parsing using unordered JSON eliminated fallback parsing with numpy=True; This will raise ValueError if it fails to parse (a known case are strings in the frame data) --- doc/source/io.rst | 41 +++++-- pandas/core/generic.py | 12 +- pandas/io/json.py | 145 ++++++++++++----------- pandas/io/tests/test_json/test_pandas.py | 61 ++++++---- 4 files changed, 156 insertions(+), 103 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index aec963ca81cf0..c182d456315ec 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -954,13 +954,21 @@ with optional parameters: - path_or_buf : the pathname or buffer to write the output This can be ``None`` in which case a JSON string is returned -- orient : The format of the JSON string, default is ``index`` for ``Series``, ``columns`` for ``DataFrame`` +- orient : - * split : dict like {index -> [index], columns -> [columns], data -> [values]} - * records : list like [{column -> value}, ... , {column -> value}] - * index : dict like {index -> {column -> value}} - * columns : dict like {column -> {index -> value}} - * values : just the values array + Series : + default is 'index', allowed values are: {'split','records','index'} + + DataFrame : + default is 'columns', allowed values are: {'split','records','index','columns','values'} + + The format of the JSON string + + * split : dict like {index -> [index], columns -> [columns], data -> [values]} + * records : list like [{column -> value}, ... , {column -> value}] + * index : dict like {index -> {column -> value}} + * columns : dict like {column -> {index -> value}} + * values : just the values array - date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), default is epoch - double_precision : The number of decimal places to use when encoding floating point values, default 10. @@ -1007,17 +1015,28 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` is expected. For instance, a local file could be file ://localhost/path/to/table.json - typ : type of object to recover (series or frame), default 'frame' -- orient : The format of the JSON string, one of the following +- orient : + + Series : + default is 'index', allowed values are: {'split','records','index'} + + DataFrame : + default is 'columns', allowed values are: {'split','records','index','columns','values'} + + The format of the JSON string - * split : dict like {index -> [index], name -> name, data -> [values]} - * records : list like [value, ... , value] - * index : dict like {index -> value} + * split : dict like {index -> [index], columns -> [columns], data -> [values]} + * records : list like [{column -> value}, ... , {column -> value}] + * index : dict like {index -> {column -> value}} + * columns : dict like {column -> {index -> value}} + * values : just the values array - dtype : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data - convert_axes : boolean, try to convert the axes to the proper dtypes, default is True - convert_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True - keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns -- numpy: direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. +- numpy: direct decoding to numpy arrays. default is False; + Note that the JSON ordering **MUST** be the same for each term if ``numpy=True`` The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parsable. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0d2612d7aed7a..55347aef078ef 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -507,8 +507,15 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', ---------- path_or_buf : the path or buffer to write the result string if this is None, return a StringIO of the converted string - orient : {'split', 'records', 'index', 'columns', 'values'}, - default is 'index' for Series, 'columns' for DataFrame + orient : + + Series : + default is 'index' + allowed values are: {'split','records','index'} + + DataFrame : + default is 'columns' + allowed values are: {'split','records','index','columns','values'} The format of the JSON string split : dict like @@ -517,6 +524,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', index : dict like {index -> {column -> value}} columns : dict like {column -> {index -> value}} values : just the values array + date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), default is epoch double_precision : The number of decimal places to use when encoding diff --git a/pandas/io/json.py b/pandas/io/json.py index 537d06f094cd4..fcecb31bb77a7 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -119,7 +119,7 @@ def _format_dates(self): self.obj[c] = self._format_to_date(self.obj[c]) def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, - convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=True): + convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False): """ Convert JSON string to pandas object @@ -129,12 +129,22 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.json - orient : {'split', 'records', 'index'}, default 'index' + orient : + Series : + default is 'index' + allowed values are: {'split','records','index'} + + DataFrame : + default is 'columns' + allowed values are: {'split','records','index','columns','values'} + The format of the JSON string - split : dict like - {index -> [index], name -> name, data -> [values]} - records : list like [value, ... , value] - index : dict like {index -> value} + split : dict like {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + typ : type of object to recover (series or frame), default 'frame' dtype : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, @@ -144,8 +154,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, default is True keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns - numpy: direct decoding to numpy arrays. default True but falls back - to standard decoding if a problem occurs. + numpy: direct decoding to numpy arrays. default is False.Note that the JSON ordering MUST be the same + for each term if numpy=True. Returns ------- @@ -177,7 +187,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, class Parser(object): - def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=True): + def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=False): self.json = json if orient is None: @@ -196,7 +206,15 @@ def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=Tr self.obj = None def parse(self): - self._parse() + + # try numpy + numpy = self.numpy + if numpy: + self._parse_numpy() + + else: + self._parse_no_numpy() + if self.obj is None: return None if self.convert_axes: self._convert_axes() @@ -304,33 +322,30 @@ def _try_convert_dates(self): class SeriesParser(Parser): _default_orient = 'index' - def _parse(self): + def _parse_no_numpy(self): + + json = self.json + orient = self.orient + if orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + self.obj = Series(dtype=None, **decoded) + else: + self.obj = Series(loads(json), dtype=None) + + def _parse_numpy(self): json = self.json orient = self.orient - numpy = self.numpy - - if numpy: - try: - if orient == "split": - decoded = loads(json, dtype=None, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - self.obj = Series(**decoded) - elif orient == "columns" or orient == "index": - self.obj = Series(*loads(json, dtype=None, numpy=True, - labelled=True)) - else: - self.obj = Series(loads(json, dtype=None, numpy=True)) - except (ValueError,TypeError): - numpy = False - - if not numpy: - if orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - self.obj = Series(dtype=None, **decoded) - else: - self.obj = Series(loads(json), dtype=None) + if orient == "split": + decoded = loads(json, dtype=None, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + self.obj = Series(**decoded) + elif orient == "columns" or orient == "index": + self.obj = Series(*loads(json, dtype=None, numpy=True, + labelled=True)) + else: + self.obj = Series(loads(json, dtype=None, numpy=True)) def _try_convert_types(self): if self.obj is None: return @@ -341,42 +356,40 @@ def _try_convert_types(self): class FrameParser(Parser): _default_orient = 'columns' - def _parse(self): + def _parse_numpy(self): json = self.json orient = self.orient - numpy = self.numpy - if numpy: - try: - if orient == "columns": - args = loads(json, dtype=None, numpy=True, labelled=True) - if args: - args = (args[0].T, args[2], args[1]) - self.obj = DataFrame(*args) - elif orient == "split": - decoded = loads(json, dtype=None, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - self.obj = DataFrame(**decoded) - elif orient == "values": - self.obj = DataFrame(loads(json, dtype=None, numpy=True)) - else: - self.obj = DataFrame(*loads(json, dtype=None, numpy=True, - labelled=True)) - except (ValueError,TypeError): - numpy = False - - if not numpy: - if orient == "columns": - self.obj = DataFrame(loads(json), dtype=None) - elif orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - self.obj = DataFrame(dtype=None, **decoded) - elif orient == "index": - self.obj = DataFrame(loads(json), dtype=None).T - else: - self.obj = DataFrame(loads(json), dtype=None) + if orient == "columns": + args = loads(json, dtype=None, numpy=True, labelled=True) + if args: + args = (args[0].T, args[2], args[1]) + self.obj = DataFrame(*args) + elif orient == "split": + decoded = loads(json, dtype=None, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + self.obj = DataFrame(**decoded) + elif orient == "values": + self.obj = DataFrame(loads(json, dtype=None, numpy=True)) + else: + self.obj = DataFrame(*loads(json, dtype=None, numpy=True, labelled=True)) + + def _parse_no_numpy(self): + + json = self.json + orient = self.orient + + if orient == "columns": + self.obj = DataFrame(loads(json), dtype=None) + elif orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + self.obj = DataFrame(dtype=None, **decoded) + elif orient == "index": + self.obj = DataFrame(loads(json), dtype=None).T + else: + self.obj = DataFrame(loads(json), dtype=None) def _try_convert_types(self): if self.obj is None: return diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 23ac4c4df15e3..bdd700bdbcec3 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -56,11 +56,19 @@ def setUp(self): def test_frame_from_json_to_json(self): - def _check_orient(df, orient, dtype=None, numpy=True, convert_axes=True, check_dtype=True): + def _check_orient(df, orient, dtype=None, numpy=False, convert_axes=True, check_dtype=True, raise_ok=None): df = df.sort() dfjson = df.to_json(orient=orient) - unser = read_json(dfjson, orient=orient, dtype=dtype, - numpy=numpy, convert_axes=convert_axes) + + try: + unser = read_json(dfjson, orient=orient, dtype=dtype, + numpy=numpy, convert_axes=convert_axes) + except (Exception), detail: + if raise_ok is not None: + if type(detail) == raise_ok: + return + raise + unser = unser.sort() if not convert_axes and df.index.dtype.type == np.datetime64: @@ -84,7 +92,9 @@ def _check_orient(df, orient, dtype=None, numpy=True, convert_axes=True, check_d else: assert_frame_equal(df, unser, check_less_precise=False, check_dtype=check_dtype) - def _check_all_orients(df, dtype=None, convert_axes=True): + def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None): + + # numpy=False if convert_axes: _check_orient(df, "columns", dtype=dtype) _check_orient(df, "records", dtype=dtype) @@ -98,18 +108,19 @@ def _check_all_orients(df, dtype=None, convert_axes=True): _check_orient(df, "index", dtype=dtype, convert_axes=False) _check_orient(df, "values", dtype=dtype ,convert_axes=False) + # numpy=True and raise_ok might be not None, so ignore the error if convert_axes: - _check_orient(df, "columns", dtype=dtype, numpy=False) - _check_orient(df, "records", dtype=dtype, numpy=False) - _check_orient(df, "split", dtype=dtype, numpy=False) - _check_orient(df, "index", dtype=dtype, numpy=False) - _check_orient(df, "values", dtype=dtype, numpy=False) - - _check_orient(df, "columns", dtype=dtype, numpy=False, convert_axes=False) - _check_orient(df, "records", dtype=dtype, numpy=False, convert_axes=False) - _check_orient(df, "split", dtype=dtype, numpy=False, convert_axes=False) - _check_orient(df, "index", dtype=dtype, numpy=False, convert_axes=False) - _check_orient(df, "values", dtype=dtype, numpy=False, convert_axes=False) + _check_orient(df, "columns", dtype=dtype, numpy=True, raise_ok=raise_ok) + _check_orient(df, "records", dtype=dtype, numpy=True, raise_ok=raise_ok) + _check_orient(df, "split", dtype=dtype, numpy=True, raise_ok=raise_ok) + _check_orient(df, "index", dtype=dtype, numpy=True, raise_ok=raise_ok) + _check_orient(df, "values", dtype=dtype, numpy=True, raise_ok=raise_ok) + + _check_orient(df, "columns", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "records", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "split", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "index", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "values", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) # basic _check_all_orients(self.frame) @@ -131,7 +142,8 @@ def _check_all_orients(df, dtype=None, convert_axes=True): _check_all_orients(DataFrame(biggie, dtype=np.float64), dtype=np.float64, convert_axes=False) _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int, convert_axes=False) - _check_all_orients(DataFrame(biggie, dtype='