From 876f1337e2c6ba5aa38573cc041dffc8b375a813 Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Sat, 9 Apr 2022 10:12:44 +0100 Subject: [PATCH 1/8] raise FileNotFoundError in _get_data_from_filepath() --- pandas/io/json/_json.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 2a9ed9f15cd11..3dfbc54543dc0 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -698,6 +698,9 @@ def _get_data_from_filepath(self, filepath_or_buffer): This method turns (1) into (2) to simplify the rest of the processing. It returns input types (2) and (3) unchanged. + + It raises FileNotFoundError if the input is a string ending in + one of .json, .json.gz, .json.bz2 but no such file exists. """ # if it is a string but the file does not exist, it might be a JSON string filepath_or_buffer = stringify_path(filepath_or_buffer) @@ -716,6 +719,12 @@ def _get_data_from_filepath(self, filepath_or_buffer): errors=self.encoding_errors, ) filepath_or_buffer = self.handles.handle + elif ( + isinstance(filepath_or_buffer, str) + and filepath_or_buffer.lower().endswith((".json", ".json.gz", ".json.bz2")) + and not file_exists(filepath_or_buffer) + ): + raise FileNotFoundError(f"File {filepath_or_buffer} does not exist") return filepath_or_buffer From fcccaaf8b86e14b1219c347d237fd6a9108a1a0a Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Sat, 9 Apr 2022 10:13:15 +0100 Subject: [PATCH 2/8] update tests test_read_non_existent + test_read_expands_user_home_dir --- pandas/tests/io/test_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index ca6809470b2b1..ee563c82d2695 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -187,7 +187,7 @@ def test_iterator(self): (pd.read_hdf, "tables", FileNotFoundError, "h5"), (pd.read_stata, "os", FileNotFoundError, "dta"), (pd.read_sas, "os", FileNotFoundError, "sas7bdat"), - (pd.read_json, "os", ValueError, "json"), + (pd.read_json, "os", FileNotFoundError, "json"), (pd.read_pickle, "os", FileNotFoundError, "pickle"), ], ) @@ -253,7 +253,7 @@ def test_write_missing_parent_directory(self, method, module, error_class, fn_ex (pd.read_hdf, "tables", FileNotFoundError, "h5"), (pd.read_stata, "os", FileNotFoundError, "dta"), (pd.read_sas, "os", FileNotFoundError, "sas7bdat"), - (pd.read_json, "os", ValueError, "json"), + (pd.read_json, "os", FileNotFoundError, "json"), (pd.read_pickle, "os", FileNotFoundError, "pickle"), ], ) From 2b84027c24acdf655494f7c21e717702c620fd88 Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Sat, 9 Apr 2022 10:20:17 +0100 Subject: [PATCH 3/8] add changelog entry in doc/source/whatsnew/v1.5.0.rst --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4920622a15f3f..f11bc718fe300 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -566,6 +566,7 @@ I/O - Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`) - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`) - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`) +- :func:`read_json` now raises ``FileNotFoundError`` (previously ``ValueError``) when input is a string ending in one of ``.json``, ``.json.gz``, ``.json.bz2`` but no such file exists. (:issue:`29102`) Period ^^^^^^ From e9b1fe6885246bd56cdc1108ee7decd988c60c66 Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Sat, 9 Apr 2022 12:48:29 +0100 Subject: [PATCH 4/8] use pandas.io.common._compression_to_extension instead of hard-coded extensions --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/io/json/_json.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index f11bc718fe300..46ca67c4baa5c 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -566,7 +566,7 @@ I/O - Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`) - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`) - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`) -- :func:`read_json` now raises ``FileNotFoundError`` (previously ``ValueError``) when input is a string ending in one of ``.json``, ``.json.gz``, ``.json.bz2`` but no such file exists. (:issue:`29102`) +- :func:`read_json` now raises ``FileNotFoundError`` (previously ``ValueError``) when input is a string ending in ``.json``, ``.json.gz``, ``.json.bz2``, etc. but no such file exists. (:issue:`29102`) Period ^^^^^^ diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 3dfbc54543dc0..7169fc5347729 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -52,6 +52,7 @@ from pandas.io.common import ( IOHandles, + _compression_to_extension, file_exists, get_handle, is_fsspec_url, @@ -700,7 +701,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): It returns input types (2) and (3) unchanged. It raises FileNotFoundError if the input is a string ending in - one of .json, .json.gz, .json.bz2 but no such file exists. + one of .json, .json.gz, .json.bz2, etc. but no such file exists. """ # if it is a string but the file does not exist, it might be a JSON string filepath_or_buffer = stringify_path(filepath_or_buffer) @@ -721,7 +722,9 @@ def _get_data_from_filepath(self, filepath_or_buffer): filepath_or_buffer = self.handles.handle elif ( isinstance(filepath_or_buffer, str) - and filepath_or_buffer.lower().endswith((".json", ".json.gz", ".json.bz2")) + and filepath_or_buffer.lower().endswith( + tuple([".json"] + [f".json.{c}" for c in _compression_to_extension]) + ) and not file_exists(filepath_or_buffer) ): raise FileNotFoundError(f"File {filepath_or_buffer} does not exist") From 72bdb9398b11d8d408d566e7c6e14be3dd34332b Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Sat, 9 Apr 2022 20:11:21 +0100 Subject: [PATCH 5/8] move changelog entry from IO to other API changes --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 46ca67c4baa5c..b397d222bc58b 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -309,6 +309,7 @@ Other API changes `_. The ``auth_local_webserver = False`` option is planned to stop working in October 2022. (:issue:`46312`) +- :func:`read_json` now raises ``FileNotFoundError`` (previously ``ValueError``) when input is a string ending in ``.json``, ``.json.gz``, ``.json.bz2``, etc. but no such file exists. (:issue:`29102`) - .. --------------------------------------------------------------------------- @@ -566,7 +567,6 @@ I/O - Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`) - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`) - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`) -- :func:`read_json` now raises ``FileNotFoundError`` (previously ``ValueError``) when input is a string ending in ``.json``, ``.json.gz``, ``.json.bz2``, etc. but no such file exists. (:issue:`29102`) Period ^^^^^^ From 87c0490082c4edda6bc5f2a174433bf96e617b76 Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Fri, 3 Jun 2022 19:49:23 +0100 Subject: [PATCH 6/8] fix ImportError from _compression_to_extension -> _extension_to_compression rename --- pandas/io/json/_json.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 7169fc5347729..85f8a4c9eb278 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -52,7 +52,7 @@ from pandas.io.common import ( IOHandles, - _compression_to_extension, + _extension_to_compression, file_exists, get_handle, is_fsspec_url, @@ -723,7 +723,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): elif ( isinstance(filepath_or_buffer, str) and filepath_or_buffer.lower().endswith( - tuple([".json"] + [f".json.{c}" for c in _compression_to_extension]) + tuple([".json"] + [f".json.{c}" for c in _extension_to_compression]) ) and not file_exists(filepath_or_buffer) ): From 008cbaef2d55ef8d25f52f7b37ffe3e8a7cec07f Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Mon, 6 Jun 2022 11:07:24 +0100 Subject: [PATCH 7/8] add test read_json very long file path --- pandas/io/json/_json.py | 2 +- pandas/tests/io/json/test_pandas.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 85f8a4c9eb278..c5647f9c17304 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -723,7 +723,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): elif ( isinstance(filepath_or_buffer, str) and filepath_or_buffer.lower().endswith( - tuple([".json"] + [f".json.{c}" for c in _extension_to_compression]) + (".json",) + tuple(f".json.{c}" for c in _extension_to_compression) ) and not file_exists(filepath_or_buffer) ): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 576d99f25e25c..eaffbc60ead32 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1566,6 +1566,20 @@ def test_read_json_with_url_value(self, url): expected = DataFrame({"url": [url]}) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "compression", + ["", ".gz", ".bz2", ".tar"], + ) + def test_read_json_with_very_long_file_path(self, compression): + # GH 46718 + long_json_path = f'{"a" * 1000}.json{compression}' + with pytest.raises( + FileNotFoundError, match=f"File {long_json_path} does not exist" + ): + # path too long for Windows is handled in file_exists() but raises in + # _get_data_from_filepath() + read_json(long_json_path) + @pytest.mark.parametrize( "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")] ) From 1c38a6baaad37c1692630ccdc73046cd9be429cb Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Mon, 6 Jun 2022 12:05:57 +0100 Subject: [PATCH 8/8] remove extra period in extension checking --- pandas/io/json/_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index c5647f9c17304..fbea7a71202eb 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -723,7 +723,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): elif ( isinstance(filepath_or_buffer, str) and filepath_or_buffer.lower().endswith( - (".json",) + tuple(f".json.{c}" for c in _extension_to_compression) + (".json",) + tuple(f".json{c}" for c in _extension_to_compression) ) and not file_exists(filepath_or_buffer) ):