From 6a1c6b4ff0e9f4fea948956d6efbf56f2b36b9e9 Mon Sep 17 00:00:00 2001 From: Robin Raymond Date: Mon, 4 Oct 2021 21:13:40 +0200 Subject: [PATCH] BUG: Fix dtypes for read_json (#42819) * Fix dtypes for read_json * Address comments * Add whatsnew entry * Update doc/source/whatsnew/v1.4.0.rst Co-authored-by: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> * Linting Co-authored-by: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/json/_json.py | 9 +-------- pandas/tests/io/json/test_pandas.py | 30 +++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 0c841078fe9b4..8113ac97a3a37 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -465,6 +465,7 @@ I/O - Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`) - Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`) - Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`) +- Bug in :func:`read_json` not handling non-numpy dtypes correctly (especially ``category``) (:issue:`21892`, :issue:`33205`) - Bug in :func:`json_normalize` where multi-character ``sep`` parameter is incorrectly prefixed to every key (:issue:`43831`) - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`) - diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index f92fc65f55df6..b9bdfb91ca154 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -876,11 +876,8 @@ def check_keys_split(self, decoded): def parse(self): - # try numpy - numpy = self.numpy - if numpy: + if self.numpy: self._parse_numpy() - else: self._parse_no_numpy() @@ -941,10 +938,6 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): ) if dtype is not None: try: - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; - # expected "Type[Any]" - dtype = np.dtype(dtype) # type: ignore[arg-type] return data.astype(dtype), True except (TypeError, ValueError): return data, False diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a856f031e20ba..747770ad78684 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1387,6 +1387,36 @@ def test_from_json_to_json_table_dtypes(self): result = read_json(dfjson, orient="table") tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) + def test_to_json_from_json_columns_dtypes(self, orient): + # GH21892 GH33205 + expected = DataFrame.from_dict( + { + "Integer": Series([1, 2, 3], dtype="int64"), + "Float": Series([None, 2.0, 3.0], dtype="float64"), + "Object": Series([None, "", "c"], dtype="object"), + "Bool": Series([True, False, True], dtype="bool"), + "Category": Series(["a", "b", None], dtype="category"), + "Datetime": Series( + ["2020-01-01", None, "2020-01-03"], dtype="datetime64[ns]" + ), + } + ) + dfjson = expected.to_json(orient=orient) + result = read_json( + dfjson, + orient=orient, + dtype={ + "Integer": "int64", + "Float": "float64", + "Object": "object", + "Bool": "bool", + "Category": "category", + "Datetime": "datetime64[ns]", + }, + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}]) def test_read_json_table_dtype_raises(self, dtype): # GH21345