From 6a1c6b4ff0e9f4fea948956d6efbf56f2b36b9e9 Mon Sep 17 00:00:00 2001
From: Robin Raymond <robin@robinraymond.de>
Date: Mon, 4 Oct 2021 21:13:40 +0200
Subject: [PATCH] BUG: Fix dtypes for read_json (#42819)

* Fix dtypes for read_json

* Address comments

* Add whatsnew entry

* Update doc/source/whatsnew/v1.4.0.rst

Co-authored-by: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com>

* Linting

Co-authored-by: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com>
---
 doc/source/whatsnew/v1.4.0.rst      |  1 +
 pandas/io/json/_json.py             |  9 +--------
 pandas/tests/io/json/test_pandas.py | 30 +++++++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 0c841078fe9b4..8113ac97a3a37 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -465,6 +465,7 @@ I/O
 - Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`)
 - Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`)
 - Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`)
+- Bug in :func:`read_json` not handling non-numpy dtypes correctly (especially ``category``) (:issue:`21892`, :issue:`33205`)
 - Bug in :func:`json_normalize` where multi-character ``sep`` parameter is incorrectly prefixed to every key (:issue:`43831`)
 - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`)
 -
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index f92fc65f55df6..b9bdfb91ca154 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -876,11 +876,8 @@ def check_keys_split(self, decoded):
 
     def parse(self):
 
-        # try numpy
-        numpy = self.numpy
-        if numpy:
+        if self.numpy:
             self._parse_numpy()
-
         else:
             self._parse_no_numpy()
 
@@ -941,10 +938,6 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True):
                 )
                 if dtype is not None:
                     try:
-                        # error: Argument 1 to "dtype" has incompatible type
-                        # "Union[ExtensionDtype, str, dtype[Any], Type[object]]";
-                        # expected "Type[Any]"
-                        dtype = np.dtype(dtype)  # type: ignore[arg-type]
                         return data.astype(dtype), True
                     except (TypeError, ValueError):
                         return data, False
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index a856f031e20ba..747770ad78684 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -1387,6 +1387,36 @@ def test_from_json_to_json_table_dtypes(self):
         result = read_json(dfjson, orient="table")
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"])
+    def test_to_json_from_json_columns_dtypes(self, orient):
+        # GH21892 GH33205
+        expected = DataFrame.from_dict(
+            {
+                "Integer": Series([1, 2, 3], dtype="int64"),
+                "Float": Series([None, 2.0, 3.0], dtype="float64"),
+                "Object": Series([None, "", "c"], dtype="object"),
+                "Bool": Series([True, False, True], dtype="bool"),
+                "Category": Series(["a", "b", None], dtype="category"),
+                "Datetime": Series(
+                    ["2020-01-01", None, "2020-01-03"], dtype="datetime64[ns]"
+                ),
+            }
+        )
+        dfjson = expected.to_json(orient=orient)
+        result = read_json(
+            dfjson,
+            orient=orient,
+            dtype={
+                "Integer": "int64",
+                "Float": "float64",
+                "Object": "object",
+                "Bool": "bool",
+                "Category": "category",
+                "Datetime": "datetime64[ns]",
+            },
+        )
+        tm.assert_frame_equal(result, expected)
+
     @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}])
     def test_read_json_table_dtype_raises(self, dtype):
         # GH21345