diff --git a/docs/guide/serialization.rst b/docs/guide/serialization.rst index 0ed596d5..6557a3e4 100644 --- a/docs/guide/serialization.rst +++ b/docs/guide/serialization.rst @@ -153,6 +153,27 @@ Details of how the object is recognized for encoding or how the dictionary is recognized for decoding can be changed by passing functions to the ``is_my_obj`` or ``is_my_dict`` parameters of :class:`.JSONCodec`. +.. warning:: + The Custom encoders & decoders only override the default JSON + encoder/decoder if they are not able to natively handle the object. + This leads to some odd / lossy behaviour for some objects such + as ``np.float64`` which is natively converted to a ``float`` type + by the default encoder, whilst other numpy generic types are + appropriately roundtripped. + +On the use of NumPy generic types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Due to their inconsistent behaviour in how they are handled by the default +JSON encoder/decoder routines (see warning above), it is our suggestion +that Python types should be used preferrentially instead of NumPy generic +types. For example if one would be looking to store a single float value, +a ``float`` would be prefered to a ``np.float32`` or ``np.float64``. + +Please note that this only applied to generic types being used **outside of +numpy arrays**. NumPy arrays are, as far as we know, always handled +in a consistent manner. + Dumping arbitrary objects to JSON --------------------------------- diff --git a/gufe/custom_codecs.py b/gufe/custom_codecs.py index 39c604fb..d28e1882 100644 --- a/gufe/custom_codecs.py +++ b/gufe/custom_codecs.py @@ -34,6 +34,12 @@ def inherited_is_my_dict(dct, cls): return cls in stored.mro() +def is_npy_dtype_dict(dct): + expected = ["dtype", "bytes"] + is_custom = all(exp in dct for exp in expected) + return is_custom and ("shape" not in dct) + + def is_openff_unit_dict(dct): expected = ["pint_unit_registry", "unit_name", ":is_custom:"] is_custom = all(exp in dct for exp in expected) @@ -52,18 +58,39 @@ def is_openff_quantity_dict(dct): from_dict=lambda dct: pathlib.PosixPath(dct["path"]), ) + BYTES_CODEC = JSONCodec( cls=bytes, to_dict=lambda obj: {'latin-1': obj.decode('latin-1')}, from_dict=lambda dct: dct['latin-1'].encode('latin-1'), ) + DATETIME_CODEC = JSONCodec( cls=datetime.datetime, to_dict=lambda obj: {'isotime': obj.isoformat()}, from_dict=lambda dct: datetime.datetime.fromisoformat(dct['isotime']), ) + +# Note that this has inconsistent behaviour for some generic types +# which end up being handled by the default JSON encoder/decoder. +# The main example of this is np.float64 which will be turned into +# a float type on serialization. +NPY_DTYPE_CODEC = JSONCodec( + cls=np.generic, + to_dict=lambda obj: { + 'dtype': str(obj.dtype), + 'bytes': obj.tobytes(), + }, + from_dict=lambda dct: np.frombuffer( + dct['bytes'], dtype=np.dtype(dct['dtype']) + )[0], + is_my_obj=lambda obj: isinstance(obj, np.generic), + is_my_dict=is_npy_dtype_dict, +) + + NUMPY_CODEC = JSONCodec( cls=np.ndarray, to_dict=lambda obj: { @@ -76,6 +103,7 @@ def is_openff_quantity_dict(dct): ).reshape(dct['shape']) ) + SETTINGS_CODEC = JSONCodec( cls=SettingsBaseModel, to_dict=lambda obj: {field: getattr(obj, field) for field in obj.__fields__}, @@ -83,6 +111,7 @@ def is_openff_quantity_dict(dct): is_my_dict=functools.partial(inherited_is_my_dict, cls=SettingsBaseModel), ) + OPENFF_QUANTITY_CODEC = JSONCodec( cls=None, to_dict=lambda obj: { @@ -96,6 +125,7 @@ def is_openff_quantity_dict(dct): is_my_dict=is_openff_quantity_dict, ) + OPENFF_UNIT_CODEC = JSONCodec( cls=None, to_dict=lambda unit: { diff --git a/gufe/tests/test_custom_json.py b/gufe/tests/test_custom_json.py index 025d53eb..71274b98 100644 --- a/gufe/tests/test_custom_json.py +++ b/gufe/tests/test_custom_json.py @@ -15,6 +15,7 @@ from gufe.custom_codecs import ( BYTES_CODEC, NUMPY_CODEC, + NPY_DTYPE_CODEC, OPENFF_QUANTITY_CODEC, OPENFF_UNIT_CODEC, PATH_CODEC, @@ -46,6 +47,22 @@ def test_add_existing_codec(self): assert len(serialization.codecs) == 1 +@pytest.mark.parametrize('obj', [ + np.array([[1.0, 0.0], [2.0, 3.2]]), + np.float32(1.1) +]) +@pytest.mark.parametrize('codecs', [ + [BYTES_CODEC, NUMPY_CODEC, NPY_DTYPE_CODEC], + [NPY_DTYPE_CODEC, BYTES_CODEC, NUMPY_CODEC], +]) +def test_numpy_codec_order_roundtrip(obj, codecs): + serialization = JSONSerializerDeserializer(codecs) + serialized = serialization.serializer(obj) + reconstructed = serialization.deserializer(serialized) + npt.assert_equal(obj, reconstructed) + assert obj.dtype == reconstructed.dtype + + class CustomJSONCodingTest: """Base class for testing codecs. @@ -89,8 +106,9 @@ def test_not_mine(self): class TestNumpyCoding(CustomJSONCodingTest): def setup_method(self): self.codec = NUMPY_CODEC - self.objs = [np.array([[1.0, 0.0], [2.0, 3.2]]), np.array([1, 0])] - shapes = [[2, 2], [2,]] + self.objs = [np.array([[1.0, 0.0], [2.0, 3.2]]), np.array([1, 0]), + np.array([1.0, 2.0, 3.0], dtype=np.float32)] + shapes = [[2, 2], [2,], [3,]] dtypes = [str(arr.dtype) for arr in self.objs] # may change by system? byte_reps = [arr.tobytes() for arr in self.objs] self.dcts = [ @@ -117,10 +135,38 @@ def test_round_trip(self): json_str = json.dumps(obj, cls=encoder) reconstructed = json.loads(json_str, cls=decoder) npt.assert_array_equal(reconstructed, obj) + assert reconstructed.dtype == obj.dtype json_str_2 = json.dumps(obj, cls=encoder) assert json_str == json_str_2 +class TestNumpyGenericCodec(TestNumpyCoding): + def setup_method(self): + self.codec = NPY_DTYPE_CODEC + # Note that np.float64 is treated as a float by the + # default json encode (and so returns a float not a numpy + # object). + self.objs = [np.bool_(True), np.float16(1.0), np.float32(1.0), + np.complex128(1.0), + np.clongdouble(1.0), np.uint64(1)] + dtypes = [str(a.dtype) for a in self.objs] + byte_reps = [a.tobytes() for a in self.objs] + # Overly complicated extraction of the class name + # to deal with the bool_ -> bool dtype class name problem + classes = [str(a.__class__).split("'")[1].split('.')[1] + for a in self.objs] + self.dcts = [ + { + ":is_custom:": True, + "__class__": classname, + "__module__": "numpy", + "dtype": dtype, + "bytes": byte_rep, + } + for dtype, byte_rep, classname in zip(dtypes, byte_reps, classes) + ] + + class TestPathCodec(CustomJSONCodingTest): def setup_method(self): self.codec = PATH_CODEC diff --git a/gufe/tokenization.py b/gufe/tokenization.py index 25364891..66e8accd 100644 --- a/gufe/tokenization.py +++ b/gufe/tokenization.py @@ -15,6 +15,7 @@ from gufe.custom_codecs import ( BYTES_CODEC, DATETIME_CODEC, + NPY_DTYPE_CODEC, NUMPY_CODEC, OPENFF_QUANTITY_CODEC, OPENFF_UNIT_CODEC, @@ -26,6 +27,7 @@ _default_json_codecs = [ PATH_CODEC, NUMPY_CODEC, + NPY_DTYPE_CODEC, BYTES_CODEC, DATETIME_CODEC, SETTINGS_CODEC,