Skip to content

Commit

Permalink
Fix: Always pass bytes in the RDKitMolCodec (#261)
Browse files Browse the repository at this point in the history
  • Loading branch information
cwognum authored Feb 1, 2025
1 parent fe3bff3 commit f64bcdc
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 3 deletions.
6 changes: 4 additions & 2 deletions polaris/dataset/zarr/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@ def encode(self, buf: np.ndarray):
"""
Encode a chunk of RDKit Mols to byte strings
"""
to_encode = np.empty(shape=len(buf), dtype=object)
# NOTE (cwognum): I ran into a Cython issue because we could pass None to the VLenBytes codec.
# Using np.full() ensures all elements are initialized as empty byte strings instead.
to_encode = np.full(fill_value=b"", shape=len(buf), dtype=object)
for idx, mol in enumerate(buf):
if mol is None or (isinstance(mol, bytes) and len(mol) == 0):
continue
if not isinstance(mol, Chem.Mol):
raise ValueError(f"Expected an RDKitMol, but got {type(buf)} instead.")
raise ValueError(f"Expected an RDKitMol, but got {type(mol)} instead.")
props = Chem.PropertyPickleOptions.AllProps
to_encode[idx] = mol.ToBinary(props)

Expand Down
9 changes: 8 additions & 1 deletion tests/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@

def test_rdkit_mol_codec():
mol = dm.to_mol("C1=CC=CC=C1")
arr = zarr.array([mol, mol], chunks=(2,), dtype=object, object_codec=RDKitMolCodec())

arr = zarr.empty(shape=10, chunks=2, dtype=object, object_codec=RDKitMolCodec())

arr[0] = mol
arr[1] = mol
arr[2] = mol

assert dm.same_mol(arr[0], mol)
assert dm.same_mol(arr[1], mol)
assert dm.same_mol(arr[2], mol)

0 comments on commit f64bcdc

Please sign in to comment.