Skip to content

Commit

Permalink
Merge pull request #45 from skjerns/fix_nullchar
Browse files Browse the repository at this point in the history
dont replace null chars in strings
  • Loading branch information
skjerns authored Jan 31, 2023
2 parents 3eab0fa + 8fabcbe commit 040b375
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 74 deletions.
4 changes: 4 additions & 0 deletions .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,7 @@ A clear and concise description of what the bug is.
Please provide a sample file with the given error or code to produce such a file. Without a sample file to work with, I likely cannot help you. You can upload your file either directly to GitHub or https://wetransfer.com

If your data contains privacy sensitive information, consider altering it in a way to remove the information or request me for a temporal mail adress that you can send your file confidentially to.



If you get `ERROR: MATLAB type not supported: XXX, (uint32)` there is probably very little I can do, as this represents a MATLAB proprietary data type, and nobody has yet figured out how to decode them.
8 changes: 5 additions & 3 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,17 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: [3.6, 3.7, 3.8, 3.9, 3.10.0]
python-version: [3.7, 3.8, 3.9]
os: [ubuntu-latest, macos-latest, windows-latest]

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: 'pip' # caching pip dependencies

- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
79 changes: 40 additions & 39 deletions mat73/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def print_tree(node):
class HDF5Decoder():
def __init__(self, verbose=True, use_attrdict=False,
only_include=None):

if isinstance(only_include, str):
only_include = [only_include]
if only_include is not None:
Expand All @@ -58,8 +58,6 @@ def __init__(self, verbose=True, use_attrdict=False,
if only_include is not None:
_vardict = dict(zip(only_include, [False]*len(only_include)))
self._found_include_var = _vardict



def is_included(self, hdf5):
# if only_include is not specified, we always return True
Expand All @@ -73,11 +71,10 @@ def is_included(self, hdf5):
if s in hdf5.name or hdf5.name in s:
return True
return False


def mat2dict(self, hdf5):
if '#refs#' in hdf5:

if '#refs#' in hdf5:
self.refs = hdf5['#refs#']
d = self._dict_class()
for var in hdf5:
Expand All @@ -98,13 +95,13 @@ def mat2dict(self, hdf5):
print(hdf5.filename, 'contains the following vars:')
hdf5.visit(print_tree)
return d

# @profile
def unpack_mat(self, hdf5, depth=0, MATLAB_class=None):
"""
unpack a h5py entry: if it's a group expand,
if it's a dataset convert
for safety reasons, the depth cannot be larger than 99
"""
if depth==99:
Expand All @@ -118,9 +115,9 @@ def unpack_mat(self, hdf5, depth=0, MATLAB_class=None):
continue
if 'MATLAB_class' in elem.attrs:
MATLAB_class = elem.attrs.get('MATLAB_class')
if MATLAB_class is not None:
if MATLAB_class is not None:
MATLAB_class = MATLAB_class.decode()
unpacked = self.unpack_mat(elem, depth=depth+1,
unpacked = self.unpack_mat(elem, depth=depth+1,
MATLAB_class=MATLAB_class)
# sparse elements need to be loaded separately
# see https://github.com/skjerns/mat7.3/issues/28
Expand All @@ -129,7 +126,7 @@ def unpack_mat(self, hdf5, depth=0, MATLAB_class=None):
from scipy.sparse import csc_matrix
data = unpacked['data']
row_ind = unpacked['ir']
col_ind = unpacked['jc']
col_ind = unpacked['jc']
n_rows = elem.attrs['MATLAB_sparse']
n_cols = len(col_ind) - 1
unpacked = csc_matrix((data, row_ind, col_ind), shape=(n_rows, n_cols))
Expand All @@ -148,7 +145,7 @@ def unpack_mat(self, hdf5, depth=0, MATLAB_class=None):
logging.error(f'Tried loading the sparse matrix `{elem.name}`'
' but something went wrong: {e}\n{e.__traceback__}')
raise e


elif MATLAB_class=='struct' and len(elem)>1 and \
isinstance(unpacked, dict):
Expand Down Expand Up @@ -183,41 +180,40 @@ def unpack_mat(self, hdf5, depth=0, MATLAB_class=None):
return self.convert_mat(hdf5, depth, MATLAB_class=MATLAB_class)
else:
raise Exception(f'Unknown hdf5 type: {key}:{type(hdf5)}')

# @profile
def _has_refs(self, dataset):
if len(dataset.shape)<2: return False
# dataset[0].
dataset[0][0]
if isinstance(dataset[0][0], h5py.h5r.Reference):
if isinstance(dataset[0][0], h5py.h5r.Reference):
return True
return False

# @profile
def convert_mat(self, dataset, depth, MATLAB_class=None):
"""
Converts h5py.dataset into python native datatypes
according to the matlab class annotation
"""
"""
# all MATLAB variables have the attribute MATLAB_class
# if this is not present, it is not convertible
if MATLAB_class is None and 'MATLAB_class' in dataset.attrs:
MATLAB_class = dataset.attrs['MATLAB_class'].decode()


if not MATLAB_class and not self._has_refs(dataset):
if self.verbose:
message = 'ERROR: not a MATLAB datatype: ' + \
'{}, ({})'.format(dataset, dataset.dtype)
logging.error(message)
return None

known_cls = ['cell', 'char', 'bool', 'logical', 'double', 'single',
known_cls = ['cell', 'char', 'bool', 'logical', 'double', 'single',
'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16',
'uint32', 'uint64']

if 'MATLAB_empty' in dataset.attrs.keys() and \
MATLAB_class in ['cell', 'struct']:
if 'MATLAB_empty' in dataset.attrs.keys():
mtype = 'empty'
elif MATLAB_class in known_cls:
mtype = MATLAB_class
Expand All @@ -244,38 +240,43 @@ def convert_mat(self, dataset, depth, MATLAB_class=None):
return cell

elif mtype=='empty':
dims = [x for x in dataset]
return empty(*dims)
if MATLAB_class in ['cell', 'struct']:
dims = [x for x in dataset]
return empty(*dims)
elif MATLAB_class=='char':
return ''
else:

return None

elif mtype=='char':
elif mtype=='char':
string_array = np.ravel(dataset)
string_array = ''.join([chr(x) for x in string_array])
string_array = string_array.replace('\x00', '')
return string_array

elif mtype=='bool':
return bool(dataset)

elif mtype=='logical':
elif mtype=='logical':
arr = np.array(dataset, dtype=bool).T.squeeze()
if arr.size==1: arr=bool(arr)
return arr

elif mtype=='canonical empty':
elif mtype=='canonical empty':
return None

# complex numbers need to be filtered out separately
elif 'imag' in str(dataset.dtype):
if dataset.attrs['MATLAB_class']==b'single':
dtype = np.complex64
dtype = np.complex64
else:
dtype = np.complex128
arr = np.array(dataset)
arr = (arr['real'] + arr['imag']*1j).astype(dtype)
return arr.T.squeeze()

# if it is none of the above, we can convert to numpy array
elif mtype in ('double', 'single', 'int8', 'int16', 'int32', 'int64',
elif mtype in ('double', 'single', 'int8', 'int16', 'int32', 'int64',
'uint8', 'uint16', 'uint32', 'uint64'):
arr = np.array(dataset, dtype=dataset.dtype)
return arr.T.squeeze()
Expand All @@ -287,31 +288,31 @@ def convert_mat(self, dataset, depth, MATLAB_class=None):
'{}, ({})'.format(mtype, dataset.dtype)
logging.error(message)
return None


def loadmat(filename, use_attrdict=False, only_include=None, verbose=True):
"""
Loads a MATLAB 7.3 .mat file, which is actually a
HDF5 file with some custom matlab annotations inside
:param filename: A string pointing to the file
:param use_attrdict: make it possible to access structs like in MATLAB
using struct.varname instead of struct['varname']
WARNING: builtin dict functions cannot be overwritten,
e.g. keys(), pop(), ...
these will still be available by struct['keys']
:param verbose: print warnings
:param only_include: A list of HDF5 paths that should be loaded.
this can greatly reduce loading times. If a path
:param only_include: A list of HDF5 paths that should be loaded.
this can greatly reduce loading times. If a path
contains further sub datasets, these will be loaded
as well, e.g. 'struct/' will load all subvars of
as well, e.g. 'struct/' will load all subvars of
struct, 'struct/var' will load only ['struct']['var']
:returns: A dictionary with the matlab variables loaded
"""
assert os.path.isfile(filename), '{} does not exist'.format(filename)
decoder = HDF5Decoder(verbose=verbose, use_attrdict=use_attrdict,
only_include=only_include)

ext = os.path.splitext(filename)[1].lower()
if ext!='.mat':
logging.warning('Can only load MATLAB .mat file, this file type might '
Expand All @@ -323,15 +324,15 @@ def loadmat(filename, use_attrdict=False, only_include=None, verbose=True):
except OSError:
raise TypeError('{} is not a MATLAB 7.3 file. '\
'Load with scipy.io.loadmat() instead.'.format(filename))


def savemat(filename, verbose=True):
raise NotImplementedError


if __name__=='__main__':
# for testing / debugging
d = loadmat('../tests/testfile1.mat')
d = loadmat('../tests/testfile2.mat')


# file = '../tests/testfile8.mat'
Expand Down
2 changes: 1 addition & 1 deletion mat73/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.59'
__version__ = '0.60'
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
h5py>=3.0
numpy>=1.19
Loading

0 comments on commit 040b375

Please sign in to comment.