Skip to content

Commit

Permalink
Merge pull request #26 from skjerns/issue12_load_specific_var
Browse files Browse the repository at this point in the history
Issue12 load specific var
  • Loading branch information
skjerns authored Dec 2, 2021
2 parents dc42b9c + 290d865 commit 25a7c48
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 11 deletions.
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ data_dict = mat73.loadmat('data.mat', use_attrdict=True)
struct = data_dict['structure'] # assuming a structure was saved in the .mat
struct[0].var1 == struct[0]['var1'] # it's the same!
```
You can also specifiy to only load a specific variable or variable tree, useful to reduce loading times
```Python
data_dict = mat73.loadmat('data.mat', only_include='structure')
struct = data_dict['structure'] # now only structure is loaded and nothing else

data_dict = mat73.loadmat('data.mat', only_include=['var/subvar/subsubvar', 'tree1/'])
tree1 = data_dict['tree1'] # the entire tree has been loaded, so tree1 is a dict with all subvars of tree1
subsubvar = data_dict['var']['subvar']['subsubvar'] # this subvar has been loaded
```

## Installation

Expand Down Expand Up @@ -67,5 +76,5 @@ The following MATLAB datatypes can be loaded

- This library will __only__ load mat 7.3 files. For older versions use `scipy.io.loadmat`
- Proprietary MATLAB types (e.g `datetime`, `duriation`, etc) are not supported. If someone tells me how to convert them, I'll implement that
- For now, you can't save anything back to the .mat. Let me know if you need this functionality, would be quick to implement.
- For now, you can't save anything back to the .mat. It's a bit more difficult than expected, so it's not on the roadmap for now
- See also [hdf5storage](https://github.com/frejanordsiek/hdf5storage), which can indeed be used for saving .mat, but has less features for loading
67 changes: 58 additions & 9 deletions mat73/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,25 +36,58 @@ def __getattr__(self, key):
def __setattr__(self, key, value):
return self.__setitem__(key, value)


def print_tree(node):
if node.startswith('#refs#/') or node.startswith('#subsystem#/'):
return
print(' ', node)


class HDF5Decoder():
def __init__(self, verbose=True, use_attrdict=False):
def __init__(self, verbose=True, use_attrdict=False,
only_include=None):

if isinstance(only_include, str):
only_include = [only_include]
if only_include is not None:
only_include = [s if s[0]=='/' else f'/{s}' for s in only_include]
only_include = [s[:-1] if s[-1]=='/' else s for s in only_include]
self.verbose = verbose
self._dict_class = AttrDict if use_attrdict else dict
self.refs = {} # this is used in case of matlab matrices
self.only_include = only_include
if only_include is not None:
_vardict = dict(zip(only_include, [False]*len(only_include)))
self._found_include_var = _vardict


def mat2dict(self, hdf5, only_load=None):

def is_included(self, hdf5):
# if only_include is not specified, we always return True
# because we load everything
if self.only_include is None:
return True
# see if the current name is in any of the included variables
for s in self.only_include:
if s in hdf5.name:
self._found_include_var[s] = True
if s in hdf5.name or hdf5.name in s:
return True
return False


def mat2dict(self, hdf5):

if '#refs#' in hdf5:
self.refs = hdf5['#refs#']
d = self._dict_class()
for var in hdf5:
# this first loop is just here to catch the refs and subsystem vars
if var in ['#refs#','#subsystem#']:
continue
ext = os.path.splitext(hdf5.filename)[1].lower()
if ext.lower()=='.mat':
# if hdf5
if not self.is_included(hdf5[var]):
continue
d[var] = self.unpack_mat(hdf5[var])
elif ext=='.h5' or ext=='.hdf5':
err = 'Can only load .mat. Please use package hdfdict instead'\
Expand All @@ -63,6 +96,14 @@ def mat2dict(self, hdf5, only_load=None):
raise NotImplementedError(err)
else:
raise ValueError('can only unpack .mat')
if self.only_include is not None:
for var, found in self._found_include_var.items():
if not found:
logging.warn(f'Variable "{var}" was specified to be loaded'\
' but could not be found.')
if not any(list(self._found_include_var.values())):
print(hdf5.filename, 'contains the following vars:')
hdf5.visit(print_tree)
return d

# @profile
Expand All @@ -80,6 +121,8 @@ def unpack_mat(self, hdf5, depth=0, MATLAB_class=None):

for key in hdf5:
elem = hdf5[key]
if not self.is_included(elem):
continue
if 'MATLAB_class' in elem.attrs:
MATLAB_class = elem.attrs.get('MATLAB_class')
if MATLAB_class is not None:
Expand Down Expand Up @@ -115,7 +158,8 @@ def unpack_mat(self, hdf5, depth=0, MATLAB_class=None):

return d
elif isinstance(hdf5, h5py._hl.dataset.Dataset):
return self.convert_mat(hdf5, depth, MATLAB_class=MATLAB_class)
if self.is_included(hdf5):
return self.convert_mat(hdf5, depth, MATLAB_class=MATLAB_class)
else:
raise Exception(f'Unknown hdf5 type: {key}:{type(hdf5)}')

Expand Down Expand Up @@ -225,7 +269,7 @@ def convert_mat(self, dataset, depth, MATLAB_class=None):
return None


def loadmat(filename, use_attrdict=False, only_load=None, verbose=True):
def loadmat(filename, use_attrdict=False, only_include=None, verbose=True):
"""
Loads a MATLAB 7.3 .mat file, which is actually a
HDF5 file with some custom matlab annotations inside
Expand All @@ -237,14 +281,19 @@ def loadmat(filename, use_attrdict=False, only_load=None, verbose=True):
e.g. keys(), pop(), ...
these will still be available by struct['keys']
:param verbose: print warnings
:param only_load: A list of HDF5 paths that should be loaded
:param only_include: A list of HDF5 paths that should be loaded.
this can greatly reduce loading times. If a path
contains further sub datasets, these will be loaded
as well, e.g. 'struct/' will load all subvars of
struct, 'struct/var' will load only ['struct']['var']
:returns: A dictionary with the matlab variables loaded
"""
assert os.path.isfile(filename), '{} does not exist'.format(filename)
decoder = HDF5Decoder(verbose=verbose, use_attrdict=use_attrdict)
decoder = HDF5Decoder(verbose=verbose, use_attrdict=use_attrdict,
only_include=only_include)
try:
with h5py.File(filename, 'r') as hdf5:
dictionary = decoder.mat2dict(hdf5, only_load=only_load)
dictionary = decoder.mat2dict(hdf5)
return dictionary
except OSError:
raise TypeError('{} is not a MATLAB 7.3 file. '\
Expand Down
24 changes: 23 additions & 1 deletion tests/test_mat73.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,29 @@ def test_file6_empty_cell_array(self):
def test_file7_empty_cell_array(self):
data = mat73.loadmat(self.testfile7)


def test_load_specific_vars(self):
for key in ['keys', ['keys']]:
data = mat73.loadmat(self.testfile1, only_include=key)
assert len(data)==1
assert data['keys']=='must_not_overwrite'

with self.assertWarns(Warning):
data = mat73.loadmat(self.testfile1, only_include='notpresent')

data = mat73.loadmat(self.testfile1, only_include=['data', 'keys'])
assert len(data)==2
assert len(data['data'])==29
assert len(data['data']['cell_'])==7

# check if loading times are faster, should be the case.
start = time.time()
data = mat73.loadmat(self.testfile4)
elapsed1 = time.time()-start
start = time.time()
data = mat73.loadmat(self.testfile4, only_include='Res/HRV/Param/use_custom_print_command')
elapsed2 = time.time()-start
assert elapsed2<elapsed1, 'loading specific var was not faster'



if __name__ == '__main__':
Expand Down

0 comments on commit 25a7c48

Please sign in to comment.