diff --git a/README.md b/README.md index 88b316a..c3173ce 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,15 @@ data_dict = mat73.loadmat('data.mat', use_attrdict=True) struct = data_dict['structure'] # assuming a structure was saved in the .mat struct[0].var1 == struct[0]['var1'] # it's the same! ``` +You can also specifiy to only load a specific variable or variable tree, useful to reduce loading times +```Python +data_dict = mat73.loadmat('data.mat', only_include='structure') +struct = data_dict['structure'] # now only structure is loaded and nothing else + +data_dict = mat73.loadmat('data.mat', only_include=['var/subvar/subsubvar', 'tree1/']) +tree1 = data_dict['tree1'] # the entire tree has been loaded, so tree1 is a dict with all subvars of tree1 +subsubvar = data_dict['var']['subvar']['subsubvar'] # this subvar has been loaded +``` ## Installation @@ -67,5 +76,5 @@ The following MATLAB datatypes can be loaded - This library will __only__ load mat 7.3 files. For older versions use `scipy.io.loadmat` - Proprietary MATLAB types (e.g `datetime`, `duriation`, etc) are not supported. If someone tells me how to convert them, I'll implement that -- For now, you can't save anything back to the .mat. Let me know if you need this functionality, would be quick to implement. +- For now, you can't save anything back to the .mat. It's a bit more difficult than expected, so it's not on the roadmap for now - See also [hdf5storage](https://github.com/frejanordsiek/hdf5storage), which can indeed be used for saving .mat, but has less features for loading diff --git a/mat73/__init__.py b/mat73/__init__.py index 12ec82f..d1c0cc2 100644 --- a/mat73/__init__.py +++ b/mat73/__init__.py @@ -36,25 +36,58 @@ def __getattr__(self, key): def __setattr__(self, key, value): return self.__setitem__(key, value) - +def print_tree(node): + if node.startswith('#refs#/') or node.startswith('#subsystem#/'): + return + print(' ', node) class HDF5Decoder(): - def __init__(self, verbose=True, use_attrdict=False): + def __init__(self, verbose=True, use_attrdict=False, + only_include=None): + + if isinstance(only_include, str): + only_include = [only_include] + if only_include is not None: + only_include = [s if s[0]=='/' else f'/{s}' for s in only_include] + only_include = [s[:-1] if s[-1]=='/' else s for s in only_include] self.verbose = verbose self._dict_class = AttrDict if use_attrdict else dict self.refs = {} # this is used in case of matlab matrices + self.only_include = only_include + if only_include is not None: + _vardict = dict(zip(only_include, [False]*len(only_include))) + self._found_include_var = _vardict + - def mat2dict(self, hdf5, only_load=None): + + def is_included(self, hdf5): + # if only_include is not specified, we always return True + # because we load everything + if self.only_include is None: + return True + # see if the current name is in any of the included variables + for s in self.only_include: + if s in hdf5.name: + self._found_include_var[s] = True + if s in hdf5.name or hdf5.name in s: + return True + return False + + + def mat2dict(self, hdf5): + if '#refs#' in hdf5: self.refs = hdf5['#refs#'] d = self._dict_class() for var in hdf5: + # this first loop is just here to catch the refs and subsystem vars if var in ['#refs#','#subsystem#']: continue ext = os.path.splitext(hdf5.filename)[1].lower() if ext.lower()=='.mat': - # if hdf5 + if not self.is_included(hdf5[var]): + continue d[var] = self.unpack_mat(hdf5[var]) elif ext=='.h5' or ext=='.hdf5': err = 'Can only load .mat. Please use package hdfdict instead'\ @@ -63,6 +96,14 @@ def mat2dict(self, hdf5, only_load=None): raise NotImplementedError(err) else: raise ValueError('can only unpack .mat') + if self.only_include is not None: + for var, found in self._found_include_var.items(): + if not found: + logging.warn(f'Variable "{var}" was specified to be loaded'\ + ' but could not be found.') + if not any(list(self._found_include_var.values())): + print(hdf5.filename, 'contains the following vars:') + hdf5.visit(print_tree) return d # @profile @@ -80,6 +121,8 @@ def unpack_mat(self, hdf5, depth=0, MATLAB_class=None): for key in hdf5: elem = hdf5[key] + if not self.is_included(elem): + continue if 'MATLAB_class' in elem.attrs: MATLAB_class = elem.attrs.get('MATLAB_class') if MATLAB_class is not None: @@ -115,7 +158,8 @@ def unpack_mat(self, hdf5, depth=0, MATLAB_class=None): return d elif isinstance(hdf5, h5py._hl.dataset.Dataset): - return self.convert_mat(hdf5, depth, MATLAB_class=MATLAB_class) + if self.is_included(hdf5): + return self.convert_mat(hdf5, depth, MATLAB_class=MATLAB_class) else: raise Exception(f'Unknown hdf5 type: {key}:{type(hdf5)}') @@ -225,7 +269,7 @@ def convert_mat(self, dataset, depth, MATLAB_class=None): return None -def loadmat(filename, use_attrdict=False, only_load=None, verbose=True): +def loadmat(filename, use_attrdict=False, only_include=None, verbose=True): """ Loads a MATLAB 7.3 .mat file, which is actually a HDF5 file with some custom matlab annotations inside @@ -237,14 +281,19 @@ def loadmat(filename, use_attrdict=False, only_load=None, verbose=True): e.g. keys(), pop(), ... these will still be available by struct['keys'] :param verbose: print warnings - :param only_load: A list of HDF5 paths that should be loaded + :param only_include: A list of HDF5 paths that should be loaded. + this can greatly reduce loading times. If a path + contains further sub datasets, these will be loaded + as well, e.g. 'struct/' will load all subvars of + struct, 'struct/var' will load only ['struct']['var'] :returns: A dictionary with the matlab variables loaded """ assert os.path.isfile(filename), '{} does not exist'.format(filename) - decoder = HDF5Decoder(verbose=verbose, use_attrdict=use_attrdict) + decoder = HDF5Decoder(verbose=verbose, use_attrdict=use_attrdict, + only_include=only_include) try: with h5py.File(filename, 'r') as hdf5: - dictionary = decoder.mat2dict(hdf5, only_load=only_load) + dictionary = decoder.mat2dict(hdf5) return dictionary except OSError: raise TypeError('{} is not a MATLAB 7.3 file. '\ diff --git a/tests/test_mat73.py b/tests/test_mat73.py index f6ee6ad..b239018 100644 --- a/tests/test_mat73.py +++ b/tests/test_mat73.py @@ -337,7 +337,29 @@ def test_file6_empty_cell_array(self): def test_file7_empty_cell_array(self): data = mat73.loadmat(self.testfile7) - + def test_load_specific_vars(self): + for key in ['keys', ['keys']]: + data = mat73.loadmat(self.testfile1, only_include=key) + assert len(data)==1 + assert data['keys']=='must_not_overwrite' + + with self.assertWarns(Warning): + data = mat73.loadmat(self.testfile1, only_include='notpresent') + + data = mat73.loadmat(self.testfile1, only_include=['data', 'keys']) + assert len(data)==2 + assert len(data['data'])==29 + assert len(data['data']['cell_'])==7 + + # check if loading times are faster, should be the case. + start = time.time() + data = mat73.loadmat(self.testfile4) + elapsed1 = time.time()-start + start = time.time() + data = mat73.loadmat(self.testfile4, only_include='Res/HRV/Param/use_custom_print_command') + elapsed2 = time.time()-start + assert elapsed2