Merge pull request #228 from texadactyl/master

Fix hdf_reader.py and create utility h5diag - issue #226
UCBerkeleySETI · Aug 10, 2021 · c310179 · c310179
2 parents 7c4373f + 7e2fb41
commit c310179
Show file tree

Hide file tree

Showing 5 changed files with 157 additions and 62 deletions.
diff --git a/VERSION-HISTORY.md b/VERSION-HISTORY.md
@@ -3,6 +3,7 @@ This file is a version history of turbo_seti amendments, beginning with version
 <br>
 |    Date    | Version | Contents |
 | :--: | :--: | :-- |
+| 2021-08-10 | 2.0.22 | Non/broken HDF5 input files need better diagnosis (Issue #226).  |
 | 2021-08-07 | 2.0.21 | New signal_processing source file, "dedoppler.py" (discussion in PR #220).  |
 | 2021-08-06 | 2.0.20 | New utility, "stix" (Issue #221).  |
 | 2021-07-28 | 2.0.19 | Update fil2h5 to handle YUGE data matrixes.  |

diff --git a/blimpy/__init__.py b/blimpy/__init__.py
@@ -7,6 +7,7 @@
     from . import utils
     from . import fil2h5
     from . import h52fil
+    from . import h5diag
     from . import bl_scrunch
     from . import calcload
     from . import rawhdr

diff --git a/blimpy/h5diag.py b/blimpy/h5diag.py
@@ -0,0 +1,78 @@
+r""" h5diag """
+
+import os
+import sys
+from argparse import ArgumentParser
+import h5py
+from astropy.coordinates import Angle
+
+
+def oops(msg):
+    print("\n*** h5diag: {}\n".format(msg))
+    sys.exit(86)
+
+
+def read_header(h5):
+    """ Read header and return a Python dictionary of key:value pairs
+    """
+
+    header = {} # Initialise as a nil dictionary.
+
+    for key, val in h5["data"].attrs.items():
+        #if six.PY3:
+        #    key = bytes(key, "ascii")
+        if isinstance(val, bytes):
+            val = val.decode("ascii")
+        if key == "src_raj":
+            header[key] = Angle(val, unit="hr")
+        elif key == "src_dej":
+            header[key] = Angle(val, unit="deg")
+        else:
+            header[key] = val
+
+    return header
+
+
+def examine(filename):
+    r""" Diagnose the given HDF5 file"""
+    h5 = h5py.File(filename, mode="r")
+    if "CLASS" in h5.attrs:
+        classstr = h5.attrs["CLASS"]
+    else:
+        oops("CLASS attribute missing")
+    if classstr != "FILTERBANK":
+        oops("Expected CLASS attribute to be 'FILTERBANK' but saw '{}'".format(classstr))
+    if "VERSION" in h5.attrs:
+        versionstr = h5.attrs["VERSION"]
+    else:
+        oops("VERSION attribute missing")
+    print("VERSION is ", versionstr)
+    header = read_header(h5)
+    print("Header:", header)
+    if not "data" in h5:
+        oops("data attribute missing")
+    if h5["data"].ndim != 3:
+        oops("Expected data.ndim to be 3 but saw '{}'".format(h5["data"].ndim))
+    print("h5diag: data shape:", h5["data"].shape)
+
+
+def cmd_tool(args=None):
+    """ Command line tool h5diag """
+    parser = ArgumentParser(description="Command line utility for diagnosing HDF5 files.")
+    parser.add_argument("filename", type=str, help="Path of file to read")
+    if args is None:
+        args = sys.argv[1:]
+    parse_args = parser.parse_args(args)
+
+    if not os.path.isfile(parse_args.filename):
+        oops("Not a file: {}".format(parse_args.filename))
+    if not h5py.is_hdf5(parse_args.filename):
+        oops("Not an HDF5 file: {}".format(parse_args.filename))
+
+    print("h5diag: Begin")
+    examine(parse_args.filename)
+    print("h5diag: No errors detected")
+
+
+if __name__ == "__main__":
+    cmd_tool()
diff --git a/blimpy/io/hdf_reader.py b/blimpy/io/hdf_reader.py
@@ -16,7 +16,7 @@ def __init__(self, filename, f_start=None, f_stop=None, t_start=None, t_stop=Non
         """ Constructor.
 
         Args:
-            filename (str): filename of blimpy file.
+            filename (str): file path of HDF5 file.
             f_start (float): start frequency, in MHz
             f_stop (float): stop frequency, in MHz
             t_start (int): start time bin
@@ -25,74 +25,88 @@ def __init__(self, filename, f_start=None, f_stop=None, t_start=None, t_stop=Non
         """
         super(H5Reader, self).__init__()
 
-        if filename and os.path.isfile(filename) and h5py.is_hdf5(filename):
-
-            #These values may be modified once code for multi_beam and multi_stokes observations are possible.
-            self.freq_axis = 2
-            self.time_axis = 0
-            self.beam_axis = 1  # Place holder
-            self.stokes_axis = 4  # Place holder
-
-            self.filename = filename
-            self.filestat = os.stat(filename)
-            self.filesize = self.filestat.st_size/(1024.0**2)
-            self.load_data = load_data
-            self.h5 = h5py.File(self.filename, mode='r')
-            self.read_header()
-            self.file_size_bytes = os.path.getsize(self.filename)  # In bytes
-            self.n_ints_in_file = self.h5["data"].shape[self.time_axis] #
-            self.n_channels_in_file = self.h5["data"].shape[self.freq_axis] #
-            self.n_beams_in_file = self.header['nifs'] #Placeholder for future development.
-            self.n_pols_in_file = 1 #Placeholder for future development.
-            self._n_bytes = int(self.header['nbits'] / 8)  #number of bytes per digit.
-            self._d_type = self._setup_dtype()
-            self.file_shape = (self.n_ints_in_file,self.n_beams_in_file,self.n_channels_in_file)
-
-            if self.header['foff'] < 0:
-                self.f_end = self.header['fch1']
-                self.f_begin = self.f_end + self.n_channels_in_file*self.header['foff']
-            else:
-                self.f_begin = self.header['fch1']
-                self.f_end = self.f_begin + self.n_channels_in_file*self.header['foff']
+        if not os.path.isfile(filename):
+            raise IOError("Not a file: {}".format(filename))
+        if not h5py.is_hdf5(filename):
+            raise IOError("Not an HDF5 file: {}".format(filename))
+
+        #These values may be modified once code for multi_beam and multi_stokes observations are possible.
+        self.freq_axis = 2
+        self.time_axis = 0
+        self.beam_axis = 1  # Place holder
+        self.stokes_axis = 4  # Place holder
+
+        self.filename = filename
+        self.filestat = os.stat(filename)
+        self.filesize = self.filestat.st_size/(1024.0**2)
+        self.load_data = load_data
+        self.h5 = h5py.File(self.filename, mode='r')
+        if "CLASS" in self.h5.attrs:
+            classstr = self.h5.attrs["CLASS"]
+        else:
+            raise ValueError("CLASS attribute missing")
+        if classstr != "FILTERBANK":
+            raise ValueError("Expected CLASS attribute to be 'FILTERBANK' but saw '{}'".format(classstr))
+        if not "VERSION" in self.h5.attrs:
+            raise ValueError("VERSION attribute missing")
+        if not "data" in self.h5:
+            raise ValueError("data attribute missing")
+        if self.h5["data"].ndim != 3:
+            raise ValueError("Expected data.ndim to be 3 but saw '{}'".format(self.h5["data"].ndim))
+        self.read_header()
+        self.file_size_bytes = os.path.getsize(self.filename)  # In bytes
+        self.n_ints_in_file = self.h5["data"].shape[self.time_axis] #
+        self.n_channels_in_file = self.h5["data"].shape[self.freq_axis] #
+        self.n_beams_in_file = self.header['nifs'] #Placeholder for future development.
+        self.n_pols_in_file = 1 #Placeholder for future development.
+        self._n_bytes = int(self.header['nbits'] / 8)  #number of bytes per digit.
+        self._d_type = self._setup_dtype()
+        self.file_shape = (self.n_ints_in_file,self.n_beams_in_file,self.n_channels_in_file)
+
+        if self.header['foff'] < 0:
+            self.f_end = self.header['fch1']
+            self.f_begin = self.f_end + self.n_channels_in_file*self.header['foff']
+        else:
+            self.f_begin = self.header['fch1']
+            self.f_end = self.f_begin + self.n_channels_in_file*self.header['foff']
 
-            self.t_begin = 0
-            self.t_end = self.n_ints_in_file
+        self.t_begin = 0
+        self.t_end = self.n_ints_in_file
 
-            #Taking care all the frequencies are assigned correctly.
-            self._setup_selection_range(f_start=f_start, f_stop=f_stop, t_start=t_start, t_stop=t_stop, init=True)
-            #Convert input frequencies into what their corresponding channel number would be.
-            self._setup_chans()
-            #Update frequencies ranges from channel number.
-            self._setup_freqs()
+        #Taking care all the frequencies are assigned correctly.
+        self._setup_selection_range(f_start=f_start, f_stop=f_stop, t_start=t_start, t_stop=t_stop, init=True)
+        #Convert input frequencies into what their corresponding channel number would be.
+        self._setup_chans()
+        #Update frequencies ranges from channel number.
+        self._setup_freqs()
 
-            #Applying data size limit to load.
-            if max_load is not None and max_load > 0:
-                self.max_data_array_size = max_load * GIGA
+        #Applying data size limit to load.
+        if max_load is not None and max_load > 0:
+            self.max_data_array_size = max_load * GIGA
 
-            if self.file_size_bytes > self.max_data_array_size:
-                self.large_file = True
-            else:
-                self.large_file = False
-
-            if self.load_data:
-                if self.large_file:
-                    # Only checking the selection, if the file is too large.
-                    if self.f_start or self.f_stop or self.t_start or self.t_stop:
-                        if self.isheavy():
-                            self.warn_memory("Selection", self._calc_selection_size())
-                            self._init_empty_selection()
-                        else:
-                            self.read_data()
-                    else:
-                        self.warn_memory("File", self.file_size_bytes)
+        if self.file_size_bytes > self.max_data_array_size:
+            self.large_file = True
+        else:
+            self.large_file = False
+
+        if self.load_data:
+            if self.large_file:
+                # Only checking the selection, if the file is too large.
+                if self.f_start or self.f_stop or self.t_start or self.t_stop:
+                    if self.isheavy():
+                        self.warn_memory("Selection", self._calc_selection_size())
                         self._init_empty_selection()
+                    else:
+                        self.read_data()
                 else:
-                    self.read_data()
+                    self.warn_memory("File", self.file_size_bytes)
+                    self._init_empty_selection()
             else:
-                logger.debug("Skipping loading data ...")
-                self._init_empty_selection()
+                self.read_data()
         else:
-            raise IOError("Need a file to open, please give me one!")
+            logger.debug("Skipping loading data ...")
+            self._init_empty_selection()
+
 
     def read_header(self):
         """ Read header and return a Python dictionary of key:value pairs

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 """
 from setuptools import setup, find_packages
 
-__version__ = '2.0.21'
+__version__ = '2.0.22'
 
 with open("README.md", "r") as fh:
     long_description = fh.read()
@@ -16,6 +16,7 @@
         'rawutil = blimpy.guppi:cmd_tool',
         'fil2h5 = blimpy.fil2h5:cmd_tool',
         'h52fil = blimpy.h52fil:cmd_tool',
+        'h5diag = blimpy.h5diag:cmd_tool',
         'bl_scrunch = blimpy.bl_scrunch:cmd_tool',
         'matchfils = blimpy.match_fils:cmd_tool',
         'bldice = blimpy.dice:cmd_tool',