From 44842fdb6195721678f4c9428b2043a6e3c103a5 Mon Sep 17 00:00:00 2001 From: Eric Engle Date: Tue, 23 Jan 2024 09:40:03 -0500 Subject: [PATCH] Cleanup of index logic The indexing code has been cleaned up significantly. The index dictionary now contains offsets and sizes for all sections, for all messages. This will help for a future change to allow for in-place modification of GRIB2 metdata back into the input file -- similar to how you can edit NetCDF attributes with mode = 'a'. The new code logic allows for a simplified process for managing GRIB2 messages that contain submessages. Additionally with this commit, grib2io can now safely ignore GRIB1 messages. ECMWF GRIB files can contains GRIB1 and GRIB2 (...ugh...). --- grib2io/_grib2io.py | 256 +++++++++++++++++++------------------------- 1 file changed, 113 insertions(+), 143 deletions(-) diff --git a/grib2io/_grib2io.py b/grib2io/_grib2io.py index decb2bb..37e1271 100644 --- a/grib2io/_grib2io.py +++ b/grib2io/_grib2io.py @@ -29,6 +29,7 @@ import re import struct import sys +import warnings from dataclasses import dataclass, field from numpy import ma @@ -199,23 +200,23 @@ def _build_index(self, no_data=False): """ # Initialize index dictionary if not self._hasindex: - self._index['offset'] = [] - self._index['bitmap_offset'] = [] - self._index['data_offset'] = [] - self._index['size'] = [] - self._index['data_size'] = [] - self._index['submessageOffset'] = [] - self._index['submessageBeginSection'] = [] - self._index['isSubmessage'] = [] - self._index['messageNumber'] = [] + self._index['sectionOffset'] = [] + self._index['sectionSize'] = [] + self._index['msgSize'] = [] + self._index['msgNumber'] = [] self._index['msg'] = [] self._hasindex = True # Iterate while True: try: - # Read first 4 bytes and decode...looking for "GRIB" + # Initialize + bmapflag = None pos = self._filehandle.tell() + section2 = b'' + trailer = b'' + _secpos = dict.fromkeys(range(8)) + _secsize = dict.fromkeys(range(8)) # Ignore headers (usually text) that are not part of the GRIB2 # file. For example, NAVGEM files have a http header at the @@ -233,149 +234,118 @@ def _build_index(self, no_data=False): pos = pos + test_offset break - # Test header. Then get information from GRIB2 Section 0: the discipline - # number, edition number (should always be 2), and GRIB2 message size. - # Then iterate to check for submessages. - - _issubmessage = False - _submsgoffset = 0 - _submsgbegin = 0 - _bmapflag = None - # Read the rest of Section 0 using struct. - section0 = np.concatenate(([header],list(struct.unpack('>HBBQ',self._filehandle.read(12)))),dtype=np.int64) + _secpos[0] = self._filehandle.tell()-4 + _secsize[0] = 16 + secmsg = self._filehandle.read(12) + section0 = np.concatenate(([header],list(struct.unpack('>HBBQ',secmsg))),dtype=np.int64) + # Make sure message is GRIB2. if section0[3] != 2: - raise ValueError( - "This is a GRIB version 1 file. grib2io only supports GRIB version 2." - ) - - # Read and unpack Section 1 - secsize = struct.unpack('>i',self._filehandle.read(4))[0] - secnum = struct.unpack('>B',self._filehandle.read(1))[0] - assert secnum == 1 - self._filehandle.seek(self._filehandle.tell()-5) - _grbmsg = self._filehandle.read(secsize) - _grbpos = 0 - section1,_grbpos = g2clib.unpack1(_grbmsg,_grbpos,np.empty) - secrange = range(2,8) + # Check for GRIB1 and ignore. + if secmsg[3] == 1: + warnings.warn("GRIB version 1 message detected. Ignoring...") + grib1_size = int.from_bytes(secmsg[:3],"big") + self._filehandle.seek(self._filehandle.tell()+grib1_size-16) + continue + else: + raise ValueError("Bad GRIB version number.") + + # Read and unpack sections 1 through 8 which all follow a pattern of + # section size, number, and content. while 1: - section2 = b'' - for num in secrange: - secsize = struct.unpack('>i',self._filehandle.read(4))[0] - secnum = struct.unpack('>B',self._filehandle.read(1))[0] - if secnum == num: - if secnum == 2: - if secsize > 0: - section2 = self._filehandle.read(secsize-5) - elif secnum == 3: - self._filehandle.seek(self._filehandle.tell()-5) - _grbmsg = self._filehandle.read(secsize) - _grbpos = 0 - # Unpack Section 3 - _gds,_gdt,_deflist,_grbpos = g2clib.unpack3(_grbmsg,_grbpos,np.empty) - _gds = _gds.tolist() - _gdt = _gdt.tolist() - section3 = np.concatenate((_gds,_gdt)) - section3 = np.where(section3==4294967295,-1,section3) - elif secnum == 4: - self._filehandle.seek(self._filehandle.tell()-5) - _grbmsg = self._filehandle.read(secsize) - _grbpos = 0 - # Unpack Section 4 - _numcoord,_pdt,_pdtnum,_coordlist,_grbpos = g2clib.unpack4(_grbmsg,_grbpos,np.empty) - _pdt = _pdt.tolist() - section4 = np.concatenate((np.array((_numcoord,_pdtnum)),_pdt)) - elif secnum == 5: - self._filehandle.seek(self._filehandle.tell()-5) - _grbmsg = self._filehandle.read(secsize) - _grbpos = 0 - # Unpack Section 5 - _drt,_drtn,_npts,self._pos = g2clib.unpack5(_grbmsg,_grbpos,np.empty) - section5 = np.concatenate((np.array((_npts,_drtn)),_drt)) - section5 = np.where(section5==4294967295,-1,section5) - elif secnum == 6: - # Unpack Section 6. Not really...just get the flag value. - _bmapflag = struct.unpack('>B',self._filehandle.read(1))[0] - if _bmapflag == 0: - _bmappos = self._filehandle.tell()-6 - elif _bmapflag == 254: - pass # Do this to keep the previous position value - else: - _bmappos = None - self._filehandle.seek(self._filehandle.tell()+secsize-6) - elif secnum == 7: - # Unpack Section 7. No need to read it, just index the position in file. - _datapos = self._filehandle.tell()-5 - _datasize = secsize - self._filehandle.seek(self._filehandle.tell()+secsize-5) - else: - self._filehandle.seek(self._filehandle.tell()+secsize-5) + # Read first 5 bytes of the section which contains the size of the + # section (4 bytes) and the section number (1 byte). + secmsg = self._filehandle.read(5) + secsize, secnum = struct.unpack('>iB',secmsg) + + # Record the offset of the section number and "append" the rest of + # the section to secmsg. + _secpos[secnum] = self._filehandle.tell()-5 + _secsize[secnum] = secsize + if secnum in {1,3,4,5}: + secmsg += self._filehandle.read(secsize-5) + grbpos = 0 + + # Unpack section + if secnum == 1: + # Unpack Section 1 + section1, grbpos = g2clib.unpack1(secmsg,grbpos,np.empty) + elif secnum == 2: + # Unpack Section 2 + section2 = self._filehandle.read(secsize-5) + elif secnum == 3: + # Unpack Section 3 + gds, gdt, deflist, grbpos = g2clib.unpack3(secmsg,grbpos,np.empty) + gds = gds.tolist() + gdt = gdt.tolist() + section3 = np.concatenate((gds,gdt)) + section3 = np.where(section3==4294967295,-1,section3) + elif secnum == 4: + # Unpack Section 4 + numcoord, pdt, pdtnum, coordlist, grbpos = g2clib.unpack4(secmsg,grbpos,np.empty) + pdt = pdt.tolist() + section4 = np.concatenate((np.array((numcoord,pdtnum)),pdt)) + elif secnum == 5: + # Unpack Section 5 + drt, drtn, npts, self._pos = g2clib.unpack5(secmsg,grbpos,np.empty) + section5 = np.concatenate((np.array((npts,drtn)),drt)) + section5 = np.where(section5==4294967295,-1,section5) + elif secnum == 6: + # Unpack Section 6 - Just the bitmap flag + bmapflag = struct.unpack('>B',self._filehandle.read(1))[0] + if bmapflag == 0: + bmappos = self._filehandle.tell()-6 + elif bmapflag == 254: + # Do this to keep the previous position value + pass else: - if num == 2 and secnum == 3: - pass # Allow this. Just means no Local Use Section. - else: - _issubmessage = True - _submsgoffset = (self._filehandle.tell()-5)-(self._index['offset'][-1]) - _submsgbegin = secnum - self._filehandle.seek(self._filehandle.tell()-5) - continue - trailer = struct.unpack('>4s',self._filehandle.read(4))[0] - if trailer == b'7777': + bmappos = None + self._filehandle.seek(self._filehandle.tell()+secsize-6) + elif secnum == 7: + # Do not unpack section 7 here, but move the file pointer + # to after section 7. + self._filehandle.seek(self._filehandle.tell()+secsize-5) + + # Update the file index. self.messages += 1 - self._index['offset'].append(pos) - self._index['bitmap_offset'].append(_bmappos) - self._index['data_offset'].append(_datapos) - self._index['size'].append(section0[-1]) - self._index['data_size'].append(_datasize) - self._index['messageNumber'].append(self.messages) - self._index['isSubmessage'].append(_issubmessage) - if _issubmessage: - self._index['submessageOffset'].append(_submsgoffset) - self._index['submessageBeginSection'].append(_submsgbegin) - else: - self._index['submessageOffset'].append(0) - self._index['submessageBeginSection'].append(_submsgbegin) + self._index['sectionOffset'].append(_secpos) + self._index['sectionSize'].append(_secsize) + self._index['msgSize'].append(section0[-1]) + self._index['msgNumber'].append(self.messages) # Create Grib2Message with data. - msg = Grib2Message(section0,section1,section2,section3,section4,section5,_bmapflag) + msg = Grib2Message(section0,section1,section2,section3,section4,section5,bmapflag) msg._msgnum = self.messages-1 - msg._deflist = _deflist - msg._coordlist = _coordlist + msg._deflist = deflist + msg._coordlist = coordlist if not no_data: msg._data = Grib2MessageOnDiskArray((msg.ny,msg.nx), 2, TYPE_OF_VALUES_DTYPE[msg.typeOfValues], self._filehandle, - msg, pos, _bmappos, _datapos) + msg, pos, _secpos[6], _secpos[7]) self._index['msg'].append(msg) - break - else: - self._filehandle.seek(self._filehandle.tell()-4) - self.messages += 1 - self._index['offset'].append(pos) - self._index['bitmap_offset'].append(_bmappos) - self._index['data_offset'].append(_datapos) - self._index['size'].append(section0[-1]) - self._index['data_size'].append(_datasize) - self._index['messageNumber'].append(self.messages) - self._index['isSubmessage'].append(_issubmessage) - self._index['submessageOffset'].append(_submsgoffset) - self._index['submessageBeginSection'].append(_submsgbegin) - - # Create Grib2Message with data. - msg = Grib2Message(section0,section1,section2,section3,section4,section5,_bmapflag) - msg._msgnum = self.messages-1 - msg._deflist = _deflist - msg._coordlist = _coordlist - if not no_data: - msg._data = Grib2MessageOnDiskArray((msg.ny,msg.nx), 2, - TYPE_OF_VALUES_DTYPE[msg.typeOfValues], - self._filehandle, - msg, pos, _bmappos, _datapos) - self._index['msg'].append(msg) + # If here, then we have moved through GRIB2 section 1-7. Now we + # need to check the next 4 bytes after section 7. + trailer = struct.unpack('>i',self._filehandle.read(4))[0] - continue + # If we reach the GRIB2 trailer string ('7777'), then we can break + # begin processing the next GRIB2 message. If not, then we continue + # within the same iteration to process a GRIB2 submessage. + if trailer.to_bytes(4, "big") == b'7777': + break + else: + # If here, trailer should be the size of the first section + # of the next submessage, then the next byte is the section + # number. Check this value. + nextsec = struct.unpack('>B',self._filehandle.read(1))[0] + if nextsec not in {2,3,4}: + raise ValueError("Bad GRIB2 message structure.") + self._filehandle.seek(self._filehandle.tell()-5) + continue + else: + raise ValueError("Bad GRIB2 section number.") except(struct.error): if 'r' in self.mode: @@ -445,7 +415,7 @@ def seek(self, pos): The GRIB2 Message number to set the file pointer to. """ if self._hasindex: - self._filehandle.seek(self._index['offset'][pos]) + self._filehandle.seek(self._index['sectionOffset'][0][pos]) self.current_message = pos @@ -1225,14 +1195,14 @@ class Grib2MessageOnDiskArray: filehandle: open msg: Grib2Message offset: int - bmap_offset: int + bitmap_offset: int data_offset: int def __array__(self, dtype=None): - return np.asarray(_data(self.filehandle, self.msg, self.bmap_offset, self.data_offset),dtype=dtype) + return np.asarray(_data(self.filehandle, self.msg, self.bitmap_offset, self.data_offset),dtype=dtype) -def _data(filehandle: open, msg: Grib2Message, bmap_offset: int, data_offset: int)-> np.array: +def _data(filehandle: open, msg: Grib2Message, bitmap_offset: int, data_offset: int)-> np.array: """ Returns an unpacked data grid. @@ -1260,8 +1230,8 @@ def _data(filehandle: open, msg: Grib2Message, bmap_offset: int, data_offset: in fill_value = np.nan # Read bitmap data. - if bmap_offset is not None: - filehandle.seek(bmap_offset) # Position file pointer to the beginning of bitmap section. + if bitmap_offset is not None: + filehandle.seek(bitmap_offset) # Position file pointer to the beginning of bitmap section. bmap_size,num = struct.unpack('>IB',filehandle.read(5)) filehandle.seek(filehandle.tell()-5) ipos = 0