Skip to content

Commit

Permalink
Cleanup of index logic
Browse files Browse the repository at this point in the history
The indexing code has been cleaned up significantly.  The index
dictionary now contains offsets and sizes for all sections, for all
messages.  This will help for a future change to allow for in-place
modification of GRIB2 metdata back into the input file -- similar to how
you can edit NetCDF attributes with mode = 'a'.

The new code logic allows for a simplified process for managing GRIB2
messages that contain submessages.

Additionally with this commit, grib2io can now safely ignore GRIB1
messages. ECMWF GRIB files can contains GRIB1 and GRIB2 (...ugh...).
  • Loading branch information
EricEngle-NOAA committed Jan 23, 2024
1 parent 2d4d773 commit 44842fd
Showing 1 changed file with 113 additions and 143 deletions.
256 changes: 113 additions & 143 deletions grib2io/_grib2io.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import re
import struct
import sys
import warnings

from dataclasses import dataclass, field
from numpy import ma
Expand Down Expand Up @@ -199,23 +200,23 @@ def _build_index(self, no_data=False):
"""
# Initialize index dictionary
if not self._hasindex:
self._index['offset'] = []
self._index['bitmap_offset'] = []
self._index['data_offset'] = []
self._index['size'] = []
self._index['data_size'] = []
self._index['submessageOffset'] = []
self._index['submessageBeginSection'] = []
self._index['isSubmessage'] = []
self._index['messageNumber'] = []
self._index['sectionOffset'] = []
self._index['sectionSize'] = []
self._index['msgSize'] = []
self._index['msgNumber'] = []
self._index['msg'] = []
self._hasindex = True

# Iterate
while True:
try:
# Read first 4 bytes and decode...looking for "GRIB"
# Initialize
bmapflag = None
pos = self._filehandle.tell()
section2 = b''
trailer = b''
_secpos = dict.fromkeys(range(8))
_secsize = dict.fromkeys(range(8))

# Ignore headers (usually text) that are not part of the GRIB2
# file. For example, NAVGEM files have a http header at the
Expand All @@ -233,149 +234,118 @@ def _build_index(self, no_data=False):
pos = pos + test_offset
break

# Test header. Then get information from GRIB2 Section 0: the discipline
# number, edition number (should always be 2), and GRIB2 message size.
# Then iterate to check for submessages.

_issubmessage = False
_submsgoffset = 0
_submsgbegin = 0
_bmapflag = None

# Read the rest of Section 0 using struct.
section0 = np.concatenate(([header],list(struct.unpack('>HBBQ',self._filehandle.read(12)))),dtype=np.int64)
_secpos[0] = self._filehandle.tell()-4
_secsize[0] = 16
secmsg = self._filehandle.read(12)
section0 = np.concatenate(([header],list(struct.unpack('>HBBQ',secmsg))),dtype=np.int64)

# Make sure message is GRIB2.
if section0[3] != 2:
raise ValueError(
"This is a GRIB version 1 file. grib2io only supports GRIB version 2."
)

# Read and unpack Section 1
secsize = struct.unpack('>i',self._filehandle.read(4))[0]
secnum = struct.unpack('>B',self._filehandle.read(1))[0]
assert secnum == 1
self._filehandle.seek(self._filehandle.tell()-5)
_grbmsg = self._filehandle.read(secsize)
_grbpos = 0
section1,_grbpos = g2clib.unpack1(_grbmsg,_grbpos,np.empty)
secrange = range(2,8)
# Check for GRIB1 and ignore.
if secmsg[3] == 1:
warnings.warn("GRIB version 1 message detected. Ignoring...")
grib1_size = int.from_bytes(secmsg[:3],"big")
self._filehandle.seek(self._filehandle.tell()+grib1_size-16)
continue
else:
raise ValueError("Bad GRIB version number.")

# Read and unpack sections 1 through 8 which all follow a pattern of
# section size, number, and content.
while 1:
section2 = b''
for num in secrange:
secsize = struct.unpack('>i',self._filehandle.read(4))[0]
secnum = struct.unpack('>B',self._filehandle.read(1))[0]
if secnum == num:
if secnum == 2:
if secsize > 0:
section2 = self._filehandle.read(secsize-5)
elif secnum == 3:
self._filehandle.seek(self._filehandle.tell()-5)
_grbmsg = self._filehandle.read(secsize)
_grbpos = 0
# Unpack Section 3
_gds,_gdt,_deflist,_grbpos = g2clib.unpack3(_grbmsg,_grbpos,np.empty)
_gds = _gds.tolist()
_gdt = _gdt.tolist()
section3 = np.concatenate((_gds,_gdt))
section3 = np.where(section3==4294967295,-1,section3)
elif secnum == 4:
self._filehandle.seek(self._filehandle.tell()-5)
_grbmsg = self._filehandle.read(secsize)
_grbpos = 0
# Unpack Section 4
_numcoord,_pdt,_pdtnum,_coordlist,_grbpos = g2clib.unpack4(_grbmsg,_grbpos,np.empty)
_pdt = _pdt.tolist()
section4 = np.concatenate((np.array((_numcoord,_pdtnum)),_pdt))
elif secnum == 5:
self._filehandle.seek(self._filehandle.tell()-5)
_grbmsg = self._filehandle.read(secsize)
_grbpos = 0
# Unpack Section 5
_drt,_drtn,_npts,self._pos = g2clib.unpack5(_grbmsg,_grbpos,np.empty)
section5 = np.concatenate((np.array((_npts,_drtn)),_drt))
section5 = np.where(section5==4294967295,-1,section5)
elif secnum == 6:
# Unpack Section 6. Not really...just get the flag value.
_bmapflag = struct.unpack('>B',self._filehandle.read(1))[0]
if _bmapflag == 0:
_bmappos = self._filehandle.tell()-6
elif _bmapflag == 254:
pass # Do this to keep the previous position value
else:
_bmappos = None
self._filehandle.seek(self._filehandle.tell()+secsize-6)
elif secnum == 7:
# Unpack Section 7. No need to read it, just index the position in file.
_datapos = self._filehandle.tell()-5
_datasize = secsize
self._filehandle.seek(self._filehandle.tell()+secsize-5)
else:
self._filehandle.seek(self._filehandle.tell()+secsize-5)
# Read first 5 bytes of the section which contains the size of the
# section (4 bytes) and the section number (1 byte).
secmsg = self._filehandle.read(5)
secsize, secnum = struct.unpack('>iB',secmsg)

# Record the offset of the section number and "append" the rest of
# the section to secmsg.
_secpos[secnum] = self._filehandle.tell()-5
_secsize[secnum] = secsize
if secnum in {1,3,4,5}:
secmsg += self._filehandle.read(secsize-5)
grbpos = 0

# Unpack section
if secnum == 1:
# Unpack Section 1
section1, grbpos = g2clib.unpack1(secmsg,grbpos,np.empty)
elif secnum == 2:
# Unpack Section 2
section2 = self._filehandle.read(secsize-5)
elif secnum == 3:
# Unpack Section 3
gds, gdt, deflist, grbpos = g2clib.unpack3(secmsg,grbpos,np.empty)
gds = gds.tolist()
gdt = gdt.tolist()
section3 = np.concatenate((gds,gdt))
section3 = np.where(section3==4294967295,-1,section3)
elif secnum == 4:
# Unpack Section 4
numcoord, pdt, pdtnum, coordlist, grbpos = g2clib.unpack4(secmsg,grbpos,np.empty)
pdt = pdt.tolist()
section4 = np.concatenate((np.array((numcoord,pdtnum)),pdt))
elif secnum == 5:
# Unpack Section 5
drt, drtn, npts, self._pos = g2clib.unpack5(secmsg,grbpos,np.empty)
section5 = np.concatenate((np.array((npts,drtn)),drt))
section5 = np.where(section5==4294967295,-1,section5)
elif secnum == 6:
# Unpack Section 6 - Just the bitmap flag
bmapflag = struct.unpack('>B',self._filehandle.read(1))[0]
if bmapflag == 0:
bmappos = self._filehandle.tell()-6
elif bmapflag == 254:
# Do this to keep the previous position value
pass
else:
if num == 2 and secnum == 3:
pass # Allow this. Just means no Local Use Section.
else:
_issubmessage = True
_submsgoffset = (self._filehandle.tell()-5)-(self._index['offset'][-1])
_submsgbegin = secnum
self._filehandle.seek(self._filehandle.tell()-5)
continue
trailer = struct.unpack('>4s',self._filehandle.read(4))[0]
if trailer == b'7777':
bmappos = None
self._filehandle.seek(self._filehandle.tell()+secsize-6)
elif secnum == 7:
# Do not unpack section 7 here, but move the file pointer
# to after section 7.
self._filehandle.seek(self._filehandle.tell()+secsize-5)

# Update the file index.
self.messages += 1
self._index['offset'].append(pos)
self._index['bitmap_offset'].append(_bmappos)
self._index['data_offset'].append(_datapos)
self._index['size'].append(section0[-1])
self._index['data_size'].append(_datasize)
self._index['messageNumber'].append(self.messages)
self._index['isSubmessage'].append(_issubmessage)
if _issubmessage:
self._index['submessageOffset'].append(_submsgoffset)
self._index['submessageBeginSection'].append(_submsgbegin)
else:
self._index['submessageOffset'].append(0)
self._index['submessageBeginSection'].append(_submsgbegin)
self._index['sectionOffset'].append(_secpos)
self._index['sectionSize'].append(_secsize)
self._index['msgSize'].append(section0[-1])
self._index['msgNumber'].append(self.messages)

# Create Grib2Message with data.
msg = Grib2Message(section0,section1,section2,section3,section4,section5,_bmapflag)
msg = Grib2Message(section0,section1,section2,section3,section4,section5,bmapflag)
msg._msgnum = self.messages-1
msg._deflist = _deflist
msg._coordlist = _coordlist
msg._deflist = deflist
msg._coordlist = coordlist
if not no_data:
msg._data = Grib2MessageOnDiskArray((msg.ny,msg.nx), 2,
TYPE_OF_VALUES_DTYPE[msg.typeOfValues],
self._filehandle,
msg, pos, _bmappos, _datapos)
msg, pos, _secpos[6], _secpos[7])
self._index['msg'].append(msg)

break
else:
self._filehandle.seek(self._filehandle.tell()-4)
self.messages += 1
self._index['offset'].append(pos)
self._index['bitmap_offset'].append(_bmappos)
self._index['data_offset'].append(_datapos)
self._index['size'].append(section0[-1])
self._index['data_size'].append(_datasize)
self._index['messageNumber'].append(self.messages)
self._index['isSubmessage'].append(_issubmessage)
self._index['submessageOffset'].append(_submsgoffset)
self._index['submessageBeginSection'].append(_submsgbegin)

# Create Grib2Message with data.
msg = Grib2Message(section0,section1,section2,section3,section4,section5,_bmapflag)
msg._msgnum = self.messages-1
msg._deflist = _deflist
msg._coordlist = _coordlist
if not no_data:
msg._data = Grib2MessageOnDiskArray((msg.ny,msg.nx), 2,
TYPE_OF_VALUES_DTYPE[msg.typeOfValues],
self._filehandle,
msg, pos, _bmappos, _datapos)
self._index['msg'].append(msg)
# If here, then we have moved through GRIB2 section 1-7. Now we
# need to check the next 4 bytes after section 7.
trailer = struct.unpack('>i',self._filehandle.read(4))[0]

continue
# If we reach the GRIB2 trailer string ('7777'), then we can break
# begin processing the next GRIB2 message. If not, then we continue
# within the same iteration to process a GRIB2 submessage.
if trailer.to_bytes(4, "big") == b'7777':
break
else:
# If here, trailer should be the size of the first section
# of the next submessage, then the next byte is the section
# number. Check this value.
nextsec = struct.unpack('>B',self._filehandle.read(1))[0]
if nextsec not in {2,3,4}:
raise ValueError("Bad GRIB2 message structure.")
self._filehandle.seek(self._filehandle.tell()-5)
continue
else:
raise ValueError("Bad GRIB2 section number.")

except(struct.error):
if 'r' in self.mode:
Expand Down Expand Up @@ -445,7 +415,7 @@ def seek(self, pos):
The GRIB2 Message number to set the file pointer to.
"""
if self._hasindex:
self._filehandle.seek(self._index['offset'][pos])
self._filehandle.seek(self._index['sectionOffset'][0][pos])
self.current_message = pos


Expand Down Expand Up @@ -1225,14 +1195,14 @@ class Grib2MessageOnDiskArray:
filehandle: open
msg: Grib2Message
offset: int
bmap_offset: int
bitmap_offset: int
data_offset: int

def __array__(self, dtype=None):
return np.asarray(_data(self.filehandle, self.msg, self.bmap_offset, self.data_offset),dtype=dtype)
return np.asarray(_data(self.filehandle, self.msg, self.bitmap_offset, self.data_offset),dtype=dtype)


def _data(filehandle: open, msg: Grib2Message, bmap_offset: int, data_offset: int)-> np.array:
def _data(filehandle: open, msg: Grib2Message, bitmap_offset: int, data_offset: int)-> np.array:
"""
Returns an unpacked data grid.
Expand Down Expand Up @@ -1260,8 +1230,8 @@ def _data(filehandle: open, msg: Grib2Message, bmap_offset: int, data_offset: in
fill_value = np.nan

# Read bitmap data.
if bmap_offset is not None:
filehandle.seek(bmap_offset) # Position file pointer to the beginning of bitmap section.
if bitmap_offset is not None:
filehandle.seek(bitmap_offset) # Position file pointer to the beginning of bitmap section.
bmap_size,num = struct.unpack('>IB',filehandle.read(5))
filehandle.seek(filehandle.tell()-5)
ipos = 0
Expand Down

0 comments on commit 44842fd

Please sign in to comment.