Skip to content

Commit

Permalink
Read labels from new Vault CSV files. Fixes #420
Browse files Browse the repository at this point in the history
  • Loading branch information
jay0lee committed Jan 18, 2024
1 parent 59fd57b commit cfc587d
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 17 deletions.
6 changes: 4 additions & 2 deletions fmbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@ def get_header(self, header, case_insensitive=False):
check_folded_header = False
for line in self.msg_bytes.split(b'\n'):
if case_insensitive:
line = line.lower()
search_line = line.lower()
else:
search_line = line
if check_folded_header:
if line.startswith(b' ') or line.startswith(b'\t'):
header_value += line.lstrip()
else:
return header_value.decode()
elif line.startswith(b'%s: ' % header):
elif search_line.startswith(b'%s: ' % header):
header_value = line[len(header)+2:]
check_folded_header = True
elif line == '':
Expand Down
45 changes: 30 additions & 15 deletions gyb.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
__program_name__ = 'Got Your Back: Gmail Backup'
__author__ = 'Jay Lee'
__email__ = '[email protected]'
__version__ = '1.74'
__version__ = '1.80'
__license__ = 'Apache License 2.0 (https://www.apache.org/licenses/LICENSE-2.0)'
__website__ = 'jaylee.us/gyb'
__db_schema_version__ = '6'
Expand All @@ -44,6 +44,7 @@
mbox_extensions = ['mbx', 'mbox', 'eml']

import argparse
from csv import DictReader
import importlib
from io import BytesIO
import sys
Expand Down Expand Up @@ -2343,28 +2344,38 @@ def main(argv):
current_batch_bytes = 5000
gbatch = gmail.new_batch_http_request()
max_batch_bytes = 8 * 1024 * 1024
# Look for Google Vault XML metadata which contains message labels map
# Look for Google Vault XML and CSV metadata which contains message labels map
vault_label_map = {}
vault_csv_label_map = {}
if not options.strip_labels:
for path, subdirs, files in os.walk(options.local_folder):
for filename in files:
if filename[-4:].lower() != '.xml':
file_suffix = filename[-4:].lower()
if file_suffix not in ['.xml', '.csv']:
continue
file_path = os.path.join(path, filename)
print("\nReading Vault labels from %s file %s" % (humansize(file_path), file_path))
print("large files may take some time to read...")
for _, elem in etree.iterparse(file_path, events=('end',)):
if elem.tag == 'Document':
labels = ''
fileid = None
for tag in elem.iter('Tag'):
if tag.attrib['TagName'] == 'Labels':
labels = tag.attrib.get('TagValue', '')
for file in elem.iter('ExternalFile'):
fileid = file.attrib.get('FileName', None)
if fileid and labels:
vault_label_map[fileid] = labels
elem.clear() # keep memory usage down on very large files
elif file_suffix == '.xml':
for _, elem in etree.iterparse(file_path, events=('end',)):
if elem.tag == 'Document':
labels = ''
fileid = None
for tag in elem.iter('Tag'):
if tag.attrib['TagName'] == 'Labels':
labels = tag.attrib.get('TagValue', '')
for file in elem.iter('ExternalFile'):
fileid = file.attrib.get('FileName', None)
if fileid and labels:
vault_label_map[fileid] = labels
elem.clear() # keep memory usage down on very large files
elif file_suffix == '.csv':
with open(file_path, 'r') as f:
csv_data = DictReader(f)
for row in csv_data:
msg_id = row.get('Rfc822MessageId')
if msg_id:
vault_csv_label_map[msg_id] = row.get('Labels', '')
# Look for and restore mbox files
for path, subdirs, files in os.walk(options.local_folder):
for filename in files:
Expand Down Expand Up @@ -2399,6 +2410,10 @@ def main(argv):
mbox_from = message.get_from()
mbox_fileid = mbox_from.split('@')[0]
labels_str = vault_label_map.get(mbox_fileid, '')
elif vault_csv_label_map:
# message id (minus < and >)
msg_id = message.get_header(b'message-id', case_insensitive=True)[1:-2]
labels_str = vault_csv_label_map.get(msg_id, '')
else:
labels_str = message.get_header(b'X-Gmail-Labels')
mybytes, encoding = email.header.decode_header(labels_str)[0]
Expand Down

0 comments on commit cfc587d

Please sign in to comment.