Add email mentions to validphys index

Add a script to parse the emails, and find the mentions of validphys reports and associate report id with email url and title. Because there is no way to get an email URL from the email as received, we scan the HTML of the archives, by crawling over each message in each month. The script tries to remove links that are in quoted sections but that only works if these have already been parsed as a `backquote` HTML element in the email archives. We use this information to create a link to the email, in the index page, by adding an email emoji link to each email. It could be used for other things such as displaying the email in the template. One annoying aspect is that this is an embracingly parallel task (we could be processing the emails while we are waiting for other emails to download), but I am hitting some bug I don't understand when trying to do this with curio and asks (theelous3/asks#118), so it will stay sequential for the moment. Because it is slow, we add a cache to remember already seen emails. At the moment index-emails needs to be run independently from index-reports (I run it once a day), but that may not be optimal.
NNPDF · Apr 15, 2019 · c0d9851 · c0d9851
1 parent 566c6b2
commit c0d9851
Show file tree

Hide file tree

Showing 3 changed files with 137 additions and 4 deletions.
diff --git a/validphys2/serverscripts/.gitignore b/validphys2/serverscripts/.gitignore
@@ -0,0 +1 @@
+EMAIL_BOT_PASSWORD
diff --git a/validphys2/serverscripts/index-email.py b/validphys2/serverscripts/index-email.py
@@ -0,0 +1,117 @@
+import json
+import pickle
+from collections import defaultdict
+from urllib.parse import urljoin, urlparse
+
+import requests
+from bs4 import BeautifulSoup
+
+ARCHIVES_URL = 'https://lists.cam.ac.uk/mailman/private/ucam-nnpdf/'
+
+USERNAME = '[email protected]'
+PASSWORD_FILE = 'EMAIL_BOT_PASSWORD'
+
+
+def make_login():
+    with open(PASSWORD_FILE) as f:
+        password = f.read().strip()
+    return {"password": password, "username": USERNAME, "name": "submit"}
+
+
+def make_soup(data):
+    return BeautifulSoup(data, features="html5lib")
+
+
+def get_archive_index(session):
+    resp = session.post(ARCHIVES_URL, data=make_login())
+    resp.raise_for_status()
+    return resp.text
+
+
+def get_thread_index(month_url, session):
+    resp = session.get(month_url)
+    resp.raise_for_status()
+    return resp.text
+
+
+def parse_threads(archive_index):
+    soup = make_soup(archive_index)
+    return [
+        urljoin(ARCHIVES_URL, th.attrs['href'])
+        for th in soup.find_all('a', string='[ Thread ]')
+    ]
+
+
+def parse_emails(thread_index, month_url):
+    soup = make_soup(thread_index)
+    return [
+        urljoin(month_url, em.attrs['href'])
+        for em in soup.find_all('a', attrs={'name': True, 'href': True})
+    ]
+
+
+def get_email(email_url, session):
+    resp = session.get(email_url)
+    resp.raise_for_status()
+    return resp.text
+
+
+def parse_email(email, email_url):
+    res = {}
+
+    def good_link(tag):
+        if tag.name != 'a':
+            return False
+        if not tag.attrs['href'].startswith('https://vp.nnpdf.science/'):
+            return False
+        if any(p.name == 'blockquote' for p in tag.parents):
+            return False
+        return True
+
+    soup = make_soup(email)
+    links = soup.body.find_all(good_link, recursive=True)
+    for link in links:
+        p = urlparse(link.attrs['href']).path
+        fragments = p.split('/')
+        if len(fragments) >= 2:
+            res[fragments[1]] = [email_url, str(soup.title.string)]
+    return res
+
+
+if __name__ == '__main__':
+    try:
+        with open('email_mentions.json') as f:
+            res = defaultdict(list, json.load(f))
+    except FileNotFoundError:
+        res = defaultdict(list)
+
+    try:
+        with open('seen_emails_cache.pkl', 'rb') as f:
+            seen_data = pickle.load(f)
+        seen_months = seen_data['seen_months']
+        seen_emails = seen_data['seen_emails']
+    except FileNotFoundError:
+        seen_months = set()
+        seen_emails = set()
+
+    s = requests.Session()
+    idx = get_archive_index(session=s)
+    for mindex, month_url in enumerate(parse_threads(idx)):
+        if month_url in seen_months:
+            continue
+        # Could still add emails to last month
+        if mindex != 0:
+            seen_months.add(month_url)
+        thindex = get_thread_index(month_url, s)
+        for email_url in parse_emails(thindex, month_url):
+            if email_url in seen_emails:
+                continue
+            seen_emails.add(email_url)
+            email = get_email(email_url, s)
+            email_res = parse_email(email, email_url)
+            for k, v in email_res.items():
+                res[k].append(v)
+    with open('email_mentions.json', 'w') as f:
+        json.dump(res, f)
+    with open('seen_emails_cache.pkl', 'wb') as f:
+        pickle.dump({'seen_months': seen_months, 'seen_emails': seen_emails}, f)
diff --git a/validphys2/serverscripts/index-reports.py b/validphys2/serverscripts/index-reports.py
@@ -23,7 +23,8 @@
 ROOT = '/home/nnpdf/WEB/validphys-reports'
 ROOT_URL = 'https://vp.nnpdf.science/'
 OUT = '/home/nnpdf/WEB/validphys-reports/index.json'
-THUMBNAILS =  '/home/nnpdf/WEB/thumbnails/'
+THUMBNAILS = '/home/nnpdf/WEB/thumbnails/'
+EMAIL_MENTIONS_FILE = 'email_mentions.json'
 
 EMPTY = '-'
 
@@ -130,7 +131,7 @@ def handle_thumbnail(p):
             return thumbnail_tag(name)
     return None
 
-def register(p):
+def register(p, emails):
     path_meta = meta_from_path(p)
     title, author, tags = path_meta['title'], path_meta['author'], path_meta['keywords']
     url = ROOT_URL + p.name
@@ -148,19 +149,33 @@ def register(p):
     if not isinstance(author, str):
         author = "<INVALID AUTHOR>"
 
-    titlelink = '<a href="%s">%s</a>' % (url, title)
+    emaillinks = ' '.join(
+        f'<a href="{url}", title="{title}">📧</a>' for (url, title) in emails
+    )
+
+    titlelink = f'<a href="{url}">{title}</a> {emaillinks}'
 
     thumbnail = handle_thumbnail(p)
 
     return (titlelink, author, [date, timestamp], tags, thumbnail)
 
+
+def get_all_emails():
+    try:
+        with open(EMAIL_MENTIONS_FILE) as f:
+            return json.load(f)
+    except FileNotFoundError:
+        return {}
+
+
 def make_index():
     root_path = pathlib.Path(ROOT)
+    emails = get_all_emails()
     data = []
     keywords = defaultdict(TagProps)
     for p in root_path.iterdir():
         if p.is_dir():
-            res = register(p)
+            res = register(p, emails.get(p.name, []))
             data.append(res)
             newkeywords = res[3]
             timestamp = res[2][1]