Skip to content

Commit

Permalink
Add email mentions to validphys index
Browse files Browse the repository at this point in the history
Add a script to parse the emails, and find the mentions of validphys
reports and associate report id with email url and title. Because there
is no way to get an email URL from the email as received, we scan the
HTML of the archives, by crawling over each message in each month.

The script tries to remove links that are in quoted sections but that
only works if these have already been parsed as a `backquote` HTML
element in the email archives.

We use this information to create a link to the email, in the index
page, by adding an email emoji link to each email. It could be used for
other things such as displaying the email in the template.

One annoying aspect is that this is an embracingly parallel task (we
could be processing the emails while we are waiting for other emails to
download), but I am hitting some bug I don't understand when trying to
do this with curio and asks
(theelous3/asks#118), so it will stay
sequential for the moment. Because it is slow, we add a cache to
remember already seen emails. At the moment index-emails needs to be run
independently from index-reports (I run it once a day), but that may not
be optimal.
  • Loading branch information
Zaharid committed Apr 15, 2019
1 parent 566c6b2 commit c0d9851
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 4 deletions.
1 change: 1 addition & 0 deletions validphys2/serverscripts/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
EMAIL_BOT_PASSWORD
117 changes: 117 additions & 0 deletions validphys2/serverscripts/index-email.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import json
import pickle
from collections import defaultdict
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

ARCHIVES_URL = 'https://lists.cam.ac.uk/mailman/private/ucam-nnpdf/'

USERNAME = '[email protected]'
PASSWORD_FILE = 'EMAIL_BOT_PASSWORD'


def make_login():
with open(PASSWORD_FILE) as f:
password = f.read().strip()
return {"password": password, "username": USERNAME, "name": "submit"}


def make_soup(data):
return BeautifulSoup(data, features="html5lib")


def get_archive_index(session):
resp = session.post(ARCHIVES_URL, data=make_login())
resp.raise_for_status()
return resp.text


def get_thread_index(month_url, session):
resp = session.get(month_url)
resp.raise_for_status()
return resp.text


def parse_threads(archive_index):
soup = make_soup(archive_index)
return [
urljoin(ARCHIVES_URL, th.attrs['href'])
for th in soup.find_all('a', string='[ Thread ]')
]


def parse_emails(thread_index, month_url):
soup = make_soup(thread_index)
return [
urljoin(month_url, em.attrs['href'])
for em in soup.find_all('a', attrs={'name': True, 'href': True})
]


def get_email(email_url, session):
resp = session.get(email_url)
resp.raise_for_status()
return resp.text


def parse_email(email, email_url):
res = {}

def good_link(tag):
if tag.name != 'a':
return False
if not tag.attrs['href'].startswith('https://vp.nnpdf.science/'):
return False
if any(p.name == 'blockquote' for p in tag.parents):
return False
return True

soup = make_soup(email)
links = soup.body.find_all(good_link, recursive=True)
for link in links:
p = urlparse(link.attrs['href']).path
fragments = p.split('/')
if len(fragments) >= 2:
res[fragments[1]] = [email_url, str(soup.title.string)]
return res


if __name__ == '__main__':
try:
with open('email_mentions.json') as f:
res = defaultdict(list, json.load(f))
except FileNotFoundError:
res = defaultdict(list)

try:
with open('seen_emails_cache.pkl', 'rb') as f:
seen_data = pickle.load(f)
seen_months = seen_data['seen_months']
seen_emails = seen_data['seen_emails']
except FileNotFoundError:
seen_months = set()
seen_emails = set()

s = requests.Session()
idx = get_archive_index(session=s)
for mindex, month_url in enumerate(parse_threads(idx)):
if month_url in seen_months:
continue
# Could still add emails to last month
if mindex != 0:
seen_months.add(month_url)
thindex = get_thread_index(month_url, s)
for email_url in parse_emails(thindex, month_url):
if email_url in seen_emails:
continue
seen_emails.add(email_url)
email = get_email(email_url, s)
email_res = parse_email(email, email_url)
for k, v in email_res.items():
res[k].append(v)
with open('email_mentions.json', 'w') as f:
json.dump(res, f)
with open('seen_emails_cache.pkl', 'wb') as f:
pickle.dump({'seen_months': seen_months, 'seen_emails': seen_emails}, f)
23 changes: 19 additions & 4 deletions validphys2/serverscripts/index-reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
ROOT = '/home/nnpdf/WEB/validphys-reports'
ROOT_URL = 'https://vp.nnpdf.science/'
OUT = '/home/nnpdf/WEB/validphys-reports/index.json'
THUMBNAILS = '/home/nnpdf/WEB/thumbnails/'
THUMBNAILS = '/home/nnpdf/WEB/thumbnails/'
EMAIL_MENTIONS_FILE = 'email_mentions.json'

EMPTY = '-'

Expand Down Expand Up @@ -130,7 +131,7 @@ def handle_thumbnail(p):
return thumbnail_tag(name)
return None

def register(p):
def register(p, emails):
path_meta = meta_from_path(p)
title, author, tags = path_meta['title'], path_meta['author'], path_meta['keywords']
url = ROOT_URL + p.name
Expand All @@ -148,19 +149,33 @@ def register(p):
if not isinstance(author, str):
author = "<INVALID AUTHOR>"

titlelink = '<a href="%s">%s</a>' % (url, title)
emaillinks = ' '.join(
f'<a href="{url}", title="{title}">📧</a>' for (url, title) in emails
)

titlelink = f'<a href="{url}">{title}</a> {emaillinks}'

thumbnail = handle_thumbnail(p)

return (titlelink, author, [date, timestamp], tags, thumbnail)


def get_all_emails():
try:
with open(EMAIL_MENTIONS_FILE) as f:
return json.load(f)
except FileNotFoundError:
return {}


def make_index():
root_path = pathlib.Path(ROOT)
emails = get_all_emails()
data = []
keywords = defaultdict(TagProps)
for p in root_path.iterdir():
if p.is_dir():
res = register(p)
res = register(p, emails.get(p.name, []))
data.append(res)
newkeywords = res[3]
timestamp = res[2][1]
Expand Down

0 comments on commit c0d9851

Please sign in to comment.