-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add email mentions to validphys index
Add a script to parse the emails, and find the mentions of validphys reports and associate report id with email url and title. Because there is no way to get an email URL from the email as received, we scan the HTML of the archives, by crawling over each message in each month. The script tries to remove links that are in quoted sections but that only works if these have already been parsed as a `backquote` HTML element in the email archives. We use this information to create a link to the email, in the index page, by adding an email emoji link to each email. It could be used for other things such as displaying the email in the template. One annoying aspect is that this is an embracingly parallel task (we could be processing the emails while we are waiting for other emails to download), but I am hitting some bug I don't understand when trying to do this with curio and asks (theelous3/asks#118), so it will stay sequential for the moment. Because it is slow, we add a cache to remember already seen emails. At the moment index-emails needs to be run independently from index-reports (I run it once a day), but that may not be optimal.
- Loading branch information
Showing
3 changed files
with
137 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
EMAIL_BOT_PASSWORD |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
import json | ||
import pickle | ||
from collections import defaultdict | ||
from urllib.parse import urljoin, urlparse | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
ARCHIVES_URL = 'https://lists.cam.ac.uk/mailman/private/ucam-nnpdf/' | ||
|
||
USERNAME = '[email protected]' | ||
PASSWORD_FILE = 'EMAIL_BOT_PASSWORD' | ||
|
||
|
||
def make_login(): | ||
with open(PASSWORD_FILE) as f: | ||
password = f.read().strip() | ||
return {"password": password, "username": USERNAME, "name": "submit"} | ||
|
||
|
||
def make_soup(data): | ||
return BeautifulSoup(data, features="html5lib") | ||
|
||
|
||
def get_archive_index(session): | ||
resp = session.post(ARCHIVES_URL, data=make_login()) | ||
resp.raise_for_status() | ||
return resp.text | ||
|
||
|
||
def get_thread_index(month_url, session): | ||
resp = session.get(month_url) | ||
resp.raise_for_status() | ||
return resp.text | ||
|
||
|
||
def parse_threads(archive_index): | ||
soup = make_soup(archive_index) | ||
return [ | ||
urljoin(ARCHIVES_URL, th.attrs['href']) | ||
for th in soup.find_all('a', string='[ Thread ]') | ||
] | ||
|
||
|
||
def parse_emails(thread_index, month_url): | ||
soup = make_soup(thread_index) | ||
return [ | ||
urljoin(month_url, em.attrs['href']) | ||
for em in soup.find_all('a', attrs={'name': True, 'href': True}) | ||
] | ||
|
||
|
||
def get_email(email_url, session): | ||
resp = session.get(email_url) | ||
resp.raise_for_status() | ||
return resp.text | ||
|
||
|
||
def parse_email(email, email_url): | ||
res = {} | ||
|
||
def good_link(tag): | ||
if tag.name != 'a': | ||
return False | ||
if not tag.attrs['href'].startswith('https://vp.nnpdf.science/'): | ||
return False | ||
if any(p.name == 'blockquote' for p in tag.parents): | ||
return False | ||
return True | ||
|
||
soup = make_soup(email) | ||
links = soup.body.find_all(good_link, recursive=True) | ||
for link in links: | ||
p = urlparse(link.attrs['href']).path | ||
fragments = p.split('/') | ||
if len(fragments) >= 2: | ||
res[fragments[1]] = [email_url, str(soup.title.string)] | ||
return res | ||
|
||
|
||
if __name__ == '__main__': | ||
try: | ||
with open('email_mentions.json') as f: | ||
res = defaultdict(list, json.load(f)) | ||
except FileNotFoundError: | ||
res = defaultdict(list) | ||
|
||
try: | ||
with open('seen_emails_cache.pkl', 'rb') as f: | ||
seen_data = pickle.load(f) | ||
seen_months = seen_data['seen_months'] | ||
seen_emails = seen_data['seen_emails'] | ||
except FileNotFoundError: | ||
seen_months = set() | ||
seen_emails = set() | ||
|
||
s = requests.Session() | ||
idx = get_archive_index(session=s) | ||
for mindex, month_url in enumerate(parse_threads(idx)): | ||
if month_url in seen_months: | ||
continue | ||
# Could still add emails to last month | ||
if mindex != 0: | ||
seen_months.add(month_url) | ||
thindex = get_thread_index(month_url, s) | ||
for email_url in parse_emails(thindex, month_url): | ||
if email_url in seen_emails: | ||
continue | ||
seen_emails.add(email_url) | ||
email = get_email(email_url, s) | ||
email_res = parse_email(email, email_url) | ||
for k, v in email_res.items(): | ||
res[k].append(v) | ||
with open('email_mentions.json', 'w') as f: | ||
json.dump(res, f) | ||
with open('seen_emails_cache.pkl', 'wb') as f: | ||
pickle.dump({'seen_months': seen_months, 'seen_emails': seen_emails}, f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters