-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathprinting_press.py
185 lines (156 loc) · 6.69 KB
/
printing_press.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 8 09:12:38 2019
Updated on Jan 2 2020
@author: James Farmer, Sadman Ahmed Shanto
Version: 2.0
ChangeLog:
Two classes are defined below, Journalist and Editor.
The Journalist pulls RSS feeds, searches their contents, and generates a list
of the articles which match our interest.
The Editor reviews the list of articles provided by the Journalist and generates
a list of those articles, less any repeated content, formatted as Slack message contents.
"""
import sys
from concurrent.futures import ThreadPoolExecutor
from functools import reduce
from re import search, sub
#imports
from feedparser import parse
class Journalist:
'''Searches the RSS feeds and passes relevant articles'''
def __init__(self,feeds='feeds.txt',keywords='keywords.txt',authors='authors.txt',nLow=1):
'''Journalist.drafts is a list of dictionaries with fields
'quality': number of keywords matched
'matches': list of matching words
'entry': dictionary of article details'''
with open(feeds) as f:
self.urls = self._get_strings(f.read())
with open(keywords)as f:
self.words = self._get_strings(f.read())
with open(authors) as f:
self.words += self._get_strings(f.read())
self.errors = []
notes = self._collect_feeds()
self.titles = []
reviews = [self._review_note(note,self.words) for note in notes if note['title'] not in self.titles]
self.drafts = list(filter(lambda x: x['quality'] >= nLow, reviews))
def _get_strings(self,text):
'''returns a list of each line in text'''
return text.strip().splitlines()
def _collect_feeds(self,):
'''Pulls the RSS feeds and returns summary as list of dictionaries'''
with ThreadPoolExecutor() as e:
raw_material = [e.submit(parse,url) for url in self.urls]
notes_list = sum([M.result().entries for M in raw_material], [])
notes = [self._clean_notes(i,note) for i,note in enumerate(notes_list)]
self.nArticles = len(notes)
return notes
def _clean_notes(self, i, entry):
""" returns a dict from entry containing article content and link """
summary = reduce(lambda x,y: sub(y,' ',x),
[entry.summary,'Author[^>]+>','<[^>]+>','\n'])
# Include abstract if available
abstract = getattr(entry, 'abstract', '')
# full_text = summary + ' ' + abstract # Combine summary and abstract
full_text = summary # Only search in summary for now
try:
data = {'text': entry.title +'\n'+ sub('\.','',entry.author) +'\n'+ full_text,
'link': entry.link,
'title': entry.title,
'authors': sub('<[^>]+>','',entry.author),
'summary': summary,
'abstract': abstract
}
except AttributeError:
data = {'text': entry.title +'\n'+ full_text,
'link': entry.link,
'title': entry.title,
'authors': 'No Author found in RSS',
'summary': summary,
'abstract': abstract
}
except:
e = sys.exc_info()[0]
data = {'text': None}
self.errors.append('Error: {}'.format(e) +'\n'+ entry.link)
finally:
return data
def _review_note(self, note, words):
"""checks the note for any keywords"""
self.titles.append(note['title'])
# Search in both summary and abstract
match_list = [self._search_note(note['text'], word) for word in words]
match_words = [x for x, y in zip(words, match_list) if y == 1]
dictionary = {'quality': sum(match_list), 'matching': match_words,'entry': note}
return dictionary
def _search_note(self,note,word):
""" checks the note for given string """
if search(r'\b'+word.lower()+r'\b', note.lower()):
return True
else:
return False
class Editor:
'''retrieves the drafts from Journalist and formats them into
JSON message payloads'''
DIVIDER = {"type": "divider"}
def __init__(self,feeds='feeds.txt',keywords='keywords.txt',authors='authors.txt',channel='channel.txt',nLow=1):
journalist = Journalist(feeds,keywords,authors,nLow)
self.timestamp = ""
with open(channel) as f:
self.channel = journalist._get_strings(f.read())[0]
self.username = 'PaperBoy'
self.icon = ":robot_face:"
with open('previous_titles.txt') as f:
self.titles = journalist._get_strings(f.read())
self.new_titles = []
self.articles = [self.get_payload(draft) for draft in journalist.drafts if draft['entry']['title'] not in self.titles]
if self.new_titles:
with open('previous_titles.txt','a') as f:
f.write('\n'.join(str(title) for title in self.new_titles))
f.write('\n')
def get_payload(self,draft):
E = draft['entry']
self.new_titles.append(E['title'])
return {
"ts": self.timestamp,
"channel": self.channel,
"username": self.username,
"icon_emoji": self.icon,
"blocks": [
self.DIVIDER,
self.DIVIDER,
self._get_title(E['title']),
self._get_authors(E['authors']),
self._get_link(E['link']),
self.DIVIDER,
self._get_summary(E['summary']),
self._get_matched(draft['matching'])
],
}
def _get_title(self,title):
return {
"type": "section",
"text": {"type": "mrkdwn","text": ("*{}*".format(title))}
}
def _get_authors(self,authors):
return {
"type": "section",
"text": {"type": "mrkdwn","text": (authors)}
}
def _get_summary(self,summary):
return {
"type": "section",
"text": {"type": "mrkdwn","text": (summary)}
}
def _get_matched(self,matches):
return {
"type": "section",
"text": {"type": "mrkdwn","text": ('keyword matches: {}'.format(matches))}
}
def _get_link(self,link):
return {
"type": "context",
"elements": [{"type": "mrkdwn","text": " :information_source: *<{}|Go to source>*".format(link)}]
}