-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtweet_count.py
executable file
·35 lines (26 loc) · 946 Bytes
/
tweet_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re
#best regex for our purposes
WORD_RE = re.compile(r"[\w']+")
def zip(occurances):
fst = 0
snd = 0
for o in occurances:
if o == 1: fst += 1
elif o == 0: snd += 1
zipped = (fst,snd) #<number of flu tweets, number of not flu>
return zipped
class TweetCounter(MRJob):
def mapper(self, key, line):
l= []
#edit to match formatting
tag = int(line.split("***")[0]) #1 for flu, 0 otherwise
new_line = line.split("***")[1] #tweet content
for word in WORD_RE.findall(new_line): #break up tweet based on regex
if word not in l: #ignore duplicates of words
l.append(word)
for v in l: #intermediate pair = <word,flu/not flu>
yield v, tag
def reducer(self, word, occurrences):
yield word, zip(occurrences) #produce final tuple for each word
if __name__ == '__main__':
TweetCounter.run()