-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtwitter_streamer.py
111 lines (93 loc) · 3.87 KB
/
twitter_streamer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
from twitter_credentials import twit_auth as credentials
import tweepy
from tweepy import Stream
from tweepy.streaming import StreamListener
import json
auth = tweepy.OAuthHandler(credentials["consumer_key"],
credentials["consumer_secret"])
auth.set_access_token(credentials["access_token"],
credentials["access_token_secret"])
api=tweepy.API(auth)
class retweet_user_name_streamer(StreamListener):
"""
Extension on Tweepy twitter streamer that stores retweet
information about users and their connections to one another
"""
def __init__(self, tweets_per_file=500, sep=";,."):
"""
Twitter streamer that finds retweets and stores the username of both
the retweeter and the original tweeter in a CSV file. Handles file
management so the data is split across multiple CSV files.
tweets_per_file: how many tweets to analyze and store per CSV
sep: separator for CSV data, defaults to characters unlikely to be
together in a twitter username
"""
super().__init__()
self.tweets_seen = 0
self.current_file = 0
self.tweets_per_file = tweets_per_file
self.sep = sep
self.base_file_name = "data/twitter_retweet_network_data{}.csv"
self.file_to_write = None
self.open_next_file()
def open_next_file(self):
"""
On call, opens a new file to be used in output and closes any open file.
Manages the filename by making sure never to overwrite a previously created file.
Sets the open file as an attribute for usage across the class.
return: None
"""
if self.file_to_write:
self.file_to_write.close()
file_name = self.get_valid_file_name()
print("Opening File " + str(self.current_file))
self.current_file += 1
self.file_to_write = open(file_name, 'w')
def get_valid_file_name(self):
"""
File name selector helper function. Checks for existing files and
iterates filenames until a valid, unused file name is found in the
base filename directory.
return: string, valid filename for data storage
"""
file_name = self.base_file_name.format(self.current_file)
if os.path.exists(file_name):
while os.path.exists(file_name):
self.current_file += 1
file_name = self.base_file_name.format(self.current_file)
return file_name
def on_data(self, data):
"""
When receiving data from the stream, check to see if it is a retweet (label RT),
if it is, extract the username and check to see if the original tweet information
is available. If so, store the usernames for the original tweeter and the retweeter
so we can build a network graph. Also manages how many tweets are currently stored in
each file.
data: JSON object from twitter API stream
return: None
"""
tweet = json.loads(data)
try:
if tweet['text'].startswith('RT'):
try:
user_original = tweet['user']['name']
self.file_to_write.write(user_original + self.sep + tweet['retweeted_status']['user']['name']+'\n')
self.tweets_seen += 1
if self.tweets_seen > self.tweets_per_file:
self.open_next_file()
self.tweets_seen = 0
except KeyError:
pass
except KeyError:
pass
def on_error(self, status):
"""
If error from API, print the error.
status: Error information
return: None
"""
print(status)
if __name__ == "__main__":
twitter_stream = Stream(auth, retweet_user_name_streamer())
twitter_stream.filter(track=['Trump', 'trump'])