-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathscraping_tweets.py
133 lines (106 loc) · 5.21 KB
/
scraping_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Import the libraries
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import tweepy
import json
import pandas as pd
import csv
import re #regular expression
from textblob import TextBlob
import string
import preprocessor as p
import os
import time
# Authenticating Twitter API
# Obtain your Twitter credentials from your twitter developer account
consumer_key = 'your_consumer_key'
consumer_secret = 'your_consumer_secret'
access_key = 'your_access_key'
access_secret = 'your_access_secret'
# Pass your twitter credentials to tweepy via its OAuthHandler
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
## Automating Scraping
# Calls API every 15 minutes to prevent overcalling
# 1. define a for-loop
# 2. define search parameter
# 3. define date period
# 4. define no. of tweets to pull
def scraptweets(search_words, date_since, numTweets, numRuns):
## Arguments:
# search_words -> define a string of keywords for this function to extract
# date_since -> define a date from which to start extracting the tweets
# numTweets -> number of tweets to extract per run
# numRun -> number of runs to perform in this program - API calls are limited to once every 15 mins, so each run will be 15 mins apart.
##
# Define a pandas dataframe to store the date:
db_tweets = pd.DataFrame(columns = ['username', 'acctdesc', 'location', 'following',
'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts',
'retweetcount', 'text', 'hashtags']
)
# Define a for-loop to generate tweets at regular intervals
for i in range(0, numRuns):
# We will time how long it takes to scrape tweets for each run:
start_run = time.time()
# Collect tweets using the Cursor object
# .Cursor() returns an object that you can iterate or loop over to access the data collected.
# Each item in the iterator has various attributes that you can access to get information about each tweet
tweets = tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since, tweet_mode='extended').items(numTweets)
# Store these tweets into a python list
tweet_list = [tweet for tweet in tweets]
# Obtain the following info (methods to call them out):
# user.screen_name - twitter handle
# user.description - description of account
# user.location - where is he tweeting from
# user.friends_count - no. of other users that user is following (following)
# user.followers_count - no. of other users who are following this user (followers)
# user.statuses_count - total tweets by user
# user.created_at - when the user account was created
# created_at - when the tweet was created
# retweet_count - no. of retweets
# (deprecated) user.favourites_count - probably total no. of tweets that is favourited by user
# retweeted_status.full_text - full text of the tweet
# tweet.entities['hashtags'] - hashtags in the tweet
# Begin scraping the tweets individually:
noTweets = 0
for tweet in tweet_list:
# Pull the values
username = tweet.user.screen_name
acctdesc = tweet.user.description
location = tweet.user.location
following = tweet.user.friends_count
followers = tweet.user.followers_count
totaltweets = tweet.user.statuses_count
usercreatedts = tweet.user.created_at
tweetcreatedts = tweet.created_at
retweetcount = tweet.retweet_count
hashtags = tweet.entities['hashtags']
try:
text = tweet.retweeted_status.full_text
except AttributeError: # Not a Retweet
text = tweet.full_text
# Add the 11 variables to the empty list - ith_tweet:
ith_tweet = [username, acctdesc, location, following, followers, totaltweets,
usercreatedts, tweetcreatedts, retweetcount, text, hashtags]
# Append to dataframe - db_tweets
db_tweets.loc[len(db_tweets)] = ith_tweet
# increase counter - noTweets
noTweets += 1
# Run ended:
end_run = time.time()
duration_run = round(end_run-start_run, 2)
print('no. of tweets scraped for run {} is {}'.format(i, noTweets))
print('time take for {} run to complete is {}'.format(i, duration_run))
time.sleep(900) #15 minute sleep time
# Once all runs have completed, save them to a single csv file:
# Obtain timestamp in a readable format:
from datetime import datetime
to_csv_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S')
# Define working path and filename
path = os.getcwd()
filename = path + '/data/' + to_csv_timestamp + '_sahkprotests_tweets.csv'
# Store dataframe in csv with creation date timestamp
db_tweets.to_csv(filename, index = False)
print('Scraping has completed!')