-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
113 lines (84 loc) · 2.69 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
'''
Helper functions to clean text data for analysis, read and write csv files.
'''
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from collections import Counter
import csv, time, re, string
def import_csv(csv_filename):
data = []
# add '.csv' type if not included in csv_filename
if not csv_filename.endswith('.csv'):
csv_filename = csv_filename + '.csv'
# save csv data
with open(csv_filename, newline='') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
for row in csv_reader:
data.extend(row)
return data
def write_to_scv(tweets):
'''
Write tweets to a new csv.
'''
# name csv file
timestr = time.strftime("%Y%m%d-%H%M%S")
new_filename = 'tweets-' + timestr + '.csv'
# write to csv file
header = tweets[0].keys()
with open(new_filename, 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, header)
dict_writer.writeheader()
dict_writer.writerows(tweets)
def remove_noise(tweet_tokens, stop_words = ()):
'''
Utility function to get cleaned alphanumeric word tokens.
Remove URL hyperlinks, @ mentions, punctuation and specials characters.
Removes stop words and normalizes word forms.
Arguments:
tweet_tokens: Tokens to be cleaned.
stop_words: Stop words as list of strings.
Returns:
cleaned_tokens: Cleaned tokens.
'''
cleaned_tokens = []
for token, tag in pos_tag(tweet_tokens):
# replace URL hyperlinks with an empty string
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
'(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
# replace @ mentions with an empty string
token = re.sub("(@[A-Za-z0-9_]+)",'', token)
# normalize word forms using lemmatizer
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
# get normalized tokens
token = lemmatizer.lemmatize(token, pos)
# remove punctuation, special characters and stop words
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
cleaned_tokens.append(token.lower())
return cleaned_tokens
def get_all_words(cleaned_tokens_list):
'''
Utility function to iterate through a list of tokens.
Arguments:
cleaned_tokens_list: A list of tokens.
Yields:
token: Each token from given list.
'''
for tokens in cleaned_tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(cleaned_tokens_list):
'''
Utility function to format tokens as a dictionary for a model.
Arguments:
cleaned_tokens_list: A list of tokens.
Yields:
Generator function for tokens as a dictionary.
'''
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)