-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprocess_data.py
459 lines (432 loc) · 18.7 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
"""
Description of frames from Table S2 in "Supplimentary Information for Dominant Frames in Legacy and Social Media
Coverage of the IPCC Fifth Assessment Report"
Some description has been left out like mentions of other frames, punctuation removed, everything lowercased,
entities joined by _
ss Settled Science
us Uncertain (and contested) Science
pis Political or Ideological Struggle
d Disaster
o1 & o2 Opportunity
e1 & e2 Economic
me1 & me2 Morality and Ethics
ros Role of Science
s Security
h Health
column 1 socio-political context of frame
column 2 problem definition, moral judgement, remedy
column 3 typical sources
column 4 themes or storylines
column 5 language, metaphors, phrases
column 6 visual imagery
"""
import json
import time
from sentence_transformers import SentenceTransformer
from redditscore.tokenizer import CrazyTokenizer
from nltk.tokenize import sent_tokenize
import os
import tensorflow as tf
from tweet_parser.tweet import Tweet
import pandas as pd
from sqlalchemy.dialects.postgresql import ARRAY
from crate.client.sqlalchemy.types import Object
from sqlalchemy.types import String, DateTime, Float
import numpy as np
from tqdm import tqdm
import tensorflow_hub as hub
import pickle
import emoji
import re
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.prod((10, 1050)))
use_embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
roberta_model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
rule_poli = re.compile(r'[pP][oO][lL][iI]')
rule_govt = re.compile(r'[gG][oO][vV][tT]')
rule_2c = re.compile(r"""\d+[cC] | # e.g. 2c
\d+\.\d+[cC] | # e.g. 1.5C
\d+º[cC] | # e.g. 2ºC
\d+\.\d+º[cC] # e.g. 1.5ºc
""", re.X)
rule_mdg = re.compile(r'[mM][dD][gG][sS]|[mM][dD][gG]')
rule_ipcc = re.compile(r'[iI][pP][cC][cC]')
rule_un = re.compile(r'\s[uU][nN]\s')
rule_who = re.compile(r'\s[W][H][O]\s')
extra_patterns = [(rule_2c, ' degree celsius '),
(rule_mdg, ' Millennium Development Goal '),
(rule_poli, ' politics '),
(rule_govt, ' government '),
(rule_ipcc, ' Intergovernmental Panel on Climate Change '),
(rule_un, ' United Nations '),
(rule_who, ' World Health Organization ')]
phrase_tokenizer = CrazyTokenizer(lowercase=False,
keepcaps=True,
hashtags='split',
remove_punct=False,
decontract=True,
# extra_patterns=extra_patterns,
twitter_handles='',
urls='',
whitespaces_to_underscores=False)
def phrase_tokenize(text):
all_phrases = {}
phrases = sent_tokenize(text)
for i in range(len(phrases)):
phrase = phrases[i]
for pattern in extra_patterns:
phrase = re.sub(pattern[0], pattern[1], phrase)
tokens = phrase_tokenizer.tokenize(phrase)
tokens = [token if not emoji.is_emoji(token) else token.strip(':').replace('_', ' ') for token in tokens]
all_phrases['sentence_{}'.format(i)] = {'tokens': tokens, 'phrase': ' '.join(tokens)}
return all_phrases
def check_array_size(a):
size = len(a.encode('utf-8'))
try:
assert size < 32766
except:
print("size: ", size)
frame_list = [
'settled_science'] * 18 + [
'uncertain_science'] * 18 + [
'political_or_ideological_struggle'] * 11 + [
'disaster'] * 18 + [
'opportunity'] * 23 + [
'economic'] * 19 + [
'morality_and_ethics'] * 13 + [
'role_of_science'] * 24 + [
'security'] * 15 + [
'health'] * 13
element_id_list = list(range(18)) + list(range(18)) + list(range(11)) + list(range(18)) + list(range(23)) + \
list(range(19)) + list(range(13)) + list(range(24)) + list(range(15)) + list(range(13))
element_list = [
# settled science
"there is broad expert scientific consensus",
"considerable evidence of the need for action",
"science has spoken",
"politicians must act in terms of global agreements",
"exhaustive Intergovernmental Panel on Climate Change report produced by thousands of expert scientists",
"unprecedented rate of change compared to paleo records",
"carbon budget emissions allowance in order to meet 2 degrees celsius policy target",
"severe and irreversible impacts",
"trust climate scientists and dismiss skeptic voices",
"settled science",
"unequivocal nature of anthropogenic climate change",
"landmark report by Intergovernmental Panel on Climate Change",
"the balance of evidence",
"what more proof do we need",
"greatest challenge of our time",
"skeptics wishful thinking or malpractice",
"go read the Intergovernmental Panel on Climate Change report",
"citing sources of information",
# uncertain science
"there is still a lack of scientific evidence to justify action",
"uncertainty in climate science impacts or solutions",
"question anthropogenic nature of climate change",
"natural variability",
"science has been wrong before and still lacks knowledge",
"we cannot should not or will struggle to act",
"unexplained pause in global mean temperature warming",
"Climatic Research Unit stolen emails",
"climategate",
"errors in Intergovernmental Panel on Climate Change",
"a pause in warming or slowdown",
"we cannot be sure despite scientists best efforts",
"scientists making errors or mistakes",
"hysteria and silliness",
"scientists admit or insist or are puzzled",
"scientists attempt to prove climate change",
"global warming believers",
"climate change hoax",
# political or ideological struggle
"a political or ideological conflict over the way the world should work",
"conflict over solutions or strategy to address issues",
"a battle for power between nations groups or personalities",
"detail of specific policies",
"green new deal",
"climate change act",
"disagreement over policies and policy detail",
"questioning the motives or funding of opponents",
"a battle or war or fierce debate of ideas",
"government strategy confused",
"how can the other political side ignore these scientific truths and not act",
# disaster
"predicted impacts are dire with severe consequences",
"impacts are numerous and threaten all aspects of life",
"impacts will get worse and we are not well prepared",
"unprecedented rise in global average surface temperature",
"sea level rise",
"snow and ice decline",
"decline in coral reefs",
"extreme weather including droughts heatwaves floods",
"scale of the challenge is overwhelming",
"positively frightening",
"unnatural weather",
"weather on steroids",
"violent or extreme weather",
"runaway climate change",
"life is unsustainable",
"threatened species or ecosystems",
"disaster-stricken people",
"entire ecosystems are collapsing",
# opportunity
"climate change poses opportunities",
"reimagine how we live",
"further human development",
"invest in co-benefits",
"climate change is rich with opportunity",
"time for innovation or creativity",
"improve lives now and in the future",
"take personal action",
"change in lifestyle choices",
"change diet go vegan or vegetarian",
"eco-friendly and sustainable cities and management",
"eco-friendly and sustainable lifestyle",
"reduce carbon footprint",
"adapt to challenges",
"adaptation strategies",
"carbon dioxide fertilization for agriculture",
"beneficial impacts of changing climate",
"no intervention needed",
"melting arctic will lead to opening up of shipping routes",
"new trade opportunities",
"increased agricultural productivity through increasing atmospheric carbon dioxide fertilization",
"opportunity to transform trade",
"increased resource extraction",
# economic
"economic growth prosperity investments and markets",
"high monetary costs of inaction",
"the economic case provides a strong argument for action now",
"divestment from fossil fuels like oil and gas",
"cost of mitigating climate change is high but the cost will be higher if we do not act now",
"action now can create green jobs",
"economic growth and prosperity",
"costs and economic estimates",
"billions of dollars of damage in the future if no action is taken now",
"it will not cost the world to save the planet",
"high monetary costs of action",
"action is hugely expensive or simply too costly in the context of other priorities",
"scientific uncertainty",
"United Nations is proposing climate plans which will damage economic growth",
"action at home now is unfair as Annex II countries will gain economic advantage",
"action will damage economic growth",
"it is no time for panicky rearranging of the global economy",
"killing industry",
"imposing costly energy efficiency requirements",
# morality and ethics
"an explicit and urgent moral religious or ethical call for action",
"strong mitigation and protection of the most vulnerable",
"God ethics and morality",
"climate change linked to poverty",
"ending world hunger",
"Millennium Development Goal",
"exert moral pressure",
"degradation of nature",
"ruining the planet or creation",
"people or nations at the front line of climate change for the most vulnerable and already exposed",
"responsibility to protect nature",
"there is no planet B",
"globalist climate change religion",
# role of science
"process or role of science in society",
"how the Intergovernmental Panel on Climate Change works or does not",
"transparency in funding",
"awareness of science",
"institutions involving scientists like the Intergovernmental Panel on Climate Change",
"public opinion understanding and knowledge",
"bias in media sources",
"giving contrarians a voice",
"not broadcasting diverse views",
"Intergovernmental Panel on Climate Change is a leading institution",
"politicisation of science",
"Intergovernmental Panel on Climate Change is too conservative or alarmist",
"detail how Intergovernmental Panel on Climate Change process works",
"amount of time and space given to contrarians or skeptics in the media",
"threats to free speech",
"false balance",
"balance as bias",
"sexed up science",
"belief in scientists as a new priesthood of the truth",
"misinformation and propaganda",
"fake news media",
"hidden agenda and mainstream narrative",
"suppression of information",
"conflict of interest",
# security
"threat to human energy",
"threat to water supply",
"threat to food security",
"threats to the nation state especially over migration",
"conflict might be local but could be larger in scale and endanger many",
"conflicts may occur between developed and developing countries",
"conflict between nature and humans",
"conflict between different stakeholders in developed nations",
"climate change as a threat multiplier",
"increase in instability volatility and tension",
"fighting for water security",
"a danger to world peace",
"impacts on security usually related to food drought or migration",
"armed forces preparing for war",
"people are being displaced",
# health
"severe danger to human health",
"deaths from malnutrition",
"deaths from insect-borne diseases",
"poor air quality",
"urgent mitigation and adaptation required",
"vulnerability of Annex II countries",
"vulnerability of children and elders to health impacts",
"details of health impacts from climate change",
"health wellbeing livelihoods and survival are compromised",
"financial cost of impacts to human health",
"mental health issues",
"worsening environmental and air pollution",
"climate change is a global problem and affects everyone"
]
element_use = use_embed(element_list).numpy()
element_roberta = roberta_model.encode(element_list)
element_roberta_norm = tf.keras.utils.normalize(element_roberta, axis=-1, order=2)
frames = {
"element_id": element_id_list,
"frame": frame_list,
"element_txt": element_list,
"element_use": element_use.tolist(),
"element_roberta": element_roberta.tolist(),
"element_roberta_norm": element_roberta_norm.tolist()
}
element_df = pd.DataFrame(frames)
element_df.to_sql('frame_elements', 'crate://localhost:4200', if_exists='append', index=False, dtype={
'element_use': ARRAY(Float),
'element_roberta': ARRAY(Float),
'element_roberta_norm': ARRAY(Float)
})
ids = []
table = []
split = []
created_at_datetime = []
screen_name = []
bio = []
txt = []
processed = []
use_embeddings = []
use_median = []
use_avg = []
roberta_embeddings = []
roberta_median = []
roberta_avg = []
roberta_embeddings_norm = []
roberta_median_norm = []
roberta_avg_norm = []
with open('sample.jsonl', 'r') as infile:
for line in tqdm(infile, desc='tweets'):
start_time = time.time()
tweet_dict = json.loads(line)
tweet = Tweet(tweet_dict)
if tweet.user_entered_text != '' and tweet.lang == 'en':
ids.append(tweet.id)
table.append('climate_tweets')
split.append('sample')
created_at_datetime.append(tweet.created_at_datetime)
screen_name.append(tweet.screen_name)
bio.append(tweet.bio)
txt.append(tweet.user_entered_text)
p = phrase_tokenize(tweet.user_entered_text)
processed.append(json.dumps(p))
s = [p[s]['phrase'] for s in p]
ue = use_embed(s).numpy()
use_embeddings.append(np.array_repr(ue))
# use_embeddings.append(ue.tolist())
use_median.append(np.array_repr(np.median(ue, axis=0)))
# use_median.append(np.median(ue, axis=0).tolist())
use_avg.append(np.array_repr(np.average(ue, axis=0)))
# use_avg.append(np.average(ue, axis=0).tolist())
rob = roberta_model.encode(s)
roberta_embeddings.append(np.array_repr(rob))
# roberta_embeddings.append(rob.tolist())
# print(rob.shape)
rob_med = np.median(rob, axis=0)
roberta_median.append(np.array_repr(rob_med))
# roberta_median.append(rob_med.tolist())
# print(rob_med.shape)
rob_avg = np.average(rob, axis=0)
# print(rob_avg.shape)
roberta_avg.append(np.array_repr(rob_avg))
# roberta_avg.append(rob_avg.tolist())
rob_norm = tf.keras.utils.normalize(rob, axis=-1, order=2)
# print(rob_norm.shape)
roberta_embeddings_norm.append(np.array_repr(rob_norm))
rob_med_norm = tf.keras.utils.normalize(rob_med, axis=-1, order=2).flatten()
# print(rob_med_norm.shape)
roberta_median_norm.append(np.array_repr(rob_med_norm))
rob_avg_norm = tf.keras.utils.normalize(rob_avg, axis=-1, order=2).flatten()
roberta_avg_norm.append(np.array_repr(rob_avg_norm))
# print(rob_avg_norm.shape)
if len(ids) == 50:
tweets_df = pd.DataFrame({
'id': ids,
'table_name': table,
'split': split,
'created_at_datetime': created_at_datetime,
'screen_name': screen_name,
'bio': bio,
'txt': txt,
'txt_clean_sentences': processed,
'txt_clean_use': use_embeddings,
'use_median': use_median,
'use_average': use_avg,
'txt_clean_roberta': roberta_embeddings,
'roberta_median': roberta_median,
'roberta_average': roberta_avg,
'txt_clean_roberta_norm': roberta_embeddings_norm,
'roberta_norm_median': roberta_median_norm,
'roberta_norm_average': roberta_avg_norm
})
tweets_df['txt_clean_roberta'].apply(check_array_size)
tweets_df['txt_clean_roberta_norm'].apply(check_array_size)
tweets_df.to_sql('climate_tweets', 'crate://localhost:4200', if_exists='append', index=False,
dtype={'created_at_datetime': DateTime,
'txt_clean_sentences': Object})
ids = []
table = []
split = []
created_at_datetime = []
screen_name = []
bio = []
txt = []
processed = []
sentences = []
use_embeddings = []
use_median = []
use_avg = []
roberta_embeddings = []
roberta_median = []
roberta_avg = []
roberta_embeddings_norm = []
roberta_median_norm = []
roberta_avg_norm = []
end_time = time.time()
print("total time taken this loop: ", end_time - start_time)
tweets_df = pd.DataFrame({'id': ids,
'table_name': table,
'split': split,
'created_at_datetime': created_at_datetime,
'screen_name': screen_name,
'bio': bio,
'txt': txt,
'txt_clean_sentences': processed,
'txt_clean_use': use_embeddings,
'use_median': use_median,
'use_average': use_avg
# 'txt_clean_roberta': roberta_embeddings,
# 'roberta_median': roberta_median,
# 'roberta_average': roberta_avg,
# 'txt_clean_roberta_norm': roberta_embeddings_norm,
# 'roberta_median_norm': roberta_median_norm,
# 'roberta_average_norm': roberta_avg_norm
})
# tweets_df['txt_clean_roberta'].apply(check_array_size)
# tweets_df['txt_clean_roberta_norm'].apply(check_array_size)
# print(tweets_df)
tweets_df.to_sql('climate_tweets', 'crate://localhost:4200', if_exists='append', index=False,
dtype={'created_at_datetime': DateTime,
'txt_clean_sentences': Object})