diff --git a/README.md b/README.md index 64255f0..a6196cb 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,29 @@ -# FakeNewsPropagation -Fake News propagation study +# Fake News Propagation + +Code for paper "Hierarchical Propagation Networks for Fake News Detection: Investigation and Exploitation" ICWSM 2020 https://arxiv.org/abs/1903.09196 + +###Dataset + + +###To Run: + + +###References + +If you use this dataset, please cite the following papers: + +@article{shu2019hierarchical, + title={Hierarchical propagation networks for fake news detection: Investigation and exploitation}, + author={Shu, Kai and Mahudeswaran, Deepak and Wang, Suhang and Liu, Huan}, + journal={arXiv preprint arXiv:1903.09196}, + year={2019} +} + +@article{shu2018fakenewsnet, + title={FakeNewsNet: A Data Repository with News Content, Social Context and Dynamic Information for Studying Fake News on Social Media}, + author={Shu, Kai and Mahudeswaran, Deepak and Wang, Suhang and Lee, Dongwon and Liu, Huan}, + journal={arXiv preprint arXiv:1809.01286}, + year={2018} +} + +(C) 2019 Arizona Board of Regents on Behalf of ASU diff --git a/analysis_util.py b/analysis_util.py index 237f075..d896cbc 100644 --- a/analysis_util.py +++ b/analysis_util.py @@ -1,14 +1,14 @@ import errno import os +import pickle +from abc import ABCMeta, abstractmethod from pathlib import Path import numpy as np -import pickle +from sklearn.utils import resample -from stat_test import get_box_plots, perform_t_test, get_box_plots_mod -from util.util import twitter_datetime_str_to_object, tweet_node - -from abc import ABCMeta, abstractmethod +from stat_test import perform_t_test, get_box_plots_mod +from util.util import twitter_datetime_str_to_object class BaseFeatureHelper(metaclass=ABCMeta): @@ -52,13 +52,13 @@ def get_dump_file_name(self, news_source, micro_features, macro_features, label, return "{}/{}.pkl".format(file_dir, "_".join(file_tags)) def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None, - file_dir="data/train_test_data"): + file_dir="data/train_test_data", use_cache=False): function_refs = [] - file_name = self.get_dump_file_name(news_source,micro_features, macro_features, label, file_dir) + file_name = self.get_dump_file_name(news_source, micro_features, macro_features, label, file_dir) data_file = Path(file_name) - if data_file.is_file(): + if use_cache and data_file.is_file(): return pickle.load(open(file_name, "rb")) if micro_features: @@ -134,6 +134,37 @@ def get_feature_significance_t_tests(self, fake_feature_array, real_feature_arra print("Feature {} : {}".format(short_feature_names[idx], feature_names[idx])) perform_t_test(fake_feature_values, real_feature_values) + def get_feature_significance_bootstrap_tests(self, fake_feature_array, real_feature_array, micro_features=None, + macro_features=None): + + [feature_names, short_feature_names] = self.get_feature_names(micro_features, macro_features) + + for idx in range(len(feature_names)): + fake_feature_values = fake_feature_array[:, idx] + real_feature_values = real_feature_array[:, idx] + + perms_fake = [] + perms_real = [] + + combined = np.concatenate((fake_feature_values, real_feature_values), axis=0) + + print("combined shape : ", combined.shape) + + for i in range(10000): + np.random.seed(i) + perms_fake.append(resample(combined, n_samples=len(fake_feature_values))) + perms_real.append(resample(combined, n_samples=len(real_feature_values))) + + dif_bootstrap_means = (np.mean(perms_fake, axis=1) - np.mean(perms_real, axis=1)) + print("diff bootstrap means : ", dif_bootstrap_means.shape) + + obs_difs = (np.mean(fake_feature_values) - np.mean(real_feature_values)) + + p_value = dif_bootstrap_means[dif_bootstrap_means >= obs_difs].shape[0] / 10000 + + print("Feature {} : {}".format(short_feature_names[idx], feature_names[idx])) + print("t- value : {} p-value : {}".format(obs_difs, p_value)) + def get_sample_feature_value(news_graps: list, get_feature_fun_ref): result = [] diff --git a/baseline/LIWC.py b/baseline/LIWC.py index aa74c59..997161c 100644 --- a/baseline/LIWC.py +++ b/baseline/LIWC.py @@ -1,280 +1,280 @@ - -from random import shuffle - -import numpy as np -import pandas as pd -from sklearn import linear_model -from sklearn import preprocessing -from sklearn import svm -from sklearn import tree -from sklearn.base import clone -from sklearn.ensemble import AdaBoostClassifier -from sklearn.ensemble import BaggingClassifier -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import accuracy_score, f1_score -from sklearn.metrics import precision_score, recall_score -from sklearn.model_selection import cross_val_score -from sklearn.model_selection import train_test_split -from sklearn.naive_bayes import MultinomialNB -from sklearn.neighbors import KNeighborsClassifier -from xgboost import XGBClassifier - - -def LIWC_Representation(data_type): - f_out = open('./' + data_type + '/LIWCFeats.txt', 'w+') - with open('LIWC2015_'+data_type+'_fake.txt') as f_fake: - for line in f_fake: - line = line.strip() - all_data = line.split('\t') - if all_data[0]=='Filename': - continue - ID = all_data[0] - feats = all_data[2:] - f_out.write(ID+'\t') - f_out.write('\t'.join(f for f in feats)) - f_out.write('\n') - - with open('LIWC2015_'+data_type+'_real.txt') as f_fake: - for line in f_fake: - line = line.strip() - all_data = line.split('\t') - if all_data[0]=='Filename': - continue - ID = all_data[0] - feats = all_data[2:] - f_out.write(ID + '\t') - f_out.write('\t'.join(f for f in feats)) - f_out.write('\n') - f_out.close() - print - -def LIWC_Prediction(data_type): - X_real = [] - y_real = [] - X_fake = [] - y_fake = [] - X=[] - y=[] - with open('./'+data_type+'/LIWCFeats.txt') as f_rst: - for line in f_rst: - line = line.strip() - line_str = line.split('\t') - ID = line_str[0] - feats = [float(x) for x in line_str[1:]] - if 'Real' in ID: - X_real.append(feats) - y_real.append(0) - else: - X_fake.append(feats) - y_fake.append(1) - ## Balance fake and true news - num = len(y_fake) - X_real = X_real[:num] - y_real = y_real[:num] - for i in range(num): - X.append(X_real[i]) - X.append(X_fake[i]) - y.append(y_real[i]) - y.append(y_fake[i]) - - X = np.array(X) - y = np.array(y) - # # shuffle the rows - arry = range(X.shape[0]) - shuffle(arry) - X = X[arry, :] - y = y[arry] - # clf = SVC(kernel='linear', class_weight='balanced') - # clf = RandomForestClassifier() - clf = tree.DecisionTreeClassifier() - X = preprocessing.normalize(X) - res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='accuracy') - res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) - print('Accuracy '+res) - res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='precision') - res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) - print('precision '+res) - res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='recall') - res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) - print('recall '+res) - res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='f1') - res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) - print('f1 '+res) - print - -def LIWC_Prediction2(data_type): - X_real = [] - y_real = [] - X_fake = [] - y_fake = [] - X=[] - y=[] - with open('./'+data_type+'/LIWCFeats.txt') as f_rst: - for line in f_rst: - line = line.strip() - line_str = line.split('\t') - ID = line_str[0] - feats = [float(x) for x in line_str[1:]] - if 'Real' in ID: - X_real.append(feats) - y_real.append(0) - else: - X_fake.append(feats) - y_fake.append(1) - ## Balance fake and true news - num = len(y_fake) - X_real = X_real[:num] - y_real = y_real[:num] - for i in range(num): - X.append(X_real[i]) - X.append(X_fake[i]) - y.append(y_real[i]) - y.append(y_fake[i]) - - X = np.array(X) - y = np.array(y) - # # shuffle the rows - arry = range(X.shape[0]) - shuffle(arry) - X = X[arry, :] - y = y[arry] - clfs = [ - linear_model.LogisticRegression(random_state=22), - MultinomialNB(), - tree.DecisionTreeClassifier(random_state=21), - RandomForestClassifier(random_state=22), - XGBClassifier(), - AdaBoostClassifier(random_state=22), - svm.SVC(kernel='linear', class_weight='balanced'), - GradientBoostingClassifier(random_state=22), - BaggingClassifier(random_state=22), - KNeighborsClassifier() - ] - clf_names = [ - 'Logistic Regression', - 'Naive Bayes', - 'Decision Tree', - 'Random Forest', - 'XGBoost', - 'AdaBoost', - 'SVM', - 'GradientBoosting', - 'Bagging Clf', - 'KNeighbors Clf' - ] - - X = preprocessing.normalize(X) - cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1'] - - df = pd.DataFrame(columns=cols) - df = df.set_index('alg') - for i in range(len(clfs)): - clf = clone(clfs[i]) - clf_name = clf_names[i] - df = test(clf,clf_name,df,cols,X,y) - print(df) - df.to_csv('./LIWC_'+data_type+'_results.csv', header=True,sep='\t',columns=cols) - -def test(clf, clf_name, df, cols, X, y,train_ratio): - acc = [] - prec = [] - recall = [] - f1 = [] - for i in range(5): - X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_ratio) - clf.fit(X_train, y_train) - y_pred = clf.predict(X_test) - acc.append(accuracy_score(y_test, y_pred)) - prec.append(precision_score(y_test, y_pred)) - recall.append(recall_score(y_test, y_pred)) - f1.append(f1_score(y_test, y_pred)) - tmp = pd.DataFrame([[clf_name, np.average(acc), np.std(acc), np.average(prec), np.std(prec), np.average(recall), - np.std(recall), np.average(f1), np.std(f1)]], columns=cols) - df = df.append(tmp) - return df - -def LIWC_Prediction2_curve(data_type): - X_real = [] - y_real = [] - X_fake = [] - y_fake = [] - X=[] - y=[] - with open('./'+data_type+'/LIWCFeats.txt') as f_rst: - for line in f_rst: - line = line.strip() - line_str = line.split('\t') - ID = line_str[0] - feats = [float(x) for x in line_str[1:]] - if 'Real' in ID: - X_real.append(feats) - y_real.append(0) - else: - X_fake.append(feats) - y_fake.append(1) - ## Balance fake and true news - num = len(y_fake) - X_real = X_real[:num] - y_real = y_real[:num] - for i in range(num): - X.append(X_real[i]) - X.append(X_fake[i]) - y.append(y_real[i]) - y.append(y_fake[i]) - - X = np.array(X) - y = np.array(y) - # # shuffle the rows - arry = range(X.shape[0]) - shuffle(arry) - X = X[arry, :] - y = y[arry] - clfs = [ - # linear_model.LogisticRegression(random_state=22), - # MultinomialNB(), - # tree.DecisionTreeClassifier(random_state=21), - # RandomForestClassifier(random_state=22), - # XGBClassifier(), - AdaBoostClassifier(random_state=22), - # svm.SVC(kernel='linear', class_weight='balanced'), - # GradientBoostingClassifier(random_state=22), - # BaggingClassifier(random_state=22), - # KNeighborsClassifier() - ] - clf_names = [ - # 'Logistic Regression', - # 'Naive Bayes', - # 'Decision Tree', - # 'Random Forest', - # 'XGBoost', - 'AdaBoost', - # 'SVM', - # 'GradientBoosting', - # 'Bagging Clf', - # 'KNeighbors Clf' - ] - - X = preprocessing.normalize(X) - cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1'] - - df = pd.DataFrame(columns=cols) - df = df.set_index('alg') - tr = [0.2,0.4,0.6] - for t in tr: - for i in range(len(clfs)): - clf = clone(clfs[i]) - clf_name = clf_names[i] - df = test(clf, clf_name, df, cols, X, y,t) - with pd.option_context('expand_frame_repr', False): - print (df) - df.to_csv('./LIWC_'+data_type+'_results_curve.csv', header=True,sep='\t',columns=cols) - -if __name__ == '__main__': - data_type = 'BuzzFeed' - # LIWC_Representation(data_type) - # LIWC_Prediction2('BuzzFeed') - # LIWC_Prediction2('PolitiFact') - LIWC_Prediction2_curve('BuzzFeed') - LIWC_Prediction2_curve('PolitiFact') - print \ No newline at end of file +# +# from random import shuffle +# +# import numpy as np +# import pandas as pd +# from sklearn import linear_model +# from sklearn import preprocessing +# from sklearn import svm +# from sklearn import tree +# from sklearn.base import clone +# from sklearn.ensemble import AdaBoostClassifier +# from sklearn.ensemble import BaggingClassifier +# from sklearn.ensemble import GradientBoostingClassifier +# from sklearn.ensemble import RandomForestClassifier +# from sklearn.metrics import accuracy_score, f1_score +# from sklearn.metrics import precision_score, recall_score +# from sklearn.model_selection import cross_val_score +# from sklearn.model_selection import train_test_split +# from sklearn.naive_bayes import MultinomialNB +# from sklearn.neighbors import KNeighborsClassifier +# from xgboost import XGBClassifier +# +# +# def LIWC_Representation(data_type): +# f_out = open('./' + data_type + '/LIWCFeats.txt', 'w+') +# with open('LIWC2015_'+data_type+'_fake.txt') as f_fake: +# for line in f_fake: +# line = line.strip() +# all_data = line.split('\t') +# if all_data[0]=='Filename': +# continue +# ID = all_data[0] +# feats = all_data[2:] +# f_out.write(ID+'\t') +# f_out.write('\t'.join(f for f in feats)) +# f_out.write('\n') +# +# with open('LIWC2015_'+data_type+'_real.txt') as f_fake: +# for line in f_fake: +# line = line.strip() +# all_data = line.split('\t') +# if all_data[0]=='Filename': +# continue +# ID = all_data[0] +# feats = all_data[2:] +# f_out.write(ID + '\t') +# f_out.write('\t'.join(f for f in feats)) +# f_out.write('\n') +# f_out.close() +# print +# +# def LIWC_Prediction(data_type): +# X_real = [] +# y_real = [] +# X_fake = [] +# y_fake = [] +# X=[] +# y=[] +# with open('./'+data_type+'/LIWCFeats.txt') as f_rst: +# for line in f_rst: +# line = line.strip() +# line_str = line.split('\t') +# ID = line_str[0] +# feats = [float(x) for x in line_str[1:]] +# if 'Real' in ID: +# X_real.append(feats) +# y_real.append(0) +# else: +# X_fake.append(feats) +# y_fake.append(1) +# ## Balance fake and true news +# num = len(y_fake) +# X_real = X_real[:num] +# y_real = y_real[:num] +# for i in range(num): +# X.append(X_real[i]) +# X.append(X_fake[i]) +# y.append(y_real[i]) +# y.append(y_fake[i]) +# +# X = np.array(X) +# y = np.array(y) +# # # shuffle the rows +# arry = range(X.shape[0]) +# shuffle(arry) +# X = X[arry, :] +# y = y[arry] +# # clf = SVC(kernel='linear', class_weight='balanced') +# # clf = RandomForestClassifier() +# clf = tree.DecisionTreeClassifier() +# X = preprocessing.normalize(X) +# res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='accuracy') +# res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) +# print('Accuracy '+res) +# res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='precision') +# res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) +# print('precision '+res) +# res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='recall') +# res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) +# print('recall '+res) +# res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='f1') +# res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) +# print('f1 '+res) +# print +# +# def LIWC_Prediction2(data_type): +# X_real = [] +# y_real = [] +# X_fake = [] +# y_fake = [] +# X=[] +# y=[] +# with open('./'+data_type+'/LIWCFeats.txt') as f_rst: +# for line in f_rst: +# line = line.strip() +# line_str = line.split('\t') +# ID = line_str[0] +# feats = [float(x) for x in line_str[1:]] +# if 'Real' in ID: +# X_real.append(feats) +# y_real.append(0) +# else: +# X_fake.append(feats) +# y_fake.append(1) +# ## Balance fake and true news +# num = len(y_fake) +# X_real = X_real[:num] +# y_real = y_real[:num] +# for i in range(num): +# X.append(X_real[i]) +# X.append(X_fake[i]) +# y.append(y_real[i]) +# y.append(y_fake[i]) +# +# X = np.array(X) +# y = np.array(y) +# # # shuffle the rows +# arry = range(X.shape[0]) +# shuffle(arry) +# X = X[arry, :] +# y = y[arry] +# clfs = [ +# linear_model.LogisticRegression(random_state=22), +# MultinomialNB(), +# tree.DecisionTreeClassifier(random_state=21), +# RandomForestClassifier(random_state=22), +# XGBClassifier(), +# AdaBoostClassifier(random_state=22), +# svm.SVC(kernel='linear', class_weight='balanced'), +# GradientBoostingClassifier(random_state=22), +# BaggingClassifier(random_state=22), +# KNeighborsClassifier() +# ] +# clf_names = [ +# 'Logistic Regression', +# 'Naive Bayes', +# 'Decision Tree', +# 'Random Forest', +# 'XGBoost', +# 'AdaBoost', +# 'SVM', +# 'GradientBoosting', +# 'Bagging Clf', +# 'KNeighbors Clf' +# ] +# +# X = preprocessing.normalize(X) +# cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1'] +# +# df = pd.DataFrame(columns=cols) +# df = df.set_index('alg') +# for i in range(len(clfs)): +# clf = clone(clfs[i]) +# clf_name = clf_names[i] +# df = test(clf,clf_name,df,cols,X,y) +# print(df) +# df.to_csv('./LIWC_'+data_type+'_results.csv', header=True,sep='\t',columns=cols) +# +# def test(clf, clf_name, df, cols, X, y,train_ratio): +# acc = [] +# prec = [] +# recall = [] +# f1 = [] +# for i in range(5): +# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_ratio) +# clf.fit(X_train, y_train) +# y_pred = clf.predict(X_test) +# acc.append(accuracy_score(y_test, y_pred)) +# prec.append(precision_score(y_test, y_pred)) +# recall.append(recall_score(y_test, y_pred)) +# f1.append(f1_score(y_test, y_pred)) +# tmp = pd.DataFrame([[clf_name, np.average(acc), np.std(acc), np.average(prec), np.std(prec), np.average(recall), +# np.std(recall), np.average(f1), np.std(f1)]], columns=cols) +# df = df.append(tmp) +# return df +# +# def LIWC_Prediction2_curve(data_type): +# X_real = [] +# y_real = [] +# X_fake = [] +# y_fake = [] +# X=[] +# y=[] +# with open('./'+data_type+'/LIWCFeats.txt') as f_rst: +# for line in f_rst: +# line = line.strip() +# line_str = line.split('\t') +# ID = line_str[0] +# feats = [float(x) for x in line_str[1:]] +# if 'Real' in ID: +# X_real.append(feats) +# y_real.append(0) +# else: +# X_fake.append(feats) +# y_fake.append(1) +# ## Balance fake and true news +# num = len(y_fake) +# X_real = X_real[:num] +# y_real = y_real[:num] +# for i in range(num): +# X.append(X_real[i]) +# X.append(X_fake[i]) +# y.append(y_real[i]) +# y.append(y_fake[i]) +# +# X = np.array(X) +# y = np.array(y) +# # # shuffle the rows +# arry = range(X.shape[0]) +# shuffle(arry) +# X = X[arry, :] +# y = y[arry] +# clfs = [ +# # linear_model.LogisticRegression(random_state=22), +# # MultinomialNB(), +# # tree.DecisionTreeClassifier(random_state=21), +# # RandomForestClassifier(random_state=22), +# # XGBClassifier(), +# AdaBoostClassifier(random_state=22), +# # svm.SVC(kernel='linear', class_weight='balanced'), +# # GradientBoostingClassifier(random_state=22), +# # BaggingClassifier(random_state=22), +# # KNeighborsClassifier() +# ] +# clf_names = [ +# # 'Logistic Regression', +# # 'Naive Bayes', +# # 'Decision Tree', +# # 'Random Forest', +# # 'XGBoost', +# 'AdaBoost', +# # 'SVM', +# # 'GradientBoosting', +# # 'Bagging Clf', +# # 'KNeighbors Clf' +# ] +# +# X = preprocessing.normalize(X) +# cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1'] +# +# df = pd.DataFrame(columns=cols) +# df = df.set_index('alg') +# tr = [0.2,0.4,0.6] +# for t in tr: +# for i in range(len(clfs)): +# clf = clone(clfs[i]) +# clf_name = clf_names[i] +# df = test(clf, clf_name, df, cols, X, y,t) +# with pd.option_context('expand_frame_repr', False): +# print (df) +# df.to_csv('./LIWC_'+data_type+'_results_curve.csv', header=True,sep='\t',columns=cols) +# +# if __name__ == '__main__': +# data_type = 'BuzzFeed' +# # LIWC_Representation(data_type) +# # LIWC_Prediction2('BuzzFeed') +# # LIWC_Prediction2('PolitiFact') +# LIWC_Prediction2_curve('BuzzFeed') +# LIWC_Prediction2_curve('PolitiFact') +# print \ No newline at end of file diff --git a/baseline/RST-VSM.py b/baseline/RST-VSM.py index c85bdf4..5a87e93 100644 --- a/baseline/RST-VSM.py +++ b/baseline/RST-VSM.py @@ -1,250 +1,250 @@ -# This is an implementation of Rhetorical Structure Theory for Vector Space Model -# The basic idea is from the paper: Identification of Truth and Deception in Text: Application of Vector Space Model to Rhetorical Structure Theory -from os import listdir -from os.path import isfile, join -from random import shuffle - -import numpy as np -import pandas as pd -from sklearn import linear_model -from sklearn import preprocessing -from sklearn import svm -from sklearn import tree -from sklearn.base import clone -from sklearn.ensemble import AdaBoostClassifier -from sklearn.ensemble import BaggingClassifier -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import accuracy_score, f1_score -from sklearn.metrics import precision_score, recall_score -from sklearn.model_selection import train_test_split -from sklearn.naive_bayes import MultinomialNB -from sklearn.neighbors import KNeighborsClassifier -from xgboost import XGBClassifier - - -def RSTRepresentation(data_type, out_file): - # dir_path = './'+data_type+'/' - dir_path = data_type - - f_out = open(out_file,'w+') - all_relations = set() - org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))] - News_RSTFeats = dict() - for of in org_files: - ID = of[:of.index('.txt')] - file_name = dir_path+'/'+of - relation_num = dict() - with open(file_name) as f_rst: - for line in f_rst: - line = line.replace('\'','') - line = line.replace(' ','') - tmp_relation = line.split(',')[3] - relation = tmp_relation[:-2] - all_relations.add(relation) - if relation in relation_num: - num = relation_num[relation] - num+=1 - relation_num[relation] = num - else: - relation_num[relation]=1 - News_RSTFeats[ID] = relation_num - - all_relations = list(all_relations) - print(all_relations) - for news, rn in News_RSTFeats.items(): - f_out.write(news+'\t') - feats = [] - for al in all_relations: - if al in rn: - num = rn[al] - else: - num=0 - feats.append(num) - f_out.write('\t'.join(str(x) for x in feats)) - f_out.write('\n') - f_out.close() - - -def RSTPrediction2(data_type): - X_real = [] - y_real = [] - X_fake = [] - y_fake = [] - X=[] - y=[] - with open('./'+data_type+'/RSTFeats.txt') as f_rst: - for line in f_rst: - line = line.strip() - line_str = line.split('\t') - ID = line_str[0] - feats = [float(x) for x in line_str[1:]] - if 'Real' in ID: - X_real.append(feats) - y_real.append(0) - else: - X_fake.append(feats) - y_fake.append(1) - ## Balance fake and true news - num = len(y_fake) - X_real = X_real[:num] - y_real = y_real[:num] - for i in range(num): - X.append(X_real[i]) - X.append(X_fake[i]) - y.append(y_real[i]) - y.append(y_fake[i]) - - X = np.array(X) - y = np.array(y) - # # shuffle the rows - arry = range(X.shape[0]) - shuffle(arry) - X = X[arry, :] - y = y[arry] - clfs = [ - linear_model.LogisticRegression(random_state=22), - MultinomialNB(), - tree.DecisionTreeClassifier(random_state=21), - RandomForestClassifier(random_state=22), - XGBClassifier(), - AdaBoostClassifier(random_state=22), - svm.SVC(kernel='linear', class_weight='balanced'), - GradientBoostingClassifier(random_state=22), - BaggingClassifier(random_state=22), - KNeighborsClassifier() - ] - clf_names = [ - 'Logistic Regression', - 'Naive Bayes', - 'Decision Tree', - 'Random Forest', - 'XGBoost', - 'AdaBoost', - 'SVM', - 'GradientBoosting', - 'Bagging Clf', - 'KNeighbors Clf' - ] - - X = preprocessing.normalize(X) - cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1'] - - df = pd.DataFrame(columns=cols) - df = df.set_index('alg') - for i in range(len(clfs)): - clf = clone(clfs[i]) - clf_name = clf_names[i] - df = test(clf,clf_name,df,cols,X,y,0.8) - - print(df) - df.to_csv('./RST_'+data_type+'_results.csv', header=True,sep='\t',columns=cols) - - -def test(clf, clf_name, df, cols, X, y,train_ratio): - acc = [] - prec = [] - recall = [] - f1 = [] - for i in range(5): - X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_ratio) - clf.fit(X_train, y_train) - y_pred = clf.predict(X_test) - acc.append(accuracy_score(y_test, y_pred)) - prec.append(precision_score(y_test, y_pred)) - recall.append(recall_score(y_test, y_pred)) - f1.append(f1_score(y_test, y_pred)) - tmp = pd.DataFrame([[clf_name, np.average(acc), np.std(acc), np.average(prec), np.std(prec), np.average(recall), - np.std(recall), np.average(f1), np.std(f1)]], columns=cols) - df = df.append(tmp) - return df - -def RSTPrediction2_curve(data_type): - X_real = [] - y_real = [] - X_fake = [] - y_fake = [] - X=[] - y=[] - with open('./'+data_type+'/RSTFeats.txt') as f_rst: - for line in f_rst: - line = line.strip() - line_str = line.split('\t') - ID = line_str[0] - feats = [float(x) for x in line_str[1:]] - if 'Real' in ID: - X_real.append(feats) - y_real.append(0) - else: - X_fake.append(feats) - y_fake.append(1) - ## Balance fake and true news - num = len(y_fake) - X_real = X_real[:num] - y_real = y_real[:num] - for i in range(num): - X.append(X_real[i]) - X.append(X_fake[i]) - y.append(y_real[i]) - y.append(y_fake[i]) - - X = np.array(X) - y = np.array(y) - # # shuffle the rows - arry = range(X.shape[0]) - shuffle(arry) - X = X[arry, :] - y = y[arry] - clfs = [ - linear_model.LogisticRegression(random_state=22), - # MultinomialNB(), - # tree.DecisionTreeClassifier(random_state=21), - # RandomForestClassifier(random_state=22), - # XGBClassifier(), - AdaBoostClassifier(random_state=22), - # svm.SVC(kernel='linear', class_weight='balanced'), - # GradientBoostingClassifier(random_state=22), - # BaggingClassifier(random_state=22), - # KNeighborsClassifier() - ] - clf_names = [ - 'Logistic Regression', - # 'Naive Bayes', - # 'Decision Tree', - # 'Random Forest', - # 'XGBoost', - 'AdaBoost', - # 'SVM', - # 'GradientBoosting', - # 'Bagging Clf', - # 'KNeighbors Clf' - ] - - X = preprocessing.normalize(X) - cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1'] - - df = pd.DataFrame(columns=cols) - df = df.set_index('alg') - tr = [0.2,0.4,0.6] - for t in tr: - for i in range(len(clfs)): - clf = clone(clfs[i]) - clf_name = clf_names[i] - df = test(clf, clf_name, df, cols, X, y,t) - with pd.option_context('expand_frame_repr', False): - print (df) - df.to_csv('./RST_'+data_type+'_results_curve.csv', header=True,sep='\t',columns=cols) - -if __name__ == '__main__': - data_type = 'PolitiFact' - - RSTRepresentation("data/baseline_features/rst/raw_parsed_data/politifact_fake", - "data/baseline_features/rst/raw_parsed_data/politifact_fake_rst_features.txt") - RSTRepresentation("data/baseline_features/rst/raw_parsed_data/politifact_real", - "data/baseline_features/rst/raw_parsed_data/politifact_real_rst_features.txt") - - # RSTRepresentation(data_type) - # RSTPrediction2('BuzzFeed') - # RSTPrediction2('PolitiFact') - # RSTPrediction2_curve('BuzzFeed') - # RSTPrediction2_curve('PolitiFact') \ No newline at end of file +# # This is an implementation of Rhetorical Structure Theory for Vector Space Model +# # The basic idea is from the paper: Identification of Truth and Deception in Text: Application of Vector Space Model to Rhetorical Structure Theory +# from os import listdir +# from os.path import isfile, join +# from random import shuffle +# +# import numpy as np +# import pandas as pd +# from sklearn import linear_model +# from sklearn import preprocessing +# from sklearn import svm +# from sklearn import tree +# from sklearn.base import clone +# from sklearn.ensemble import AdaBoostClassifier +# from sklearn.ensemble import BaggingClassifier +# from sklearn.ensemble import GradientBoostingClassifier +# from sklearn.ensemble import RandomForestClassifier +# from sklearn.metrics import accuracy_score, f1_score +# from sklearn.metrics import precision_score, recall_score +# from sklearn.model_selection import train_test_split +# from sklearn.naive_bayes import MultinomialNB +# from sklearn.neighbors import KNeighborsClassifier +# from xgboost import XGBClassifier +# +# +# def RSTRepresentation(data_type, out_file): +# # dir_path = './'+data_type+'/' +# dir_path = data_type +# +# f_out = open(out_file,'w+') +# all_relations = set() +# org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))] +# News_RSTFeats = dict() +# for of in org_files: +# ID = of[:of.index('.txt')] +# file_name = dir_path+'/'+of +# relation_num = dict() +# with open(file_name) as f_rst: +# for line in f_rst: +# line = line.replace('\'','') +# line = line.replace(' ','') +# tmp_relation = line.split(',')[3] +# relation = tmp_relation[:-2] +# all_relations.add(relation) +# if relation in relation_num: +# num = relation_num[relation] +# num+=1 +# relation_num[relation] = num +# else: +# relation_num[relation]=1 +# News_RSTFeats[ID] = relation_num +# +# all_relations = list(all_relations) +# print(all_relations) +# for news, rn in News_RSTFeats.items(): +# f_out.write(news+'\t') +# feats = [] +# for al in all_relations: +# if al in rn: +# num = rn[al] +# else: +# num=0 +# feats.append(num) +# f_out.write('\t'.join(str(x) for x in feats)) +# f_out.write('\n') +# f_out.close() +# +# +# def RSTPrediction2(data_type): +# X_real = [] +# y_real = [] +# X_fake = [] +# y_fake = [] +# X=[] +# y=[] +# with open('./'+data_type+'/RSTFeats.txt') as f_rst: +# for line in f_rst: +# line = line.strip() +# line_str = line.split('\t') +# ID = line_str[0] +# feats = [float(x) for x in line_str[1:]] +# if 'Real' in ID: +# X_real.append(feats) +# y_real.append(0) +# else: +# X_fake.append(feats) +# y_fake.append(1) +# ## Balance fake and true news +# num = len(y_fake) +# X_real = X_real[:num] +# y_real = y_real[:num] +# for i in range(num): +# X.append(X_real[i]) +# X.append(X_fake[i]) +# y.append(y_real[i]) +# y.append(y_fake[i]) +# +# X = np.array(X) +# y = np.array(y) +# # # shuffle the rows +# arry = range(X.shape[0]) +# shuffle(arry) +# X = X[arry, :] +# y = y[arry] +# clfs = [ +# linear_model.LogisticRegression(random_state=22), +# MultinomialNB(), +# tree.DecisionTreeClassifier(random_state=21), +# RandomForestClassifier(random_state=22), +# XGBClassifier(), +# AdaBoostClassifier(random_state=22), +# svm.SVC(kernel='linear', class_weight='balanced'), +# GradientBoostingClassifier(random_state=22), +# BaggingClassifier(random_state=22), +# KNeighborsClassifier() +# ] +# clf_names = [ +# 'Logistic Regression', +# 'Naive Bayes', +# 'Decision Tree', +# 'Random Forest', +# 'XGBoost', +# 'AdaBoost', +# 'SVM', +# 'GradientBoosting', +# 'Bagging Clf', +# 'KNeighbors Clf' +# ] +# +# X = preprocessing.normalize(X) +# cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1'] +# +# df = pd.DataFrame(columns=cols) +# df = df.set_index('alg') +# for i in range(len(clfs)): +# clf = clone(clfs[i]) +# clf_name = clf_names[i] +# df = test(clf,clf_name,df,cols,X,y,0.8) +# +# print(df) +# df.to_csv('./RST_'+data_type+'_results.csv', header=True,sep='\t',columns=cols) +# +# +# def test(clf, clf_name, df, cols, X, y,train_ratio): +# acc = [] +# prec = [] +# recall = [] +# f1 = [] +# for i in range(5): +# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_ratio) +# clf.fit(X_train, y_train) +# y_pred = clf.predict(X_test) +# acc.append(accuracy_score(y_test, y_pred)) +# prec.append(precision_score(y_test, y_pred)) +# recall.append(recall_score(y_test, y_pred)) +# f1.append(f1_score(y_test, y_pred)) +# tmp = pd.DataFrame([[clf_name, np.average(acc), np.std(acc), np.average(prec), np.std(prec), np.average(recall), +# np.std(recall), np.average(f1), np.std(f1)]], columns=cols) +# df = df.append(tmp) +# return df +# +# def RSTPrediction2_curve(data_type): +# X_real = [] +# y_real = [] +# X_fake = [] +# y_fake = [] +# X=[] +# y=[] +# with open('./'+data_type+'/RSTFeats.txt') as f_rst: +# for line in f_rst: +# line = line.strip() +# line_str = line.split('\t') +# ID = line_str[0] +# feats = [float(x) for x in line_str[1:]] +# if 'Real' in ID: +# X_real.append(feats) +# y_real.append(0) +# else: +# X_fake.append(feats) +# y_fake.append(1) +# ## Balance fake and true news +# num = len(y_fake) +# X_real = X_real[:num] +# y_real = y_real[:num] +# for i in range(num): +# X.append(X_real[i]) +# X.append(X_fake[i]) +# y.append(y_real[i]) +# y.append(y_fake[i]) +# +# X = np.array(X) +# y = np.array(y) +# # # shuffle the rows +# arry = range(X.shape[0]) +# shuffle(arry) +# X = X[arry, :] +# y = y[arry] +# clfs = [ +# linear_model.LogisticRegression(random_state=22), +# # MultinomialNB(), +# # tree.DecisionTreeClassifier(random_state=21), +# # RandomForestClassifier(random_state=22), +# # XGBClassifier(), +# AdaBoostClassifier(random_state=22), +# # svm.SVC(kernel='linear', class_weight='balanced'), +# # GradientBoostingClassifier(random_state=22), +# # BaggingClassifier(random_state=22), +# # KNeighborsClassifier() +# ] +# clf_names = [ +# 'Logistic Regression', +# # 'Naive Bayes', +# # 'Decision Tree', +# # 'Random Forest', +# # 'XGBoost', +# 'AdaBoost', +# # 'SVM', +# # 'GradientBoosting', +# # 'Bagging Clf', +# # 'KNeighbors Clf' +# ] +# +# X = preprocessing.normalize(X) +# cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1'] +# +# df = pd.DataFrame(columns=cols) +# df = df.set_index('alg') +# tr = [0.2,0.4,0.6] +# for t in tr: +# for i in range(len(clfs)): +# clf = clone(clfs[i]) +# clf_name = clf_names[i] +# df = test(clf, clf_name, df, cols, X, y,t) +# with pd.option_context('expand_frame_repr', False): +# print (df) +# df.to_csv('./RST_'+data_type+'_results_curve.csv', header=True,sep='\t',columns=cols) +# +# if __name__ == '__main__': +# data_type = 'PolitiFact' +# +# RSTRepresentation("data/baseline_features/rst/raw_parsed_data/politifact_fake", +# "data/baseline_features/rst/raw_parsed_data/politifact_fake_rst_features.txt") +# RSTRepresentation("data/baseline_features/rst/raw_parsed_data/politifact_real", +# "data/baseline_features/rst/raw_parsed_data/politifact_real_rst_features.txt") +# +# # RSTRepresentation(data_type) +# # RSTPrediction2('BuzzFeed') +# # RSTPrediction2('PolitiFact') +# # RSTPrediction2_curve('BuzzFeed') +# # RSTPrediction2_curve('PolitiFact') \ No newline at end of file diff --git a/baseline_basic_model.py b/baseline_basic_model.py index 88c593a..72f5455 100644 --- a/baseline_basic_model.py +++ b/baseline_basic_model.py @@ -1,9 +1,10 @@ -import numpy as np import pickle +import numpy as np + from analysis_util import get_propagation_graphs, equal_samples -from basic_model import get_basic_model_results, dump_random_forest_feature_importance -from construct_sample_features import get_train_test_split +from basic_model import get_basic_model_results +from construct_sample_features import get_train_test_split, get_TPNF_dataset, get_dataset_feature_names from structure_temp_analysis import ScienceCascadeFeatureHelper @@ -21,12 +22,14 @@ def get_science_dataset_array(news_source): macro_features=include_macro, news_source=news_source, label="real") - return np.concatenate([fake_features, real_features]) + sample_features = np.concatenate([fake_features, real_features]) + pickle.dump(sample_features, open("data/stfn/{}_stfn_features.pkl".format(news_source), "wb")) + return sample_features def get_castillo_features(news_source, castillo_feature_folder="data/castillo/saved_features"): - features = pickle.load(open("{}/{}_castillo_features.pkl".format(castillo_feature_folder, news_source), "rb")) + features = pickle.load(open("{}/{}_castillo_features.pkl".format(castillo_feature_folder, news_source), "rb")) features = np.nan_to_num(features) return features @@ -35,14 +38,15 @@ def get_tpnf_features(news_source, feature_folder="data/train_test_data"): return pickle.load(open("{}/{}_micro_macro_struct_temp_linguistic.pkl".format(feature_folder, news_source), "rb")) -def get_liwc_features(news_source, feature_folder = "data/baseline_features/liwc_features"): +def get_liwc_features(news_source, feature_folder="data/baseline_features/liwc_features"): fake_features = np.loadtxt("{}/{}_fake_liwc.csv".format(feature_folder, news_source), delimiter=',') real_features = np.loadtxt("{}/{}_real_liwc.csv".format(feature_folder, news_source), delimiter=',') feature_array = np.concatenate([fake_features, real_features]) return feature_array -def get_rst_features(news_source, rst_feature_folder = "data/baseline_features/rst/raw_parsed_data"): + +def get_rst_features(news_source, rst_feature_folder="data/baseline_features/rst_both/raw_parsed_data"): fake_features = np.loadtxt("{}/{}_fake_rst_features.csv".format(rst_feature_folder, news_source), delimiter='\t') real_features = np.loadtxt("{}/{}_real_rst_features.csv".format(rst_feature_folder, news_source), delimiter='\t') feature_array = np.concatenate([fake_features, real_features]) @@ -50,7 +54,7 @@ def get_rst_features(news_source, rst_feature_folder = "data/baseline_features/r return feature_array -def get_sample_feature_array(news_source, tpnf=False, castillo=False, liwc=False, rst=False, stfn = False): +def get_sample_feature_array(news_source, tpnf=False, castillo=False, liwc=False, rst=False, stfn=False): feature_arrays = [] if tpnf: @@ -76,8 +80,8 @@ def get_sample_feature_array(news_source, tpnf=False, castillo=False, liwc=False return all_feature_array -def get_baselines_classificaton_result(news_source, tpnf=False, castillo=False, liwc=False, rst=False, stfn = False): - sample_feature_array = get_sample_feature_array(news_source, tpnf, castillo, liwc, rst , stfn) +def get_baselines_classificaton_result(news_source, tpnf=False, castillo=False, liwc=False, rst=False, stfn=False): + sample_feature_array = get_sample_feature_array(news_source, tpnf, castillo, liwc, rst, stfn) print("Sample feature array dimensions") print(sample_feature_array.shape, flush=True) @@ -89,12 +93,79 @@ def get_baselines_classificaton_result(news_source, tpnf=False, castillo=False, get_basic_model_results(X_train, X_test, y_train, y_test) +def get_baseline_modification_classificaton_result(news_source, data_dir = "data/train_test_data"): + include_micro = True + include_macro = True + + include_structural = True + include_temporal = True + include_linguistic = True + + science_features = get_science_dataset_array(news_source) + science_features = science_features[:, [3,4]] + print("stfn features :", science_features.shape) + + sample_feature_array = get_TPNF_dataset(data_dir, news_source, include_micro, include_macro, include_structural, + include_temporal, include_linguistic, use_cache=True) + + sample_feature_array = sample_feature_array[:, :-1] + feature_names, short_feature_names = get_dataset_feature_names(include_micro, include_macro, include_structural, + include_temporal, include_linguistic) + + print("tpnf features :", sample_feature_array.shape) + + sample_feature_array = np.concatenate([sample_feature_array, science_features], axis=1) + + print("overall features dim : ", sample_feature_array.shape) + + print("Sample feature array dimensions") + print(sample_feature_array.shape, flush=True) + + num_samples = int(len(sample_feature_array) / 2) + target_labels = np.concatenate([np.ones(num_samples), np.zeros(num_samples)], axis=0) + + X_train, X_test, y_train, y_test = get_train_test_split(sample_feature_array, target_labels) + get_basic_model_results(X_train, X_test, y_train, y_test) + + +def get_domain_adaptation_classification_results(source_news_source, target_news_source, tpnf=False, castillo=False, + liwc=False, rst=False, stfn=False): + train_sample_feature_array = get_sample_feature_array(source_news_source, tpnf, castillo, liwc, rst, stfn) + test_sample_feature_array = get_sample_feature_array(target_news_source, tpnf, castillo, liwc, rst, stfn) + + print("Source Domain : {}".format(source_news_source)) + print("Target Domain : {}".format(target_news_source)) + + print("source : ", train_sample_feature_array.shape) + print("target : ", test_sample_feature_array.shape) + + train_num_samples = int(len(train_sample_feature_array) / 2) + test_num_samples = int(len(test_sample_feature_array) / 2) + + train_target_labels = np.concatenate([np.ones(train_num_samples), np.zeros(train_num_samples)], axis=0) + test_target_labels = np.concatenate([np.ones(test_num_samples), np.zeros(test_num_samples)], axis=0) + + S_X_train, S_X_test, S_y_train, S_y_test = get_train_test_split(train_sample_feature_array, train_target_labels) + T_X_train, T_X_test, T_y_train, T_y_test = get_train_test_split(test_sample_feature_array, test_target_labels) + + # get_basic_model_results(train_sample_feature_array, test_sample_feature_array, train_target_labels, + # test_target_labels) + + get_basic_model_results(S_X_train, T_X_test, S_y_train, T_y_test) + + if __name__ == "__main__": - get_baselines_classificaton_result("gossipcop", tpnf=True, castillo=False, liwc=False, rst=False, stfn=True) + get_baselines_classificaton_result("gossipcop", tpnf=True, castillo=False, liwc=False, rst=False, stfn=False) + + # get_baselines_classificaton_result("gossipcop", tpnf=False, castillo=False, liwc=False, rst=False, stfn=True) + + # get_baseline_modification_classificaton_result("gossipcop") + + # get_domain_adaptation_classification_results("gossipcop", "politifact", tpnf=True, stfn=False, liwc=False, rst=False) # feature_array = get_castillo_features("politifact") # num_samples = int(feature_array.shape[0]/2) # np.savetxt("fake_castillo_features.csv", feature_array[:num_samples], delimiter=",") - # np.savetxt("real_castillo_features.csv", feature_array[num_samples+1:], delimiter=",") + # np.savetxt("real_castillo_features.csv", feature _array[num_samples+1:], delimiter=",") # - # dump_random_forest_feature_importance(feature_array) \ No newline at end of file + # dump_random_forest_feature_importance(feature_array) diff --git a/baseline_feature_extraction.py b/baseline_feature_extraction.py index 36a1412..9939753 100644 --- a/baseline_feature_extraction.py +++ b/baseline_feature_extraction.py @@ -246,6 +246,121 @@ def dump_ordered_rst_representation(rst_folder, news_source, fake_out_file, real f_out.close() +def dump_both_ordered_rst_representation(rst_folder1, rst_folder2, news_source1, news_source2,fake_out_file1, + fake_out_file2, real_out_file1, real_out_file2): + dir_path = rst_folder1 + + all_relations = set() + org_files = [] + + org_files.extend([join(dir_path, f) for f in listdir(dir_path) if isfile(join(dir_path, f))]) + + dir_path = rst_folder2 + + org_files.extend([join(dir_path, f) for f in listdir(dir_path) if isfile(join(dir_path, f))]) + + News_RSTFeats = dict() + for file_name in org_files: + ID = file_name[file_name.rindex("/")+1:file_name.index('.txt')] + # file_name = dir_path + '/' + of + relation_num = dict() + with open(file_name) as f_rst: + for line in f_rst: + line = line.replace('\'', '') + line = line.replace(' ', '') + tmp_relation = line.split(',')[3] + relation = tmp_relation[:-2] + all_relations.add(relation) + if relation in relation_num: + num = relation_num[relation] + num += 1 + relation_num[relation] = num + else: + relation_num[relation] = 1 + News_RSTFeats[ID] = relation_num + + + + all_relations = list(all_relations) + print(all_relations) + + fake_ordered_sample_ids = pickle.load( + open("data/baseline_data/{}_{}_sample_news_ordered_ids.pkl".format(news_source1, "fake"), "rb")) + + f_out = open(fake_out_file1, 'w+') + for news_id in fake_ordered_sample_ids: + # for news, rn in News_RSTFeats.items(): + # f_out.write(news + '\t') + rn = News_RSTFeats[news_id] + feats = [] + for al in all_relations: + if al in rn: + num = rn[al] + else: + num = 0 + feats.append(num) + f_out.write('\t'.join(str(x) for x in feats)) + f_out.write('\n') + f_out.close() + + fake_ordered_sample_ids = pickle.load( + open("data/baseline_data/{}_{}_sample_news_ordered_ids.pkl".format(news_source2, "fake"), "rb")) + + f_out = open(fake_out_file2, 'w+') + for news_id in fake_ordered_sample_ids: + # for news, rn in News_RSTFeats.items(): + # f_out.write(news + '\t') + rn = News_RSTFeats[news_id] + feats = [] + for al in all_relations: + if al in rn: + num = rn[al] + else: + num = 0 + feats.append(num) + f_out.write('\t'.join(str(x) for x in feats)) + f_out.write('\n') + f_out.close() + + real_ordered_sample_ids = pickle.load( + open("data/baseline_data/{}_{}_sample_news_ordered_ids.pkl".format(news_source1, "real"), "rb")) + + f_out = open(real_out_file1, 'w+') + for news_id in real_ordered_sample_ids: + # for news, rn in News_RSTFeats.items(): + # f_out.write(news + '\t') + rn = News_RSTFeats[news_id] + feats = [] + for al in all_relations: + if al in rn: + num = rn[al] + else: + num = 0 + feats.append(num) + f_out.write('\t'.join(str(x) for x in feats)) + f_out.write('\n') + f_out.close() + + real_ordered_sample_ids = pickle.load( + open("data/baseline_data/{}_{}_sample_news_ordered_ids.pkl".format(news_source2, "real"), "rb")) + + f_out = open(real_out_file2, 'w+') + for news_id in real_ordered_sample_ids: + # for news, rn in News_RSTFeats.items(): + # f_out.write(news + '\t') + rn = News_RSTFeats[news_id] + feats = [] + for al in all_relations: + if al in rn: + num = rn[al] + else: + num = 0 + feats.append(num) + f_out.write('\t'.join(str(x) for x in feats)) + f_out.write('\n') + f_out.close() + + if __name__ == "__main__": # get_news_ids_used_for_propagation_network("politifact") @@ -254,10 +369,18 @@ def dump_ordered_rst_representation(rst_folder, news_source, fake_out_file, real news_source = "gossipcop" - dump_ordered_rst_representation("data/baseline_features/rst/raw_parsed_data/gossipcop",news_source, - "data/baseline_features/rst/raw_parsed_data/gossipcop_fake_rst_features.csv", - "data/baseline_features/rst/raw_parsed_data/gossipcop_real_rst_features.csv" - ) + dump_both_ordered_rst_representation("data/baseline_features/rst/raw_parsed_data/politifact", + "data/baseline_features/rst/raw_parsed_data/gossipcop", + "politifact", "gossipcop", + "data/baseline_features/rst_both/raw_parsed_data/politifact_fake_rst_features.csv", + "data/baseline_features/rst_both/raw_parsed_data/gossipcop_fake_rst_features.csv", + "data/baseline_features/rst_both/raw_parsed_adata/politifact_real_rst_features.csv", + "data/baseline_features/rst_both/raw_parsed_data/gossipcop_real_rst_features.csv") + + # dump_ordered_rst_representation("data/baseline_features/rst/raw_parsed_data/gossipcop", news_source, + # "data/baseline_features/rst/raw_parsed_data/gossipcop_fake_rst_features.csv", + # "data/baseline_features/rst/raw_parsed_data/gossipcop_real_rst_features.csv" + # ) # dump_LIWC_Representation("data/baseline_features/liwc_features/LIWC2015_{}_fake_text_contents_ordered_new.txt".format(news_source), # "data/baseline_features/liwc_features/{}_fake_liwc.csv".format(news_source)) diff --git a/basic_model.py b/basic_model.py index a99ba19..f8e374c 100644 --- a/basic_model.py +++ b/basic_model.py @@ -1,15 +1,16 @@ +import matplotlib import numpy as np - -from sklearn import preprocessing, svm, clone +from sklearn import preprocessing, svm from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier -from construct_sample_features import get_TPNF_dataset, get_train_test_split, get_dataset_feature_names - -import matplotlib +from analysis_util import get_propagation_graphs, equal_samples +from construct_sample_features import get_TPNF_dataset, get_train_test_split, get_dataset_feature_names, \ + filter_propagation_graphs, get_nx_propagation_graphs +from structure_temp_analysis import ScienceCascadeFeatureHelper matplotlib.use('agg') import matplotlib.pyplot as plt @@ -23,7 +24,7 @@ def get_classifier_by_name(classifier_name): elif classifier_name == "DecisionTreeClassifier": return DecisionTreeClassifier() elif classifier_name == "RandomForestClassifier": - return RandomForestClassifier(n_estimators=100) + return RandomForestClassifier(n_estimators=50) elif classifier_name == "SVM -linear kernel": return svm.SVC(kernel='linear') @@ -89,8 +90,9 @@ def get_basic_model_results(X_train, X_test, y_train, y_test): X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) - classifiers = [GaussianNB(), LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(), - svm.SVC(kernel='linear')] + classifiers = [GaussianNB(), LogisticRegression(), DecisionTreeClassifier(), + RandomForestClassifier(n_estimators=100), + svm.SVC()] classifier_names = ["GaussianNB", "LogisticRegression", "DecisionTreeClassifier", "RandomForestClassifier", "SVM -linear kernel"] @@ -99,16 +101,59 @@ def get_basic_model_results(X_train, X_test, y_train, y_test): train_model(classifier_names[idx], X_train, X_test, y_train, y_test) -def get_classificaton_results_tpnf(data_dir, news_source): - include_micro = True +def get_classificaton_results_tpnf(data_dir, news_source, time_interval, use_cache=False): + include_micro = False include_macro = True include_structural = True - include_temporal = False - include_linguistic = False + include_temporal = True + include_linguistic = True sample_feature_array = get_TPNF_dataset(data_dir, news_source, include_micro, include_macro, include_structural, - include_temporal, include_linguistic) + include_temporal, include_linguistic, time_interval, use_cache=use_cache) + + print("Sample feature array dimensions") + print(sample_feature_array.shape, flush=True) + + num_samples = int(len(sample_feature_array) / 2) + target_labels = np.concatenate([np.ones(num_samples), np.zeros(num_samples)], axis=0) + + X_train, X_test, y_train, y_test = get_train_test_split(sample_feature_array, target_labels) + get_basic_model_results(X_train, X_test, y_train, y_test) + + +def get_science_dataset_array_time_based(news_source, time_interval=None): + fake_prop_graph, real_prop_graph = get_nx_propagation_graphs("data/nx_network_data", news_source) + fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) + feature_helper = ScienceCascadeFeatureHelper() + include_micro = False + include_macro = True + + if time_interval is not None: + time_limit = time_interval * 60 * 60 + + print("Time limit in seconds : {}".format(time_limit)) + + fake_prop_graph = filter_propagation_graphs(fake_prop_graph, time_limit, reply=False) + real_prop_graph = filter_propagation_graphs(real_prop_graph, time_limit, reply=False) + + print("After time based filtering ") + print("No. of fake samples : {} No. of real samples: {}".format(len(fake_prop_graph), len(real_prop_graph))) + + fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) + + fake_features = feature_helper.get_features_array(fake_prop_graph, micro_features=include_micro, + macro_features=include_macro, news_source=news_source, + label="fake", use_cache=False) + real_features = feature_helper.get_features_array(real_prop_graph, micro_features=include_micro, + macro_features=include_macro, news_source=news_source, + label="real", use_cache=False) + + return np.concatenate([fake_features, real_features]) + + +def get_classificaton_results_stnf(news_source, time_interval=None): + sample_feature_array = get_science_dataset_array_time_based(news_source, time_interval) print("Sample feature array dimensions") print(sample_feature_array.shape, flush=True) @@ -139,9 +184,9 @@ def dump_random_forest_feature_importance(data_dir, news_source): include_linguistic = True sample_feature_array = get_TPNF_dataset(data_dir, news_source, include_micro, include_macro, include_structural, - include_temporal, include_linguistic) + include_temporal, include_linguistic, use_cache=True) - sample_feature_array = sample_feature_array[:,:-1] + sample_feature_array = sample_feature_array[:, :-1] feature_names, short_feature_names = get_dataset_feature_names(include_micro, include_macro, include_structural, include_temporal, include_linguistic) @@ -173,103 +218,60 @@ def dump_random_forest_feature_importance(data_dir, news_source): for f in range(X_train.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) + matplotlib.rcParams['figure.figsize'] = 5, 2 + # Plot the feature importances of the forest plt.figure() - plt.title("Feature importances - PolitiFact dataset") - - + # plt.title("Feature importances - PolitiFact dataset") plt.bar(range(X_train.shape[1]), importances[indices], color="b", yerr=std[indices], align="center") - plt.xticks(range(X_train.shape[1]), np.array(short_feature_names)[indices], rotation=60, fontsize=9) + plt.xticks(range(X_train.shape[1]), np.array(short_feature_names)[indices], rotation=75, fontsize=9.5) plt.xlim([-1, X_train.shape[1]]) plt.savefig('{}_feature_importance.png'.format(news_source), bbox_inches='tight') plt.show() -# def dump_random_forest_feature_importance(sample_feature_array): -# include_micro = True -# include_macro = True -# -# include_structural = True -# include_temporal = True -# include_linguistic = True -# -# feature_names, short_feature_names = get_dataset_feature_names(include_micro, include_macro, include_structural, -# include_temporal, include_linguistic) -# -# num_samples = int(len(sample_feature_array) / 2) -# target_labels = np.concatenate([np.ones(num_samples), np.zeros(num_samples)], axis=0) -# -# X_train, X_test, y_train, y_test = get_train_test_split(sample_feature_array, target_labels) -# -# # scaler = preprocessing.StandardScaler().fit(X_train) -# # -# # X_train = scaler.transform(X_train) -# # X_test = scaler.transform(X_test) -# -# # Build a forest and compute the feature importances -# -# forest = ExtraTreesClassifier(n_estimators=100, -# random_state=0) -# -# forest.fit(X_train, y_train) -# importances = forest.feature_importances_ -# std = np.std([tree.feature_importances_ for tree in forest.estimators_], -# axis=0) -# indices = np.argsort(importances)[::-1] -# -# # Print the feature ranking -# print("Feature ranking:") -# -# for f in range(X_train.shape[1]): -# print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) -# -# # Plot the feature importances of the forest -# plt.figure() -# plt.title("Feature importances - Politifact dataset") -# plt.bar(range(X_train.shape[1]), importances[indices], -# color="b", yerr=std[indices], align="center") -# plt.xticks(range(X_train.shape[1]), indices, rotation=60) -# plt.xlim([-1, X_train.shape[1]]) -# plt.savefig('feature_importance.png', bbox_inches='tight') -# -# plt.show() - - -# def dump_feature_importance(data_dir, news_source): -# include_micro = True -# include_macro = True -# -# include_structural = True -# include_temporal = True -# include_linguistic = True -# -# sample_feature_array = get_TPNF_dataset(data_dir, news_source, include_micro, include_macro, include_structural, -# include_temporal, include_linguistic) -# -# feature_names, short_feature_names = get_dataset_feature_names(include_micro, include_macro, include_structural, -# include_temporal, include_linguistic) -# -# num_samples = int(len(sample_feature_array) / 2) -# target_labels = np.concatenate([np.ones(num_samples), np.zeros(num_samples)], axis=0) -# -# X_train, X_test, y_train, y_test = get_train_test_split(sample_feature_array, target_labels) -# -# scaler = preprocessing.StandardScaler().fit(X_train) -# -# X_train = scaler.transform(X_train) -# X_test = scaler.transform(X_test) -# -# classifier = svm.SVC(kernel='linear') -# classifier.fit(X_train, y_train) -# -# plot_feature_importances(classifier.coef_.ravel(), short_feature_names) +def get_science_dataset_array(news_source): + fake_prop_graph, real_prop_graph = get_propagation_graphs("data/saved_new_no_filter", news_source) + fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) + feature_helper = ScienceCascadeFeatureHelper() + include_micro = False + include_macro = True + + fake_features = feature_helper.get_features_array(fake_prop_graph, micro_features=include_micro, + macro_features=include_macro, news_source=news_source, + label="fake") + real_features = feature_helper.get_features_array(real_prop_graph, micro_features=include_micro, + macro_features=include_macro, news_source=news_source, + label="real") + + return np.concatenate([fake_features, real_features]) if __name__ == "__main__": - get_classificaton_results_tpnf("data/train_test_data", "gossipcop") + get_classificaton_results_tpnf("data/train_test_data", "gossipcop", time_interval=None, use_cache=False) + + # get_classificaton_results_stnf( "politifact", time_interval=None) + + # get_classificaton_results_tpnf("data/train_test_data", "politifact", time_interval = None) + + # exit(1) + + time_intervals = [3, 6, 12, 24, 36, 48, 60, 72, 84, 96] + # time_intervals = [3, 6] + # time_intervals = [None] + + # for time_interval in time_intervals: + # print("=============Time Interval : {} ==========".format(time_interval)) + # start_time = time.time() + # # get_classificaton_results_tpnf("data/train_test_data", "politifact", time_interval) + # # get_classificaton_results_tpnf("data/train_test_data", "politifact", time_interval) + # + # get_classificaton_results_stnf("politifact", time_interval) + # print("\n\n================Exectuion time - {} ==================================\n".format( + # time.time() - start_time)) # dump_feature_importance("data/train_test_data", "politifact") - # dump_random_forest_feature_importance("data/train_test_data", "politifact") + # dump_random_forest_feature_importance("data/train_test_data", "gossipcop") diff --git a/castillo_features.py b/castillo_features.py index 6de25a6..c8e05b7 100644 --- a/castillo_features.py +++ b/castillo_features.py @@ -1,830 +1,830 @@ -import pickle -from os import listdir -from os.path import isfile, join -from pathlib import Path - -from pymongo import MongoClient -from datetime import datetime -import networkx as nx -import numpy as np -from random import shuffle -from sklearn.svm import SVC -from sklearn.model_selection import cross_val_score -import re -from tqdm import tqdm -# from sklearn.ensemble import RandomForestClassifier -# import json -# import random -# from nltk.tokenize import RegexpTokenizer -# from stop_words import get_stop_words -# from nltk.stem.porter import PorterStemmer -# from gensim import corpora -# import gensim -# from sklearn.model_selection import cross_validate -# from sklearn.dummy import DummyClassifier -# from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score -# from sklearn.model_selection import train_test_split -# from sklearn import preprocessing - -from analysis_util import get_propagation_graphs, equal_samples, get_numpy_array, create_dir -from misc_process import get_reply_of_replies -from pre_process_util import get_news_articles, load_configuration, get_database_connection -from structure_temp_analysis import StructureFeatureHelper - -all_reply_id_sentiment_score_dict = pickle.load(open("{}/all_reply_id_sentiment_result.pkl" - .format("data/pre_process_data/vader_sentiment"), "rb")) - - -# def content_featureAgg(tweets): -# # Current version tweets content are almost the same, not distinguishable -# -# return [] -# -# def networkFeatureAgg(users,user_followers_coll,user_followees_coll): -# -# user_index = dict() -# for i in range(len(users)): -# user_index[users[i]]=i -# -# edge_list = set() -# for au in tqdm(users): -# user_name = au -# all_follower_tmp = list(user_followers_coll.find({'user_name': user_name})) -# if len(all_follower_tmp)!=0: -# all_followers = all_follower_tmp[0]['followers'] -# for aft in all_followers: -# if aft['screen_name'] in user_index: -# edge_list.add((user_name,aft['screen_name'])) -# -# all_followee_tmp = list(user_followees_coll.find({'user_name':user_name})) -# if len(all_followee_tmp)!=0: -# all_followees = all_followee_tmp[0]['followees'] -# for aft in all_followees: -# if aft['screen_name'] in user_index: -# edge_list.add((aft['screen_name'],user_name)) -# G=nx.Graph() -# G.add_edges_from(edge_list) -# node_num = G.number_of_nodes() -# link_num = G.number_of_edges() -# if node_num==0: -# density=0 -# cc=0 -# avg_degree=0 -# else: -# density = link_num/(float(node_num)*float(node_num)) -# cc = nx.average_clustering(G) -# degrees = G.degree() -# avg_degree = sum(degrees.values())/len(degrees.values()) -# return [node_num,link_num,density,cc,avg_degree] -# -# def getSocialEngagements(db,datasource): -# f_out = open('./'+datasource+'/SocialFeats.txt','w+') -# if datasource=='BuzzFeed': -# user_profiles_coll = db['TwitterUserProfile'] -# else: -# user_profiles_coll = db['TwitterUserProfile2'] -# if datasource=='BuzzFeed': -# user_followers_coll = db['TwitterUserFollowers'] -# else: -# user_followers_coll = db['TwitterUserFollowers2'] -# if datasource=='BuzzFeed': -# user_followees_coll = db['TwitterUserFollowees'] -# else: -# user_followees_coll = db['TwitterUserFollowees2'] -# news_tweets = dict() -# news_users = dict() -# # Fake News / Real News +# import pickle +# from os import listdir +# from os.path import isfile, join +# from pathlib import Path +# +# from pymongo import MongoClient +# from datetime import datetime +# import networkx as nx +# import numpy as np +# from random import shuffle +# from sklearn.svm import SVC +# from sklearn.model_selection import cross_val_score +# import re +# from tqdm import tqdm +# # from sklearn.ensemble import RandomForestClassifier +# # import json +# # import random +# # from nltk.tokenize import RegexpTokenizer +# # from stop_words import get_stop_words +# # from nltk.stem.porter import PorterStemmer +# # from gensim import corpora +# # import gensim +# # from sklearn.model_selection import cross_validate +# # from sklearn.dummy import DummyClassifier +# # from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score +# # from sklearn.model_selection import train_test_split +# # from sklearn import preprocessing +# +# from analysis_util import get_propagation_graphs, equal_samples, get_numpy_array, create_dir +# from misc_process import get_reply_of_replies +# from pre_process_util import get_news_articles, load_configuration, get_database_connection +# from structure_temp_analysis import StructureFeatureHelper +# +# all_reply_id_sentiment_score_dict = pickle.load(open("{}/all_reply_id_sentiment_result.pkl" +# .format("data/pre_process_data/vader_sentiment"), "rb")) +# +# +# # def content_featureAgg(tweets): +# # # Current version tweets content are almost the same, not distinguishable +# # +# # return [] +# # +# # def networkFeatureAgg(users,user_followers_coll,user_followees_coll): +# # +# # user_index = dict() +# # for i in range(len(users)): +# # user_index[users[i]]=i +# # +# # edge_list = set() +# # for au in tqdm(users): +# # user_name = au +# # all_follower_tmp = list(user_followers_coll.find({'user_name': user_name})) +# # if len(all_follower_tmp)!=0: +# # all_followers = all_follower_tmp[0]['followers'] +# # for aft in all_followers: +# # if aft['screen_name'] in user_index: +# # edge_list.add((user_name,aft['screen_name'])) +# # +# # all_followee_tmp = list(user_followees_coll.find({'user_name':user_name})) +# # if len(all_followee_tmp)!=0: +# # all_followees = all_followee_tmp[0]['followees'] +# # for aft in all_followees: +# # if aft['screen_name'] in user_index: +# # edge_list.add((aft['screen_name'],user_name)) +# # G=nx.Graph() +# # G.add_edges_from(edge_list) +# # node_num = G.number_of_nodes() +# # link_num = G.number_of_edges() +# # if node_num==0: +# # density=0 +# # cc=0 +# # avg_degree=0 +# # else: +# # density = link_num/(float(node_num)*float(node_num)) +# # cc = nx.average_clustering(G) +# # degrees = G.degree() +# # avg_degree = sum(degrees.values())/len(degrees.values()) +# # return [node_num,link_num,density,cc,avg_degree] +# # +# # def getSocialEngagements(db,datasource): +# # f_out = open('./'+datasource+'/SocialFeats.txt','w+') +# # if datasource=='BuzzFeed': +# # user_profiles_coll = db['TwitterUserProfile'] +# # else: +# # user_profiles_coll = db['TwitterUserProfile2'] +# # if datasource=='BuzzFeed': +# # user_followers_coll = db['TwitterUserFollowers'] +# # else: +# # user_followers_coll = db['TwitterUserFollowers2'] +# # if datasource=='BuzzFeed': +# # user_followees_coll = db['TwitterUserFollowees'] +# # else: +# # user_followees_coll = db['TwitterUserFollowees2'] +# # news_tweets = dict() +# # news_users = dict() +# # # Fake News / Real News +# # +# # if datasource=='BuzzFeed': +# # dir_path = './Crawler/BuzzFeedCrawler/RealTwitterResult' +# # else: +# # dir_path = './Crawler/PolitiFact/PolitiFactTwitterResult' +# # org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))] +# # for of in org_files: +# # ID = of[:of.index('.json')] +# # file_name = dir_path+'/'+of +# # tweets = [] +# # users = [] +# # with open(file_name) as f_engagements: +# # for line in f_engagements: +# # line = line.strip() +# # tweet_json = json.loads(line) +# # tweets.append(tweet_json['text']) +# # users.append(tweet_json['username']) +# # news_tweets[ID]=tweets +# # news_users[ID]=users +# # +# # for k, tweets in news_tweets.items(): +# # users = news_users[k] +# # if len(users)>150: +# # users = users[:150] +# # user_features = userFeatureAgg(users, user_profiles_coll) +# # content_features = content_featureAgg(tweets) +# # network_features = networkFeatureAgg(users,user_followers_coll,user_followees_coll) +# # +# # all_feats=[] +# # all_feats.extend(user_features) +# # all_feats.extend(content_features) +# # all_feats.extend(network_features) +# # f_out.write(k+'\t'+'\t'.join(str(f) for f in all_feats)+'\n') +# # print k+'\t'+'\t'.join(str(f) for f in all_feats) +# # f_out.close() +# # +# # def getSocialEngagementsEarly(db,datasource,delta): +# # early_users = dict() +# # with open('./'+datasource+'/Early/User_'+delta+'.txt') as f_users: +# # for line in f_users: +# # line = line.strip() +# # early_users[line]=1 +# # +# # f_out = open('./'+datasource+'/Early/SocialFeatsReal'+delta+'.txt','w+') +# # if datasource=='BuzzFeed': +# # user_profiles_coll = db['TwitterUserProfile'] +# # else: +# # user_profiles_coll = db['TwitterUserProfile2'] +# # if datasource=='BuzzFeed': +# # user_followers_coll = db['TwitterUserFollowers'] +# # else: +# # user_followers_coll = db['TwitterUserFollowers2'] +# # if datasource=='BuzzFeed': +# # user_followees_coll = db['TwitterUserFollowees'] +# # else: +# # user_followees_coll = db['TwitterUserFollowees2'] +# # news_tweets = dict() +# # news_users = dict() +# # # Fake News / Real News +# # +# # if datasource=='BuzzFeed': +# # dir_path = './Crawler/BuzzFeedCrawler/TwitterResult' +# # else: +# # dir_path = './Crawler/PolitiFact/RealTwitterResult' +# # org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))] +# # for of in org_files: +# # ID = of[:of.index('.json')] +# # file_name = dir_path+'/'+of +# # tweets = [] +# # users = [] +# # with open(file_name) as f_engagements: +# # for line in f_engagements: +# # line = line.strip() +# # tweet_json = json.loads(line) +# # if tweet_json['username'] not in early_users: +# # continue +# # tweets.append(tweet_json['text']) +# # users.append(tweet_json['username']) +# # news_tweets[ID]=tweets +# # news_users[ID]=users +# # +# # for k, tweets in news_tweets.items(): +# # users = news_users[k] +# # if len(users)>150: +# # users = users[:150] +# # user_features = userFeatureAgg(users, user_profiles_coll) +# # content_features = content_featureAgg(tweets) +# # network_features = networkFeatureAgg(users,user_followers_coll,user_followees_coll) +# # +# # all_feats=[] +# # all_feats.extend(user_features) +# # all_feats.extend(content_features) +# # all_feats.extend(network_features) +# # f_out.write(k+'\t'+'\t'.join(str(f) for f in all_feats)+'\n') +# # print k+'\t'+'\t'.join(str(f) for f in all_feats) +# # f_out.close() +# # +# # def userFeature(user, user_profiles_coll): +# # if list(user_profiles_coll.find({'screen_name':user})) ==[]: +# # return [0,0,0,0] +# # tmp = list(user_profiles_coll.find({'screen_name':user}))[0] +# # pnum = tmp['statuses_count'] +# # fnum = tmp['friends_count'] +# # fonum = tmp['followers_count'] +# # create_time = tmp['created_at'] +# # verified = tmp['verified'] +# # if verified==False: +# # verified=0 +# # else: +# # verified=1 +# # date_create = datetime.strptime(create_time, '%a %b %d %H:%M:%S +0000 %Y') +# # today = datetime.now() +# # dregister =(today-date_create).days +# # return [pnum,fnum,fonum,dregister,verified] +# # +# # def content_feature(tweet): +# # topic_feature = [] +# # url_num = len([m for m in re.finditer('http', tweet)]) +# # question_flag = 0 +# # if '?' in tweet: +# # question_flag=1 +# # mention_num = len([m for m in re.finditer('@', tweet)]) +# # retweet_count=0 +# # try: +# # retweet_count = float(tweet.split(':::')[1]) +# # except: +# # pass +# # +# # return [url_num,question_flag,mention_num,retweet_count] +# # +# # def getTopicFeature(tweets, num_topic): +# # doc_set = [] +# # for entry in tweets: +# # try: +# # doc_set.append(entry.split(':::')[0]) +# # except: +# # pass +# # +# # texts = [] +# # tokenizer = RegexpTokenizer(r'\w+') +# # en_stop = get_stop_words('en') +# # p_stemmer = PorterStemmer() +# # for i in doc_set: +# # # clean and tokenize document string +# # raw = i.lower() +# # # Filter http +# # raw = raw.replace('http','') +# # tokens = tokenizer.tokenize(raw) +# # # remove stop words from tokensk +# # stopped_tokens = [i for i in tokens if not i in en_stop] +# # # stem tokens +# # stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] +# # # add tokens to list +# # texts.append(stemmed_tokens) +# # +# # dictionary = corpora.Dictionary(texts) +# # # convert tokenized documents into a document-term matrix +# # corpus = [dictionary.doc2bow(text) for text in texts] +# # # generate LDA model +# # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topic, id2word=dictionary, passes=20) +# # +# # topic_distribution = [] +# # for c in corpus: +# # +# # dis = ldamodel[c] +# # tmp_dis = [0 for i in range(num_topic)] +# # for d in dis: +# # tmp_dis[d[0]]=d[1] +# # topic_distribution.append(tmp_dis) +# # return topic_distribution +# # +# # def TweetLevelFeaturs(db): +# # f_out = open('./'+datasource+'/TweetLevelFeatsReal.txt','w+') +# # if datasource=='BuzzFeed': +# # user_profiles_coll = db['TwitterUserProfile'] +# # else: +# # user_profiles_coll = db['TwitterUserProfile1'] +# # # Fake News / Real News +# # dir_path = './Crawler/BuzzFeedCrawler/RealTwitterResult' +# # org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))] +# # news_tweets = dict() +# # news_users = dict() +# # for of in org_files: +# # ID = of[:of.index('.json')] +# # file_name = dir_path+'/'+of +# # tweets = [] +# # users = [] +# # with open(file_name) as f_engagements: +# # for line in f_engagements: +# # line = line.strip() +# # tweet_json = json.loads(line) +# # tweets.append(tweet_json['text']+':::'+str(tweet_json['retweets'])+':::'+str(tweet_json['id'])) +# # users.append(tweet_json['username']) +# # news_tweets[ID]=tweets +# # news_users[ID]=users +# # +# # for k, tweets in news_tweets.items(): +# # users = news_users[k] +# # if 'Real' in k: +# # tw_label='1' ### Using 1 as high credibility +# # else: +# # tw_label='-1' +# # +# # Topic_feats = getTopicFeature(tweets,10) +# # +# # for i in range(len(users)): +# # user = users[i] +# # tweet = tweets[i] +# # tid = tweet.split(':::')[2] +# # user_features = userFeature(user,user_profiles_coll) +# # content_features = content_feature(tweet) +# # all_feats=[] +# # all_feats.extend(user_features) +# # all_feats.extend(content_features) +# # all_feats.extend(Topic_feats[i]) +# # f_out.write(tid+'\t'+tw_label+'\t'+'\t'.join(str(f) for f in all_feats)+'\n') +# # print tid+'\t'+tw_label+'\t'+'\t'.join(str(f) for f in all_feats) +# # f_out.close() +# # +# # def Castillo11(datasource,delta): +# # all_news = [] +# # with open('./'+datasource+'/News.txt') as f_news: +# # for line in f_news: +# # all_news.append(line.strip()) +# # +# # all_X = [] +# # all_y = [] +# # with open('./'+datasource+'/Early/SocialFeats'+delta+'.txt') as f_fake_social: +# # for line in f_fake_social: +# # line = line.strip() +# # ID = line.split('\t')[0] +# # if ID in all_news: +# # feats = [float(x) for x in line.split('\t')[1:]] +# # all_X.append(feats) +# # all_y.append(1) +# # with open('./'+datasource+'/Early/SocialFeatsReal'+delta+'.txt') as f_real_social: +# # for line in f_real_social: +# # line = line.strip() +# # ID = line.split('\t')[0] +# # if ID in all_news: +# # feats = [float(x) for x in line.split('\t')[1:]] +# # all_X.append(feats) +# # all_y.append(0) +# # X = np.array(all_X) +# # y = np.array(all_y) +# # arry = range(X.shape[0]) +# # shuffle(arry) +# # X = X[arry, :] +# # y = y[arry] +# # clf = SVC(kernel='linear', class_weight='balanced') +# # # clf = RandomForestClassifier() +# # scoring = ['accuracy','precision', 'recall', 'f1'] +# # print '***'+delta+'***' +# # res = cross_validate(estimator=clf, X=X, y=y, cv=5, verbose=0, n_jobs=-1, scoring=scoring) +# # print '\t'.join([str(x) for x in res['test_accuracy']]) +# # # print '\t'.join([str(x) for x in res['test_precision']]) +# # # print '\t'.join([str(x) for x in res['test_recall']]) +# # print '\t'.join([str(x) for x in res['test_f1']]) +# # +# # # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=0, n_jobs=-1, scoring='accuracy') +# # # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) +# # # print res +# # +# # # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='precision') +# # # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) +# # # print res +# # # +# # # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='recall') +# # # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) +# # # print res +# # +# # # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=0, n_jobs=-1, scoring='f1') +# # # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) +# # # print res +# # +# # def Castillo11_2(datasource): +# # all_news = [] +# # with open('./'+datasource+'/News.txt') as f_news: +# # for line in f_news: +# # all_news.append(line.strip()) +# # +# # all_X = [] +# # all_y = [] +# # with open('./'+datasource+'/SocialFeats.txt') as f_fake_social: +# # for line in f_fake_social: +# # line = line.strip() +# # ID = line.split('\t')[0] +# # if ID in all_news: +# # feats = [float(x) for x in line.split('\t')[1:]] +# # all_X.append(feats) +# # all_y.append(1) +# # with open('./'+datasource+'/SocialFeatsReal.txt') as f_real_social: +# # for line in f_real_social: +# # line = line.strip() +# # ID = line.split('\t')[0] +# # if ID in all_news: +# # feats = [float(x) for x in line.split('\t')[1:]] +# # all_X.append(feats) +# # all_y.append(0) +# # X = np.array(all_X) +# # y = np.array(all_y) +# # arry = range(X.shape[0]) +# # shuffle(arry) +# # X = X[arry, :] +# # y = y[arry] +# # +# # # X = preprocessing.normalize(X) +# # # clf = RandomForestClassifier() +# # +# # train_sizes = [0.2,0.4,0.6,0.8] +# # for ts in train_sizes: +# # acc = [] +# # prec = [] +# # recall = [] +# # f1 = [] +# # for i in range(3): +# # clf = SVC(kernel='linear', class_weight='balanced') +# # X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = ts) +# # clf.fit(X_train,y_train) +# # y_pred = clf.predict(X_test) +# # acc.append(accuracy_score(y_test, y_pred)) +# # prec.append(precision_score(y_test, y_pred)) +# # recall.append(recall_score(y_test, y_pred)) +# # f1.append(f1_score(y_test, y_pred)) +# # +# # print "", sum(acc)/len(acc) +# # print "", sum(prec)/len(prec) +# # print "", sum(recall)/len(recall) +# # print "", sum(f1)/len(f1) +# # print "" +# # +# # def balanced_subsample(x,y,id,subsample_size=1.0): +# # +# # class_xs = [] +# # min_elems = None +# # +# # for yi in np.unique(y): +# # elems = x[(y == yi)] +# # class_xs.append((yi, elems)) +# # if min_elems == None or elems.shape[0] < min_elems: +# # min_elems = elems.shape[0] +# # +# # use_elems = min_elems +# # if subsample_size < 1: +# # use_elems = int(min_elems*subsample_size) +# # +# # xs = [] +# # ys = [] +# # +# # for ci,this_xs in class_xs: +# # if len(this_xs) > use_elems: +# # np.random.shuffle(this_xs) +# # +# # x_ = this_xs[:use_elems] +# # y_ = np.empty(use_elems) +# # y_.fill(ci) +# # +# # xs.append(x_) +# # ys.append(y_) +# # +# # xs = np.concatenate(xs) +# # ys = np.concatenate(ys) +# # +# # return xs,ys +# # +# # def TweetLevelPredict(): +# # all_X = [] +# # all_y = [] +# # all_tid = [] +# # with open('./BuzzFeed/TweetLevelFeats.txt') as f_fake_social: +# # for line in f_fake_social: +# # line = line.strip() +# # tid = line.split('\t')[0] +# # label = line.split('\t')[1] +# # feats = [float(x) for x in line.split('\t')[2:]] +# # all_X.append(feats) +# # all_y.append(label) +# # all_tid.append(tid) +# # with open('./BuzzFeed/TweetLevelFeatsReal.txt') as f_real_social: +# # for line in f_real_social: +# # line = line.strip() +# # label = line.split('\t')[1] +# # tid = line.split('\t')[0] +# # feats = [float(x) for x in line.split('\t')[2:]] +# # all_X.append(feats) +# # all_y.append(label) +# # all_tid.append(tid) +# # X = np.array(all_X) +# # y = np.array(all_y) +# # tid = np.array(all_tid) +# # Xs,ys = balanced_subsample(X,y,0.01) +# # arry = range(Xs.shape[0]) +# # shuffle(arry) +# # Xs = Xs[arry, :] +# # ys= ys[arry] +# # +# # # clf = RandomForestClassifier(max_depth=2,random_state=0) +# # clf = SVC(kernel='linear', class_weight='balanced',probability=True) +# # # res = cross_val_score(estimator=clf, X=Xs, y=ys, cv=5, verbose=1, n_jobs=-1, scoring='accuracy') +# # # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) +# # # print res +# # clf.fit(Xs,ys) +# # y_predict = clf.predict(X) +# # print 'Accuracy ' +# # +# # def Dummy(datasource): +# # all_news = [] +# # with open('./'+datasource+'/News.txt') as f_news: +# # for line in f_news: +# # all_news.append(line.strip()) +# # +# # all_X = [] +# # all_y = [] +# # with open('./'+datasource+'/SocialFeats.txt') as f_fake_social: +# # for line in f_fake_social: +# # line = line.strip() +# # ID = line.split('\t')[0] +# # if ID in all_news: +# # feats = [float(x) for x in line.split('\t')[1:]] +# # all_X.append(feats) +# # all_y.append(1) +# # with open('./'+datasource+'/SocialFeatsReal.txt') as f_real_social: +# # for line in f_real_social: +# # line = line.strip() +# # ID = line.split('\t')[0] +# # if ID in all_news: +# # feats = [float(x) for x in line.split('\t')[1:]] +# # all_X.append(feats) +# # all_y.append(0) +# # X = np.array(all_X) +# # y = np.array(all_y) +# # arry = range(X.shape[0]) +# # shuffle(arry) +# # X = X[arry, :] +# # y = y[arry] +# # clf = DummyClassifier(constant=1) +# # scoring = ['accuracy','precision', 'recall', 'f1'] +# # res = cross_validate(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring=scoring) +# # +# # +# # print '\t'.join([str(x) for x in res['test_accuracy']]) +# # print '\t'.join([str(x) for x in res['test_precision']]) +# # print '\t'.join([str(x) for x in res['test_recall']]) +# # print '\t'.join([str(x) for x in res['test_f1']]) +# # +# # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='accuracy') +# # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) +# # print res +# # +# # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='precision') +# # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) +# # print res +# # +# # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='recall') +# # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) +# # print res +# # +# # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='f1') +# # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) +# # print res +# +# +# def get_message_based_features(reply_id_content_dict): +# num_words = [] +# num_urls = [] +# question_mark_nums = [] +# num_mentions = [] +# +# for reply_id, content in reply_id_content_dict.items(): +# url_num = len([m for m in re.finditer('http', content)]) +# question_flag = 0 +# if '?' in content: +# question_flag = 1 +# mention_num = len([m for m in re.finditer('@', content)]) +# num_word = len(content.split()) +# +# num_words.append(num_word) +# num_urls.append(url_num) +# question_mark_nums.append(question_flag) +# num_mentions.append(mention_num) # -# if datasource=='BuzzFeed': -# dir_path = './Crawler/BuzzFeedCrawler/RealTwitterResult' -# else: -# dir_path = './Crawler/PolitiFact/PolitiFactTwitterResult' -# org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))] -# for of in org_files: -# ID = of[:of.index('.json')] -# file_name = dir_path+'/'+of -# tweets = [] -# users = [] -# with open(file_name) as f_engagements: -# for line in f_engagements: -# line = line.strip() -# tweet_json = json.loads(line) -# tweets.append(tweet_json['text']) -# users.append(tweet_json['username']) -# news_tweets[ID]=tweets -# news_users[ID]=users -# -# for k, tweets in news_tweets.items(): -# users = news_users[k] -# if len(users)>150: -# users = users[:150] -# user_features = userFeatureAgg(users, user_profiles_coll) -# content_features = content_featureAgg(tweets) -# network_features = networkFeatureAgg(users,user_followers_coll,user_followees_coll) -# -# all_feats=[] -# all_feats.extend(user_features) -# all_feats.extend(content_features) -# all_feats.extend(network_features) -# f_out.write(k+'\t'+'\t'.join(str(f) for f in all_feats)+'\n') -# print k+'\t'+'\t'.join(str(f) for f in all_feats) -# f_out.close() -# -# def getSocialEngagementsEarly(db,datasource,delta): -# early_users = dict() -# with open('./'+datasource+'/Early/User_'+delta+'.txt') as f_users: -# for line in f_users: -# line = line.strip() -# early_users[line]=1 -# -# f_out = open('./'+datasource+'/Early/SocialFeatsReal'+delta+'.txt','w+') -# if datasource=='BuzzFeed': -# user_profiles_coll = db['TwitterUserProfile'] -# else: -# user_profiles_coll = db['TwitterUserProfile2'] -# if datasource=='BuzzFeed': -# user_followers_coll = db['TwitterUserFollowers'] -# else: -# user_followers_coll = db['TwitterUserFollowers2'] -# if datasource=='BuzzFeed': -# user_followees_coll = db['TwitterUserFollowees'] -# else: -# user_followees_coll = db['TwitterUserFollowees2'] -# news_tweets = dict() -# news_users = dict() -# # Fake News / Real News +# try: +# mean_num_words = np.mean(num_words) +# except: +# mean_num_words = 0 +# +# try: +# mean_num_urls = np.mean(num_urls) +# except: +# mean_num_urls = 0 # -# if datasource=='BuzzFeed': -# dir_path = './Crawler/BuzzFeedCrawler/TwitterResult' -# else: -# dir_path = './Crawler/PolitiFact/RealTwitterResult' -# org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))] -# for of in org_files: -# ID = of[:of.index('.json')] -# file_name = dir_path+'/'+of -# tweets = [] -# users = [] -# with open(file_name) as f_engagements: -# for line in f_engagements: -# line = line.strip() -# tweet_json = json.loads(line) -# if tweet_json['username'] not in early_users: -# continue -# tweets.append(tweet_json['text']) -# users.append(tweet_json['username']) -# news_tweets[ID]=tweets -# news_users[ID]=users -# -# for k, tweets in news_tweets.items(): -# users = news_users[k] -# if len(users)>150: -# users = users[:150] -# user_features = userFeatureAgg(users, user_profiles_coll) -# content_features = content_featureAgg(tweets) -# network_features = networkFeatureAgg(users,user_followers_coll,user_followees_coll) -# -# all_feats=[] -# all_feats.extend(user_features) -# all_feats.extend(content_features) -# all_feats.extend(network_features) -# f_out.write(k+'\t'+'\t'.join(str(f) for f in all_feats)+'\n') -# print k+'\t'+'\t'.join(str(f) for f in all_feats) -# f_out.close() -# -# def userFeature(user, user_profiles_coll): -# if list(user_profiles_coll.find({'screen_name':user})) ==[]: -# return [0,0,0,0] -# tmp = list(user_profiles_coll.find({'screen_name':user}))[0] -# pnum = tmp['statuses_count'] -# fnum = tmp['friends_count'] -# fonum = tmp['followers_count'] -# create_time = tmp['created_at'] -# verified = tmp['verified'] -# if verified==False: -# verified=0 -# else: -# verified=1 -# date_create = datetime.strptime(create_time, '%a %b %d %H:%M:%S +0000 %Y') -# today = datetime.now() -# dregister =(today-date_create).days -# return [pnum,fnum,fonum,dregister,verified] -# -# def content_feature(tweet): -# topic_feature = [] -# url_num = len([m for m in re.finditer('http', tweet)]) -# question_flag = 0 -# if '?' in tweet: -# question_flag=1 -# mention_num = len([m for m in re.finditer('@', tweet)]) -# retweet_count=0 # try: -# retweet_count = float(tweet.split(':::')[1]) +# mean_question_mark_nums = np.mean(question_mark_nums) # except: -# pass -# -# return [url_num,question_flag,mention_num,retweet_count] -# -# def getTopicFeature(tweets, num_topic): -# doc_set = [] -# for entry in tweets: -# try: -# doc_set.append(entry.split(':::')[0]) -# except: -# pass -# -# texts = [] -# tokenizer = RegexpTokenizer(r'\w+') -# en_stop = get_stop_words('en') -# p_stemmer = PorterStemmer() -# for i in doc_set: -# # clean and tokenize document string -# raw = i.lower() -# # Filter http -# raw = raw.replace('http','') -# tokens = tokenizer.tokenize(raw) -# # remove stop words from tokensk -# stopped_tokens = [i for i in tokens if not i in en_stop] -# # stem tokens -# stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] -# # add tokens to list -# texts.append(stemmed_tokens) -# -# dictionary = corpora.Dictionary(texts) -# # convert tokenized documents into a document-term matrix -# corpus = [dictionary.doc2bow(text) for text in texts] -# # generate LDA model -# ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topic, id2word=dictionary, passes=20) -# -# topic_distribution = [] -# for c in corpus: -# -# dis = ldamodel[c] -# tmp_dis = [0 for i in range(num_topic)] -# for d in dis: -# tmp_dis[d[0]]=d[1] -# topic_distribution.append(tmp_dis) -# return topic_distribution -# -# def TweetLevelFeaturs(db): -# f_out = open('./'+datasource+'/TweetLevelFeatsReal.txt','w+') -# if datasource=='BuzzFeed': -# user_profiles_coll = db['TwitterUserProfile'] +# mean_question_mark_nums = 0 +# +# try: +# mean_num_mentions = np.mean(num_mentions) +# except: +# mean_num_mentions = 0 +# +# return [mean_num_words, mean_num_urls, mean_question_mark_nums, mean_num_mentions] +# +# +# def get_content_topic_based_features(reply_id_content_dict): +# positive_words = [] +# negative_words = [] +# neutral_words = [] +# sentiment_scores = [] +# reply_lengths = [] +# +# for reply_id, content in reply_id_content_dict.items(): +# if reply_id in reply_id_content_dict: +# sentiment_info = all_reply_id_sentiment_score_dict[reply_id] +# positive_words.append(sentiment_info["pos"]) +# negative_words.append(sentiment_info["neg"]) +# neutral_words.append(sentiment_info["neu"]) +# sentiment_scores.append(sentiment_info["compound"]) +# reply_lengths.append(len(content)) +# +# try: +# mean_positive_words = np.mean(positive_words) +# except: +# mean_positive_words = 0 +# +# try: +# mean_negative_words = np.mean(negative_words) +# except: +# mean_negative_words = 0 +# +# try: +# mean_neutral_words = np.mean(neutral_words) +# except: +# mean_neutral_words = 0 +# +# try: +# mean_sentiment_score = np.mean(sentiment_scores) +# except: +# mean_sentiment_score = 0 +# +# try: +# mean_reply_length = np.mean(reply_lengths) +# except: +# mean_reply_length = 0 +# +# return [len(reply_id_content_dict), mean_positive_words, mean_negative_words, mean_neutral_words, +# mean_sentiment_score, mean_reply_length] +# +# +# def get_user_aggregate_features(db, is_fake, user_ids): +# posts_num = [] +# friends_num = [] +# followers_num = [] +# days_register = [] +# +# if is_fake: +# label_user_collection = db.fake_twitter_user_profile # else: -# user_profiles_coll = db['TwitterUserProfile1'] -# # Fake News / Real News -# dir_path = './Crawler/BuzzFeedCrawler/RealTwitterResult' -# org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))] -# news_tweets = dict() -# news_users = dict() -# for of in org_files: -# ID = of[:of.index('.json')] -# file_name = dir_path+'/'+of -# tweets = [] -# users = [] -# with open(file_name) as f_engagements: -# for line in f_engagements: -# line = line.strip() -# tweet_json = json.loads(line) -# tweets.append(tweet_json['text']+':::'+str(tweet_json['retweets'])+':::'+str(tweet_json['id'])) -# users.append(tweet_json['username']) -# news_tweets[ID]=tweets -# news_users[ID]=users -# -# for k, tweets in news_tweets.items(): -# users = news_users[k] -# if 'Real' in k: -# tw_label='1' ### Using 1 as high credibility +# label_user_collection = db.real_twitter_user_profile +# +# user_profile_collection = db.twitter_user_profile +# +# # np.random.shuffle(user_ids) +# +# for user_id in tqdm(user_ids): +# +# user_object = label_user_collection.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1, +# "profile_info.friends_count": 1, +# "profile_info.followers_count": 1, +# "profile_info.created_at": 1}) +# if user_object is None: +# user_object = user_profile_collection.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1, +# "profile_info.friends_count": 1, +# "profile_info.followers_count": 1, +# "profile_info.created_at": 1}) +# +# if user_object is None: +# print('user {} not found'.format(user_id)) # else: -# tw_label='-1' -# -# Topic_feats = getTopicFeature(tweets,10) -# -# for i in range(len(users)): -# user = users[i] -# tweet = tweets[i] -# tid = tweet.split(':::')[2] -# user_features = userFeature(user,user_profiles_coll) -# content_features = content_feature(tweet) -# all_feats=[] -# all_feats.extend(user_features) -# all_feats.extend(content_features) -# all_feats.extend(Topic_feats[i]) -# f_out.write(tid+'\t'+tw_label+'\t'+'\t'.join(str(f) for f in all_feats)+'\n') -# print tid+'\t'+tw_label+'\t'+'\t'.join(str(f) for f in all_feats) -# f_out.close() -# -# def Castillo11(datasource,delta): -# all_news = [] -# with open('./'+datasource+'/News.txt') as f_news: -# for line in f_news: -# all_news.append(line.strip()) -# -# all_X = [] -# all_y = [] -# with open('./'+datasource+'/Early/SocialFeats'+delta+'.txt') as f_fake_social: -# for line in f_fake_social: -# line = line.strip() -# ID = line.split('\t')[0] -# if ID in all_news: -# feats = [float(x) for x in line.split('\t')[1:]] -# all_X.append(feats) -# all_y.append(1) -# with open('./'+datasource+'/Early/SocialFeatsReal'+delta+'.txt') as f_real_social: -# for line in f_real_social: -# line = line.strip() -# ID = line.split('\t')[0] -# if ID in all_news: -# feats = [float(x) for x in line.split('\t')[1:]] -# all_X.append(feats) -# all_y.append(0) -# X = np.array(all_X) -# y = np.array(all_y) -# arry = range(X.shape[0]) -# shuffle(arry) -# X = X[arry, :] -# y = y[arry] -# clf = SVC(kernel='linear', class_weight='balanced') -# # clf = RandomForestClassifier() -# scoring = ['accuracy','precision', 'recall', 'f1'] -# print '***'+delta+'***' -# res = cross_validate(estimator=clf, X=X, y=y, cv=5, verbose=0, n_jobs=-1, scoring=scoring) -# print '\t'.join([str(x) for x in res['test_accuracy']]) -# # print '\t'.join([str(x) for x in res['test_precision']]) -# # print '\t'.join([str(x) for x in res['test_recall']]) -# print '\t'.join([str(x) for x in res['test_f1']]) -# -# # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=0, n_jobs=-1, scoring='accuracy') -# # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) -# # print res -# -# # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='precision') -# # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) -# # print res -# # -# # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='recall') -# # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) -# # print res -# -# # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=0, n_jobs=-1, scoring='f1') -# # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) -# # print res -# -# def Castillo11_2(datasource): -# all_news = [] -# with open('./'+datasource+'/News.txt') as f_news: -# for line in f_news: -# all_news.append(line.strip()) -# -# all_X = [] -# all_y = [] -# with open('./'+datasource+'/SocialFeats.txt') as f_fake_social: -# for line in f_fake_social: -# line = line.strip() -# ID = line.split('\t')[0] -# if ID in all_news: -# feats = [float(x) for x in line.split('\t')[1:]] -# all_X.append(feats) -# all_y.append(1) -# with open('./'+datasource+'/SocialFeatsReal.txt') as f_real_social: -# for line in f_real_social: -# line = line.strip() -# ID = line.split('\t')[0] -# if ID in all_news: -# feats = [float(x) for x in line.split('\t')[1:]] -# all_X.append(feats) -# all_y.append(0) -# X = np.array(all_X) -# y = np.array(all_y) -# arry = range(X.shape[0]) -# shuffle(arry) -# X = X[arry, :] -# y = y[arry] -# -# # X = preprocessing.normalize(X) -# # clf = RandomForestClassifier() -# -# train_sizes = [0.2,0.4,0.6,0.8] -# for ts in train_sizes: -# acc = [] -# prec = [] -# recall = [] -# f1 = [] -# for i in range(3): -# clf = SVC(kernel='linear', class_weight='balanced') -# X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = ts) -# clf.fit(X_train,y_train) -# y_pred = clf.predict(X_test) -# acc.append(accuracy_score(y_test, y_pred)) -# prec.append(precision_score(y_test, y_pred)) -# recall.append(recall_score(y_test, y_pred)) -# f1.append(f1_score(y_test, y_pred)) -# -# print "", sum(acc)/len(acc) -# print "", sum(prec)/len(prec) -# print "", sum(recall)/len(recall) -# print "", sum(f1)/len(f1) -# print "" -# -# def balanced_subsample(x,y,id,subsample_size=1.0): -# -# class_xs = [] -# min_elems = None -# -# for yi in np.unique(y): -# elems = x[(y == yi)] -# class_xs.append((yi, elems)) -# if min_elems == None or elems.shape[0] < min_elems: -# min_elems = elems.shape[0] -# -# use_elems = min_elems -# if subsample_size < 1: -# use_elems = int(min_elems*subsample_size) -# -# xs = [] -# ys = [] -# -# for ci,this_xs in class_xs: -# if len(this_xs) > use_elems: -# np.random.shuffle(this_xs) -# -# x_ = this_xs[:use_elems] -# y_ = np.empty(use_elems) -# y_.fill(ci) -# -# xs.append(x_) -# ys.append(y_) -# -# xs = np.concatenate(xs) -# ys = np.concatenate(ys) -# -# return xs,ys -# -# def TweetLevelPredict(): -# all_X = [] -# all_y = [] -# all_tid = [] -# with open('./BuzzFeed/TweetLevelFeats.txt') as f_fake_social: -# for line in f_fake_social: -# line = line.strip() -# tid = line.split('\t')[0] -# label = line.split('\t')[1] -# feats = [float(x) for x in line.split('\t')[2:]] -# all_X.append(feats) -# all_y.append(label) -# all_tid.append(tid) -# with open('./BuzzFeed/TweetLevelFeatsReal.txt') as f_real_social: -# for line in f_real_social: -# line = line.strip() -# label = line.split('\t')[1] -# tid = line.split('\t')[0] -# feats = [float(x) for x in line.split('\t')[2:]] -# all_X.append(feats) -# all_y.append(label) -# all_tid.append(tid) -# X = np.array(all_X) -# y = np.array(all_y) -# tid = np.array(all_tid) -# Xs,ys = balanced_subsample(X,y,0.01) -# arry = range(Xs.shape[0]) -# shuffle(arry) -# Xs = Xs[arry, :] -# ys= ys[arry] -# -# # clf = RandomForestClassifier(max_depth=2,random_state=0) -# clf = SVC(kernel='linear', class_weight='balanced',probability=True) -# # res = cross_val_score(estimator=clf, X=Xs, y=ys, cv=5, verbose=1, n_jobs=-1, scoring='accuracy') -# # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) -# # print res -# clf.fit(Xs,ys) -# y_predict = clf.predict(X) -# print 'Accuracy ' -# -# def Dummy(datasource): -# all_news = [] -# with open('./'+datasource+'/News.txt') as f_news: -# for line in f_news: -# all_news.append(line.strip()) -# -# all_X = [] -# all_y = [] -# with open('./'+datasource+'/SocialFeats.txt') as f_fake_social: -# for line in f_fake_social: -# line = line.strip() -# ID = line.split('\t')[0] -# if ID in all_news: -# feats = [float(x) for x in line.split('\t')[1:]] -# all_X.append(feats) -# all_y.append(1) -# with open('./'+datasource+'/SocialFeatsReal.txt') as f_real_social: -# for line in f_real_social: -# line = line.strip() -# ID = line.split('\t')[0] -# if ID in all_news: -# feats = [float(x) for x in line.split('\t')[1:]] -# all_X.append(feats) -# all_y.append(0) -# X = np.array(all_X) -# y = np.array(all_y) -# arry = range(X.shape[0]) -# shuffle(arry) -# X = X[arry, :] -# y = y[arry] -# clf = DummyClassifier(constant=1) -# scoring = ['accuracy','precision', 'recall', 'f1'] -# res = cross_validate(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring=scoring) -# -# -# print '\t'.join([str(x) for x in res['test_accuracy']]) -# print '\t'.join([str(x) for x in res['test_precision']]) -# print '\t'.join([str(x) for x in res['test_recall']]) -# print '\t'.join([str(x) for x in res['test_f1']]) -# -# res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='accuracy') -# res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) -# print res -# -# res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='precision') -# res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) -# print res -# -# res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='recall') -# res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) -# print res -# -# res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='f1') -# res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res)) -# print res - - -def get_message_based_features(reply_id_content_dict): - num_words = [] - num_urls = [] - question_mark_nums = [] - num_mentions = [] - - for reply_id, content in reply_id_content_dict.items(): - url_num = len([m for m in re.finditer('http', content)]) - question_flag = 0 - if '?' in content: - question_flag = 1 - mention_num = len([m for m in re.finditer('@', content)]) - num_word = len(content.split()) - - num_words.append(num_word) - num_urls.append(url_num) - question_mark_nums.append(question_flag) - num_mentions.append(mention_num) - - try: - mean_num_words = np.mean(num_words) - except: - mean_num_words = 0 - - try: - mean_num_urls = np.mean(num_urls) - except: - mean_num_urls = 0 - - try: - mean_question_mark_nums = np.mean(question_mark_nums) - except: - mean_question_mark_nums = 0 - - try: - mean_num_mentions = np.mean(num_mentions) - except: - mean_num_mentions = 0 - - return [mean_num_words, mean_num_urls, mean_question_mark_nums, mean_num_mentions] - - -def get_content_topic_based_features(reply_id_content_dict): - positive_words = [] - negative_words = [] - neutral_words = [] - sentiment_scores = [] - reply_lengths = [] - - for reply_id, content in reply_id_content_dict.items(): - if reply_id in reply_id_content_dict: - sentiment_info = all_reply_id_sentiment_score_dict[reply_id] - positive_words.append(sentiment_info["pos"]) - negative_words.append(sentiment_info["neg"]) - neutral_words.append(sentiment_info["neu"]) - sentiment_scores.append(sentiment_info["compound"]) - reply_lengths.append(len(content)) - - try: - mean_positive_words = np.mean(positive_words) - except: - mean_positive_words = 0 - - try: - mean_negative_words = np.mean(negative_words) - except: - mean_negative_words = 0 - - try: - mean_neutral_words = np.mean(neutral_words) - except: - mean_neutral_words = 0 - - try: - mean_sentiment_score = np.mean(sentiment_scores) - except: - mean_sentiment_score = 0 - - try: - mean_reply_length = np.mean(reply_lengths) - except: - mean_reply_length = 0 - - return [len(reply_id_content_dict), mean_positive_words, mean_negative_words, mean_neutral_words, - mean_sentiment_score, mean_reply_length] - - -def get_user_aggregate_features(db, is_fake, user_ids): - posts_num = [] - friends_num = [] - followers_num = [] - days_register = [] - - if is_fake: - label_user_collection = db.fake_twitter_user_profile - else: - label_user_collection = db.real_twitter_user_profile - - user_profile_collection = db.twitter_user_profile - - # np.random.shuffle(user_ids) - - for user_id in tqdm(user_ids): - - user_object = label_user_collection.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1, - "profile_info.friends_count": 1, - "profile_info.followers_count": 1, - "profile_info.created_at": 1}) - if user_object is None: - user_object = user_profile_collection.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1, - "profile_info.friends_count": 1, - "profile_info.followers_count": 1, - "profile_info.created_at": 1}) - - if user_object is None: - print('user {} not found'.format(user_id)) - else: - if "profile_info" in user_object: - pnum = user_object["profile_info"]['statuses_count'] - fnum = user_object["profile_info"]['friends_count'] - fonum = user_object["profile_info"]['followers_count'] - create_time = user_object["profile_info"]['created_at'] - date_create = datetime.strptime(create_time, '%a %b %d %H:%M:%S +0000 %Y') - today = datetime.now() - dregister = (today - date_create).days - posts_num.append(pnum) - friends_num.append(fnum) - followers_num.append(fonum) - days_register.append(dregister) - - try: - avg_posts_num = sum(posts_num) / len(posts_num) - except: - avg_posts_num = 0 - try: - avg_friends_num = sum(friends_num) / len(friends_num) - except: - avg_friends_num = 0 - try: - avg_followers_num = sum(followers_num) / len(followers_num) - except: - avg_followers_num = 0 - try: - avg_days_register = sum(days_register) / len(days_register) - except: - avg_days_register = 0 - - return [avg_posts_num, avg_friends_num, avg_followers_num, avg_days_register] - - -def get_castillo_features(db, news_source, raw_data_dir, label, prop_graphs): - raw_data = pickle.load(open("{}/{}_{}_castillo_raw_data.pkl".format(raw_data_dir, news_source, label), "rb")) - - all_features = [] - - for news in raw_data: - sample_feature = [] - sample_feature.extend(get_user_aggregate_features(db, label == "fake", news["user_ids"])) - sample_feature.extend(get_message_based_features(news["reply_id_content_dict"])) - sample_feature.extend(get_content_topic_based_features(news["reply_id_content_dict"])) - - all_features.append(sample_feature) - - structure_feature_helper = StructureFeatureHelper() - structure_features = structure_feature_helper.get_features_array(prop_graphs, micro_features=False, - macro_features=True) - - other_features = get_numpy_array(all_features) - structure_features = get_numpy_array(structure_features)[:, [0, 1, 2]] - print("Other features shape") - print(other_features.shape) - - print("Structure features shape") - print(structure_features.shape) - return np.concatenate([other_features, structure_features], axis=1) - - -def dump_castillo_features(db, news_source, raw_data_dir, feature_out_dir, prop_graphs_dir): - fake_prop_graph, real_prop_graph = get_propagation_graphs(prop_graphs_dir, news_source) - fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) - - create_dir(feature_out_dir) - - fake_castillo_features = get_castillo_features(db, news_source, raw_data_dir, "fake", fake_prop_graph) - real_castillo_features = get_castillo_features(db, news_source, raw_data_dir, "real", real_prop_graph) - - all_castillo_features = np.concatenate([fake_castillo_features, real_castillo_features]) - - print("All castillo features") - print(all_castillo_features.shape, flush=True) - - pickle.dump(all_castillo_features, open("{}/{}_castillo_features.pkl".format(feature_out_dir, news_source), "wb")) - - -def get_raw_feature_for_news(news): - data = {} - - user_ids = set() - - reply_id_content_dict = dict() - - for tweet in news["tweets"]: - user_ids.add(tweet["user_id"]) - get_reply_of_replies(tweet["reply"], reply_id_content_dict) - - data["id"] = news["id"] - data["user_ids"] = list(user_ids) - data["reply_id_content_dict"] = reply_id_content_dict - - return data - - -def get_castillo_data(data_dir, prop_graphs, news_source, label): - prop_graphs_ids = [] - for news_graph in prop_graphs: - prop_graphs_ids.append(news_graph.tweet_id) - - castillo_raw_data = [None] * len(prop_graphs_ids) - - prop_graphs_ids_set = set(prop_graphs_ids) - - file_path = "{}/{}_{}_news_complete_dataset.json".format(data_dir, news_source, label) - - for news in get_news_articles(file_path): - news_id = news["id"] - if news_id in prop_graphs_ids_set: - news_id_index = prop_graphs_ids.index(news_id) - castillo_raw_data[news_id_index] = get_raw_feature_for_news(news) - - return castillo_raw_data - - -def get_castillo_raw_data(data_dir, prop_graphs_dir, out_dir, news_source): - fake_prop_graph, real_prop_graph = get_propagation_graphs(prop_graphs_dir, news_source) - - fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) - - create_dir(out_dir) - - fake_castillo_raw_data = get_castillo_data(data_dir, fake_prop_graph, news_source, "fake") - real_castillo_raw_data = get_castillo_data(data_dir, real_prop_graph, news_source, "real") - - pickle.dump(fake_castillo_raw_data, open("{}/{}_fake_castillo_raw_data.pkl".format(out_dir, news_source), "wb")) - pickle.dump(real_castillo_raw_data, open("{}/{}_real_castillo_raw_data.pkl".format(out_dir, news_source), "wb")) - - -def get_castillo_feature_array(news_source, castillo_feature_dir): - file_path = "{}/{}_real_castillo_raw_data.pkl".format(castillo_feature_dir, news_source) - file_obj = Path(file_path) - - if file_obj.exists(): - return pickle.load(open(file_path, "wb")) - - return None - - - -if __name__ == '__main__': - config = load_configuration("project.config") - db = get_database_connection(config) - news_source = "politifact" - - for news_source in ["politifact", "gossipcop"]: - # get_castillo_raw_data("data/engagement_data_latest", "data/saved_new_no_filter", "data/castillo/raw_data", - # news_source) - # - # print("Raw data dumped", flush=True) - - dump_castillo_features(db, news_source, "data/castillo/raw_data", "data/castillo/saved_features", - "data/saved_new_no_filter") - - print("Castillo features for {} dumped".format(news_source), flush=True) +# if "profile_info" in user_object: +# pnum = user_object["profile_info"]['statuses_count'] +# fnum = user_object["profile_info"]['friends_count'] +# fonum = user_object["profile_info"]['followers_count'] +# create_time = user_object["profile_info"]['created_at'] +# date_create = datetime.strptime(create_time, '%a %b %d %H:%M:%S +0000 %Y') +# today = datetime.now() +# dregister = (today - date_create).days +# posts_num.append(pnum) +# friends_num.append(fnum) +# followers_num.append(fonum) +# days_register.append(dregister) +# +# try: +# avg_posts_num = sum(posts_num) / len(posts_num) +# except: +# avg_posts_num = 0 +# try: +# avg_friends_num = sum(friends_num) / len(friends_num) +# except: +# avg_friends_num = 0 +# try: +# avg_followers_num = sum(followers_num) / len(followers_num) +# except: +# avg_followers_num = 0 +# try: +# avg_days_register = sum(days_register) / len(days_register) +# except: +# avg_days_register = 0 +# +# return [avg_posts_num, avg_friends_num, avg_followers_num, avg_days_register] +# +# +# def get_castillo_features(db, news_source, raw_data_dir, label, prop_graphs): +# raw_data = pickle.load(open("{}/{}_{}_castillo_raw_data.pkl".format(raw_data_dir, news_source, label), "rb")) +# +# all_features = [] +# +# for news in raw_data: +# sample_feature = [] +# sample_feature.extend(get_user_aggregate_features(db, label == "fake", news["user_ids"])) +# sample_feature.extend(get_message_based_features(news["reply_id_content_dict"])) +# sample_feature.extend(get_content_topic_based_features(news["reply_id_content_dict"])) +# +# all_features.append(sample_feature) +# +# structure_feature_helper = StructureFeatureHelper() +# structure_features = structure_feature_helper.get_features_array(prop_graphs, micro_features=False, +# macro_features=True) +# +# other_features = get_numpy_array(all_features) +# structure_features = get_numpy_array(structure_features)[:, [0, 1, 2]] +# print("Other features shape") +# print(other_features.shape) +# +# print("Structure features shape") +# print(structure_features.shape) +# return np.concatenate([other_features, structure_features], axis=1) +# +# +# def dump_castillo_features(db, news_source, raw_data_dir, feature_out_dir, prop_graphs_dir): +# fake_prop_graph, real_prop_graph = get_propagation_graphs(prop_graphs_dir, news_source) +# fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) +# +# create_dir(feature_out_dir) +# +# fake_castillo_features = get_castillo_features(db, news_source, raw_data_dir, "fake", fake_prop_graph) +# real_castillo_features = get_castillo_features(db, news_source, raw_data_dir, "real", real_prop_graph) +# +# all_castillo_features = np.concatenate([fake_castillo_features, real_castillo_features]) +# +# print("All castillo features") +# print(all_castillo_features.shape, flush=True) +# +# pickle.dump(all_castillo_features, open("{}/{}_castillo_features.pkl".format(feature_out_dir, news_source), "wb")) +# +# +# def get_raw_feature_for_news(news): +# data = {} +# +# user_ids = set() +# +# reply_id_content_dict = dict() +# +# for tweet in news["tweets"]: +# user_ids.add(tweet["user_id"]) +# get_reply_of_replies(tweet["reply"], reply_id_content_dict) +# +# data["id"] = news["id"] +# data["user_ids"] = list(user_ids) +# data["reply_id_content_dict"] = reply_id_content_dict +# +# return data +# +# +# def get_castillo_data(data_dir, prop_graphs, news_source, label): +# prop_graphs_ids = [] +# for news_graph in prop_graphs: +# prop_graphs_ids.append(news_graph.tweet_id) +# +# castillo_raw_data = [None] * len(prop_graphs_ids) +# +# prop_graphs_ids_set = set(prop_graphs_ids) +# +# file_path = "{}/{}_{}_news_complete_dataset.json".format(data_dir, news_source, label) +# +# for news in get_news_articles(file_path): +# news_id = news["id"] +# if news_id in prop_graphs_ids_set: +# news_id_index = prop_graphs_ids.index(news_id) +# castillo_raw_data[news_id_index] = get_raw_feature_for_news(news) +# +# return castillo_raw_data +# +# +# def get_castillo_raw_data(data_dir, prop_graphs_dir, out_dir, news_source): +# fake_prop_graph, real_prop_graph = get_propagation_graphs(prop_graphs_dir, news_source) +# +# fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) +# +# create_dir(out_dir) +# +# fake_castillo_raw_data = get_castillo_data(data_dir, fake_prop_graph, news_source, "fake") +# real_castillo_raw_data = get_castillo_data(data_dir, real_prop_graph, news_source, "real") +# +# pickle.dump(fake_castillo_raw_data, open("{}/{}_fake_castillo_raw_data.pkl".format(out_dir, news_source), "wb")) +# pickle.dump(real_castillo_raw_data, open("{}/{}_real_castillo_raw_data.pkl".format(out_dir, news_source), "wb")) +# +# +# def get_castillo_feature_array(news_source, castillo_feature_dir): +# file_path = "{}/{}_real_castillo_raw_data.pkl".format(castillo_feature_dir, news_source) +# file_obj = Path(file_path) +# +# if file_obj.exists(): +# return pickle.load(open(file_path, "wb")) +# +# return None +# +# +# +# if __name__ == '__main__': +# config = load_configuration("project.config") +# db = get_database_connection(config) +# news_source = "politifact" +# +# for news_source in ["politifact", "gossipcop"]: +# # get_castillo_raw_data("data/engagement_data_latest", "data/saved_new_no_filter", "data/castillo/raw_data", +# # news_source) +# # +# # print("Raw data dumped", flush=True) +# +# dump_castillo_features(db, news_source, "data/castillo/raw_data", "data/castillo/saved_features", +# "data/saved_new_no_filter") +# +# print("Castillo features for {} dumped".format(news_source), flush=True) diff --git a/construct_sample_features.py b/construct_sample_features.py index 2fd4981..027940e 100644 --- a/construct_sample_features.py +++ b/construct_sample_features.py @@ -9,7 +9,8 @@ from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split -from analysis_util import get_propagation_graphs, equal_samples +from analysis_util import get_propagation_graphs, equal_samples, remove_prop_graph_noise, get_noise_news_ids +from data_processing.save_dataset import load_from_nx_graphs from linguistic_analysis import get_all_linguistic_features, LinguisticFeatureHelper from structure_temp_analysis import get_all_structural_features, StructureFeatureHelper, ScienceCascadeFeatureHelper, \ get_first_post_time @@ -33,7 +34,7 @@ def get_dataset(news_source, load_dataset=False, micro_features=True, macro_feat target_labels = pickle.load(open("{}_target_labels.pkl".format(news_source), "rb")) else: - fake_prop_graph, real_prop_graph = get_propagation_graphs(news_source) + fake_prop_graph, real_prop_graph = get_nx_propagation_graphs(news_source) fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) print("fake samples len : {} real samples len : {}".format(len(fake_prop_graph), len(real_prop_graph))) @@ -92,19 +93,20 @@ def get_dataset_file_name(file_dir, news_source, include_micro=True, include_mac def get_TPNF_dataset(out_dir, news_source, include_micro=True, include_macro=True, include_structural=None, include_temporal=None, - include_linguistic=None): + include_linguistic=None, time_interval=None, use_cache=False): file_name = get_dataset_file_name(out_dir, news_source, include_micro, include_macro, include_structural, include_temporal, include_linguistic) data_file = Path(file_name) - if data_file.is_file(): + if use_cache and data_file.is_file(): return pickle.load(open(file_name, "rb")) else: fake_sample_features, real_sample_features = get_dataset_feature_array(news_source, include_micro, include_macro, include_structural, - include_temporal, include_linguistic) + include_temporal, include_linguistic, + time_interval) sample_features = np.concatenate([fake_sample_features, real_sample_features], axis=0) pickle.dump(sample_features, open(file_name, "wb")) @@ -138,11 +140,11 @@ def get_dataset_feature_names(include_micro=True, include_macro=True, include_st return feature_names_all, short_feature_names_all -def is_valid_graph(prop_graph: tweet_node): +def is_valid_graph(prop_graph: tweet_node, retweet = True, reply = True): """ Check if the prop graph has alteast one retweet or reply""" for post_node in prop_graph.children: - if len(post_node.reply_children) > 0 or len(post_node.retweet_children) > 0: + if (retweet and len(post_node.reply_children) > 0) or (reply and len(post_node.retweet_children) > 0): return True return False @@ -190,10 +192,29 @@ def filter_propagation_graphs(graphs, limit_time): return result_graphs +def get_nx_propagation_graphs(data_folder, news_source): + fake_propagation_graphs = load_from_nx_graphs(data_folder, news_source, "fake") + real_propagation_graphs = load_from_nx_graphs(data_folder, news_source, "real") + + print("Before filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs))) + print("Before filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs))) + + # fake_propagation_graphs = remove_prop_graph_noise(fake_propagation_graphs, get_noise_news_ids()) + # real_propagation_graphs = remove_prop_graph_noise(real_propagation_graphs, get_noise_news_ids()) + + # fake_news_ids = [graph.news_id for graph in fake_propagation_graphs] + # real_news_ids = [graph.news_id for graph in real_propagation_graphs] + + print("After filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs))) + print("After filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs))) + print(flush=True) + + return fake_propagation_graphs, real_propagation_graphs + def get_dataset_feature_array(news_source, include_micro=True, include_macro=True, include_structural=None, include_temporal=None, - include_linguistic=None): - fake_prop_graph, real_prop_graph = get_propagation_graphs("data/saved_new_no_filter", news_source) + include_linguistic=None, time_inteval = None): + fake_prop_graph, real_prop_graph = get_nx_propagation_graphs("data/nx_network_data", news_source) fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) @@ -222,6 +243,8 @@ def get_dataset_feature_array(news_source, include_micro=True, include_macro=Tru macro_features=include_macro, news_source=news_source, label="real") + feature_names = feature_helper.get_feature_names(micro_features= include_micro, macro_features= include_macro) + print(feature_names) if fake_features is not None and real_features is not None: fake_feature_all.append(fake_features) real_feature_all.append(real_features) diff --git a/data_processing/data_process.py b/data_processing/data_process.py new file mode 100644 index 0000000..b843a86 --- /dev/null +++ b/data_processing/data_process.py @@ -0,0 +1,912 @@ +import errno +import os +import pickle +import queue +import time +from math import ceil +from pathlib import Path + +import networkx as nx +import numpy as np +import scipy.sparse as sp +from gensim.models import KeyedVectors +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from tqdm import tqdm + +from twitter_tokenize import twitter_tokenize +from util.util import tweet_node + + +def construct_networkx_graph(graph: tweet_node, network_type): + G = nx.DiGraph() + + tweet_id_node_id_dict = dict() + + G.add_node(get_tweet_id_node_id_mapping(graph.tweet_id, tweet_id_node_id_dict)) + + if network_type == "retweet": + for node in graph.retweet_children: + add_networkxx_retweet_data(G, node, tweet_id_node_id_dict) + G.add_edge(get_tweet_id_node_id_mapping(graph.tweet_id, tweet_id_node_id_dict), + get_tweet_id_node_id_mapping(node.tweet_id, tweet_id_node_id_dict)) + else: + for node in graph.reply_children: + add_network_reply_data(G, node, tweet_id_node_id_dict) + G.add_edge(get_tweet_id_node_id_mapping(graph.tweet_id, tweet_id_node_id_dict), + get_tweet_id_node_id_mapping(node.tweet_id, tweet_id_node_id_dict)) + + return G, tweet_id_node_id_dict + + +def get_tweet_id_node_id_mapping(tweet_id, tweet_id_node_id_dict): + if tweet_id not in tweet_id_node_id_dict: + tweet_id_node_id_dict[tweet_id] = len(tweet_id_node_id_dict) + + return tweet_id_node_id_dict[tweet_id] + + +def add_networkxx_retweet_data(nx_graph: nx.DiGraph, node: tweet_node, tweet_id_node_id_dict: dict): + nx_graph.add_node(get_tweet_id_node_id_mapping(node.tweet_id, tweet_id_node_id_dict)) + + for child in node.retweet_children: + add_networkxx_retweet_data(nx_graph, child, tweet_id_node_id_dict) + nx_graph.add_edge(get_tweet_id_node_id_mapping(node.tweet_id, tweet_id_node_id_dict), + get_tweet_id_node_id_mapping(child.tweet_id, tweet_id_node_id_dict)) + + +def add_network_reply_data(nx_graph: nx.DiGraph, node: tweet_node, tweet_id_node_id_dict: dict): + nx_graph.add_node(node.tweet_id) + + for child in node.reply_children: + add_network_reply_data(nx_graph, child, tweet_id_node_id_dict) + nx_graph.add_edge(get_tweet_id_node_id_mapping(node.tweet_id, tweet_id_node_id_dict), + get_tweet_id_node_id_mapping(child.tweet_id, tweet_id_node_id_dict)) + + +def get_noise_news_ids(): + with open("data/news_id_ignore_list") as file: + lines = file.readlines() + return [line.strip() for line in lines] + + +def get_propagation_graphs(data_folder, news_source): + fake_propagation_graphs = load_prop_graph(data_folder, news_source, "fake") + # fake_propagation_graphs = [] + real_propagation_graphs = load_prop_graph(data_folder, news_source, "real") + + print("Before filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs))) + print("Before filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs))) + + fake_propagation_graphs = remove_prop_graph_noise(fake_propagation_graphs, get_noise_news_ids()) + real_propagation_graphs = remove_prop_graph_noise(real_propagation_graphs, get_noise_news_ids()) + + print("After filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs))) + print("After filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs))) + print(flush=True) + + return fake_propagation_graphs, real_propagation_graphs + + +def load_prop_graph(data_folder, news_source, news_label): + news_graphs = pickle.load(open("{}/{}_{}_news_prop_graphs.pkl".format(data_folder, news_source, news_label), "rb")) + return news_graphs + + +def remove_prop_graph_noise(news_graphs, noise_ids): + noise_ids = set(noise_ids) + return [graph for graph in news_graphs if graph.tweet_id not in noise_ids] + + +def sort_tweet_node_object_by_created_time(tweet_nodes: list): + tweet_nodes.sort(key=lambda x: x.created_time) + + return tweet_nodes + + +def equal_samples(sample1, sample2): + target_len = min(len(sample1), len(sample2)) + + np.random.seed(0) + + np.random.shuffle(sample1) + np.random.shuffle(sample2) + + return sample1[:target_len], sample2[:target_len] + + +def filter_propagation_graphs(graphs, limit_time, retweet=True, reply=True): + result_graphs = [] + + for prop_graph in graphs: + filtered_prop_graph = remove_node_by_time(prop_graph, limit_time) + if is_valid_graph(filtered_prop_graph, retweet, reply): + result_graphs.append(filtered_prop_graph) + + return result_graphs + + +def is_valid_graph(prop_graph: tweet_node, retweet=True, reply=True): + """ Check if the prop graph has alteast one retweet or reply""" + + for post_node in prop_graph.children: + if (retweet and len(post_node.reply_children) > 0) or (reply and len(post_node.retweet_children) > 0): + return True + + return False + + +def get_first_post_time(node: tweet_node): + first_post_time = time.time() + + for child in node.children: + first_post_time = min(first_post_time, child.created_time) + + return first_post_time + + +def remove_node_by_time(graph: tweet_node, limit_time): + start_time = get_first_post_time(graph) + end_time = start_time + limit_time + + q = queue.Queue() + + q.put(graph) + + while q.qsize() != 0: + node = q.get() + + children = node.children + + retweet_children = set(node.retweet_children) + reply_children = set(node.reply_children) + + for child in children.copy(): + + if child.created_time <= end_time: + q.put(child) + else: + node.children.remove(child) + try: + retweet_children.remove(child) + except KeyError: # Element not found in the list + pass + try: + reply_children.remove(child) + except KeyError: # Element not found in the list + pass + + node.retweet_children = list(retweet_children) + node.reply_children = list(reply_children) + + return graph + + +def get_all_propagation_graphs(news_source="politifact", time_interval=None, args=None): + if Path.is_file(Path("data/{}_graphs_data.pkl".format(news_source))): + graph_data = pickle.load(open("data/{}_graphs_data.pkl".format(news_source), "rb")) + return graph_data + + fake_prop_graph, real_prop_graph = get_propagation_graphs("data/prop_graph_save", news_source) + + # fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) + + # fake_prop_graph = fake_prop_graph[:100] + # real_prop_graph = real_prop_graph[:100] + + if time_interval is not None: + time_limit = time_interval * 60 * 60 + + print("Time limit in seconds : {}".format(time_limit)) + + fake_prop_graph = filter_propagation_graphs(fake_prop_graph, time_limit, reply=False) + real_prop_graph = filter_propagation_graphs(real_prop_graph, time_limit, reply=False) + + print("After time based filtering ") + print("No. of fake samples : {} No. of real samples: {}".format(len(fake_prop_graph), len(real_prop_graph))) + + fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) + + fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) + + all_network_xx_graphs = [] + all_tweet_id_node_ids_dicts = [] + all_tweet_id_text_dict = dict() + one_hot_labels = [] + + labels = [] + + max_num_nodes = 0 + + graph_hidden_states = [] + + news_article_text_contents = [] + + for graph in fake_prop_graph: + get_textual_features(graph, all_tweet_id_text_dict) + news_article_text_contents.append(graph.text) + # TODO: Uncomment after dumping time series data - prune graphs for network generation + # graph = prune_graph_by_max_nodes_time(graph, args.max_num_node) + graph, sample_tweet_id_node_id_dict = construct_networkx_graph(graph, "retweet") + all_network_xx_graphs.append(graph) + max_num_nodes = max(max_num_nodes, nx.number_of_nodes(graph)) + all_tweet_id_node_ids_dicts.append(sample_tweet_id_node_id_dict) + one_hot_labels.append([1, 0]) + labels.append(1) + + for graph in real_prop_graph: + get_textual_features(graph, all_tweet_id_text_dict) + news_article_text_contents.append(graph.text) + # TODO: Uncomment after dumping time series data - prune graphs for network generation + # graph = prune_graph_by_max_nodes_time(graph, args.max_num_node) + graph, sample_tweet_id_node_id_dict = construct_networkx_graph(graph, "retweeet") + all_network_xx_graphs.append(graph) + max_num_nodes = max(max_num_nodes, nx.number_of_nodes(graph)) + all_tweet_id_node_ids_dicts.append(sample_tweet_id_node_id_dict) + one_hot_labels.append([0, 1]) + labels.append(0) + + print("max number of nodes : {}".format(max_num_nodes)) + + # TODO: Construct hidden state of the network using Glove embedding + + # model_path = "/home/dmahudes/temporal_event_analysis/pre_train_model/glove.twitter.27B.200d.w2vformat.txt" + + model_path = "data/glove.twitter.27B.200d.w2vformat.txt" + + glove_model = get_gensim_model(model_path) + + for news_article in news_article_text_contents: + news_feature = get_tweet_latent_embeddings(news_article, glove_model) + news_feature = np.expand_dims(np.array(news_feature), axis=1).transpose() + graph_hidden_states.append(news_feature) + + # return all_network_xx_graphs, all_tweet_id_text_dict, one_hot_labels, labels, all_tweet_id_node_ids_dicts + # return all_network_xx_graphs, all_tweet_id_text_dict, one_hot_labels, labels, all_tweet_id_node_ids_dicts, \ + # np.concatenate(graph_hidden_states) + + graph_data = [all_network_xx_graphs, all_tweet_id_text_dict, one_hot_labels, labels, all_tweet_id_node_ids_dicts, + np.concatenate(graph_hidden_states)] + + pickle.dump(graph_data, open("data/{}_graphs_data.pkl".format(news_source), "wb")) + + return graph_data + + +def get_textual_features(graph: tweet_node, tweet_id_text_dict): + q = queue.Queue() + + q.put(graph) + + while q.qsize() != 0: + node = q.get() + tweet_id_text_dict[node.tweet_id] = node.text + for child in node.retweet_children: + q.put(child) + + +def nodes_stats(all_network_xx_graphs): + node_sizes = [] + + for graph in all_network_xx_graphs: + node_sizes.append(nx.number_of_nodes(graph)) + + print("Min : {}".format(min(node_sizes))) + print("Max : {}".format(max(node_sizes))) + print("Mean : {}".format(np.mean(node_sizes))) + print("STD: {} ".format(np.std(node_sizes))) + print("Total nodes : {}".format(np.sum(node_sizes))) + + +def filter_graphs(all_network_xx_graphs, max_nodes): + graphs = [] + for graph in all_network_xx_graphs: + nodes_count = nx.number_of_nodes(graph) + if nodes_count <= max_nodes: + graphs.append(graph) + + return graphs + + +def get_nodes_count(node: tweet_node, edge_type="retweet"): + if node is None: + return 0 + + node_count = 0 + + if edge_type == "retweet": + children = node.retweet_children + elif edge_type == "reply": + children = node.reply_children + else: + children = node.children + + for child in children: + node_count += get_nodes_count(child, edge_type) + + return node_count + 1 + + +def get_K_node_time(graph, max_nodes): + node_creation_times = [] + + q = queue.Queue() + + q.put(graph) + + while q.qsize() != 0: + node = q.get() + + children = node.retweet_children + + for child in children: + q.put(child) + node_creation_times.append(child.created_time) + + node_creation_times.sort() + + return node_creation_times[max_nodes - 1] + + +def prune_graph_by_max_nodes_time(graph, max_nodes): + if get_nodes_count(graph) < max_nodes: + return graph + + node_k_time = get_K_node_time(graph, max_nodes) + + return remove_node_by_end_time(graph, node_k_time) + + +def remove_node_by_end_time(graph: tweet_node, end_time): + q = queue.Queue() + + q.put(graph) + + while q.qsize() != 0: + node = q.get() + + children = node.children + + for child in list(children): + + if child.created_time <= end_time: + q.put(child) + else: + node.children.remove(child) + try: + node.retweet_children.remove(child) + except ValueError: # Element not found in the list + pass + try: + node.reply_children.remove(child) + except ValueError: # Element not found in the list + pass + + return graph + + +def reverse_dict(tweet_id_node_id_dict): + node_id_tweet_id_dict = dict() + + for key, value in tweet_id_node_id_dict.items(): + node_id_tweet_id_dict[value] = key + + return node_id_tweet_id_dict + + +def get_batch_pooling_matrix(graphs): + node_sizes = [] + + for graph in graphs: + nx.nodes(graph) + node_sizes.append(nx.number_of_nodes(graph)) + + num_graphs = len(graphs) + num_nodes = np.sum(node_sizes) + + pooling_matrix = np.zeros((num_graphs, num_nodes)) + + start = 0 + + indexes = [] + + for idx, graph in enumerate(graphs): + indexes.append(start) + + start += len(nx.nodes(graph)) + + indexes.append(start) + + for idx in range(num_graphs): + pooling_matrix[idx, range(indexes[idx], indexes[idx + 1])] = (1 / (indexes[idx + 1] - indexes[idx])) + + return pooling_matrix + + +def get_overall_adjoint_matrix(graphs): + node_sizes = [] + + for graph in graphs: + nx.nodes(graph) + node_sizes.append(nx.number_of_nodes(graph)) + + num_graphs = len(graphs) + num_nodes = np.sum(node_sizes) + + print("num of nodes : {}".format(num_nodes)) + + adj_matrix = [[0 for i in range(num_nodes)] for k in range(num_nodes)] + + start = 0 + + indexes = [] + + for idx, graph in tqdm(enumerate(graphs)): + edges = nx.to_edgelist(graph) + indexes.append(start) + for edge in tqdm(edges): + u = edge[0] + v = edge[1] + + u += start + v += start + + adj_matrix[u][v] = 1 + + start += len(nx.nodes(graph)) + + adj_matrix = np.matrix(adj_matrix) + adj_matrix = sp.coo_matrix(adj_matrix) + + # sp.save_npz("politifact_adj_matrix_basic", adj_matrix) + return adj_matrix + + +def get_all_documents(news_source, all_tweet_id_text_dict): + tweet_ids = [] + documents = [] + + for tweet_id, text in all_tweet_id_text_dict.items(): + tweet_ids.append(tweet_id) + + if str(news_source) in str(tweet_id): + documents.append(" ") + print("Root node tweet id : {}".format(tweet_id)) + else: + documents.append(text) + + vectorizer = TfidfVectorizer(max_features=2000, stop_words="english") + vectorizer.fit(documents) + + pickle.dump(vectorizer, open("{}_doc_vectorizer.pkl".format(news_source), "wb")) + + transformed_docs = vectorizer.transform(documents).todense() + + from sklearn.decomposition import PCA + + pca = PCA(n_components=10) + transformed_docs = pca.fit_transform(transformed_docs) + + print("tranformed docs ", transformed_docs.shape) + + single_node_embeddings = transformed_docs[0, :].transpose() + + print("single doc transofmred doc", single_node_embeddings.shape, flush=True) + + all_tweet_id_text_dict = dict() + + for idx in range(transformed_docs.shape[0]): + all_tweet_id_text_dict[tweet_ids[idx]] = transformed_docs[idx, :] + + return all_tweet_id_text_dict + + +def get_all_documents_glove_embeddings(news_source, all_tweet_id_text_dict): + # model_path = "/home/dmahudes/temporal_event_analysis/pre_train_model/glove.twitter.27B.100d.w2vformat.txt" + # model_path = "/home/dmahudes/temporal_event_analysis/pre_train_model/glove.twitter.27B.25d.w2vformat.txt" + + # model_path = "/home/dmahudes/temporal_event_analysis/pre_train_model/glove.twitter.27B.200d.w2vformat.txt" + model_path = "data/glove.twitter.27B.200d.w2vformat.txt" + + glove_model = get_gensim_model(model_path) + + tweet_id_embddings_dict = dict() + + for tweet_id, text in tqdm(all_tweet_id_text_dict.items()): + if str(news_source) in str(tweet_id): + tweet_id_embddings_dict[tweet_id] = np.zeros((200,)) + print("root tweet id : {}".format(tweet_id), flush=True) + + else: + tweet_id_embddings_dict[tweet_id] = get_tweet_latent_embeddings(text, glove_model) + + pickle.dump(tweet_id_embddings_dict, open("{}_tweet_id_glove_embeddings_dict.pkl".format(news_source), "wb")) + + # vectorizer = TfidfVectorizer(max_features=5000, stop_words="english") + # vectorizer.fit(documents) + + # pickle.dump(vectorizer, open("{}_doc_vectorizer.pkl".format(news_source), "wb")) + + # transformed_docs = vectorizer.transform(documents).todense() + + # print("tranformed docs ", transformed_docs.shape) + # + # single_node_embeddings = transformed_docs[0, :].transpose() + # + # print("single doc transofmred doc", single_node_embeddings.shape, flush=True) + # + # all_tweet_id_text_dict = dict() + + # for idx in range(transformed_docs.shape[0]): + # all_tweet_id_text_dict[tweet_ids[idx]] = transformed_docs[idx, :] + + return tweet_id_embddings_dict + + +def get_feature_matrix(graphs, tweet_id_feature_dict, graph_tweet_id_node_id_dicts): + node_features = [] + + for idx in range(len(graphs)): + graph = graphs[idx] + tweet_id_node_id_dict = graph_tweet_id_node_id_dicts[idx] + node_id_tweet_id_dict = reverse_dict(tweet_id_node_id_dict) + for node_id in nx.nodes(graph): + # print("node id", node_id) + tweet_id = node_id_tweet_id_dict[node_id] + tweet_feature = np.array(tweet_id_feature_dict[tweet_id]).transpose() + # print("tweet feature ", tweet_feature.shape) + # node_features.append(tweet_feature) + node_features.append(np.expand_dims(np.array(tweet_id_feature_dict[tweet_id]), axis=1).transpose()) + # print("no. of nodes : {}".format(len(node_features))) + # return sp.csr_matrix(np.concatenate(node_features, axis=1).transpose()) + + return sp.csr_matrix(np.concatenate(node_features)) + + +def get_glove_feature_matrix(graphs, tweet_id_feature_dict, graph_tweet_id_node_id_dicts): + node_features = [] + + for idx in range(len(graphs)): + graph = graphs[idx] + tweet_id_node_id_dict = graph_tweet_id_node_id_dicts[idx] + node_id_tweet_id_dict = reverse_dict(tweet_id_node_id_dict) + for node_id in nx.nodes(graph): + # print("node id", node_id) + tweet_id = node_id_tweet_id_dict[node_id] + tweet_feature = np.expand_dims(np.array(tweet_id_feature_dict[tweet_id]), axis=1).transpose() + if len(tweet_feature.shape) > 1: + if tweet_feature.shape[0] != 1 or tweet_feature.shape[1] != 200: + print("tweet feature : ", tweet_feature.shape) + else: + tweet_feature = np.zeros((1, 200)) + print(tweet_feature.shape) + + node_features.append(tweet_feature) + + # print("no. of nodes : {}".format(len(node_features))) + + print("batch_embdding_Size before concat ", len(node_features), node_features[0].shape) + + return sp.csr_matrix(np.concatenate(node_features)) + + +def create_dir(dir_name): + if not os.path.exists(dir_name): + try: + os.makedirs(dir_name) + except OSError as exc: # Guard against race condition + if exc.errno != errno.EEXIST: + raise + + +# trasforma matrici in tuple +def to_tuple(mat): + if not sp.isspmatrix_coo(mat): + mat = mat.tocoo() + idxs = np.vstack((mat.row, mat.col)).transpose() + values = mat.data + shape = mat.shape + return idxs, values, shape + + +# trasforma matrici sparse in tuble +def sparse_to_tuple(sparse_mat): + if isinstance(sparse_mat, list): + for i in range(len(sparse_mat)): + sparse_mat[i] = to_tuple(sparse_mat[i]) + else: + sparse_mat = to_tuple(sparse_mat) + return sparse_mat + + +# normalizza la matrice delle feature per riga e la trasforma in tupla +def process_features(features: object) -> object: + features /= features.sum(1).reshape(-1, 1) + features[np.isnan(features) | np.isinf(features)] = 0 # serve per le features dei nodi globali, che sono di soli 0. + return sparse_to_tuple(sp.csr_matrix(features)) + + +# renormalization trick della matrice di adiacenza +def normalize_adj(adj, symmetric=True): + if symmetric: + d = sp.diags(np.power(np.array(adj.sum(1)), -0.5).flatten(), 0) + a_norm = adj.dot(d).transpose().dot(d).tocsr() + else: + d = sp.diags(np.power(np.array(adj.sum(1)), -1.0).flatten(), 0) + a_norm = d.dot(adj).tocsr() + return sp.csr_matrix(a_norm) + + +# conversione a tupla e normalizzazione della matrice d'adiacenza +def preprocess_adj(adj, is_gcn, symmetric=True): + if is_gcn: + adj = adj + sp.eye(adj.shape[0]) # ogni nodo ha come vicino anche se stesso, fa parte di GCN + adj = normalize_adj(adj, symmetric) + return sparse_to_tuple(adj) + + +def get_input_for_batches(news_source, batch_size, time_interval, input_dim): + all_network_x_graphs, all_tweet_id_text_dict, one_hot_labels, labels, all_tweet_id_node_ids_dicts = get_all_propagation_graphs( + news_source, time_interval) + + # all_tweet_id_text_dict = get_all_documents(news_source, all_tweet_id_text_dict) + + # tweet_id_embeddings_dict = all_tweet_id_text_dict + + tweet_id_embeddings_dict = get_all_documents_glove_embeddings(news_source, all_tweet_id_text_dict) + + # tweet_id_embeddings_dict = pickle.load(open("{}_tweet_id_glove_embeddings_dict.pkl".format(news_source), "rb")) + + print("all_network_x_graphs count : {}".format(len(all_network_x_graphs))) + print("all_tweet_id_text_dict count : {}".format(len(one_hot_labels))) + print("all_tweet_id_node_ids_dicts count: {}".format(len(all_tweet_id_node_ids_dicts))) + + train_network_x_graphs, test_network_x_graphs, train_one_hot_labels, test_one_hot_labels, train_tweet_id_node_ids_dicts, test_tweet_id_node_ids_dicts = train_test_split( + all_network_x_graphs, one_hot_labels, all_tweet_id_node_ids_dicts, stratify=labels, + test_size=0.2, random_state=42) + + # all_network_x_graphs = train_network_x_graphs + # labels = train_one_hot_labels + # all_tweet_id_node_ids_dicts = train_tweet_id_node_ids_dicts + + dump_batch_inputs(batch_size, news_source, "train", train_network_x_graphs, train_tweet_id_node_ids_dicts, + train_one_hot_labels, tweet_id_embeddings_dict, time_interval, input_dim) + + dump_batch_inputs(batch_size, news_source, "test", test_network_x_graphs, test_tweet_id_node_ids_dicts, + test_one_hot_labels, tweet_id_embeddings_dict, time_interval, input_dim) + + # dump_glove_feature_batch_embeddings(batch_size, news_source, "train", train_network_x_graphs, + # train_tweet_id_node_ids_dicts, train_one_hot_labels, tweet_id_embeddings_dict, time_interval) + # dump_glove_feature_batch_embeddings(batch_size, news_source, "test", test_network_x_graphs, + # test_tweet_id_node_ids_dicts, + # test_one_hot_labels, tweet_id_embeddings_dict, time_interval) + + +def dump_glove_feature_batch_embeddings(batch_size, news_source, split_label, all_network_x_graphs, + all_tweet_id_node_ids_dicts, labels, + all_tweet_id_text_dict, time_interval): + data_dir = "data/time_batch_data" + create_dir(data_dir) + + data_dir = "{}/batch_{}".format(data_dir, time_interval) + create_dir(data_dir) + + data_dir = "{}/{}".format(data_dir, news_source) + + create_dir(data_dir) + + data_dir = "{}/glove_feat".format(data_dir) + create_dir(data_dir) + + data_dir = "{}/{}".format(data_dir, split_label) + create_dir(data_dir) + + num_samples = len(labels) + + num_batches = int(ceil(num_samples / batch_size)) + + for idx in tqdm(range(num_batches)): + start_idx = idx * batch_size + end_idx = start_idx + batch_size + + batch_graphs = all_network_x_graphs[start_idx: end_idx] + batch_labels = labels[start_idx: end_idx] + batch_mapping_dicts = all_tweet_id_node_ids_dicts[start_idx: end_idx] + + batch_node_features = get_glove_feature_matrix(batch_graphs, all_tweet_id_text_dict, batch_mapping_dicts) + + # print("node feature matrix shape ", batch_node_features.shape) + + batch_node_features = process_features(batch_node_features) + + batch_input = [batch_node_features] + # batch_inputs.append(batch_input) + + pickle.dump(batch_input, open("{}/batch_{}.pkl".format(data_dir, idx), "wb")) + + +def dump_batch_inputs(batch_size, news_source, split_label, all_network_x_graphs, all_tweet_id_node_ids_dicts, labels, + all_tweet_id_text_dict, time_interval, input_dim): + data_dir = "data/time_batch_data" + create_dir(data_dir) + + data_dir = "{}/batch_{}".format(data_dir, time_interval) + create_dir(data_dir) + + data_dir = "{}/{}_{}".format(data_dir, news_source, input_dim) + + create_dir(data_dir) + + data_dir = "{}/{}".format(data_dir, split_label) + create_dir(data_dir) + + batch_inputs = [] + + num_samples = len(labels) + + num_batches = int(ceil(num_samples / batch_size)) + + for idx in tqdm(range(num_batches)): + start_idx = idx * batch_size + end_idx = start_idx + batch_size + + batch_graphs = all_network_x_graphs[start_idx: end_idx] + batch_labels = labels[start_idx: end_idx] + batch_mapping_dicts = all_tweet_id_node_ids_dicts[start_idx: end_idx] + + batch_adj_matrix = get_overall_adjoint_matrix(batch_graphs) + batch_pooling_matrix = get_batch_pooling_matrix(batch_graphs) + + batch_adj_matrix = preprocess_adj(batch_adj_matrix, True, False) + + # batch_node_features = get_feature_matrix(batch_graphs, all_tweet_id_text_dict, batch_mapping_dicts) + + batch_node_features = get_glove_feature_matrix(batch_graphs, all_tweet_id_text_dict, batch_mapping_dicts) + + print("node feature matrix shape ", batch_node_features.shape) + + batch_node_features = process_features(batch_node_features) + + batch_input = [batch_adj_matrix, batch_node_features, batch_labels, batch_pooling_matrix] + # batch_inputs.append(batch_input) + + pickle.dump(batch_input, open("{}/batch_{}.pkl".format(data_dir, idx), "wb")) + + # pickle.dump(batch_inputs, open("{}_batched_inputs.pkl".format(news_source), "wb")) + # return batch_inputs + + +def get_gensim_model(model_path): + model = KeyedVectors.load_word2vec_format(model_path, binary=False) + + return model + + +def get_tweet_latent_embeddings(text_contents, model): + word_embeddings = [] + + tokens = twitter_tokenize(text_contents) + + for token in tokens.split(): + try: + word_embeddings.append(model[token]) + except: + pass + + if len(word_embeddings) > 0: + try: + return np.mean(word_embeddings, axis=0) + except: + return np.zeros((200,)) + + return np.zeros((200,)) + + +def analyze_dataset(news_source): + graphs, all_tweet_id_text_dict, one_hot_labels, labels, all_tweet_id_node_ids_dicts = get_all_propagation_graphs( + news_source) + + # graphs = filter_graphs(graphs, 1500) + + graph_sizes = [] + + for graph in graphs: + graph_sizes.append(nx.number_of_nodes(graph)) + + import matplotlib + matplotlib.use('agg') + import matplotlib.pyplot as plt + + plt.hist(graph_sizes, normed=True, bins=30) + + plt.savefig("figures/{}_graph_distribution.png".format(news_source)) + + +def get_random_bfs_sequence(G): + start_id = 0 + dictionary = dict(nx.bfs_successors(G, start_id)) + start = [start_id] + + max_prev_nodes = 0 + + while len(start) > 0: + next = [] + + while len(start) > 0: + current = start.pop(0) + neighbor = dictionary.get(current) + + if neighbor is not None: + next = next + neighbor + + max_prev_nodes = max(max_prev_nodes, len(next)) + + start = next + + # print("max previous nodes : {}".format(max_prev_nodes)) + return max_prev_nodes + + +if __name__ == "__main__": + news_source = "gossipcop" + + news_source = "politifact" + + # analyze_dataset(news_source) + + # time_intervals = [12, 24, 36, 48, 60, 72, 84, 96] + + # time_intervals = [12, 24, 36, 48, 60, 72, 84, 96] + + # time_intervals = [12, 24, 36,48, 60, 72, 84, 96] + # + input_dim = 200 + # + time_intervals = [3, 6] + # + # # time_intervals = [None] + # + for time_interval in time_intervals: + print("=============Time Interval : {} ==========".format(time_interval)) + start_time = time.time() + # get_classificaton_results_tpnf("data/train_test_data", "politifact", time_interval) + # get_classificaton_results_tpnf("data/train_test_data", "gossipcop", time_interval) + get_input_for_batches(news_source, 8, time_interval, input_dim) + + print("\n\n================Exectuion time - {} ==================================\n".format( + time.time() - start_time)) + + # graphs, all_tweet_id_text_dict, one_hot_labels, labels, all_tweet_id_node_ids_dicts, hidden_state = get_all_propagation_graphs( + # news_source="gossipcop", args = Args()) + + # max_nodes = 5000 + + # graphs = filter_graphs(graphs, max_nodes) + + # max_breadths = [] + # + # for graph in graphs: + # max_breadths.append(get_random_bfs_sequence(graph)) + # + # print("Mean : {}".format(np.mean(max_breadths))) + # print("Max : {}".format(max(max_breadths))) + # print("Min : {}".format(min(max_breadths))) + # print(np.histogram(max_breadths)) + + exit(1) + + all_network_x_graphs, all_tweet_id_text_dict, labels = get_all_propagation_graphs(news_source) + + all_tweet_id_text_dict = get_all_documents(news_source, all_tweet_id_text_dict) + + pickle.dump(all_network_x_graphs, open("{}_all_networkx_graphs.pkl".format(news_source), "wb")) + pickle.dump(all_tweet_id_text_dict, open("{}_graphs_text_dict.pkl".format(news_source), "wb")) + pickle.dump(labels, open("{}_labels.pkl".format(news_source), "wb")) + + # all_network_xx_graphs = filter_graphs(all_network_x_graphs, 2000) + + # adj_matrix = get_overall_adjoint_matrix(all_network_x_graphs) + + # nodes_stats(all_network_xx_graphs) + # + # print(len(all_network_xx_graphs)) diff --git a/elmo_feature_extraction.py b/elmo_feature_extraction.py deleted file mode 100644 index 391fa38..0000000 --- a/elmo_feature_extraction.py +++ /dev/null @@ -1,117 +0,0 @@ -import pickle - -import numpy as np - -from allennlp.modules.elmo import Elmo, batch_to_ids -import torch -from nltk import TweetTokenizer -from torch.autograd import Variable - - -def get_batches(batch_size, params): - total_len = len(params) - for batch_i in range(int(np.ceil(total_len / batch_size))): - start_i = batch_i * batch_size - - yield params[start_i:start_i + batch_size] - - -def get_elmo_sentence_embeddings(documents): - options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" - weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" - - elmo = Elmo(options_file, weight_file, 1, dropout=0) - - # use batch_to_ids to convert sentences to character ids - # sentences = [['First', 'sentence', '.'], ['Another', '.']] - - batches_lat_embeddings = [] - - device = torch.device('cuda') - - batch_size = 128 - - elmo = elmo.to(device) - - for batch_idx, doc_batch in enumerate(get_batches(batch_size, documents)): - character_ids = batch_to_ids(doc_batch) - character_ids = character_ids.to(device) - # - - embeddings = elmo(character_ids) - - # sentence_embeddings = torch.sum(embeddings['elmo_representations'][0], dim=1) - layer_1_rep = get_weights_from_layers(masks=embeddings["mask"], elmo_rep=embeddings['elmo_representations'][0]) - - batches_lat_embeddings.append(layer_1_rep) - - print("batch idx : {} completed...".format(batch_idx), flush=True) - - return np.concatenate(batches_lat_embeddings, axis=0) - - # layer_2_rep = get_weights_from_layers(masks = embeddings["mask"],elmo_rep = embeddings['elmo_representations'][1]) - # - # return np.concatenate([layer_1_rep, layer_2_rep], axis=1) - - -def get_weights_from_layers(masks, elmo_rep): - batch_size = masks.shape[0] - max_seq_len = masks.shape[1] - - # masks = masks.unsqueeze(1) - - elmo_rep = elmo_rep.view(elmo_rep.shape[0] * elmo_rep.shape[1], 1024) - - # mask_weighted_rep = torch.matmul(masks, elmo_rep) - masks = masks.view(masks.shape[0] * masks.shape[1]) - - masks = masks.view(-1, 1).repeat(1, 1024) - # mask_weighted_rep = torch.matmul(masks.float(), elmo_rep) - - mask_weighted_rep = masks.float() * elmo_rep - mask_weighted_rep = mask_weighted_rep.view(batch_size, max_seq_len, 1024) - - sentence_embeddings = torch.sum(mask_weighted_rep, dim=1) - - return Variable(sentence_embeddings).data.cpu().numpy() - - -def dump_elmo_features(data_dir, news_source, label, out_dir): - reply_id_content_dict = pickle.load( - open("{}/{}_{}_reply_id_content_dict.pkl".format(data_dir, news_source, label), "rb")) - - reply_contents = [] - - reply_arr_idx_dict = dict() - - idx = 0 - - tokenizer = TweetTokenizer(strip_handles=True) - for reply_id, content in reply_id_content_dict.items(): - reply_arr_idx_dict[reply_id] = idx - reply_contents.append(tokenizer.tokenize(content)) - idx += 1 - - sentence_lat_embeddings = get_elmo_sentence_embeddings(reply_contents) - - pickle.dump(sentence_lat_embeddings, - open("{}/{}_{}_elmo_lat_embeddings.pkl".format(out_dir, news_source, label), "wb")) - pickle.dump(reply_arr_idx_dict, - open("{}/{}_{}_reply_id_latent_mat_index.pkl".format(out_dir, news_source, label), "wb")) - - -if __name__ == "__main__": - # sentences = [['First', 'sentence', '.'], []] - # sentence_lat_embeddings = get_elmo_sentence_embeddings(sentences) - - # print("============ Dumping fake data ============") - # dump_elmo_features("data/pre_process_data", "politifact", "fake", "data/pre_process_data/elmo_features") - # - # print("============ Dumping real data ============") - # dump_elmo_features("data/pre_process_data", "politifact", "real", "data/pre_process_data/elmo_features") - - print("============ Dumping fake data ============") - dump_elmo_features("data/pre_process_data", "gossipcop", "fake", "data/pre_process_data/elmo_features") - - print("============ Dumping real data ============") - dump_elmo_features("data/pre_process_data", "gossipcop", "real", "data/pre_process_data/elmo_features") diff --git a/linguistic_analysis.py b/linguistic_analysis.py index b51843b..79bb401 100644 --- a/linguistic_analysis.py +++ b/linguistic_analysis.py @@ -15,15 +15,19 @@ from util.constants import REPLY_NODE, POST_NODE from util.util import tweet_node -all_reply_id_sentiment_score_dict = pickle.load(open("{}/all_reply_id_sentiment_result.pkl" - .format("data/pre_process_data/vader_sentiment"), "rb")) +all_reply_id_sentiment_score_dict = dict() -def tweet_text_sentiment(reply_id): - if reply_id in all_reply_id_sentiment_score_dict: - return all_reply_id_sentiment_score_dict[reply_id]["compound"] - else: - return 0 +# +# all_reply_id_sentiment_score_dict = pickle.load(open("{}/all_reply_id_sentiment_result.pkl" +# .format("data/pre_process_data/vader_sentiment"), "rb")) + + +# def tweet_text_sentiment(reply_id): +# if reply_id in all_reply_id_sentiment_score_dict: +# return all_reply_id_sentiment_score_dict[reply_id]["compound"] +# else: +# return 0 # def tweet_text_sentiment(text): @@ -55,8 +59,8 @@ def get_first_reply_nodes_average_sentiment(prop_graph: tweet_node): q.put(child) if child.node_type == REPLY_NODE and node.node_type == POST_NODE: - if node.text: - reply_diff_values.append(tweet_text_sentiment(child.tweet_id)) + if child.sentiment: + reply_diff_values.append(child.sentiment) if len(reply_diff_values) == 0: return 0 @@ -76,8 +80,8 @@ def get_reply_nodes_average_sentiment(prop_graph: tweet_node): q.put(child) if node.node_type == REPLY_NODE: - if node.text: - reply_diff_values.append(tweet_text_sentiment(node.tweet_id)) + if node.sentiment: + reply_diff_values.append(node.sentiment) if len(reply_diff_values) == 0: return 0 @@ -147,8 +151,7 @@ def get_reply_nodes_sentiment_ratio(prop_graph: tweet_node): q.put(child) if node.node_type == REPLY_NODE: - if node.text: - reply_diff_values.append(tweet_text_sentiment(node.tweet_id)) + reply_diff_values.append(node.sentiment) if len(reply_diff_values) == 0: return 0 @@ -192,7 +195,7 @@ def get_all_linguistic_features(news_graphs, micro_features, macro_features): get_deepest_cascade_first_level_reply_sentiment] for function_reference in reply_function_references: - features_set = get_stats_for_features(news_graphs, function_reference, print=False, feature_name=None) + features_set = get_stats_for_features(news_graphs, function_reference, print=True, feature_name=None) all_features.append(features_set) return np.transpose(get_numpy_array(all_features)) @@ -236,8 +239,7 @@ def get_micro_feature_method_references(self): get_reply_nodes_average_sentiment, get_first_reply_nodes_average_sentiment, get_deepest_cascade_reply_nodes_avg_sentiment, - get_deepest_cascade_first_level_reply_sentiment, - get_supporting_opposing_replies_ratio] + get_deepest_cascade_first_level_reply_sentiment] return method_refs @@ -246,13 +248,12 @@ def get_micro_feature_method_names(self): "Average sentiment of all replies", "Average sentiment of first level replies", "Average sentiment of replies in deepest cascade", - "Average setiment of first level replies in deepest cascade", - "Supporting or opposing ratio"] + "Average setiment of first level replies in deepest cascade"] return feature_names def get_micro_feature_short_names(self): - feature_names = ["L1", "L2", "L3", "L4", "L5","L6"] + feature_names = ["L1", "L2", "L3", "L4", "L5", "L6"] return feature_names def get_macro_feature_method_references(self): @@ -271,13 +272,14 @@ def get_macro_feature_short_names(self): feature_names = [] return feature_names - def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None, file_dir="data/train_test_data"): + def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None, + file_dir="data/train_test_data", use_cache=False): function_refs = [] file_name = self.get_dump_file_name(news_source, micro_features, macro_features, label, file_dir) data_file = Path(file_name) - if data_file.is_file(): + if use_cache and data_file.is_file(): return pickle.load(open(file_name, "rb")) if micro_features: @@ -288,19 +290,21 @@ def get_features_array(self, prop_graphs, micro_features, macro_features, news_s all_features = [] - for idx in range(len(function_refs) - 1): + for idx in range(len(function_refs)): features_set = get_sample_feature_value(prop_graphs, function_refs[idx]) all_features.append(features_set) - all_features.append(get_feature_involving_additional_args(prop_graphs, function_refs[-1],news_source, label)) + # all_features.append(get_feature_involving_additional_args(prop_graphs, function_refs[-1], news_source, label)) feature_array = np.transpose(get_numpy_array(all_features)) + + # feature_array = feature_array[:, :-1] + pickle.dump(feature_array, open(file_name, "wb")) return feature_array - def get_feature_involving_additional_args(prop_graphs, function_reference, news_source, label): feature_values = [] for prop_graph in prop_graphs: diff --git a/misc_process.py b/misc_process.py index 6a619c0..e070f75 100644 --- a/misc_process.py +++ b/misc_process.py @@ -1,764 +1,764 @@ -import csv -import json -import mmap -import os -import pickle -import queue -import re -import shutil -import string -import sys -import traceback -from datetime import datetime -from pathlib import Path - -import datefinder -import requests -from bs4 import BeautifulSoup -from newspaper import Article -from pymongo import UpdateOne -from tqdm import tqdm -import newspaper - -from analysis_util import get_propagation_graphs -from baseline_feature_extraction import dump_LIWC_Representation -from pre_process_util import load_configuration, get_database_connection, get_news_articles -from util.constants import RETWEET_EDGE, REPLY_EDGE, RETWEET_NODE, REPLY_NODE -from util.util import tweet_node - - -def get_reply_of_replies(replies: list, result_dict: dict): - for reply in replies: - if reply: - if "engagement" in reply: - get_reply_of_replies(reply["engagement"]["tweet_replies"], result_dict) - - result_dict[reply["id"]] = reply["text"] - - -def get_web_archieve_results(search_url): - try: - archieve_url = "http://web.archive.org/cdx/search/cdx?url={}&output=json".format(search_url) - - response = requests.get(archieve_url) - response_json = json.loads(response.content) - - response_json = response_json[1:] - - return response_json - - except: - return None - - -def get_website_url_from_arhieve(url): - archieve_results = get_web_archieve_results(url) - if archieve_results: - modified_url = "https://web.archive.org/web/{}/{}".format(archieve_results[0][1], archieve_results[0][2]) - return modified_url - else: - return url - - -def dump_friends_file_as_lines(dataset_file, out_file): - pattern = re.compile(rb'{([^{}]+)}', - re.DOTALL | re.IGNORECASE | re.MULTILINE) - - with open(out_file, "w", 100) as out_file: - with open(dataset_file, 'r') as f: - with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m: - for match in pattern.findall(m): - data = "{" + str(match.decode('utf-8')) + "}\n" - out_file.write(data) - - -def dump_social_network_to_db(db, folder): - friends_coll = db.twitter_user_friends_collection - - batch_update_list = [] - - files = os.listdir(folder) - user_names = set([file[:file.find(".csv")] for file in files]) - - print("actual files : {}".format(len(user_names)), flush=True) - - saved_user_names = set(friends_coll.distinct("user_name")) - print("saved user names : {}".format(len(saved_user_names)), flush=True) - - user_names = user_names.difference(saved_user_names) - - print("user names to be saved : {}".format(len(user_names)), flush=True) - - for idx, user_name in enumerate(user_names): - try: - friends_user_names = get_friends_names("{}/{}.csv".format(folder, user_name)) - - batch_update_list.append(UpdateOne({"user_name": user_name}, - {"$set": {"user_name": user_name, "friends_name": friends_user_names}}, - upsert=True)) - - if idx % 10000 == 0: - try: - friends_coll.bulk_write(batch_update_list, ordered=False) - except: - print("Exception") - traceback.print_exc(file=sys.stdout) - - batch_update_list = [] - print("bulk update {}".format(idx), flush=True) - - except Exception as ex: - print("Exception in file : {}/{} : {}".format(folder, user_name, str(ex))) - traceback.print_exc(file=sys.stdout) - - if len(batch_update_list) > 0: - friends_coll.bulk_write(batch_update_list, ordered=False) - print("bulk update", flush=True) - - print("completed dumping for folder {}".format(folder)) - - -def get_user_to_fetch(all_user_file, user_ids_user_name_dict, db): - user_names = set(json.load(open(all_user_file))["user_names"]) - - friends_coll = db.twitter_user_friends_collection - - fake_friends_collection = db.fake_twitter_user_followees - real_friends_collection = db.real_twitter_user_followees - - fake_users_ids = set(fake_friends_collection.distinct("user_id")) - real_users_ids = set(real_friends_collection.distinct("user_id")) - - all_user_ids = set() - all_user_ids.update(fake_users_ids) - all_user_ids.update(real_users_ids) - - id_fetched_user_names = set() - - user_ids_user_name_dict = json.load(open(user_ids_user_name_dict)) - - for user_id, user_name in user_ids_user_name_dict.items(): - if int(user_id) in all_user_ids: - id_fetched_user_names.add(user_name) - - print("actual files : {}".format(len(user_names)), flush=True) - - saved_user_names = set(friends_coll.distinct("user_name")) - print("saved user names : {}".format(len(saved_user_names)), flush=True) - - user_names = user_names.difference(saved_user_names) - - print("user names to be collected : {}".format(len(user_names)), flush=True) - - print("ID fetched users : {}".format(len(id_fetched_user_names))) - - user_names = user_names.difference(id_fetched_user_names) - - print("Final set of user names to be fetched : {}".format(len(user_names))) - - json.dump({"user_names": list(user_names)}, open("politifact_user_names_to_collect.json", "w")) - - -def chunks(l, n): - """Yield successive n-sized chunks from l.""" - for i in range(0, len(l), n): - yield l[i:i + n] - - -def dump_user_friends_data(db, user_names_file, dump_out_file): - user_names = json.load(open(user_names_file))["user_names"] - friends_collection = db.twitter_user_friends_collection - with open(dump_out_file, "w", 1000) as file: - for user_name_chunk in chunks(list(user_names), 10000): - for user_info in friends_collection.find({"user_name": {"$in": user_name_chunk}}, {"_id": 0}): - file.write(json.dumps(user_info)) - file.write("\n") - - print("Compeleted dumping {}".format(dump_out_file)) - - -def dump_user_id_friends_data(db, user_id_dict_file, dump_out_file): - user_id_name_dict = json.load(open(user_id_dict_file)) - - user_ids = user_id_name_dict.keys() - - user_ids = [int(user_id) for user_id in user_ids] - - user_ids = set(user_ids) - - fake_friends_collection = db.fake_twitter_user_followees - real_friends_collection = db.real_twitter_user_followees - - with open(dump_out_file, "w", 1000) as file: - - for user_ids_chunk in chunks(list(user_ids), 10000): - for user_info in fake_friends_collection.find({"user_id": {"$in": user_ids_chunk}}, {"_id": 0}): - user_ids.remove(user_info["user_id"]) - file.write(json.dumps(user_info) + "\n") - - for user_ids_chunk in chunks(list(user_ids), 10000): - for user_info in real_friends_collection.find({"user_id": {"$in": user_ids_chunk}}, {"_id": 0}): - user_ids.remove(user_info["user_id"]) - file.write(json.dumps(user_info) + "\n") - - print("Compeleted dumping {}".format(dump_out_file)) - - -def get_friends_names(friends_file): - try: - with open(friends_file, encoding="UTF-8") as file: - lines = file.readlines() - lines = [line.strip() for line in lines] - return lines[1:] - - except: - return [] - - -def write_file_if_not_exist(output_folder, user_id_followee_json_data): - file_path = "{}/{}.json".format(output_folder, user_id_followee_json_data["user_id"]) - if not os.path.exists(file_path): - json.dump(user_id_followee_json_data, open(file_path, "w")) - - -def write_file_user_name_if_not_exist(output_folder, user_name_followee_json_data): - file_path = "{}/{}.json".format(output_folder, user_name_followee_json_data["user_name"]) - if not os.path.exists(file_path): - json.dump(user_name_followee_json_data, open(file_path, "w")) - - -def dump_social_network_user_id_single_file(input_ids_file, output_folder): - with open(input_ids_file) as file: - for line in tqdm(file): - write_file_if_not_exist(output_folder, json.loads(line)) - - -def dump_social_network_user_name_single_file(input_names_file, output_folder): - with open(input_names_file) as file: - for line in tqdm(file): - write_file_user_name_if_not_exist(output_folder, json.loads(line)) - - -def download_news_article(url): - news_article = Article(url) - news_article.download() - news_article.parse() - return news_article - - -def get_dataset_publication_date(news_id_publish_time_dict, news_id_fact_statement_date_dict, news_id_source_date_dict): - """ - Check the different dates and choose the right date for filtering noise - :param news_id_publish_time: - :param news_id_fact_statement_date_dict: - :param news_id_source_date_dict: - :return: - """ - all_news_ids = news_id_fact_statement_date_dict.keys() - - news_id_selected_filter_date = dict() - - for news_id in all_news_ids: - if news_id in news_id_publish_time_dict: - news_id_selected_filter_date[news_id] = news_id_publish_time_dict[news_id].timestamp() - elif news_id in news_id_source_date_dict: - news_id_selected_filter_date[news_id] = news_id_source_date_dict[news_id].timestamp() - elif news_id in news_id_fact_statement_date_dict: - news_id_selected_filter_date[news_id] = datetime.strptime(news_id_fact_statement_date_dict[news_id], - "%Y-%m-%d").timestamp() - - return news_id_selected_filter_date - - -def get_news_articles_published_time(db, is_fake): - news_id_publish_time_dict = dict() - - if is_fake: - news_source_article_collection = db.fake_news_source_article - else: - news_source_article_collection = db.real_news_source_article - - for news_source in news_source_article_collection.find({"news_source": "politifact"}): - news_id = news_source["id"] - if news_source and news_source["publish_date"]: - news_id_publish_time_dict[news_id] = news_source["publish_date"] - - return news_id_publish_time_dict - - -# def get_news_articles_published_time(dataset_file): +# import csv +# import json +# import mmap +# import os +# import pickle +# import queue +# import re +# import shutil +# import string +# import sys +# import traceback +# from datetime import datetime +# from pathlib import Path +# +# import datefinder +# import requests +# from bs4 import BeautifulSoup +# from newspaper import Article +# from pymongo import UpdateOne +# from tqdm import tqdm +# import newspaper +# +# from analysis_util import get_propagation_graphs +# from baseline_feature_extraction import dump_LIWC_Representation +# from pre_process_util import load_configuration, get_database_connection, get_news_articles +# from util.constants import RETWEET_EDGE, REPLY_EDGE, RETWEET_NODE, REPLY_NODE +# from util.util import tweet_node +# +# +# def get_reply_of_replies(replies: list, result_dict: dict): +# for reply in replies: +# if reply: +# if "engagement" in reply: +# get_reply_of_replies(reply["engagement"]["tweet_replies"], result_dict) +# +# result_dict[reply["id"]] = reply["text"] +# +# +# def get_web_archieve_results(search_url): +# try: +# archieve_url = "http://web.archive.org/cdx/search/cdx?url={}&output=json".format(search_url) +# +# response = requests.get(archieve_url) +# response_json = json.loads(response.content) +# +# response_json = response_json[1:] +# +# return response_json +# +# except: +# return None +# +# +# def get_website_url_from_arhieve(url): +# archieve_results = get_web_archieve_results(url) +# if archieve_results: +# modified_url = "https://web.archive.org/web/{}/{}".format(archieve_results[0][1], archieve_results[0][2]) +# return modified_url +# else: +# return url +# +# +# def dump_friends_file_as_lines(dataset_file, out_file): +# pattern = re.compile(rb'{([^{}]+)}', +# re.DOTALL | re.IGNORECASE | re.MULTILINE) +# +# with open(out_file, "w", 100) as out_file: +# with open(dataset_file, 'r') as f: +# with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m: +# for match in pattern.findall(m): +# data = "{" + str(match.decode('utf-8')) + "}\n" +# out_file.write(data) +# +# +# def dump_social_network_to_db(db, folder): +# friends_coll = db.twitter_user_friends_collection +# +# batch_update_list = [] +# +# files = os.listdir(folder) +# user_names = set([file[:file.find(".csv")] for file in files]) +# +# print("actual files : {}".format(len(user_names)), flush=True) +# +# saved_user_names = set(friends_coll.distinct("user_name")) +# print("saved user names : {}".format(len(saved_user_names)), flush=True) +# +# user_names = user_names.difference(saved_user_names) +# +# print("user names to be saved : {}".format(len(user_names)), flush=True) +# +# for idx, user_name in enumerate(user_names): +# try: +# friends_user_names = get_friends_names("{}/{}.csv".format(folder, user_name)) +# +# batch_update_list.append(UpdateOne({"user_name": user_name}, +# {"$set": {"user_name": user_name, "friends_name": friends_user_names}}, +# upsert=True)) +# +# if idx % 10000 == 0: +# try: +# friends_coll.bulk_write(batch_update_list, ordered=False) +# except: +# print("Exception") +# traceback.print_exc(file=sys.stdout) +# +# batch_update_list = [] +# print("bulk update {}".format(idx), flush=True) +# +# except Exception as ex: +# print("Exception in file : {}/{} : {}".format(folder, user_name, str(ex))) +# traceback.print_exc(file=sys.stdout) +# +# if len(batch_update_list) > 0: +# friends_coll.bulk_write(batch_update_list, ordered=False) +# print("bulk update", flush=True) +# +# print("completed dumping for folder {}".format(folder)) +# +# +# def get_user_to_fetch(all_user_file, user_ids_user_name_dict, db): +# user_names = set(json.load(open(all_user_file))["user_names"]) +# +# friends_coll = db.twitter_user_friends_collection +# +# fake_friends_collection = db.fake_twitter_user_followees +# real_friends_collection = db.real_twitter_user_followees +# +# fake_users_ids = set(fake_friends_collection.distinct("user_id")) +# real_users_ids = set(real_friends_collection.distinct("user_id")) +# +# all_user_ids = set() +# all_user_ids.update(fake_users_ids) +# all_user_ids.update(real_users_ids) +# +# id_fetched_user_names = set() +# +# user_ids_user_name_dict = json.load(open(user_ids_user_name_dict)) +# +# for user_id, user_name in user_ids_user_name_dict.items(): +# if int(user_id) in all_user_ids: +# id_fetched_user_names.add(user_name) +# +# print("actual files : {}".format(len(user_names)), flush=True) +# +# saved_user_names = set(friends_coll.distinct("user_name")) +# print("saved user names : {}".format(len(saved_user_names)), flush=True) +# +# user_names = user_names.difference(saved_user_names) +# +# print("user names to be collected : {}".format(len(user_names)), flush=True) +# +# print("ID fetched users : {}".format(len(id_fetched_user_names))) +# +# user_names = user_names.difference(id_fetched_user_names) +# +# print("Final set of user names to be fetched : {}".format(len(user_names))) +# +# json.dump({"user_names": list(user_names)}, open("politifact_user_names_to_collect.json", "w")) +# +# +# def chunks(l, n): +# """Yield successive n-sized chunks from l.""" +# for i in range(0, len(l), n): +# yield l[i:i + n] +# +# +# def dump_user_friends_data(db, user_names_file, dump_out_file): +# user_names = json.load(open(user_names_file))["user_names"] +# friends_collection = db.twitter_user_friends_collection +# with open(dump_out_file, "w", 1000) as file: +# for user_name_chunk in chunks(list(user_names), 10000): +# for user_info in friends_collection.find({"user_name": {"$in": user_name_chunk}}, {"_id": 0}): +# file.write(json.dumps(user_info)) +# file.write("\n") +# +# print("Compeleted dumping {}".format(dump_out_file)) +# +# +# def dump_user_id_friends_data(db, user_id_dict_file, dump_out_file): +# user_id_name_dict = json.load(open(user_id_dict_file)) +# +# user_ids = user_id_name_dict.keys() +# +# user_ids = [int(user_id) for user_id in user_ids] +# +# user_ids = set(user_ids) +# +# fake_friends_collection = db.fake_twitter_user_followees +# real_friends_collection = db.real_twitter_user_followees +# +# with open(dump_out_file, "w", 1000) as file: +# +# for user_ids_chunk in chunks(list(user_ids), 10000): +# for user_info in fake_friends_collection.find({"user_id": {"$in": user_ids_chunk}}, {"_id": 0}): +# user_ids.remove(user_info["user_id"]) +# file.write(json.dumps(user_info) + "\n") +# +# for user_ids_chunk in chunks(list(user_ids), 10000): +# for user_info in real_friends_collection.find({"user_id": {"$in": user_ids_chunk}}, {"_id": 0}): +# user_ids.remove(user_info["user_id"]) +# file.write(json.dumps(user_info) + "\n") +# +# print("Compeleted dumping {}".format(dump_out_file)) +# +# +# def get_friends_names(friends_file): +# try: +# with open(friends_file, encoding="UTF-8") as file: +# lines = file.readlines() +# lines = [line.strip() for line in lines] +# return lines[1:] +# +# except: +# return [] +# +# +# def write_file_if_not_exist(output_folder, user_id_followee_json_data): +# file_path = "{}/{}.json".format(output_folder, user_id_followee_json_data["user_id"]) +# if not os.path.exists(file_path): +# json.dump(user_id_followee_json_data, open(file_path, "w")) +# +# +# def write_file_user_name_if_not_exist(output_folder, user_name_followee_json_data): +# file_path = "{}/{}.json".format(output_folder, user_name_followee_json_data["user_name"]) +# if not os.path.exists(file_path): +# json.dump(user_name_followee_json_data, open(file_path, "w")) +# +# +# def dump_social_network_user_id_single_file(input_ids_file, output_folder): +# with open(input_ids_file) as file: +# for line in tqdm(file): +# write_file_if_not_exist(output_folder, json.loads(line)) +# +# +# def dump_social_network_user_name_single_file(input_names_file, output_folder): +# with open(input_names_file) as file: +# for line in tqdm(file): +# write_file_user_name_if_not_exist(output_folder, json.loads(line)) +# +# +# def download_news_article(url): +# news_article = Article(url) +# news_article.download() +# news_article.parse() +# return news_article +# +# +# def get_dataset_publication_date(news_id_publish_time_dict, news_id_fact_statement_date_dict, news_id_source_date_dict): +# """ +# Check the different dates and choose the right date for filtering noise +# :param news_id_publish_time: +# :param news_id_fact_statement_date_dict: +# :param news_id_source_date_dict: +# :return: +# """ +# all_news_ids = news_id_fact_statement_date_dict.keys() +# +# news_id_selected_filter_date = dict() +# +# for news_id in all_news_ids: +# if news_id in news_id_publish_time_dict: +# news_id_selected_filter_date[news_id] = news_id_publish_time_dict[news_id].timestamp() +# elif news_id in news_id_source_date_dict: +# news_id_selected_filter_date[news_id] = news_id_source_date_dict[news_id].timestamp() +# elif news_id in news_id_fact_statement_date_dict: +# news_id_selected_filter_date[news_id] = datetime.strptime(news_id_fact_statement_date_dict[news_id], +# "%Y-%m-%d").timestamp() +# +# return news_id_selected_filter_date +# +# +# def get_news_articles_published_time(db, is_fake): +# news_id_publish_time_dict = dict() +# +# if is_fake: +# news_source_article_collection = db.fake_news_source_article +# else: +# news_source_article_collection = db.real_news_source_article +# +# for news_source in news_source_article_collection.find({"news_source": "politifact"}): +# news_id = news_source["id"] +# if news_source and news_source["publish_date"]: +# news_id_publish_time_dict[news_id] = news_source["publish_date"] +# +# return news_id_publish_time_dict +# +# +# # def get_news_articles_published_time(dataset_file): +# # dataset = get_news_articles(dataset_file) +# # news_id_publish_time = dict() +# # count = 0 +# # print("total no. of articles : {}".format(len(dataset))) +# # for news in dataset: +# # if "publish_date" in news["text_content"] and news["text_content"]["publish_date"]: +# # count += 1 +# # print(news["text_content"]["publish_date"]) +# # +# # # if "url" in news["text_content"]: +# # # try: +# # # formatted_url = news["text_content"]["url"].lstrip("'").rstrip("'").lstrip("/") +# # # +# # # print("Formatted url : {}".format(formatted_url)) +# # # +# # # news_article = download_news_article(formatted_url) +# # # print("News id : {} publish data : {}".format(news["id"], news_article.publish_date), flush=True) +# # # news_id_publish_time[news["id"]] = news_article.publish_date.timestamp() +# # # except Exception as ex: +# # # print(ex) +# # print("old wrong present publish date count : {}".format(count)) +# # return news_id_publish_time +# +# +# def get_publish_date_from_sources_politifact(db, is_fake): +# if is_fake: +# news_collection = db.fake_news_collection +# news_format_collection = db.fake_news_format +# else: +# news_collection = db.real_news_collection +# news_format_collection = db.real_news_format +# +# news_id_fact_statement_date_dict = dict() +# +# news_id_source_date_dict = dict() +# +# for news_format in news_format_collection.find({"news_source": "politifact"}): +# news_id = news_format["id"] +# +# news_id_int = int(news_id.replace("politifact", "")) +# +# news = news_collection.find_one({"id": news_id_int}) +# +# publish_date = get_formatted_news_publish_date(news) +# +# try: +# if publish_date: +# publish_date = next(publish_date) +# +# if publish_date: +# news_id_source_date_dict[news_id] = publish_date +# except StopIteration: +# pass +# +# news_id_fact_statement_date_dict[news_id] = news["statement_date"] +# +# return news_id_fact_statement_date_dict, news_id_source_date_dict +# +# +# def get_formatted_news_publish_date(fake_news): +# try: +# id = fake_news['id'] +# source_html = fake_news['sources'] +# sources_soup = BeautifulSoup(source_html) +# sources = sources_soup.find_all('p') +# if not sources: +# sources = sources_soup.find_all('div') +# statement = '' +# url = '' +# +# ## Using the first source that contains href as the fake news source if source is not removed +# ## This is not always true +# +# date_matches = None +# for i in range(len(sources)): +# if sources[i].find('a') is not None: +# statement_tmp = sources[i].text +# +# date_matches = datefinder.find_dates(statement_tmp) +# statements = re.findall(r'\"(.+?){\,,.}\"', statement_tmp) +# if len(statements) == 0: +# statement = sources[i].a.text +# +# # TODO: Verify this logic is proper +# splits = statement_tmp.split(',') +# for split in splits: +# if len(statement) < len(split): +# statement = split +# +# # TODO: Why encoding is required - encoding considers quotes of string also - understand why? +# # statement = statement.encode('utf-8') +# else: +# # TODO: Why encoding is required - encoding considers quotes of string also - understand why? +# # statement = statements[0].encode('utf-8') +# statement = statements[0] +# pass +# +# # TODO: Check if it is proper +# statement = str(statement).translate(str.maketrans('', '', string.punctuation)) +# +# # statement_new = statement.translate(str.maketrans('', '', string.punctuation)) # move punctuations +# +# url = sources[i].a['href'] +# break +# +# # TODO: Check if the condition is proper +# if statement == '' or len(statement.split(' ')) <= 3: +# return None +# +# return date_matches +# +# except: +# return None +# +# +# def get_politifact_tweet_filter_dates(db, is_fake): +# news_id_fact_statement_date_dict, news_id_source_date_dict = get_publish_date_from_sources_politifact( +# db, is_fake=is_fake) +# news_id_publish_time_dict = get_news_articles_published_time(db, is_fake=is_fake) +# +# news_id_filter_date_dict = get_dataset_publication_date(news_id_publish_time_dict, news_id_fact_statement_date_dict, +# news_id_source_date_dict) +# +# return news_id_filter_date_dict +# +# +# def get_replies_from_dataset(dataset_dir, news_source, label, out_dir): +# dataset_file = "{}/{}_{}_news_complete_dataset.json".format(dataset_dir, news_source, label) # dataset = get_news_articles(dataset_file) -# news_id_publish_time = dict() -# count = 0 -# print("total no. of articles : {}".format(len(dataset))) +# +# reply_id_content_dict = dict() +# # for news in dataset: -# if "publish_date" in news["text_content"] and news["text_content"]["publish_date"]: -# count += 1 -# print(news["text_content"]["publish_date"]) -# -# # if "url" in news["text_content"]: -# # try: -# # formatted_url = news["text_content"]["url"].lstrip("'").rstrip("'").lstrip("/") -# # -# # print("Formatted url : {}".format(formatted_url)) -# # -# # news_article = download_news_article(formatted_url) -# # print("News id : {} publish data : {}".format(news["id"], news_article.publish_date), flush=True) -# # news_id_publish_time[news["id"]] = news_article.publish_date.timestamp() -# # except Exception as ex: -# # print(ex) -# print("old wrong present publish date count : {}".format(count)) -# return news_id_publish_time - - -def get_publish_date_from_sources_politifact(db, is_fake): - if is_fake: - news_collection = db.fake_news_collection - news_format_collection = db.fake_news_format - else: - news_collection = db.real_news_collection - news_format_collection = db.real_news_format - - news_id_fact_statement_date_dict = dict() - - news_id_source_date_dict = dict() - - for news_format in news_format_collection.find({"news_source": "politifact"}): - news_id = news_format["id"] - - news_id_int = int(news_id.replace("politifact", "")) - - news = news_collection.find_one({"id": news_id_int}) - - publish_date = get_formatted_news_publish_date(news) - - try: - if publish_date: - publish_date = next(publish_date) - - if publish_date: - news_id_source_date_dict[news_id] = publish_date - except StopIteration: - pass - - news_id_fact_statement_date_dict[news_id] = news["statement_date"] - - return news_id_fact_statement_date_dict, news_id_source_date_dict - - -def get_formatted_news_publish_date(fake_news): - try: - id = fake_news['id'] - source_html = fake_news['sources'] - sources_soup = BeautifulSoup(source_html) - sources = sources_soup.find_all('p') - if not sources: - sources = sources_soup.find_all('div') - statement = '' - url = '' - - ## Using the first source that contains href as the fake news source if source is not removed - ## This is not always true - - date_matches = None - for i in range(len(sources)): - if sources[i].find('a') is not None: - statement_tmp = sources[i].text - - date_matches = datefinder.find_dates(statement_tmp) - statements = re.findall(r'\"(.+?){\,,.}\"', statement_tmp) - if len(statements) == 0: - statement = sources[i].a.text - - # TODO: Verify this logic is proper - splits = statement_tmp.split(',') - for split in splits: - if len(statement) < len(split): - statement = split - - # TODO: Why encoding is required - encoding considers quotes of string also - understand why? - # statement = statement.encode('utf-8') - else: - # TODO: Why encoding is required - encoding considers quotes of string also - understand why? - # statement = statements[0].encode('utf-8') - statement = statements[0] - pass - - # TODO: Check if it is proper - statement = str(statement).translate(str.maketrans('', '', string.punctuation)) - - # statement_new = statement.translate(str.maketrans('', '', string.punctuation)) # move punctuations - - url = sources[i].a['href'] - break - - # TODO: Check if the condition is proper - if statement == '' or len(statement.split(' ')) <= 3: - return None - - return date_matches - - except: - return None - - -def get_politifact_tweet_filter_dates(db, is_fake): - news_id_fact_statement_date_dict, news_id_source_date_dict = get_publish_date_from_sources_politifact( - db, is_fake=is_fake) - news_id_publish_time_dict = get_news_articles_published_time(db, is_fake=is_fake) - - news_id_filter_date_dict = get_dataset_publication_date(news_id_publish_time_dict, news_id_fact_statement_date_dict, - news_id_source_date_dict) - - return news_id_filter_date_dict - - -def get_replies_from_dataset(dataset_dir, news_source, label, out_dir): - dataset_file = "{}/{}_{}_news_complete_dataset.json".format(dataset_dir, news_source, label) - dataset = get_news_articles(dataset_file) - - reply_id_content_dict = dict() - - for news in dataset: - for tweet in news["tweets"]: - get_reply_of_replies(tweet["reply"], reply_id_content_dict) - - pickle.dump(reply_id_content_dict, - open("{}/{}_{}_reply_id_content_dict.pkl".format(out_dir, news_source, label), "wb")) - - -def dump_all_botometer_results(db): - screen_name_botometer_score_dict = dict() - - for user_score in db.twitter_user_botometer_results.find(): - screen_name_botometer_score_dict[user_score["screen_name"]] = user_score["result"] - - pickle.dump(screen_name_botometer_score_dict, open("all_user_botometer_scores.pkl", "wb")) - - -def dump_all_user_profile_info(db, is_fake, label): - user_id_profile_info = dict() - - all_users_ids = pickle.load(open("all_prop_graph_{}_user.pkl".format(label), "rb")) - - if is_fake: - user_profile_collection = db.fake_twitter_user_profile - else: - user_profile_collection = db.real_twitter_user_profile - - for user_id in tqdm(all_users_ids): - user_object = user_profile_collection.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1, - "profile_info.friends_count": 1, - "profile_info.followers_count": 1, - "profile_info.created_at": 1}) - if user_object is None: - user_object = db.twitter_user_profile.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1, - "profile_info.friends_count": 1, - "profile_info.followers_count": 1, - "profile_info.created_at": 1}) - if user_object and "profile_info" in user_object: - user_id_profile_info[user_id] = user_object["profile_info"] - - print("No. of users found : {}".format(len(user_id_profile_info))) - - pickle.dump(user_id_profile_info, open("all_{}_user_profile_info.pkl".format(label), "wb")) - - -def get_user_aggregate_features(db, is_fake, user_names): - dump_folder = "/home/dmahudes/fake_user_profiles" - - if is_fake: - label_user_collection = db.fake_twitter_user_profile - else: - label_user_collection = db.real_twitter_user_profile - - user_profile_collection = db.twitter_user_profile - - # np.random.shuffle(user_ids) - - for user_name in tqdm(user_names): - - user_object = label_user_collection.find_one({"screen_name": user_name}, {"screen_name": 1, "user_id": 1, - "profile_info": 1, "_id": 0}) - if user_object is None: - user_object = user_profile_collection.find_one({"user_id": user_name}, {"screen_name": 1, "user_id": 1, - "profile_info": 1, "_id": 0}) - - if user_object is None: - print('user {} not found'.format(user_name)) - else: - json.dump(user_object, open("{}/{}.json".format(dump_folder, user_name), "w")) - - -def remove_escape_characters(text_content): - text_content = text_content.replace(',', ' ') - text_content = text_content.replace('\n', ' ') - text_content = text_content.replace('\t', ' ') - words = text_content.split(" ") - return " ".join(words[:1000]) - - -def get_missing_rst_news_content(): - news_source = "gossipcop" - - file = "/Users/deepak/Downloads/{}_content_no_ignore.tsv".format(news_source) - # rst_folder = "/Users/deepak/Desktop/DMML/GitRepo/FakeNewsPropagation/data/baseline_features/rst/raw_parsed_data/{}".format( - # news_source) - # - # out_folder = "data/baseline_features/rst/raw_parsed_data/{}_kai".format(news_source) - - fake_news_ids = list() - - real_news_ids = list() - - all_news_folder = "data/baseline_data_kai/all_{}".format(news_source) - - kai_data_folder = "/Users/deepak/Desktop/DMML/GitRepo/FakeNewsPropagation/data/baseline_data_kai/kai_{}".format( - news_source) - - missing_files = set() - with open(file, encoding="UTF-8") as file: - reader = csv.reader(file, delimiter='\t', ) - next(reader) - - for news in reader: - - if news[1] == '1': - fake_news_ids.append(news[0]) - else: - real_news_ids.append(news[0]) - - expected_file = "{}/{}.txt.brackets".format(all_news_folder, news[0]) - out_file = "{}/{}.txt.brackets".format(kai_data_folder, news[0]) - - file = Path(expected_file) - todofile = Path("data/baseline_data_kai/{}_missed/{}.json".format(news_source, news[0])) - if file.is_file(): - shutil.copy(expected_file, out_file) - elif todofile.is_file(): - pass - else: - missing_files.add(expected_file) - with open("data/baseline_data_kai/{}_missed/{}.json".format(news_source, news[0]), "w", - encoding="UTF-8") as out_file: - out_file.write(remove_escape_characters(news[2])) - # file = Path(expected_file) - # if file.is_file(): - # with open("{}/{}.txt".format(out_folder, news[0]), "w", encoding="UTF-8") as out_file: - # out_file.write(remove_escape_characters(news[2])) - # else: - # missing_files.add(news[0]) - - pickle.dump(fake_news_ids, - open("data/baseline_data_kai/{}_{}_sample_news_ordered_ids.pkl".format(news_source, "fake"), "wb")) - pickle.dump(real_news_ids, - open("data/baseline_data_kai/{}_{}_sample_news_ordered_ids.pkl".format(news_source, "real"), "wb")) - - print("No. of missing files : {}".format(len(missing_files))) - - -def get_files_for_liwc_parsing(): - news_source = "gossipcop" - - file = "/Users/deepak/Downloads/{}_content_no_ignore.tsv".format(news_source) - - fake_data_file = open("data/baseline_data_kai/liwc/raw_data/{}_fake_liwc_data.csv".format(news_source), "w", - encoding="UTF-8") - - real_data_file = open("data/baseline_data_kai/liwc/raw_data/{}_real_liwc_data.csv".format(news_source), "w", - encoding="UTF-8") - - fake_csv_writer = csv.writer(fake_data_file) - real_csv_writer = csv.writer(real_data_file) - - with open(file, encoding="UTF-8") as file: - reader = csv.reader(file, delimiter='\t', ) - next(reader) - - for news in reader: - csv_row = [news[0], remove_escape_characters(news[2])] - - if news[1] == '1': - fake_csv_writer.writerow(csv_row) - else: - real_csv_writer.writerow(csv_row) - - fake_data_file.close() - real_data_file.close() - - -def get_users_in_network(prop_graph: tweet_node, edge_type=None): - q = queue.Queue() - - q.put(prop_graph) - - users_list = set() - - while q.qsize() != 0: - node = q.get() - - if node.user_id is not None: - users_list.add(node.user_id) - - if edge_type == RETWEET_EDGE: - children = node.retweet_children - elif edge_type == REPLY_EDGE: - children = node.reply_children - else: - children = node.children - - for child in children: - q.put(child) - - return users_list - - -def get_node_ids_in_network_by_type(prop_graph: tweet_node, edge_type=None, node_type=None): - q = queue.Queue() - - q.put(prop_graph) - - node_ids_set = set() - - while q.qsize() != 0: - node = q.get() - - if node.tweet_id is not None and node.node_type == node_type: - node_ids_set.add(node.tweet_id) - - if edge_type == RETWEET_EDGE: - children = node.retweet_children - elif edge_type == REPLY_EDGE: - children = node.reply_children - else: - children = node.children - - for child in children: - q.put(child) - - return node_ids_set - - -def get_tweets_ids_in_prop_network(prop_graph: tweet_node): - tweet_ids = set() - - for child in prop_graph.children: - tweet_ids.add(child.tweet_id) - - return tweet_ids - - -def prop_network_stats(news_source): - fake_prop_graph, real_prop_graph = get_propagation_graphs("data/saved_new_no_filter", news_source) - - tweet_ids = set() - retweet_ids = set() - reply_ids = set() - user_ids = set() - - for prop_graph in fake_prop_graph: - tweet_ids.update(get_tweets_ids_in_prop_network(prop_graph)) - retweet_ids.update(get_node_ids_in_network_by_type(prop_graph, RETWEET_EDGE, RETWEET_NODE)) - reply_ids.update(get_node_ids_in_network_by_type(prop_graph, REPLY_EDGE, REPLY_NODE)) - user_ids.update(get_users_in_network(prop_graph)) - - for prop_graph in real_prop_graph: - tweet_ids.update(get_tweets_ids_in_prop_network(prop_graph)) - retweet_ids.update(get_node_ids_in_network_by_type(prop_graph, RETWEET_EDGE, RETWEET_NODE)) - reply_ids.update(get_node_ids_in_network_by_type(prop_graph, REPLY_EDGE, REPLY_NODE)) - user_ids.update(get_users_in_network(prop_graph)) - - print("News source : {}".format(news_source)) - print("No. of tweets : {}".format(len(tweet_ids))) - print("No. of retweet ids : {}".format(len(retweet_ids))) - print("No. of reply ids : {}".format(len(reply_ids))) - print("Nol. of user : {}".format(len(user_ids))) - - -if __name__ == "__main__": - config = load_configuration("project.config") - db = get_database_connection(config) - - # prop_network_stats("politifact") - # prop_network_stats("gossipcop") - - # get_files_for_liwc_parsing() - - news_source = "politifact" - dump_LIWC_Representation("data/baseline_data_kai/liwc/liwc_results/{}_fake_liwc_data.txt".format(news_source), - "data/baseline_data_kai/liwc/extracted_featuers/{}_fake_liwc_features.csv".format(news_source)) - - dump_LIWC_Representation("data/baseline_data_kai/liwc/liwc_results/{}_real_liwc_data.txt".format(news_source), - "data/baseline_data_kai/liwc/extracted_featuers/{}_real_liwc_features.csv".format(news_source)) - - # get_missing_rst_news_content() - # get_user_aggregate_features(db, is_fake=True, - # user_names=["News1Lightning", "OfeliasHeaven", "jimbradyispapa", "CraigRozniecki", - # "yojudenz", - # "GinaLawriw", "GossipCop", "GossipCopIntern", "findsugarmummy", - # "DJDavidNewsroom"]) - # dump_all_user_profile_info(db, is_fake=True, label="fake") - # dump_all_user_profile_info(db, is_fake=False, label="real") - - exit(1) - - # get_replies_from_dataset("data/engagement_data_latest","politifact","fake","data/pre_process_data") - # get_replies_from_dataset("data/engagement_data_latest", "politifact", "real", "data/pre_process_data") - - get_replies_from_dataset("data/engagement_data_latest", "gossipcop", "fake", "data/pre_process_data") - get_replies_from_dataset("data/engagement_data_latest", "gossipcop", "real", "data/pre_process_data") - - # news_id_filter_date_dict = get_politifact_tweet_filter_dates(db, is_fake=True) - # - # print(len(news_id_filter_date_dict)) - # - # news_id_fact_statement_date_dict, news_id_source_date_dict = get_publish_date_from_sources_politifact(db, - # is_fake=False) - # news_id_publish_time_dict = get_news_articles_published_time(db, is_fake=False) - # - # # news_id_publish_time = get_news_articles_published_time( - # # "data/engagement_data/politifact_fake_news_dataset_format.json") - # - # news_id_filter_date_dict = get_dataset_publication_date(news_id_publish_time_dict, news_id_fact_statement_date_dict, - # news_id_source_date_dict) - # - # print("Source news id len : {}".format(len(news_id_source_date_dict))) - # print("Statement news id len : {}".format(len(news_id_fact_statement_date_dict))) - # print("publish news ids len : {}".format(len(news_id_publish_time_dict))) - # print("News id propagation network filter date len : {}".format(len(news_id_filter_date_dict))) - # - # exit(1) - - # dump_social_network_user_id_single_file("data/social_network_data/gossipcop_user_ids_friends_network.txt", - # "/Users/deepak/Desktop/social_network_single_files/user_ids_files" ) - # - # dump_social_network_user_name_single_file("data/social_network_data/gossipcop_user_names_friends_network.txt", - # "/Users/deepak/Desktop/social_network_single_files/user_names_files") - - # dump_user_friends_data(db, "data/format/politifact_prop_user_names.json", - # "data/social_network_data/politifact_user_names_friends_network.txt") - # - # dump_user_friends_data(db, "data/format/gossipcop_prop_user_names.json", - # "data/social_network_data/gossipcop_user_names_friends_network.txt") - - # dump_user_id_friends_data(db, "data/format/politifact_user_id_user_name_dict.json", - # "data/social_network_data/politifact_user_ids_friends_network.txt") - # - # dump_user_id_friends_data(db, "data/format/gossipcop_user_id_user_name_dict.json", - # "data/social_network_data/gossipcop_user_ids_friends_network.txt") - - # dump_user_friends_data(db, "data/format/politifact_prop_user_names.json", - # "data/social_network_data/politifact_user_names_friends_network.txt") - - # get_user_to_fetch("data/format/politifact_prop_user_names.json", - # "data/format/politifact_user_id_user_name_dict.json", - # db) - - # dump_friends_file_as_lines("/home/dmahudes/FakeNewsPropagation/data/politifact_real_user_friends_ids_complete.txt", - # "/home/dmahudes/FakeNewsPropagation/data/format/politifact_real_user_friends_ids_complete_format.txt") - - # dump_social_network_to_db(db, "/Users/deepak/Desktop/twint_collect/data 2") - # dump_social_network_to_db(db, "/Users/deepak/Desktop/twint_collect/data") - # dump_social_network_to_db(db, "/Users/deepak/Desktop/twint_collect/home/ubuntu/social_network_crawl/data") - # dump_social_network_to_db(db, - # "/home/dmahudes/FakeNewsPropagation/data/network_data/home/ubuntu/social_network_crawl/data") +# for tweet in news["tweets"]: +# get_reply_of_replies(tweet["reply"], reply_id_content_dict) +# +# pickle.dump(reply_id_content_dict, +# open("{}/{}_{}_reply_id_content_dict.pkl".format(out_dir, news_source, label), "wb")) +# +# +# def dump_all_botometer_results(db): +# screen_name_botometer_score_dict = dict() +# +# for user_score in db.twitter_user_botometer_results.find(): +# screen_name_botometer_score_dict[user_score["screen_name"]] = user_score["result"] +# +# pickle.dump(screen_name_botometer_score_dict, open("all_user_botometer_scores.pkl", "wb")) +# +# +# def dump_all_user_profile_info(db, is_fake, label): +# user_id_profile_info = dict() +# +# all_users_ids = pickle.load(open("all_prop_graph_{}_user.pkl".format(label), "rb")) +# +# if is_fake: +# user_profile_collection = db.fake_twitter_user_profile +# else: +# user_profile_collection = db.real_twitter_user_profile +# +# for user_id in tqdm(all_users_ids): +# user_object = user_profile_collection.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1, +# "profile_info.friends_count": 1, +# "profile_info.followers_count": 1, +# "profile_info.created_at": 1}) +# if user_object is None: +# user_object = db.twitter_user_profile.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1, +# "profile_info.friends_count": 1, +# "profile_info.followers_count": 1, +# "profile_info.created_at": 1}) +# if user_object and "profile_info" in user_object: +# user_id_profile_info[user_id] = user_object["profile_info"] +# +# print("No. of users found : {}".format(len(user_id_profile_info))) +# +# pickle.dump(user_id_profile_info, open("all_{}_user_profile_info.pkl".format(label), "wb")) +# +# +# def get_user_aggregate_features(db, is_fake, user_names): +# dump_folder = "/home/dmahudes/fake_user_profiles" +# +# if is_fake: +# label_user_collection = db.fake_twitter_user_profile +# else: +# label_user_collection = db.real_twitter_user_profile +# +# user_profile_collection = db.twitter_user_profile +# +# # np.random.shuffle(user_ids) +# +# for user_name in tqdm(user_names): +# +# user_object = label_user_collection.find_one({"screen_name": user_name}, {"screen_name": 1, "user_id": 1, +# "profile_info": 1, "_id": 0}) +# if user_object is None: +# user_object = user_profile_collection.find_one({"user_id": user_name}, {"screen_name": 1, "user_id": 1, +# "profile_info": 1, "_id": 0}) +# +# if user_object is None: +# print('user {} not found'.format(user_name)) +# else: +# json.dump(user_object, open("{}/{}.json".format(dump_folder, user_name), "w")) +# +# +# def remove_escape_characters(text_content): +# text_content = text_content.replace(',', ' ') +# text_content = text_content.replace('\n', ' ') +# text_content = text_content.replace('\t', ' ') +# words = text_content.split(" ") +# return " ".join(words[:1000]) +# +# +# def get_missing_rst_news_content(): +# news_source = "gossipcop" +# +# file = "/Users/deepak/Downloads/{}_content_no_ignore.tsv".format(news_source) +# # rst_folder = "/Users/deepak/Desktop/DMML/GitRepo/FakeNewsPropagation/data/baseline_features/rst/raw_parsed_data/{}".format( +# # news_source) +# # +# # out_folder = "data/baseline_features/rst/raw_parsed_data/{}_kai".format(news_source) +# +# fake_news_ids = list() +# +# real_news_ids = list() +# +# all_news_folder = "data/baseline_data_kai/all_{}".format(news_source) +# +# kai_data_folder = "/Users/deepak/Desktop/DMML/GitRepo/FakeNewsPropagation/data/baseline_data_kai/kai_{}".format( +# news_source) +# +# missing_files = set() +# with open(file, encoding="UTF-8") as file: +# reader = csv.reader(file, delimiter='\t', ) +# next(reader) +# +# for news in reader: +# +# if news[1] == '1': +# fake_news_ids.append(news[0]) +# else: +# real_news_ids.append(news[0]) +# +# expected_file = "{}/{}.txt.brackets".format(all_news_folder, news[0]) +# out_file = "{}/{}.txt.brackets".format(kai_data_folder, news[0]) +# +# file = Path(expected_file) +# todofile = Path("data/baseline_data_kai/{}_missed/{}.json".format(news_source, news[0])) +# if file.is_file(): +# shutil.copy(expected_file, out_file) +# elif todofile.is_file(): +# pass +# else: +# missing_files.add(expected_file) +# with open("data/baseline_data_kai/{}_missed/{}.json".format(news_source, news[0]), "w", +# encoding="UTF-8") as out_file: +# out_file.write(remove_escape_characters(news[2])) +# # file = Path(expected_file) +# # if file.is_file(): +# # with open("{}/{}.txt".format(out_folder, news[0]), "w", encoding="UTF-8") as out_file: +# # out_file.write(remove_escape_characters(news[2])) +# # else: +# # missing_files.add(news[0]) +# +# pickle.dump(fake_news_ids, +# open("data/baseline_data_kai/{}_{}_sample_news_ordered_ids.pkl".format(news_source, "fake"), "wb")) +# pickle.dump(real_news_ids, +# open("data/baseline_data_kai/{}_{}_sample_news_ordered_ids.pkl".format(news_source, "real"), "wb")) +# +# print("No. of missing files : {}".format(len(missing_files))) +# +# +# def get_files_for_liwc_parsing(): +# news_source = "gossipcop" +# +# file = "/Users/deepak/Downloads/{}_content_no_ignore.tsv".format(news_source) +# +# fake_data_file = open("data/baseline_data_kai/liwc/raw_data/{}_fake_liwc_data.csv".format(news_source), "w", +# encoding="UTF-8") +# +# real_data_file = open("data/baseline_data_kai/liwc/raw_data/{}_real_liwc_data.csv".format(news_source), "w", +# encoding="UTF-8") +# +# fake_csv_writer = csv.writer(fake_data_file) +# real_csv_writer = csv.writer(real_data_file) +# +# with open(file, encoding="UTF-8") as file: +# reader = csv.reader(file, delimiter='\t', ) +# next(reader) +# +# for news in reader: +# csv_row = [news[0], remove_escape_characters(news[2])] +# +# if news[1] == '1': +# fake_csv_writer.writerow(csv_row) +# else: +# real_csv_writer.writerow(csv_row) +# +# fake_data_file.close() +# real_data_file.close() +# +# +# def get_users_in_network(prop_graph: tweet_node, edge_type=None): +# q = queue.Queue() +# +# q.put(prop_graph) +# +# users_list = set() +# +# while q.qsize() != 0: +# node = q.get() +# +# if node.user_id is not None: +# users_list.add(node.user_id) +# +# if edge_type == RETWEET_EDGE: +# children = node.retweet_children +# elif edge_type == REPLY_EDGE: +# children = node.reply_children +# else: +# children = node.children +# +# for child in children: +# q.put(child) +# +# return users_list +# +# +# def get_node_ids_in_network_by_type(prop_graph: tweet_node, edge_type=None, node_type=None): +# q = queue.Queue() +# +# q.put(prop_graph) +# +# node_ids_set = set() +# +# while q.qsize() != 0: +# node = q.get() +# +# if node.tweet_id is not None and node.node_type == node_type: +# node_ids_set.add(node.tweet_id) +# +# if edge_type == RETWEET_EDGE: +# children = node.retweet_children +# elif edge_type == REPLY_EDGE: +# children = node.reply_children +# else: +# children = node.children +# +# for child in children: +# q.put(child) +# +# return node_ids_set +# +# +# def get_tweets_ids_in_prop_network(prop_graph: tweet_node): +# tweet_ids = set() +# +# for child in prop_graph.children: +# tweet_ids.add(child.tweet_id) +# +# return tweet_ids +# +# +# def prop_network_stats(news_source): +# fake_prop_graph, real_prop_graph = get_propagation_graphs("data/saved_new_no_filter", news_source) +# +# tweet_ids = set() +# retweet_ids = set() +# reply_ids = set() +# user_ids = set() +# +# for prop_graph in fake_prop_graph: +# tweet_ids.update(get_tweets_ids_in_prop_network(prop_graph)) +# retweet_ids.update(get_node_ids_in_network_by_type(prop_graph, RETWEET_EDGE, RETWEET_NODE)) +# reply_ids.update(get_node_ids_in_network_by_type(prop_graph, REPLY_EDGE, REPLY_NODE)) +# user_ids.update(get_users_in_network(prop_graph)) +# +# for prop_graph in real_prop_graph: +# tweet_ids.update(get_tweets_ids_in_prop_network(prop_graph)) +# retweet_ids.update(get_node_ids_in_network_by_type(prop_graph, RETWEET_EDGE, RETWEET_NODE)) +# reply_ids.update(get_node_ids_in_network_by_type(prop_graph, REPLY_EDGE, REPLY_NODE)) +# user_ids.update(get_users_in_network(prop_graph)) +# +# print("News source : {}".format(news_source)) +# print("No. of tweets : {}".format(len(tweet_ids))) +# print("No. of retweet ids : {}".format(len(retweet_ids))) +# print("No. of reply ids : {}".format(len(reply_ids))) +# print("Nol. of user : {}".format(len(user_ids))) +# +# +# if __name__ == "__main__": +# config = load_configuration("project.config") +# db = get_database_connection(config) +# +# # prop_network_stats("politifact") +# # prop_network_stats("gossipcop") +# +# # get_files_for_liwc_parsing() +# +# news_source = "politifact" +# dump_LIWC_Representation("data/baseline_data_kai/liwc/liwc_results/{}_fake_liwc_data.txt".format(news_source), +# "data/baseline_data_kai/liwc/extracted_featuers/{}_fake_liwc_features.csv".format(news_source)) +# +# dump_LIWC_Representation("data/baseline_data_kai/liwc/liwc_results/{}_real_liwc_data.txt".format(news_source), +# "data/baseline_data_kai/liwc/extracted_featuers/{}_real_liwc_features.csv".format(news_source)) +# +# # get_missing_rst_news_content() +# # get_user_aggregate_features(db, is_fake=True, +# # user_names=["News1Lightning", "OfeliasHeaven", "jimbradyispapa", "CraigRozniecki", +# # "yojudenz", +# # "GinaLawriw", "GossipCop", "GossipCopIntern", "findsugarmummy", +# # "DJDavidNewsroom"]) +# # dump_all_user_profile_info(db, is_fake=True, label="fake") +# # dump_all_user_profile_info(db, is_fake=False, label="real") +# +# exit(1) +# +# # get_replies_from_dataset("data/engagement_data_latest","politifact","fake","data/pre_process_data") +# # get_replies_from_dataset("data/engagement_data_latest", "politifact", "real", "data/pre_process_data") +# +# get_replies_from_dataset("data/engagement_data_latest", "gossipcop", "fake", "data/pre_process_data") +# get_replies_from_dataset("data/engagement_data_latest", "gossipcop", "real", "data/pre_process_data") +# +# # news_id_filter_date_dict = get_politifact_tweet_filter_dates(db, is_fake=True) +# # +# # print(len(news_id_filter_date_dict)) +# # +# # news_id_fact_statement_date_dict, news_id_source_date_dict = get_publish_date_from_sources_politifact(db, +# # is_fake=False) +# # news_id_publish_time_dict = get_news_articles_published_time(db, is_fake=False) +# # +# # # news_id_publish_time = get_news_articles_published_time( +# # # "data/engagement_data/politifact_fake_news_dataset_format.json") +# # +# # news_id_filter_date_dict = get_dataset_publication_date(news_id_publish_time_dict, news_id_fact_statement_date_dict, +# # news_id_source_date_dict) +# # +# # print("Source news id len : {}".format(len(news_id_source_date_dict))) +# # print("Statement news id len : {}".format(len(news_id_fact_statement_date_dict))) +# # print("publish news ids len : {}".format(len(news_id_publish_time_dict))) +# # print("News id propagation network filter date len : {}".format(len(news_id_filter_date_dict))) +# # +# # exit(1) +# +# # dump_social_network_user_id_single_file("data/social_network_data/gossipcop_user_ids_friends_network.txt", +# # "/Users/deepak/Desktop/social_network_single_files/user_ids_files" ) +# # +# # dump_social_network_user_name_single_file("data/social_network_data/gossipcop_user_names_friends_network.txt", +# # "/Users/deepak/Desktop/social_network_single_files/user_names_files") +# +# # dump_user_friends_data(db, "data/format/politifact_prop_user_names.json", +# # "data/social_network_data/politifact_user_names_friends_network.txt") +# # +# # dump_user_friends_data(db, "data/format/gossipcop_prop_user_names.json", +# # "data/social_network_data/gossipcop_user_names_friends_network.txt") +# +# # dump_user_id_friends_data(db, "data/format/politifact_user_id_user_name_dict.json", +# # "data/social_network_data/politifact_user_ids_friends_network.txt") +# # +# # dump_user_id_friends_data(db, "data/format/gossipcop_user_id_user_name_dict.json", +# # "data/social_network_data/gossipcop_user_ids_friends_network.txt") +# +# # dump_user_friends_data(db, "data/format/politifact_prop_user_names.json", +# # "data/social_network_data/politifact_user_names_friends_network.txt") +# +# # get_user_to_fetch("data/format/politifact_prop_user_names.json", +# # "data/format/politifact_user_id_user_name_dict.json", +# # db) +# +# # dump_friends_file_as_lines("/home/dmahudes/FakeNewsPropagation/data/politifact_real_user_friends_ids_complete.txt", +# # "/home/dmahudes/FakeNewsPropagation/data/format/politifact_real_user_friends_ids_complete_format.txt") +# +# # dump_social_network_to_db(db, "/Users/deepak/Desktop/twint_collect/data 2") +# # dump_social_network_to_db(db, "/Users/deepak/Desktop/twint_collect/data") +# # dump_social_network_to_db(db, "/Users/deepak/Desktop/twint_collect/home/ubuntu/social_network_crawl/data") +# # dump_social_network_to_db(db, +# # "/home/dmahudes/FakeNewsPropagation/data/network_data/home/ubuntu/social_network_crawl/data") diff --git a/stat_test.py b/stat_test.py index aafabdf..1dadc05 100644 --- a/stat_test.py +++ b/stat_test.py @@ -197,6 +197,24 @@ def get_box_plots_mod(samples1, samples2, save_folder, title=None, file_name=Non if __name__ == "__main__": - get_box_plots_mod(np.random.rand(200, ), np.random.rand(200, ), + import seaborn as sns + + all_data = np.transpose(np.array([np.random.rand(2000, ), np.random.rand(2000, )])) + labels = ['Fake', 'Real'] + df = pd.DataFrame(all_data, columns=labels) + my_pal = {"Fake": "pink", "Real": "lightblue", } + + plt.xticks(fontsize=12) + plt.yticks(fontsize=12) + + + # sns.set(style="whitegrid") + tips = sns.load_dataset("tips") + ax = sns.violinplot(data=df, palette=my_pal, width=0.3, showfliers=False) + + plt.show() + exit(1) + + get_box_plots_mod(np.random.rand(2000, ), np.random.rand(2000, ), "/Users/deepak/Desktop/DMML/GitRepo/FakeNewsPropagation", "T10", "T10") diff --git a/structure_temp_analysis.py b/structure_temp_analysis.py index 49ad4c7..77a7323 100644 --- a/structure_temp_analysis.py +++ b/structure_temp_analysis.py @@ -8,15 +8,15 @@ from analysis_util import get_propagation_graphs, equal_samples, get_numpy_array, BaseFeatureHelper, \ get_sample_feature_value -from stat_test import perform_t_test, plot_normal_distributions, get_box_plots +from stat_test import perform_t_test, get_box_plots from util.constants import NEWS_ROOT_NODE, RETWEET_EDGE, REPLY_EDGE, RETWEET_NODE, REPLY_NODE from util.util import tweet_node -user_id_profile_info_dict = dict() -user_id_profile_info_dict.update( - pickle.load(open("data/pre_process_data/user_features/all_fake_user_profile_info.pkl", "rb"))) -user_id_profile_info_dict.update( - pickle.load(open("data/pre_process_data/user_features/all_real_user_profile_info.pkl", "rb"))) +# user_id_profile_info_dict = dict() +# user_id_profile_info_dict.update( +# pickle.load(open("data/pre_process_data/user_features/all_fake_user_profile_info.pkl", "rb"))) +# user_id_profile_info_dict.update( +# pickle.load(open("data/pre_process_data/user_features/all_real_user_profile_info.pkl", "rb"))) def get_post_tweet_deepest_cascade(prop_graph: tweet_node, edge_type=RETWEET_EDGE): @@ -560,51 +560,51 @@ def get_fraction_of_unique_users(prop_graph: tweet_node, edge_type=None): def get_num_bot_users(prop_graph: tweet_node): - global user_id_bot_score_dict - retweeting_users = set(get_user_names_retweeting_in_prop_graph(prop_graph)) + q = queue.Queue() + q.put(prop_graph) num_bot_users = 0 - for user_name in retweeting_users: - if user_name in user_id_bot_score_dict: - botometer_score = user_id_bot_score_dict[user_name] - if "scores" in botometer_score: - if botometer_score['scores']['universal'] > 0.5: + + while q.qsize() != 0: + node = q.get() + + for child in node.retweet_children: + q.put(child) + if child.node_type == RETWEET_NODE and child.user_id is not None: + if child.botometer_score and child.botometer_score > 0.5: num_bot_users += 1 - else: - print("user {} not found ".format(user_name)) return num_bot_users def get_fraction_of_bot_users_retweeting(prop_graph: tweet_node): - global user_id_bot_score_dict - retweeting_users = set(get_user_names_retweeting_in_prop_graph(prop_graph)) + q = queue.Queue() + q.put(prop_graph) num_bot_users = 1 - human_users = 1 - for user_name in retweeting_users: - if user_name in user_id_bot_score_dict: - botometer_score = user_id_bot_score_dict[user_name] - if "scores" in botometer_score: - if botometer_score['scores']['universal'] > 0.5: - num_bot_users += 1 - else: - human_users += 1 + num_human_users = 1 - return num_bot_users / (human_users+ num_bot_users) + while q.qsize() != 0: + node = q.get() + + for child in node.retweet_children: + q.put(child) + if child.node_type == RETWEET_NODE and child.user_id is not None: + if child.botometer_score: + if child.botometer_score > 0.5: + num_bot_users += 1 + else: + num_human_users += 1 + + return num_bot_users / (num_human_users+ num_bot_users) def get_prop_graphs_num_bot_users_retweeting(prop_graphs: tweet_node, edge_type=None): global user_id_bot_score_dict - user_id_bot_score_dict = pickle.load( - open("data/pre_process_data/botometer_scores/all_user_botometer_scores.pkl", "rb")) return get_sample_feature_value(prop_graphs, get_num_bot_users) def get_prop_graphs_fraction_of_bot_users_retweeting(prop_graphs: tweet_node, edge_type=None): - global user_id_bot_score_dict - user_id_bot_score_dict = pickle.load( - open("data/pre_process_data/botometer_scores/all_user_botometer_scores.pkl", "rb")) return get_sample_feature_value(prop_graphs, get_fraction_of_bot_users_retweeting) @@ -1036,13 +1036,13 @@ def get_macro_feature_short_names(self): return feature_names def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None, - file_dir="data/train_test_data"): + file_dir="data/train_test_data", use_cache = False): all_features = [] file_name = self.get_dump_file_name(news_source, micro_features, macro_features, label, file_dir) data_file = Path(file_name) - if data_file.is_file(): + if use_cache and data_file.is_file(): return pickle.load(open(file_name, "rb")) if micro_features: @@ -1115,13 +1115,13 @@ def get_macro_feature_short_names(self): return feature_names def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None, - file_dir="data/train_test_data"): + file_dir="data/train_test_data", use_cache = False): all_features = [] file_name = self.get_dump_file_name(news_source, micro_features, macro_features, label, file_dir) data_file = Path(file_name) - if data_file.is_file(): + if use_cache and data_file.is_file(): return pickle.load(open(file_name, "rb")) if micro_features: diff --git a/temporal_analysis.py b/temporal_analysis.py index bcb4d79..e91c058 100644 --- a/temporal_analysis.py +++ b/temporal_analysis.py @@ -331,6 +331,27 @@ def get_macro_feature_short_names(self): if __name__ == "__main__": + temporal_feature_helper = TemporalFeatureHelper() + + news_source = "gossipcop" + + fake_prop_graph, real_prop_graph = get_propagation_graphs("data/saved_new_no_filter", news_source) + + fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) + + fake_features = temporal_feature_helper.get_features_array(fake_prop_graph, micro_features=True, + macro_features=True, news_source=news_source, + label="fake", use_cache=True) + real_features = temporal_feature_helper.get_features_array(real_prop_graph, micro_features=True, + macro_features=True, news_source=news_source, + label="real", use_cache=True) + + temporal_feature_helper.save_blox_plots_for_features(fake_feature_array=fake_features, + real_feature_array=real_features, micro_features=True, + macro_features=True, save_folder="data/feature_images/gossipcop_violin") + + exit(1) + fake_prop_graph, real_prop_graph = get_propagation_graphs("data/saved_new_no_filter", "politifact") fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph) diff --git a/util/graph_dumper.py b/util/graph_dumper.py index ba059fc..ad4b50d 100644 --- a/util/graph_dumper.py +++ b/util/graph_dumper.py @@ -1,62 +1,62 @@ -from util.util import tweet_node - - -def dumps_graph(root_node: tweet_node, params): - tweet_info_object_dict = dict() - edges_list = [] - nodes_list = [] - - tweet_id_node_id_dict = dict() - - add_tweet_node_if_not_exists(tweet_id_node_id_dict, root_node, nodes_list, tweet_info_object_dict, params) - - root_node_id = tweet_id_node_id_dict[root_node.tweet_id] - - for child in root_node.children: - child_node_id = add_tweet_node_if_not_exists(tweet_id_node_id_dict, child, nodes_list, tweet_info_object_dict, - params) - - edges_list.append(get_edge(root_node_id, child_node_id)) - - dump_children_network(child, nodes_list, edges_list, tweet_id_node_id_dict, tweet_info_object_dict, params) - - legend_node_id = len(tweet_id_node_id_dict)+1 - return [tweet_info_object_dict, nodes_list, edges_list] - - -def get_edge(parent_node_id, child_node_id): - return {"from": parent_node_id, "to": child_node_id} - - -def add_tweet_node_if_not_exists(tweet_id_node_id_dict, node: tweet_node, nodes_list, tweet_info_object_dict: dict, - params): - if node.tweet_id not in tweet_id_node_id_dict: - tweet_id_node_id_dict[node.tweet_id] = len(tweet_id_node_id_dict) + 1 - - nodes_list.append({"id": tweet_id_node_id_dict[node.tweet_id], "tweet_id": str(node.tweet_id), - "label": tweet_id_node_id_dict[node.tweet_id], - "color": params["node_color"][node.node_type]}) - - tweet_info_object_dict[str(node.tweet_id)] = node.get_contents() - - return tweet_id_node_id_dict[node.tweet_id] - - -def dump_children_network(node, nodes_list: list, edge_list: list, tweet_id_node_id_dict: dict, - tweet_info_object_dict: dict, params): - node_id = add_tweet_node_if_not_exists(tweet_id_node_id_dict, node, nodes_list, tweet_info_object_dict, params) - - for child in node.children: - dump_children_network(child, nodes_list, edge_list, tweet_id_node_id_dict, tweet_info_object_dict, params) - child_id = tweet_id_node_id_dict[child.tweet_id] - - edge_list.append(get_edge(node_id, child_id)) - -# def dump_reply_network(node: tweet_node, nodes_list: list, edge_list: list, tweet_info_object_dict: dict): -# node_id = add_tweet_node_if_not_exists(tweet_id_node_id_dict, node, nodes_list, params) -# -# for child in node.reply_children: -# dump_retweet_network(child, nodes_list, edge_list, tweet_id_node_id_dict, params) +# from util.util import tweet_node +# +# +# def dumps_graph(root_node: tweet_node, params): +# tweet_info_object_dict = dict() +# edges_list = [] +# nodes_list = [] +# +# tweet_id_node_id_dict = dict() +# +# add_tweet_node_if_not_exists(tweet_id_node_id_dict, root_node, nodes_list, tweet_info_object_dict, params) +# +# root_node_id = tweet_id_node_id_dict[root_node.tweet_id] +# +# for child in root_node.children: +# child_node_id = add_tweet_node_if_not_exists(tweet_id_node_id_dict, child, nodes_list, tweet_info_object_dict, +# params) +# +# edges_list.append(get_edge(root_node_id, child_node_id)) +# +# dump_children_network(child, nodes_list, edges_list, tweet_id_node_id_dict, tweet_info_object_dict, params) +# +# legend_node_id = len(tweet_id_node_id_dict)+1 +# return [tweet_info_object_dict, nodes_list, edges_list] +# +# +# def get_edge(parent_node_id, child_node_id): +# return {"from": parent_node_id, "to": child_node_id} +# +# +# def add_tweet_node_if_not_exists(tweet_id_node_id_dict, node: tweet_node, nodes_list, tweet_info_object_dict: dict, +# params): +# if node.tweet_id not in tweet_id_node_id_dict: +# tweet_id_node_id_dict[node.tweet_id] = len(tweet_id_node_id_dict) + 1 +# +# nodes_list.append({"id": tweet_id_node_id_dict[node.tweet_id], "tweet_id": str(node.tweet_id), +# "label": tweet_id_node_id_dict[node.tweet_id], +# "color": params["node_color"][node.node_type]}) +# +# tweet_info_object_dict[str(node.tweet_id)] = node.get_contents() +# +# return tweet_id_node_id_dict[node.tweet_id] +# +# +# def dump_children_network(node, nodes_list: list, edge_list: list, tweet_id_node_id_dict: dict, +# tweet_info_object_dict: dict, params): +# node_id = add_tweet_node_if_not_exists(tweet_id_node_id_dict, node, nodes_list, tweet_info_object_dict, params) +# +# for child in node.children: +# dump_children_network(child, nodes_list, edge_list, tweet_id_node_id_dict, tweet_info_object_dict, params) # child_id = tweet_id_node_id_dict[child.tweet_id] # # edge_list.append(get_edge(node_id, child_id)) +# +# # def dump_reply_network(node: tweet_node, nodes_list: list, edge_list: list, tweet_info_object_dict: dict): +# # node_id = add_tweet_node_if_not_exists(tweet_id_node_id_dict, node, nodes_list, params) +# # +# # for child in node.reply_children: +# # dump_retweet_network(child, nodes_list, edge_list, tweet_id_node_id_dict, params) +# # child_id = tweet_id_node_id_dict[child.tweet_id] +# # +# # edge_list.append(get_edge(node_id, child_id)) diff --git a/util/util.py b/util/util.py index 41ebc97..87cb236 100644 --- a/util/util.py +++ b/util/util.py @@ -4,7 +4,7 @@ class tweet_node: - def __init__(self, tweet_id, text, created_time, user_name, user_id, news_id, node_type): + def __init__(self, tweet_id, text = None, created_time = None, user_name = None, user_id = None, news_id = None, node_type = None, botometer_score = None, sentiment= None): self.tweet_id = tweet_id self.text = text self.created_time = created_time @@ -22,16 +22,18 @@ def __init__(self, tweet_id, text, created_time, user_name, user_id, news_id, no self.children = set() - self.sentiment = None - self.stance = None - self.topic_vector = None + self.sentiment = sentiment + # self.stance = None + # self.topic_vector = None - self.original_object = None + # self.original_object = None self.parent_node = None self.node_type = node_type + self.botometer_score = botometer_score + def __eq__(self, other): return self.tweet_id == other.tweet_id