diff --git a/README.md b/README.md
index 64255f0..a6196cb 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,29 @@
-# FakeNewsPropagation
-Fake News propagation study
+# Fake News Propagation
+
+Code for paper "Hierarchical Propagation Networks for Fake News Detection: Investigation and Exploitation" ICWSM 2020 https://arxiv.org/abs/1903.09196
+
+###Dataset
+
+
+###To Run:
+
+
+###References
+
+If you use this dataset, please cite the following papers:
+
+@article{shu2019hierarchical,
+  title={Hierarchical propagation networks for fake news detection: Investigation and exploitation},
+  author={Shu, Kai and Mahudeswaran, Deepak and Wang, Suhang and Liu, Huan},
+  journal={arXiv preprint arXiv:1903.09196},
+  year={2019}
+}
+
+@article{shu2018fakenewsnet,
+  title={FakeNewsNet: A Data Repository with News Content, Social Context and Dynamic Information for Studying Fake News on Social Media},
+  author={Shu, Kai and  Mahudeswaran, Deepak and Wang, Suhang and Lee, Dongwon and Liu, Huan},
+  journal={arXiv preprint arXiv:1809.01286},
+  year={2018}
+}
+
+(C) 2019 Arizona Board of Regents on Behalf of ASU
diff --git a/analysis_util.py b/analysis_util.py
index 237f075..d896cbc 100644
--- a/analysis_util.py
+++ b/analysis_util.py
@@ -1,14 +1,14 @@
 import errno
 import os
+import pickle
+from abc import ABCMeta, abstractmethod
 from pathlib import Path
 
 import numpy as np
-import pickle
+from sklearn.utils import resample
 
-from stat_test import get_box_plots, perform_t_test, get_box_plots_mod
-from util.util import twitter_datetime_str_to_object, tweet_node
-
-from abc import ABCMeta, abstractmethod
+from stat_test import perform_t_test, get_box_plots_mod
+from util.util import twitter_datetime_str_to_object
 
 
 class BaseFeatureHelper(metaclass=ABCMeta):
@@ -52,13 +52,13 @@ def get_dump_file_name(self, news_source, micro_features, macro_features, label,
         return "{}/{}.pkl".format(file_dir, "_".join(file_tags))
 
     def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None,
-                           file_dir="data/train_test_data"):
+                           file_dir="data/train_test_data", use_cache=False):
         function_refs = []
 
-        file_name = self.get_dump_file_name(news_source,micro_features, macro_features, label, file_dir)
+        file_name = self.get_dump_file_name(news_source, micro_features, macro_features, label, file_dir)
         data_file = Path(file_name)
 
-        if data_file.is_file():
+        if use_cache and data_file.is_file():
             return pickle.load(open(file_name, "rb"))
 
         if micro_features:
@@ -134,6 +134,37 @@ def get_feature_significance_t_tests(self, fake_feature_array, real_feature_arra
             print("Feature {} : {}".format(short_feature_names[idx], feature_names[idx]))
             perform_t_test(fake_feature_values, real_feature_values)
 
+    def get_feature_significance_bootstrap_tests(self, fake_feature_array, real_feature_array, micro_features=None,
+                                                 macro_features=None):
+
+        [feature_names, short_feature_names] = self.get_feature_names(micro_features, macro_features)
+
+        for idx in range(len(feature_names)):
+            fake_feature_values = fake_feature_array[:, idx]
+            real_feature_values = real_feature_array[:, idx]
+
+            perms_fake = []
+            perms_real = []
+
+            combined = np.concatenate((fake_feature_values, real_feature_values), axis=0)
+
+            print("combined shape : ", combined.shape)
+
+            for i in range(10000):
+                np.random.seed(i)
+                perms_fake.append(resample(combined, n_samples=len(fake_feature_values)))
+                perms_real.append(resample(combined, n_samples=len(real_feature_values)))
+
+            dif_bootstrap_means = (np.mean(perms_fake, axis=1) - np.mean(perms_real, axis=1))
+            print("diff bootstrap means : ", dif_bootstrap_means.shape)
+
+            obs_difs = (np.mean(fake_feature_values) - np.mean(real_feature_values))
+
+            p_value = dif_bootstrap_means[dif_bootstrap_means >= obs_difs].shape[0] / 10000
+
+            print("Feature {} : {}".format(short_feature_names[idx], feature_names[idx]))
+            print("t- value : {}   p-value : {}".format(obs_difs, p_value))
+
 
 def get_sample_feature_value(news_graps: list, get_feature_fun_ref):
     result = []
diff --git a/baseline/LIWC.py b/baseline/LIWC.py
index aa74c59..997161c 100644
--- a/baseline/LIWC.py
+++ b/baseline/LIWC.py
@@ -1,280 +1,280 @@
-
-from random import shuffle
-
-import numpy as np
-import pandas as pd
-from sklearn import linear_model
-from sklearn import preprocessing
-from sklearn import svm
-from sklearn import tree
-from sklearn.base import clone
-from sklearn.ensemble import AdaBoostClassifier
-from sklearn.ensemble import BaggingClassifier
-from sklearn.ensemble import GradientBoostingClassifier
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import accuracy_score, f1_score
-from sklearn.metrics import precision_score, recall_score
-from sklearn.model_selection import cross_val_score
-from sklearn.model_selection import train_test_split
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.neighbors import KNeighborsClassifier
-from xgboost import XGBClassifier
-
-
-def LIWC_Representation(data_type):
-    f_out = open('./' + data_type + '/LIWCFeats.txt', 'w+')
-    with open('LIWC2015_'+data_type+'_fake.txt') as f_fake:
-        for line in f_fake:
-            line = line.strip()
-            all_data = line.split('\t')
-            if all_data[0]=='Filename':
-                continue
-            ID = all_data[0]
-            feats = all_data[2:]
-            f_out.write(ID+'\t')
-            f_out.write('\t'.join(f for f in feats))
-            f_out.write('\n')
-
-    with open('LIWC2015_'+data_type+'_real.txt') as f_fake:
-        for line in f_fake:
-            line = line.strip()
-            all_data = line.split('\t')
-            if all_data[0]=='Filename':
-                continue
-            ID = all_data[0]
-            feats = all_data[2:]
-            f_out.write(ID + '\t')
-            f_out.write('\t'.join(f for f in feats))
-            f_out.write('\n')
-    f_out.close()
-    print
-
-def LIWC_Prediction(data_type):
-    X_real = []
-    y_real = []
-    X_fake = []
-    y_fake = []
-    X=[]
-    y=[]
-    with open('./'+data_type+'/LIWCFeats.txt') as f_rst:
-        for line in f_rst:
-            line = line.strip()
-            line_str = line.split('\t')
-            ID = line_str[0]
-            feats = [float(x) for x in line_str[1:]]
-            if 'Real' in ID:
-                X_real.append(feats)
-                y_real.append(0)
-            else:
-                X_fake.append(feats)
-                y_fake.append(1)
-    ## Balance fake and true news
-    num = len(y_fake)
-    X_real = X_real[:num]
-    y_real = y_real[:num]
-    for i in range(num):
-        X.append(X_real[i])
-        X.append(X_fake[i])
-        y.append(y_real[i])
-        y.append(y_fake[i])
-
-    X = np.array(X)
-    y = np.array(y)
-    # # shuffle the rows
-    arry = range(X.shape[0])
-    shuffle(arry)
-    X = X[arry, :]
-    y = y[arry]
-    # clf = SVC(kernel='linear', class_weight='balanced')
-    # clf = RandomForestClassifier()
-    clf = tree.DecisionTreeClassifier()
-    X = preprocessing.normalize(X)
-    res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='accuracy')
-    res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
-    print('Accuracy '+res)
-    res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='precision')
-    res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
-    print('precision '+res)
-    res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='recall')
-    res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
-    print('recall '+res)
-    res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='f1')
-    res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
-    print('f1 '+res)
-    print
-
-def LIWC_Prediction2(data_type):
-    X_real = []
-    y_real = []
-    X_fake = []
-    y_fake = []
-    X=[]
-    y=[]
-    with open('./'+data_type+'/LIWCFeats.txt') as f_rst:
-        for line in f_rst:
-            line = line.strip()
-            line_str = line.split('\t')
-            ID = line_str[0]
-            feats = [float(x) for x in line_str[1:]]
-            if 'Real' in ID:
-                X_real.append(feats)
-                y_real.append(0)
-            else:
-                X_fake.append(feats)
-                y_fake.append(1)
-    ## Balance fake and true news
-    num = len(y_fake)
-    X_real = X_real[:num]
-    y_real = y_real[:num]
-    for i in range(num):
-        X.append(X_real[i])
-        X.append(X_fake[i])
-        y.append(y_real[i])
-        y.append(y_fake[i])
-
-    X = np.array(X)
-    y = np.array(y)
-    # # shuffle the rows
-    arry = range(X.shape[0])
-    shuffle(arry)
-    X = X[arry, :]
-    y = y[arry]
-    clfs = [
-        linear_model.LogisticRegression(random_state=22),
-        MultinomialNB(),
-        tree.DecisionTreeClassifier(random_state=21),
-        RandomForestClassifier(random_state=22),
-        XGBClassifier(),
-        AdaBoostClassifier(random_state=22),
-        svm.SVC(kernel='linear', class_weight='balanced'),
-        GradientBoostingClassifier(random_state=22),
-        BaggingClassifier(random_state=22),
-        KNeighborsClassifier()
-    ]
-    clf_names = [
-        'Logistic Regression',
-        'Naive Bayes',
-        'Decision Tree',
-        'Random Forest',
-        'XGBoost',
-        'AdaBoost',
-        'SVM',
-        'GradientBoosting',
-        'Bagging Clf',
-        'KNeighbors Clf'
-    ]
-
-    X = preprocessing.normalize(X)
-    cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1']
-
-    df = pd.DataFrame(columns=cols)
-    df = df.set_index('alg')
-    for i in range(len(clfs)):
-        clf = clone(clfs[i])
-        clf_name = clf_names[i]
-        df = test(clf,clf_name,df,cols,X,y)
-    print(df)
-    df.to_csv('./LIWC_'+data_type+'_results.csv', header=True,sep='\t',columns=cols)
-
-def test(clf, clf_name, df, cols, X, y,train_ratio):
-    acc = []
-    prec = []
-    recall = []
-    f1 = []
-    for i in range(5):
-        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_ratio)
-        clf.fit(X_train, y_train)
-        y_pred = clf.predict(X_test)
-        acc.append(accuracy_score(y_test, y_pred))
-        prec.append(precision_score(y_test, y_pred))
-        recall.append(recall_score(y_test, y_pred))
-        f1.append(f1_score(y_test, y_pred))
-    tmp = pd.DataFrame([[clf_name, np.average(acc), np.std(acc), np.average(prec), np.std(prec), np.average(recall),
-                         np.std(recall), np.average(f1), np.std(f1)]], columns=cols)
-    df = df.append(tmp)
-    return df
-
-def LIWC_Prediction2_curve(data_type):
-    X_real = []
-    y_real = []
-    X_fake = []
-    y_fake = []
-    X=[]
-    y=[]
-    with open('./'+data_type+'/LIWCFeats.txt') as f_rst:
-        for line in f_rst:
-            line = line.strip()
-            line_str = line.split('\t')
-            ID = line_str[0]
-            feats = [float(x) for x in line_str[1:]]
-            if 'Real' in ID:
-                X_real.append(feats)
-                y_real.append(0)
-            else:
-                X_fake.append(feats)
-                y_fake.append(1)
-    ## Balance fake and true news
-    num = len(y_fake)
-    X_real = X_real[:num]
-    y_real = y_real[:num]
-    for i in range(num):
-        X.append(X_real[i])
-        X.append(X_fake[i])
-        y.append(y_real[i])
-        y.append(y_fake[i])
-
-    X = np.array(X)
-    y = np.array(y)
-    # # shuffle the rows
-    arry = range(X.shape[0])
-    shuffle(arry)
-    X = X[arry, :]
-    y = y[arry]
-    clfs = [
-        # linear_model.LogisticRegression(random_state=22),
-        # MultinomialNB(),
-        # tree.DecisionTreeClassifier(random_state=21),
-        # RandomForestClassifier(random_state=22),
-        # XGBClassifier(),
-        AdaBoostClassifier(random_state=22),
-        # svm.SVC(kernel='linear', class_weight='balanced'),
-        # GradientBoostingClassifier(random_state=22),
-        # BaggingClassifier(random_state=22),
-        # KNeighborsClassifier()
-    ]
-    clf_names = [
-    #     'Logistic Regression',
-    #     'Naive Bayes',
-    #     'Decision Tree',
-    #     'Random Forest',
-    #     'XGBoost',
-        'AdaBoost',
-        # 'SVM',
-        # 'GradientBoosting',
-        # 'Bagging Clf',
-        # 'KNeighbors Clf'
-    ]
-
-    X = preprocessing.normalize(X)
-    cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1']
-
-    df = pd.DataFrame(columns=cols)
-    df = df.set_index('alg')
-    tr = [0.2,0.4,0.6]
-    for t in tr:
-        for i in range(len(clfs)):
-            clf = clone(clfs[i])
-            clf_name = clf_names[i]
-            df = test(clf, clf_name, df, cols, X, y,t)
-    with pd.option_context('expand_frame_repr', False):
-        print (df)
-    df.to_csv('./LIWC_'+data_type+'_results_curve.csv', header=True,sep='\t',columns=cols)
-
-if __name__ == '__main__':
-    data_type = 'BuzzFeed'
-    # LIWC_Representation(data_type)
-    # LIWC_Prediction2('BuzzFeed')
-    # LIWC_Prediction2('PolitiFact')
-    LIWC_Prediction2_curve('BuzzFeed')
-    LIWC_Prediction2_curve('PolitiFact')
-    print
\ No newline at end of file
+#
+# from random import shuffle
+#
+# import numpy as np
+# import pandas as pd
+# from sklearn import linear_model
+# from sklearn import preprocessing
+# from sklearn import svm
+# from sklearn import tree
+# from sklearn.base import clone
+# from sklearn.ensemble import AdaBoostClassifier
+# from sklearn.ensemble import BaggingClassifier
+# from sklearn.ensemble import GradientBoostingClassifier
+# from sklearn.ensemble import RandomForestClassifier
+# from sklearn.metrics import accuracy_score, f1_score
+# from sklearn.metrics import precision_score, recall_score
+# from sklearn.model_selection import cross_val_score
+# from sklearn.model_selection import train_test_split
+# from sklearn.naive_bayes import MultinomialNB
+# from sklearn.neighbors import KNeighborsClassifier
+# from xgboost import XGBClassifier
+#
+#
+# def LIWC_Representation(data_type):
+#     f_out = open('./' + data_type + '/LIWCFeats.txt', 'w+')
+#     with open('LIWC2015_'+data_type+'_fake.txt') as f_fake:
+#         for line in f_fake:
+#             line = line.strip()
+#             all_data = line.split('\t')
+#             if all_data[0]=='Filename':
+#                 continue
+#             ID = all_data[0]
+#             feats = all_data[2:]
+#             f_out.write(ID+'\t')
+#             f_out.write('\t'.join(f for f in feats))
+#             f_out.write('\n')
+#
+#     with open('LIWC2015_'+data_type+'_real.txt') as f_fake:
+#         for line in f_fake:
+#             line = line.strip()
+#             all_data = line.split('\t')
+#             if all_data[0]=='Filename':
+#                 continue
+#             ID = all_data[0]
+#             feats = all_data[2:]
+#             f_out.write(ID + '\t')
+#             f_out.write('\t'.join(f for f in feats))
+#             f_out.write('\n')
+#     f_out.close()
+#     print
+#
+# def LIWC_Prediction(data_type):
+#     X_real = []
+#     y_real = []
+#     X_fake = []
+#     y_fake = []
+#     X=[]
+#     y=[]
+#     with open('./'+data_type+'/LIWCFeats.txt') as f_rst:
+#         for line in f_rst:
+#             line = line.strip()
+#             line_str = line.split('\t')
+#             ID = line_str[0]
+#             feats = [float(x) for x in line_str[1:]]
+#             if 'Real' in ID:
+#                 X_real.append(feats)
+#                 y_real.append(0)
+#             else:
+#                 X_fake.append(feats)
+#                 y_fake.append(1)
+#     ## Balance fake and true news
+#     num = len(y_fake)
+#     X_real = X_real[:num]
+#     y_real = y_real[:num]
+#     for i in range(num):
+#         X.append(X_real[i])
+#         X.append(X_fake[i])
+#         y.append(y_real[i])
+#         y.append(y_fake[i])
+#
+#     X = np.array(X)
+#     y = np.array(y)
+#     # # shuffle the rows
+#     arry = range(X.shape[0])
+#     shuffle(arry)
+#     X = X[arry, :]
+#     y = y[arry]
+#     # clf = SVC(kernel='linear', class_weight='balanced')
+#     # clf = RandomForestClassifier()
+#     clf = tree.DecisionTreeClassifier()
+#     X = preprocessing.normalize(X)
+#     res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='accuracy')
+#     res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
+#     print('Accuracy '+res)
+#     res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='precision')
+#     res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
+#     print('precision '+res)
+#     res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='recall')
+#     res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
+#     print('recall '+res)
+#     res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='f1')
+#     res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
+#     print('f1 '+res)
+#     print
+#
+# def LIWC_Prediction2(data_type):
+#     X_real = []
+#     y_real = []
+#     X_fake = []
+#     y_fake = []
+#     X=[]
+#     y=[]
+#     with open('./'+data_type+'/LIWCFeats.txt') as f_rst:
+#         for line in f_rst:
+#             line = line.strip()
+#             line_str = line.split('\t')
+#             ID = line_str[0]
+#             feats = [float(x) for x in line_str[1:]]
+#             if 'Real' in ID:
+#                 X_real.append(feats)
+#                 y_real.append(0)
+#             else:
+#                 X_fake.append(feats)
+#                 y_fake.append(1)
+#     ## Balance fake and true news
+#     num = len(y_fake)
+#     X_real = X_real[:num]
+#     y_real = y_real[:num]
+#     for i in range(num):
+#         X.append(X_real[i])
+#         X.append(X_fake[i])
+#         y.append(y_real[i])
+#         y.append(y_fake[i])
+#
+#     X = np.array(X)
+#     y = np.array(y)
+#     # # shuffle the rows
+#     arry = range(X.shape[0])
+#     shuffle(arry)
+#     X = X[arry, :]
+#     y = y[arry]
+#     clfs = [
+#         linear_model.LogisticRegression(random_state=22),
+#         MultinomialNB(),
+#         tree.DecisionTreeClassifier(random_state=21),
+#         RandomForestClassifier(random_state=22),
+#         XGBClassifier(),
+#         AdaBoostClassifier(random_state=22),
+#         svm.SVC(kernel='linear', class_weight='balanced'),
+#         GradientBoostingClassifier(random_state=22),
+#         BaggingClassifier(random_state=22),
+#         KNeighborsClassifier()
+#     ]
+#     clf_names = [
+#         'Logistic Regression',
+#         'Naive Bayes',
+#         'Decision Tree',
+#         'Random Forest',
+#         'XGBoost',
+#         'AdaBoost',
+#         'SVM',
+#         'GradientBoosting',
+#         'Bagging Clf',
+#         'KNeighbors Clf'
+#     ]
+#
+#     X = preprocessing.normalize(X)
+#     cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1']
+#
+#     df = pd.DataFrame(columns=cols)
+#     df = df.set_index('alg')
+#     for i in range(len(clfs)):
+#         clf = clone(clfs[i])
+#         clf_name = clf_names[i]
+#         df = test(clf,clf_name,df,cols,X,y)
+#     print(df)
+#     df.to_csv('./LIWC_'+data_type+'_results.csv', header=True,sep='\t',columns=cols)
+#
+# def test(clf, clf_name, df, cols, X, y,train_ratio):
+#     acc = []
+#     prec = []
+#     recall = []
+#     f1 = []
+#     for i in range(5):
+#         X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_ratio)
+#         clf.fit(X_train, y_train)
+#         y_pred = clf.predict(X_test)
+#         acc.append(accuracy_score(y_test, y_pred))
+#         prec.append(precision_score(y_test, y_pred))
+#         recall.append(recall_score(y_test, y_pred))
+#         f1.append(f1_score(y_test, y_pred))
+#     tmp = pd.DataFrame([[clf_name, np.average(acc), np.std(acc), np.average(prec), np.std(prec), np.average(recall),
+#                          np.std(recall), np.average(f1), np.std(f1)]], columns=cols)
+#     df = df.append(tmp)
+#     return df
+#
+# def LIWC_Prediction2_curve(data_type):
+#     X_real = []
+#     y_real = []
+#     X_fake = []
+#     y_fake = []
+#     X=[]
+#     y=[]
+#     with open('./'+data_type+'/LIWCFeats.txt') as f_rst:
+#         for line in f_rst:
+#             line = line.strip()
+#             line_str = line.split('\t')
+#             ID = line_str[0]
+#             feats = [float(x) for x in line_str[1:]]
+#             if 'Real' in ID:
+#                 X_real.append(feats)
+#                 y_real.append(0)
+#             else:
+#                 X_fake.append(feats)
+#                 y_fake.append(1)
+#     ## Balance fake and true news
+#     num = len(y_fake)
+#     X_real = X_real[:num]
+#     y_real = y_real[:num]
+#     for i in range(num):
+#         X.append(X_real[i])
+#         X.append(X_fake[i])
+#         y.append(y_real[i])
+#         y.append(y_fake[i])
+#
+#     X = np.array(X)
+#     y = np.array(y)
+#     # # shuffle the rows
+#     arry = range(X.shape[0])
+#     shuffle(arry)
+#     X = X[arry, :]
+#     y = y[arry]
+#     clfs = [
+#         # linear_model.LogisticRegression(random_state=22),
+#         # MultinomialNB(),
+#         # tree.DecisionTreeClassifier(random_state=21),
+#         # RandomForestClassifier(random_state=22),
+#         # XGBClassifier(),
+#         AdaBoostClassifier(random_state=22),
+#         # svm.SVC(kernel='linear', class_weight='balanced'),
+#         # GradientBoostingClassifier(random_state=22),
+#         # BaggingClassifier(random_state=22),
+#         # KNeighborsClassifier()
+#     ]
+#     clf_names = [
+#     #     'Logistic Regression',
+#     #     'Naive Bayes',
+#     #     'Decision Tree',
+#     #     'Random Forest',
+#     #     'XGBoost',
+#         'AdaBoost',
+#         # 'SVM',
+#         # 'GradientBoosting',
+#         # 'Bagging Clf',
+#         # 'KNeighbors Clf'
+#     ]
+#
+#     X = preprocessing.normalize(X)
+#     cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1']
+#
+#     df = pd.DataFrame(columns=cols)
+#     df = df.set_index('alg')
+#     tr = [0.2,0.4,0.6]
+#     for t in tr:
+#         for i in range(len(clfs)):
+#             clf = clone(clfs[i])
+#             clf_name = clf_names[i]
+#             df = test(clf, clf_name, df, cols, X, y,t)
+#     with pd.option_context('expand_frame_repr', False):
+#         print (df)
+#     df.to_csv('./LIWC_'+data_type+'_results_curve.csv', header=True,sep='\t',columns=cols)
+#
+# if __name__ == '__main__':
+#     data_type = 'BuzzFeed'
+#     # LIWC_Representation(data_type)
+#     # LIWC_Prediction2('BuzzFeed')
+#     # LIWC_Prediction2('PolitiFact')
+#     LIWC_Prediction2_curve('BuzzFeed')
+#     LIWC_Prediction2_curve('PolitiFact')
+#     print
\ No newline at end of file
diff --git a/baseline/RST-VSM.py b/baseline/RST-VSM.py
index c85bdf4..5a87e93 100644
--- a/baseline/RST-VSM.py
+++ b/baseline/RST-VSM.py
@@ -1,250 +1,250 @@
-# This is an implementation of Rhetorical Structure Theory for Vector Space Model
-# The basic idea is from the paper: Identification of Truth and Deception in Text: Application of Vector Space Model to Rhetorical Structure Theory
-from os import listdir
-from os.path import isfile, join
-from random import shuffle
-
-import numpy as np
-import pandas as pd
-from sklearn import linear_model
-from sklearn import preprocessing
-from sklearn import svm
-from sklearn import tree
-from sklearn.base import clone
-from sklearn.ensemble import AdaBoostClassifier
-from sklearn.ensemble import BaggingClassifier
-from sklearn.ensemble import GradientBoostingClassifier
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import accuracy_score, f1_score
-from sklearn.metrics import precision_score, recall_score
-from sklearn.model_selection import train_test_split
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.neighbors import KNeighborsClassifier
-from xgboost import XGBClassifier
-
-
-def RSTRepresentation(data_type, out_file):
-    # dir_path = './'+data_type+'/'
-    dir_path = data_type
-
-    f_out = open(out_file,'w+')
-    all_relations = set()
-    org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
-    News_RSTFeats = dict()
-    for of in org_files:
-        ID = of[:of.index('.txt')]
-        file_name = dir_path+'/'+of
-        relation_num = dict()
-        with open(file_name) as f_rst:
-            for line in f_rst:
-                line = line.replace('\'','')
-                line = line.replace(' ','')
-                tmp_relation = line.split(',')[3]
-                relation = tmp_relation[:-2]
-                all_relations.add(relation)
-                if relation in relation_num:
-                    num = relation_num[relation]
-                    num+=1
-                    relation_num[relation] = num
-                else:
-                    relation_num[relation]=1
-        News_RSTFeats[ID] = relation_num
-
-    all_relations = list(all_relations)
-    print(all_relations)
-    for news, rn in News_RSTFeats.items():
-        f_out.write(news+'\t')
-        feats = []
-        for al in all_relations:
-            if al in rn:
-                num = rn[al]
-            else:
-                num=0
-            feats.append(num)
-        f_out.write('\t'.join(str(x) for x in feats))
-        f_out.write('\n')
-    f_out.close()
-
-
-def RSTPrediction2(data_type):
-    X_real = []
-    y_real = []
-    X_fake = []
-    y_fake = []
-    X=[]
-    y=[]
-    with open('./'+data_type+'/RSTFeats.txt') as f_rst:
-        for line in f_rst:
-            line = line.strip()
-            line_str = line.split('\t')
-            ID = line_str[0]
-            feats = [float(x) for x in line_str[1:]]
-            if 'Real' in ID:
-                X_real.append(feats)
-                y_real.append(0)
-            else:
-                X_fake.append(feats)
-                y_fake.append(1)
-    ## Balance fake and true news
-    num = len(y_fake)
-    X_real = X_real[:num]
-    y_real = y_real[:num]
-    for i in range(num):
-        X.append(X_real[i])
-        X.append(X_fake[i])
-        y.append(y_real[i])
-        y.append(y_fake[i])
-
-    X = np.array(X)
-    y = np.array(y)
-    # # shuffle the rows
-    arry = range(X.shape[0])
-    shuffle(arry)
-    X = X[arry, :]
-    y = y[arry]
-    clfs = [
-        linear_model.LogisticRegression(random_state=22),
-        MultinomialNB(),
-        tree.DecisionTreeClassifier(random_state=21),
-        RandomForestClassifier(random_state=22),
-        XGBClassifier(),
-        AdaBoostClassifier(random_state=22),
-        svm.SVC(kernel='linear', class_weight='balanced'),
-        GradientBoostingClassifier(random_state=22),
-        BaggingClassifier(random_state=22),
-        KNeighborsClassifier()
-    ]
-    clf_names = [
-        'Logistic Regression',
-        'Naive Bayes',
-        'Decision Tree',
-        'Random Forest',
-        'XGBoost',
-        'AdaBoost',
-        'SVM',
-        'GradientBoosting',
-        'Bagging Clf',
-        'KNeighbors Clf'
-    ]
-
-    X = preprocessing.normalize(X)
-    cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1']
-
-    df = pd.DataFrame(columns=cols)
-    df = df.set_index('alg')
-    for i in range(len(clfs)):
-        clf = clone(clfs[i])
-        clf_name = clf_names[i]
-        df = test(clf,clf_name,df,cols,X,y,0.8)
-
-    print(df)
-    df.to_csv('./RST_'+data_type+'_results.csv', header=True,sep='\t',columns=cols)
-
-
-def test(clf, clf_name, df, cols, X, y,train_ratio):
-    acc = []
-    prec = []
-    recall = []
-    f1 = []
-    for i in range(5):
-        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_ratio)
-        clf.fit(X_train, y_train)
-        y_pred = clf.predict(X_test)
-        acc.append(accuracy_score(y_test, y_pred))
-        prec.append(precision_score(y_test, y_pred))
-        recall.append(recall_score(y_test, y_pred))
-        f1.append(f1_score(y_test, y_pred))
-    tmp = pd.DataFrame([[clf_name, np.average(acc), np.std(acc), np.average(prec), np.std(prec), np.average(recall),
-                         np.std(recall), np.average(f1), np.std(f1)]], columns=cols)
-    df = df.append(tmp)
-    return df
-
-def RSTPrediction2_curve(data_type):
-    X_real = []
-    y_real = []
-    X_fake = []
-    y_fake = []
-    X=[]
-    y=[]
-    with open('./'+data_type+'/RSTFeats.txt') as f_rst:
-        for line in f_rst:
-            line = line.strip()
-            line_str = line.split('\t')
-            ID = line_str[0]
-            feats = [float(x) for x in line_str[1:]]
-            if 'Real' in ID:
-                X_real.append(feats)
-                y_real.append(0)
-            else:
-                X_fake.append(feats)
-                y_fake.append(1)
-    ## Balance fake and true news
-    num = len(y_fake)
-    X_real = X_real[:num]
-    y_real = y_real[:num]
-    for i in range(num):
-        X.append(X_real[i])
-        X.append(X_fake[i])
-        y.append(y_real[i])
-        y.append(y_fake[i])
-
-    X = np.array(X)
-    y = np.array(y)
-    # # shuffle the rows
-    arry = range(X.shape[0])
-    shuffle(arry)
-    X = X[arry, :]
-    y = y[arry]
-    clfs = [
-        linear_model.LogisticRegression(random_state=22),
-        # MultinomialNB(),
-        # tree.DecisionTreeClassifier(random_state=21),
-        # RandomForestClassifier(random_state=22),
-        # XGBClassifier(),
-        AdaBoostClassifier(random_state=22),
-        # svm.SVC(kernel='linear', class_weight='balanced'),
-        # GradientBoostingClassifier(random_state=22),
-        # BaggingClassifier(random_state=22),
-        # KNeighborsClassifier()
-    ]
-    clf_names = [
-        'Logistic Regression',
-        # 'Naive Bayes',
-        # 'Decision Tree',
-        # 'Random Forest',
-        # 'XGBoost',
-        'AdaBoost',
-        # 'SVM',
-        # 'GradientBoosting',
-        # 'Bagging Clf',
-        # 'KNeighbors Clf'
-    ]
-
-    X = preprocessing.normalize(X)
-    cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1']
-
-    df = pd.DataFrame(columns=cols)
-    df = df.set_index('alg')
-    tr = [0.2,0.4,0.6]
-    for t in tr:
-        for i in range(len(clfs)):
-            clf = clone(clfs[i])
-            clf_name = clf_names[i]
-            df = test(clf, clf_name, df, cols, X, y,t)
-    with pd.option_context('expand_frame_repr', False):
-        print (df)
-    df.to_csv('./RST_'+data_type+'_results_curve.csv', header=True,sep='\t',columns=cols)
-
-if __name__ == '__main__':
-    data_type = 'PolitiFact'
-
-    RSTRepresentation("data/baseline_features/rst/raw_parsed_data/politifact_fake",
-                      "data/baseline_features/rst/raw_parsed_data/politifact_fake_rst_features.txt")
-    RSTRepresentation("data/baseline_features/rst/raw_parsed_data/politifact_real",
-                      "data/baseline_features/rst/raw_parsed_data/politifact_real_rst_features.txt")
-
-    # RSTRepresentation(data_type)
-    # RSTPrediction2('BuzzFeed')
-    # RSTPrediction2('PolitiFact')
-    # RSTPrediction2_curve('BuzzFeed')
-    # RSTPrediction2_curve('PolitiFact')
\ No newline at end of file
+# # This is an implementation of Rhetorical Structure Theory for Vector Space Model
+# # The basic idea is from the paper: Identification of Truth and Deception in Text: Application of Vector Space Model to Rhetorical Structure Theory
+# from os import listdir
+# from os.path import isfile, join
+# from random import shuffle
+#
+# import numpy as np
+# import pandas as pd
+# from sklearn import linear_model
+# from sklearn import preprocessing
+# from sklearn import svm
+# from sklearn import tree
+# from sklearn.base import clone
+# from sklearn.ensemble import AdaBoostClassifier
+# from sklearn.ensemble import BaggingClassifier
+# from sklearn.ensemble import GradientBoostingClassifier
+# from sklearn.ensemble import RandomForestClassifier
+# from sklearn.metrics import accuracy_score, f1_score
+# from sklearn.metrics import precision_score, recall_score
+# from sklearn.model_selection import train_test_split
+# from sklearn.naive_bayes import MultinomialNB
+# from sklearn.neighbors import KNeighborsClassifier
+# from xgboost import XGBClassifier
+#
+#
+# def RSTRepresentation(data_type, out_file):
+#     # dir_path = './'+data_type+'/'
+#     dir_path = data_type
+#
+#     f_out = open(out_file,'w+')
+#     all_relations = set()
+#     org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
+#     News_RSTFeats = dict()
+#     for of in org_files:
+#         ID = of[:of.index('.txt')]
+#         file_name = dir_path+'/'+of
+#         relation_num = dict()
+#         with open(file_name) as f_rst:
+#             for line in f_rst:
+#                 line = line.replace('\'','')
+#                 line = line.replace(' ','')
+#                 tmp_relation = line.split(',')[3]
+#                 relation = tmp_relation[:-2]
+#                 all_relations.add(relation)
+#                 if relation in relation_num:
+#                     num = relation_num[relation]
+#                     num+=1
+#                     relation_num[relation] = num
+#                 else:
+#                     relation_num[relation]=1
+#         News_RSTFeats[ID] = relation_num
+#
+#     all_relations = list(all_relations)
+#     print(all_relations)
+#     for news, rn in News_RSTFeats.items():
+#         f_out.write(news+'\t')
+#         feats = []
+#         for al in all_relations:
+#             if al in rn:
+#                 num = rn[al]
+#             else:
+#                 num=0
+#             feats.append(num)
+#         f_out.write('\t'.join(str(x) for x in feats))
+#         f_out.write('\n')
+#     f_out.close()
+#
+#
+# def RSTPrediction2(data_type):
+#     X_real = []
+#     y_real = []
+#     X_fake = []
+#     y_fake = []
+#     X=[]
+#     y=[]
+#     with open('./'+data_type+'/RSTFeats.txt') as f_rst:
+#         for line in f_rst:
+#             line = line.strip()
+#             line_str = line.split('\t')
+#             ID = line_str[0]
+#             feats = [float(x) for x in line_str[1:]]
+#             if 'Real' in ID:
+#                 X_real.append(feats)
+#                 y_real.append(0)
+#             else:
+#                 X_fake.append(feats)
+#                 y_fake.append(1)
+#     ## Balance fake and true news
+#     num = len(y_fake)
+#     X_real = X_real[:num]
+#     y_real = y_real[:num]
+#     for i in range(num):
+#         X.append(X_real[i])
+#         X.append(X_fake[i])
+#         y.append(y_real[i])
+#         y.append(y_fake[i])
+#
+#     X = np.array(X)
+#     y = np.array(y)
+#     # # shuffle the rows
+#     arry = range(X.shape[0])
+#     shuffle(arry)
+#     X = X[arry, :]
+#     y = y[arry]
+#     clfs = [
+#         linear_model.LogisticRegression(random_state=22),
+#         MultinomialNB(),
+#         tree.DecisionTreeClassifier(random_state=21),
+#         RandomForestClassifier(random_state=22),
+#         XGBClassifier(),
+#         AdaBoostClassifier(random_state=22),
+#         svm.SVC(kernel='linear', class_weight='balanced'),
+#         GradientBoostingClassifier(random_state=22),
+#         BaggingClassifier(random_state=22),
+#         KNeighborsClassifier()
+#     ]
+#     clf_names = [
+#         'Logistic Regression',
+#         'Naive Bayes',
+#         'Decision Tree',
+#         'Random Forest',
+#         'XGBoost',
+#         'AdaBoost',
+#         'SVM',
+#         'GradientBoosting',
+#         'Bagging Clf',
+#         'KNeighbors Clf'
+#     ]
+#
+#     X = preprocessing.normalize(X)
+#     cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1']
+#
+#     df = pd.DataFrame(columns=cols)
+#     df = df.set_index('alg')
+#     for i in range(len(clfs)):
+#         clf = clone(clfs[i])
+#         clf_name = clf_names[i]
+#         df = test(clf,clf_name,df,cols,X,y,0.8)
+#
+#     print(df)
+#     df.to_csv('./RST_'+data_type+'_results.csv', header=True,sep='\t',columns=cols)
+#
+#
+# def test(clf, clf_name, df, cols, X, y,train_ratio):
+#     acc = []
+#     prec = []
+#     recall = []
+#     f1 = []
+#     for i in range(5):
+#         X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_ratio)
+#         clf.fit(X_train, y_train)
+#         y_pred = clf.predict(X_test)
+#         acc.append(accuracy_score(y_test, y_pred))
+#         prec.append(precision_score(y_test, y_pred))
+#         recall.append(recall_score(y_test, y_pred))
+#         f1.append(f1_score(y_test, y_pred))
+#     tmp = pd.DataFrame([[clf_name, np.average(acc), np.std(acc), np.average(prec), np.std(prec), np.average(recall),
+#                          np.std(recall), np.average(f1), np.std(f1)]], columns=cols)
+#     df = df.append(tmp)
+#     return df
+#
+# def RSTPrediction2_curve(data_type):
+#     X_real = []
+#     y_real = []
+#     X_fake = []
+#     y_fake = []
+#     X=[]
+#     y=[]
+#     with open('./'+data_type+'/RSTFeats.txt') as f_rst:
+#         for line in f_rst:
+#             line = line.strip()
+#             line_str = line.split('\t')
+#             ID = line_str[0]
+#             feats = [float(x) for x in line_str[1:]]
+#             if 'Real' in ID:
+#                 X_real.append(feats)
+#                 y_real.append(0)
+#             else:
+#                 X_fake.append(feats)
+#                 y_fake.append(1)
+#     ## Balance fake and true news
+#     num = len(y_fake)
+#     X_real = X_real[:num]
+#     y_real = y_real[:num]
+#     for i in range(num):
+#         X.append(X_real[i])
+#         X.append(X_fake[i])
+#         y.append(y_real[i])
+#         y.append(y_fake[i])
+#
+#     X = np.array(X)
+#     y = np.array(y)
+#     # # shuffle the rows
+#     arry = range(X.shape[0])
+#     shuffle(arry)
+#     X = X[arry, :]
+#     y = y[arry]
+#     clfs = [
+#         linear_model.LogisticRegression(random_state=22),
+#         # MultinomialNB(),
+#         # tree.DecisionTreeClassifier(random_state=21),
+#         # RandomForestClassifier(random_state=22),
+#         # XGBClassifier(),
+#         AdaBoostClassifier(random_state=22),
+#         # svm.SVC(kernel='linear', class_weight='balanced'),
+#         # GradientBoostingClassifier(random_state=22),
+#         # BaggingClassifier(random_state=22),
+#         # KNeighborsClassifier()
+#     ]
+#     clf_names = [
+#         'Logistic Regression',
+#         # 'Naive Bayes',
+#         # 'Decision Tree',
+#         # 'Random Forest',
+#         # 'XGBoost',
+#         'AdaBoost',
+#         # 'SVM',
+#         # 'GradientBoosting',
+#         # 'Bagging Clf',
+#         # 'KNeighbors Clf'
+#     ]
+#
+#     X = preprocessing.normalize(X)
+#     cols = ['alg', 'avg_acc', 'std_acc','avg_prec','std_prec','avg_rec','std_rec', 'avg_f1','std_f1']
+#
+#     df = pd.DataFrame(columns=cols)
+#     df = df.set_index('alg')
+#     tr = [0.2,0.4,0.6]
+#     for t in tr:
+#         for i in range(len(clfs)):
+#             clf = clone(clfs[i])
+#             clf_name = clf_names[i]
+#             df = test(clf, clf_name, df, cols, X, y,t)
+#     with pd.option_context('expand_frame_repr', False):
+#         print (df)
+#     df.to_csv('./RST_'+data_type+'_results_curve.csv', header=True,sep='\t',columns=cols)
+#
+# if __name__ == '__main__':
+#     data_type = 'PolitiFact'
+#
+#     RSTRepresentation("data/baseline_features/rst/raw_parsed_data/politifact_fake",
+#                       "data/baseline_features/rst/raw_parsed_data/politifact_fake_rst_features.txt")
+#     RSTRepresentation("data/baseline_features/rst/raw_parsed_data/politifact_real",
+#                       "data/baseline_features/rst/raw_parsed_data/politifact_real_rst_features.txt")
+#
+#     # RSTRepresentation(data_type)
+#     # RSTPrediction2('BuzzFeed')
+#     # RSTPrediction2('PolitiFact')
+#     # RSTPrediction2_curve('BuzzFeed')
+#     # RSTPrediction2_curve('PolitiFact')
\ No newline at end of file
diff --git a/baseline_basic_model.py b/baseline_basic_model.py
index 88c593a..72f5455 100644
--- a/baseline_basic_model.py
+++ b/baseline_basic_model.py
@@ -1,9 +1,10 @@
-import numpy as np
 import pickle
 
+import numpy as np
+
 from analysis_util import get_propagation_graphs, equal_samples
-from basic_model import get_basic_model_results, dump_random_forest_feature_importance
-from construct_sample_features import get_train_test_split
+from basic_model import get_basic_model_results
+from construct_sample_features import get_train_test_split, get_TPNF_dataset, get_dataset_feature_names
 from structure_temp_analysis import ScienceCascadeFeatureHelper
 
 
@@ -21,12 +22,14 @@ def get_science_dataset_array(news_source):
                                                       macro_features=include_macro, news_source=news_source,
                                                       label="real")
 
-    return np.concatenate([fake_features, real_features])
+    sample_features = np.concatenate([fake_features, real_features])
 
+    pickle.dump(sample_features,  open("data/stfn/{}_stfn_features.pkl".format(news_source), "wb"))
+    return sample_features
 
 
 def get_castillo_features(news_source, castillo_feature_folder="data/castillo/saved_features"):
-    features =  pickle.load(open("{}/{}_castillo_features.pkl".format(castillo_feature_folder, news_source), "rb"))
+    features = pickle.load(open("{}/{}_castillo_features.pkl".format(castillo_feature_folder, news_source), "rb"))
     features = np.nan_to_num(features)
     return features
 
@@ -35,14 +38,15 @@ def get_tpnf_features(news_source, feature_folder="data/train_test_data"):
     return pickle.load(open("{}/{}_micro_macro_struct_temp_linguistic.pkl".format(feature_folder, news_source), "rb"))
 
 
-def get_liwc_features(news_source, feature_folder = "data/baseline_features/liwc_features"):
+def get_liwc_features(news_source, feature_folder="data/baseline_features/liwc_features"):
     fake_features = np.loadtxt("{}/{}_fake_liwc.csv".format(feature_folder, news_source), delimiter=',')
     real_features = np.loadtxt("{}/{}_real_liwc.csv".format(feature_folder, news_source), delimiter=',')
     feature_array = np.concatenate([fake_features, real_features])
 
     return feature_array
 
-def get_rst_features(news_source, rst_feature_folder = "data/baseline_features/rst/raw_parsed_data"):
+
+def get_rst_features(news_source, rst_feature_folder="data/baseline_features/rst_both/raw_parsed_data"):
     fake_features = np.loadtxt("{}/{}_fake_rst_features.csv".format(rst_feature_folder, news_source), delimiter='\t')
     real_features = np.loadtxt("{}/{}_real_rst_features.csv".format(rst_feature_folder, news_source), delimiter='\t')
     feature_array = np.concatenate([fake_features, real_features])
@@ -50,7 +54,7 @@ def get_rst_features(news_source, rst_feature_folder = "data/baseline_features/r
     return feature_array
 
 
-def get_sample_feature_array(news_source, tpnf=False, castillo=False, liwc=False, rst=False, stfn  = False):
+def get_sample_feature_array(news_source, tpnf=False, castillo=False, liwc=False, rst=False, stfn=False):
     feature_arrays = []
 
     if tpnf:
@@ -76,8 +80,8 @@ def get_sample_feature_array(news_source, tpnf=False, castillo=False, liwc=False
     return all_feature_array
 
 
-def get_baselines_classificaton_result(news_source, tpnf=False, castillo=False, liwc=False, rst=False, stfn = False):
-    sample_feature_array = get_sample_feature_array(news_source, tpnf, castillo, liwc, rst , stfn)
+def get_baselines_classificaton_result(news_source, tpnf=False, castillo=False, liwc=False, rst=False, stfn=False):
+    sample_feature_array = get_sample_feature_array(news_source, tpnf, castillo, liwc, rst, stfn)
 
     print("Sample feature array dimensions")
     print(sample_feature_array.shape, flush=True)
@@ -89,12 +93,79 @@ def get_baselines_classificaton_result(news_source, tpnf=False, castillo=False,
     get_basic_model_results(X_train, X_test, y_train, y_test)
 
 
+def get_baseline_modification_classificaton_result(news_source, data_dir = "data/train_test_data"):
+    include_micro = True
+    include_macro = True
+
+    include_structural = True
+    include_temporal = True
+    include_linguistic = True
+
+    science_features = get_science_dataset_array(news_source)
+    science_features = science_features[:, [3,4]]
+    print("stfn features :", science_features.shape)
+
+    sample_feature_array = get_TPNF_dataset(data_dir, news_source, include_micro, include_macro, include_structural,
+                                            include_temporal, include_linguistic, use_cache=True)
+
+    sample_feature_array = sample_feature_array[:, :-1]
+    feature_names, short_feature_names = get_dataset_feature_names(include_micro, include_macro, include_structural,
+                                                                   include_temporal, include_linguistic)
+
+    print("tpnf features :", sample_feature_array.shape)
+
+    sample_feature_array = np.concatenate([sample_feature_array, science_features], axis=1)
+
+    print("overall features dim  : ", sample_feature_array.shape)
+
+    print("Sample feature array dimensions")
+    print(sample_feature_array.shape, flush=True)
+
+    num_samples = int(len(sample_feature_array) / 2)
+    target_labels = np.concatenate([np.ones(num_samples), np.zeros(num_samples)], axis=0)
+
+    X_train, X_test, y_train, y_test = get_train_test_split(sample_feature_array, target_labels)
+    get_basic_model_results(X_train, X_test, y_train, y_test)
+
+
+def get_domain_adaptation_classification_results(source_news_source, target_news_source, tpnf=False, castillo=False,
+                                                 liwc=False, rst=False, stfn=False):
+    train_sample_feature_array = get_sample_feature_array(source_news_source, tpnf, castillo, liwc, rst, stfn)
+    test_sample_feature_array = get_sample_feature_array(target_news_source, tpnf, castillo, liwc, rst, stfn)
+
+    print("Source Domain : {}".format(source_news_source))
+    print("Target Domain : {}".format(target_news_source))
+
+    print("source :  ", train_sample_feature_array.shape)
+    print("target :  ", test_sample_feature_array.shape)
+
+    train_num_samples = int(len(train_sample_feature_array) / 2)
+    test_num_samples = int(len(test_sample_feature_array) / 2)
+
+    train_target_labels = np.concatenate([np.ones(train_num_samples), np.zeros(train_num_samples)], axis=0)
+    test_target_labels = np.concatenate([np.ones(test_num_samples), np.zeros(test_num_samples)], axis=0)
+
+    S_X_train, S_X_test, S_y_train, S_y_test = get_train_test_split(train_sample_feature_array, train_target_labels)
+    T_X_train, T_X_test, T_y_train, T_y_test = get_train_test_split(test_sample_feature_array, test_target_labels)
+
+    # get_basic_model_results(train_sample_feature_array, test_sample_feature_array, train_target_labels,
+    # test_target_labels)
+
+    get_basic_model_results(S_X_train, T_X_test, S_y_train, T_y_test)
+
+
 if __name__ == "__main__":
-    get_baselines_classificaton_result("gossipcop", tpnf=True, castillo=False, liwc=False, rst=False, stfn=True)
+    get_baselines_classificaton_result("gossipcop", tpnf=True, castillo=False, liwc=False, rst=False, stfn=False)
+
+    # get_baselines_classificaton_result("gossipcop", tpnf=False, castillo=False, liwc=False, rst=False, stfn=True)
+
+    # get_baseline_modification_classificaton_result("gossipcop")
+
+    # get_domain_adaptation_classification_results("gossipcop", "politifact", tpnf=True, stfn=False, liwc=False, rst=False)
 
     # feature_array = get_castillo_features("politifact")
     # num_samples = int(feature_array.shape[0]/2)
     # np.savetxt("fake_castillo_features.csv", feature_array[:num_samples], delimiter=",")
-    # np.savetxt("real_castillo_features.csv", feature_array[num_samples+1:], delimiter=",")
+    # np.savetxt("real_castillo_features.csv", feature _array[num_samples+1:], delimiter=",")
     #
-    # dump_random_forest_feature_importance(feature_array)
\ No newline at end of file
+    # dump_random_forest_feature_importance(feature_array)
diff --git a/baseline_feature_extraction.py b/baseline_feature_extraction.py
index 36a1412..9939753 100644
--- a/baseline_feature_extraction.py
+++ b/baseline_feature_extraction.py
@@ -246,6 +246,121 @@ def dump_ordered_rst_representation(rst_folder, news_source, fake_out_file, real
     f_out.close()
 
 
+def dump_both_ordered_rst_representation(rst_folder1, rst_folder2, news_source1, news_source2,fake_out_file1,
+                                         fake_out_file2, real_out_file1, real_out_file2):
+    dir_path = rst_folder1
+
+    all_relations = set()
+    org_files = []
+
+    org_files.extend([join(dir_path, f) for f in listdir(dir_path) if isfile(join(dir_path, f))])
+
+    dir_path = rst_folder2
+
+    org_files.extend([join(dir_path, f) for f in listdir(dir_path) if isfile(join(dir_path, f))])
+
+    News_RSTFeats = dict()
+    for file_name in org_files:
+        ID = file_name[file_name.rindex("/")+1:file_name.index('.txt')]
+        # file_name = dir_path + '/' + of
+        relation_num = dict()
+        with open(file_name) as f_rst:
+            for line in f_rst:
+                line = line.replace('\'', '')
+                line = line.replace(' ', '')
+                tmp_relation = line.split(',')[3]
+                relation = tmp_relation[:-2]
+                all_relations.add(relation)
+                if relation in relation_num:
+                    num = relation_num[relation]
+                    num += 1
+                    relation_num[relation] = num
+                else:
+                    relation_num[relation] = 1
+        News_RSTFeats[ID] = relation_num
+
+
+
+    all_relations = list(all_relations)
+    print(all_relations)
+
+    fake_ordered_sample_ids = pickle.load(
+        open("data/baseline_data/{}_{}_sample_news_ordered_ids.pkl".format(news_source1, "fake"), "rb"))
+
+    f_out = open(fake_out_file1, 'w+')
+    for news_id in fake_ordered_sample_ids:
+        # for news, rn in News_RSTFeats.items():
+        #     f_out.write(news + '\t')
+        rn = News_RSTFeats[news_id]
+        feats = []
+        for al in all_relations:
+            if al in rn:
+                num = rn[al]
+            else:
+                num = 0
+            feats.append(num)
+        f_out.write('\t'.join(str(x) for x in feats))
+        f_out.write('\n')
+    f_out.close()
+
+    fake_ordered_sample_ids = pickle.load(
+        open("data/baseline_data/{}_{}_sample_news_ordered_ids.pkl".format(news_source2, "fake"), "rb"))
+
+    f_out = open(fake_out_file2, 'w+')
+    for news_id in fake_ordered_sample_ids:
+        # for news, rn in News_RSTFeats.items():
+        #     f_out.write(news + '\t')
+        rn = News_RSTFeats[news_id]
+        feats = []
+        for al in all_relations:
+            if al in rn:
+                num = rn[al]
+            else:
+                num = 0
+            feats.append(num)
+        f_out.write('\t'.join(str(x) for x in feats))
+        f_out.write('\n')
+    f_out.close()
+
+    real_ordered_sample_ids = pickle.load(
+        open("data/baseline_data/{}_{}_sample_news_ordered_ids.pkl".format(news_source1, "real"), "rb"))
+
+    f_out = open(real_out_file1, 'w+')
+    for news_id in real_ordered_sample_ids:
+        # for news, rn in News_RSTFeats.items():
+        #     f_out.write(news + '\t')
+        rn = News_RSTFeats[news_id]
+        feats = []
+        for al in all_relations:
+            if al in rn:
+                num = rn[al]
+            else:
+                num = 0
+            feats.append(num)
+        f_out.write('\t'.join(str(x) for x in feats))
+        f_out.write('\n')
+    f_out.close()
+
+    real_ordered_sample_ids = pickle.load(
+        open("data/baseline_data/{}_{}_sample_news_ordered_ids.pkl".format(news_source2, "real"), "rb"))
+
+    f_out = open(real_out_file2, 'w+')
+    for news_id in real_ordered_sample_ids:
+        # for news, rn in News_RSTFeats.items():
+        #     f_out.write(news + '\t')
+        rn = News_RSTFeats[news_id]
+        feats = []
+        for al in all_relations:
+            if al in rn:
+                num = rn[al]
+            else:
+                num = 0
+            feats.append(num)
+        f_out.write('\t'.join(str(x) for x in feats))
+        f_out.write('\n')
+    f_out.close()
+
+
 if __name__ == "__main__":
     # get_news_ids_used_for_propagation_network("politifact")
 
@@ -254,10 +369,18 @@ def dump_ordered_rst_representation(rst_folder, news_source, fake_out_file, real
 
     news_source = "gossipcop"
 
-    dump_ordered_rst_representation("data/baseline_features/rst/raw_parsed_data/gossipcop",news_source,
-                                    "data/baseline_features/rst/raw_parsed_data/gossipcop_fake_rst_features.csv",
-                                    "data/baseline_features/rst/raw_parsed_data/gossipcop_real_rst_features.csv"
-                                    )
+    dump_both_ordered_rst_representation("data/baseline_features/rst/raw_parsed_data/politifact",
+                                         "data/baseline_features/rst/raw_parsed_data/gossipcop",
+                                         "politifact", "gossipcop",
+                                         "data/baseline_features/rst_both/raw_parsed_data/politifact_fake_rst_features.csv",
+                                         "data/baseline_features/rst_both/raw_parsed_data/gossipcop_fake_rst_features.csv",
+                                         "data/baseline_features/rst_both/raw_parsed_adata/politifact_real_rst_features.csv",
+                                         "data/baseline_features/rst_both/raw_parsed_data/gossipcop_real_rst_features.csv")
+
+    # dump_ordered_rst_representation("data/baseline_features/rst/raw_parsed_data/gossipcop", news_source,
+    #                                 "data/baseline_features/rst/raw_parsed_data/gossipcop_fake_rst_features.csv",
+    #                                 "data/baseline_features/rst/raw_parsed_data/gossipcop_real_rst_features.csv"
+    #                                 )
 
     # dump_LIWC_Representation("data/baseline_features/liwc_features/LIWC2015_{}_fake_text_contents_ordered_new.txt".format(news_source),
     #                          "data/baseline_features/liwc_features/{}_fake_liwc.csv".format(news_source))
diff --git a/basic_model.py b/basic_model.py
index a99ba19..f8e374c 100644
--- a/basic_model.py
+++ b/basic_model.py
@@ -1,15 +1,16 @@
+import matplotlib
 import numpy as np
-
-from sklearn import preprocessing, svm, clone
+from sklearn import preprocessing, svm
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 from sklearn.naive_bayes import GaussianNB
 from sklearn.tree import DecisionTreeClassifier
 
-from construct_sample_features import get_TPNF_dataset, get_train_test_split, get_dataset_feature_names
-
-import matplotlib
+from analysis_util import get_propagation_graphs, equal_samples
+from construct_sample_features import get_TPNF_dataset, get_train_test_split, get_dataset_feature_names, \
+    filter_propagation_graphs, get_nx_propagation_graphs
+from structure_temp_analysis import ScienceCascadeFeatureHelper
 
 matplotlib.use('agg')
 import matplotlib.pyplot as plt
@@ -23,7 +24,7 @@ def get_classifier_by_name(classifier_name):
     elif classifier_name == "DecisionTreeClassifier":
         return DecisionTreeClassifier()
     elif classifier_name == "RandomForestClassifier":
-        return RandomForestClassifier(n_estimators=100)
+        return RandomForestClassifier(n_estimators=50)
     elif classifier_name == "SVM -linear kernel":
         return svm.SVC(kernel='linear')
 
@@ -89,8 +90,9 @@ def get_basic_model_results(X_train, X_test, y_train, y_test):
     X_train = scaler.transform(X_train)
     X_test = scaler.transform(X_test)
 
-    classifiers = [GaussianNB(), LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(),
-                   svm.SVC(kernel='linear')]
+    classifiers = [GaussianNB(), LogisticRegression(), DecisionTreeClassifier(),
+                   RandomForestClassifier(n_estimators=100),
+                   svm.SVC()]
     classifier_names = ["GaussianNB", "LogisticRegression", "DecisionTreeClassifier", "RandomForestClassifier",
                         "SVM -linear kernel"]
 
@@ -99,16 +101,59 @@ def get_basic_model_results(X_train, X_test, y_train, y_test):
         train_model(classifier_names[idx], X_train, X_test, y_train, y_test)
 
 
-def get_classificaton_results_tpnf(data_dir, news_source):
-    include_micro = True
+def get_classificaton_results_tpnf(data_dir, news_source, time_interval, use_cache=False):
+    include_micro = False
     include_macro = True
 
     include_structural = True
-    include_temporal = False
-    include_linguistic = False
+    include_temporal = True
+    include_linguistic = True
 
     sample_feature_array = get_TPNF_dataset(data_dir, news_source, include_micro, include_macro, include_structural,
-                                            include_temporal, include_linguistic)
+                                            include_temporal, include_linguistic, time_interval, use_cache=use_cache)
+
+    print("Sample feature array dimensions")
+    print(sample_feature_array.shape, flush=True)
+
+    num_samples = int(len(sample_feature_array) / 2)
+    target_labels = np.concatenate([np.ones(num_samples), np.zeros(num_samples)], axis=0)
+
+    X_train, X_test, y_train, y_test = get_train_test_split(sample_feature_array, target_labels)
+    get_basic_model_results(X_train, X_test, y_train, y_test)
+
+
+def get_science_dataset_array_time_based(news_source, time_interval=None):
+    fake_prop_graph, real_prop_graph = get_nx_propagation_graphs("data/nx_network_data", news_source)
+    fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
+    feature_helper = ScienceCascadeFeatureHelper()
+    include_micro = False
+    include_macro = True
+
+    if time_interval is not None:
+        time_limit = time_interval * 60 * 60
+
+        print("Time limit in seconds : {}".format(time_limit))
+
+        fake_prop_graph = filter_propagation_graphs(fake_prop_graph, time_limit, reply=False)
+        real_prop_graph = filter_propagation_graphs(real_prop_graph, time_limit, reply=False)
+
+        print("After time based filtering ")
+        print("No. of fake samples : {}  No. of real samples: {}".format(len(fake_prop_graph), len(real_prop_graph)))
+
+        fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
+
+    fake_features = feature_helper.get_features_array(fake_prop_graph, micro_features=include_micro,
+                                                      macro_features=include_macro, news_source=news_source,
+                                                      label="fake", use_cache=False)
+    real_features = feature_helper.get_features_array(real_prop_graph, micro_features=include_micro,
+                                                      macro_features=include_macro, news_source=news_source,
+                                                      label="real", use_cache=False)
+
+    return np.concatenate([fake_features, real_features])
+
+
+def get_classificaton_results_stnf(news_source, time_interval=None):
+    sample_feature_array = get_science_dataset_array_time_based(news_source, time_interval)
 
     print("Sample feature array dimensions")
     print(sample_feature_array.shape, flush=True)
@@ -139,9 +184,9 @@ def dump_random_forest_feature_importance(data_dir, news_source):
     include_linguistic = True
 
     sample_feature_array = get_TPNF_dataset(data_dir, news_source, include_micro, include_macro, include_structural,
-                                            include_temporal, include_linguistic)
+                                            include_temporal, include_linguistic, use_cache=True)
 
-    sample_feature_array = sample_feature_array[:,:-1]
+    sample_feature_array = sample_feature_array[:, :-1]
     feature_names, short_feature_names = get_dataset_feature_names(include_micro, include_macro, include_structural,
                                                                    include_temporal, include_linguistic)
 
@@ -173,103 +218,60 @@ def dump_random_forest_feature_importance(data_dir, news_source):
     for f in range(X_train.shape[1]):
         print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
 
+    matplotlib.rcParams['figure.figsize'] = 5, 2
+
     # Plot the feature importances of the forest
     plt.figure()
-    plt.title("Feature importances - PolitiFact dataset")
-
-
+    # plt.title("Feature importances - PolitiFact dataset")
 
     plt.bar(range(X_train.shape[1]), importances[indices],
             color="b", yerr=std[indices], align="center")
-    plt.xticks(range(X_train.shape[1]), np.array(short_feature_names)[indices], rotation=60, fontsize=9)
+    plt.xticks(range(X_train.shape[1]), np.array(short_feature_names)[indices], rotation=75, fontsize=9.5)
     plt.xlim([-1, X_train.shape[1]])
     plt.savefig('{}_feature_importance.png'.format(news_source), bbox_inches='tight')
 
     plt.show()
 
 
-# def dump_random_forest_feature_importance(sample_feature_array):
-#     include_micro = True
-#     include_macro = True
-#
-#     include_structural = True
-#     include_temporal = True
-#     include_linguistic = True
-#
-#     feature_names, short_feature_names = get_dataset_feature_names(include_micro, include_macro, include_structural,
-#                                                                    include_temporal, include_linguistic)
-#
-#     num_samples = int(len(sample_feature_array) / 2)
-#     target_labels = np.concatenate([np.ones(num_samples), np.zeros(num_samples)], axis=0)
-#
-#     X_train, X_test, y_train, y_test = get_train_test_split(sample_feature_array, target_labels)
-#
-#     # scaler = preprocessing.StandardScaler().fit(X_train)
-#     #
-#     # X_train = scaler.transform(X_train)
-#     # X_test = scaler.transform(X_test)
-#
-#     # Build a forest and compute the feature importances
-#
-#     forest = ExtraTreesClassifier(n_estimators=100,
-#                                   random_state=0)
-#
-#     forest.fit(X_train, y_train)
-#     importances = forest.feature_importances_
-#     std = np.std([tree.feature_importances_ for tree in forest.estimators_],
-#                  axis=0)
-#     indices = np.argsort(importances)[::-1]
-#
-#     # Print the feature ranking
-#     print("Feature ranking:")
-#
-#     for f in range(X_train.shape[1]):
-#         print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
-#
-#     # Plot the feature importances of the forest
-#     plt.figure()
-#     plt.title("Feature importances - Politifact dataset")
-#     plt.bar(range(X_train.shape[1]), importances[indices],
-#             color="b", yerr=std[indices], align="center")
-#     plt.xticks(range(X_train.shape[1]), indices, rotation=60)
-#     plt.xlim([-1, X_train.shape[1]])
-#     plt.savefig('feature_importance.png', bbox_inches='tight')
-#
-#     plt.show()
-
-
-# def dump_feature_importance(data_dir, news_source):
-#     include_micro = True
-#     include_macro = True
-#
-#     include_structural = True
-#     include_temporal = True
-#     include_linguistic = True
-#
-#     sample_feature_array = get_TPNF_dataset(data_dir, news_source, include_micro, include_macro, include_structural,
-#                                             include_temporal, include_linguistic)
-#
-#     feature_names, short_feature_names = get_dataset_feature_names(include_micro, include_macro, include_structural,
-#                                                                    include_temporal, include_linguistic)
-#
-#     num_samples = int(len(sample_feature_array) / 2)
-#     target_labels = np.concatenate([np.ones(num_samples), np.zeros(num_samples)], axis=0)
-#
-#     X_train, X_test, y_train, y_test = get_train_test_split(sample_feature_array, target_labels)
-#
-#     scaler = preprocessing.StandardScaler().fit(X_train)
-#
-#     X_train = scaler.transform(X_train)
-#     X_test = scaler.transform(X_test)
-#
-#     classifier = svm.SVC(kernel='linear')
-#     classifier.fit(X_train, y_train)
-#
-#     plot_feature_importances(classifier.coef_.ravel(), short_feature_names)
+def get_science_dataset_array(news_source):
+    fake_prop_graph, real_prop_graph = get_propagation_graphs("data/saved_new_no_filter", news_source)
+    fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
+    feature_helper = ScienceCascadeFeatureHelper()
+    include_micro = False
+    include_macro = True
+
+    fake_features = feature_helper.get_features_array(fake_prop_graph, micro_features=include_micro,
+                                                      macro_features=include_macro, news_source=news_source,
+                                                      label="fake")
+    real_features = feature_helper.get_features_array(real_prop_graph, micro_features=include_micro,
+                                                      macro_features=include_macro, news_source=news_source,
+                                                      label="real")
+
+    return np.concatenate([fake_features, real_features])
 
 
 if __name__ == "__main__":
-    get_classificaton_results_tpnf("data/train_test_data", "gossipcop")
+    get_classificaton_results_tpnf("data/train_test_data", "gossipcop", time_interval=None, use_cache=False)
+
+    # get_classificaton_results_stnf( "politifact", time_interval=None)
+
+    # get_classificaton_results_tpnf("data/train_test_data", "politifact", time_interval = None)
+
+    # exit(1)
+
+    time_intervals = [3, 6, 12, 24, 36, 48, 60, 72, 84, 96]
+    # time_intervals = [3, 6]
+    # time_intervals = [None]
+
+    # for time_interval in time_intervals:
+    #     print("=============Time Interval : {}  ==========".format(time_interval))
+    #     start_time = time.time()
+    #     # get_classificaton_results_tpnf("data/train_test_data", "politifact", time_interval)
+    #     # get_classificaton_results_tpnf("data/train_test_data", "politifact", time_interval)
+    #
+    #     get_classificaton_results_stnf("politifact", time_interval)
+    #     print("\n\n================Exectuion time - {} ==================================\n".format(
+    #         time.time() - start_time))
 
     # dump_feature_importance("data/train_test_data", "politifact")
-    # dump_random_forest_feature_importance("data/train_test_data", "politifact")
+    # dump_random_forest_feature_importance("data/train_test_data", "gossipcop")
diff --git a/castillo_features.py b/castillo_features.py
index 6de25a6..c8e05b7 100644
--- a/castillo_features.py
+++ b/castillo_features.py
@@ -1,830 +1,830 @@
-import pickle
-from os import listdir
-from os.path import isfile, join
-from pathlib import Path
-
-from pymongo import MongoClient
-from datetime import datetime
-import networkx as nx
-import numpy as np
-from random import shuffle
-from sklearn.svm import SVC
-from sklearn.model_selection import cross_val_score
-import re
-from tqdm import tqdm
-# from sklearn.ensemble import RandomForestClassifier
-# import json
-# import random
-# from nltk.tokenize import RegexpTokenizer
-# from stop_words import get_stop_words
-# from nltk.stem.porter import PorterStemmer
-# from gensim import corpora
-# import gensim
-# from sklearn.model_selection import cross_validate
-# from sklearn.dummy import DummyClassifier
-# from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
-# from sklearn.model_selection import train_test_split
-# from sklearn import preprocessing
-
-from analysis_util import get_propagation_graphs, equal_samples, get_numpy_array, create_dir
-from misc_process import get_reply_of_replies
-from pre_process_util import get_news_articles, load_configuration, get_database_connection
-from structure_temp_analysis import StructureFeatureHelper
-
-all_reply_id_sentiment_score_dict = pickle.load(open("{}/all_reply_id_sentiment_result.pkl"
-                                                     .format("data/pre_process_data/vader_sentiment"), "rb"))
-
-
-# def content_featureAgg(tweets):
-#     # Current version tweets content are almost the same, not distinguishable
-#
-#     return []
-#
-# def networkFeatureAgg(users,user_followers_coll,user_followees_coll):
-#
-#     user_index = dict()
-#     for i in range(len(users)):
-#         user_index[users[i]]=i
-#
-#     edge_list = set()
-#     for au in tqdm(users):
-#         user_name = au
-#         all_follower_tmp = list(user_followers_coll.find({'user_name': user_name}))
-#         if len(all_follower_tmp)!=0:
-#             all_followers = all_follower_tmp[0]['followers']
-#             for aft in all_followers:
-#                 if aft['screen_name'] in user_index:
-#                     edge_list.add((user_name,aft['screen_name']))
-#
-#         all_followee_tmp = list(user_followees_coll.find({'user_name':user_name}))
-#         if len(all_followee_tmp)!=0:
-#             all_followees = all_followee_tmp[0]['followees']
-#             for aft in all_followees:
-#                 if aft['screen_name'] in user_index:
-#                     edge_list.add((aft['screen_name'],user_name))
-#     G=nx.Graph()
-#     G.add_edges_from(edge_list)
-#     node_num = G.number_of_nodes()
-#     link_num = G.number_of_edges()
-#     if node_num==0:
-#         density=0
-#         cc=0
-#         avg_degree=0
-#     else:
-#         density = link_num/(float(node_num)*float(node_num))
-#         cc = nx.average_clustering(G)
-#         degrees = G.degree()
-#         avg_degree = sum(degrees.values())/len(degrees.values())
-#     return [node_num,link_num,density,cc,avg_degree]
-#
-# def getSocialEngagements(db,datasource):
-#     f_out = open('./'+datasource+'/SocialFeats.txt','w+')
-#     if datasource=='BuzzFeed':
-#         user_profiles_coll = db['TwitterUserProfile']
-#     else:
-#         user_profiles_coll = db['TwitterUserProfile2']
-#     if datasource=='BuzzFeed':
-#         user_followers_coll = db['TwitterUserFollowers']
-#     else:
-#         user_followers_coll = db['TwitterUserFollowers2']
-#     if datasource=='BuzzFeed':
-#         user_followees_coll = db['TwitterUserFollowees']
-#     else:
-#         user_followees_coll = db['TwitterUserFollowees2']
-#     news_tweets = dict()
-#     news_users = dict()
-#     # Fake News / Real News
+# import pickle
+# from os import listdir
+# from os.path import isfile, join
+# from pathlib import Path
+#
+# from pymongo import MongoClient
+# from datetime import datetime
+# import networkx as nx
+# import numpy as np
+# from random import shuffle
+# from sklearn.svm import SVC
+# from sklearn.model_selection import cross_val_score
+# import re
+# from tqdm import tqdm
+# # from sklearn.ensemble import RandomForestClassifier
+# # import json
+# # import random
+# # from nltk.tokenize import RegexpTokenizer
+# # from stop_words import get_stop_words
+# # from nltk.stem.porter import PorterStemmer
+# # from gensim import corpora
+# # import gensim
+# # from sklearn.model_selection import cross_validate
+# # from sklearn.dummy import DummyClassifier
+# # from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
+# # from sklearn.model_selection import train_test_split
+# # from sklearn import preprocessing
+#
+# from analysis_util import get_propagation_graphs, equal_samples, get_numpy_array, create_dir
+# from misc_process import get_reply_of_replies
+# from pre_process_util import get_news_articles, load_configuration, get_database_connection
+# from structure_temp_analysis import StructureFeatureHelper
+#
+# all_reply_id_sentiment_score_dict = pickle.load(open("{}/all_reply_id_sentiment_result.pkl"
+#                                                      .format("data/pre_process_data/vader_sentiment"), "rb"))
+#
+#
+# # def content_featureAgg(tweets):
+# #     # Current version tweets content are almost the same, not distinguishable
+# #
+# #     return []
+# #
+# # def networkFeatureAgg(users,user_followers_coll,user_followees_coll):
+# #
+# #     user_index = dict()
+# #     for i in range(len(users)):
+# #         user_index[users[i]]=i
+# #
+# #     edge_list = set()
+# #     for au in tqdm(users):
+# #         user_name = au
+# #         all_follower_tmp = list(user_followers_coll.find({'user_name': user_name}))
+# #         if len(all_follower_tmp)!=0:
+# #             all_followers = all_follower_tmp[0]['followers']
+# #             for aft in all_followers:
+# #                 if aft['screen_name'] in user_index:
+# #                     edge_list.add((user_name,aft['screen_name']))
+# #
+# #         all_followee_tmp = list(user_followees_coll.find({'user_name':user_name}))
+# #         if len(all_followee_tmp)!=0:
+# #             all_followees = all_followee_tmp[0]['followees']
+# #             for aft in all_followees:
+# #                 if aft['screen_name'] in user_index:
+# #                     edge_list.add((aft['screen_name'],user_name))
+# #     G=nx.Graph()
+# #     G.add_edges_from(edge_list)
+# #     node_num = G.number_of_nodes()
+# #     link_num = G.number_of_edges()
+# #     if node_num==0:
+# #         density=0
+# #         cc=0
+# #         avg_degree=0
+# #     else:
+# #         density = link_num/(float(node_num)*float(node_num))
+# #         cc = nx.average_clustering(G)
+# #         degrees = G.degree()
+# #         avg_degree = sum(degrees.values())/len(degrees.values())
+# #     return [node_num,link_num,density,cc,avg_degree]
+# #
+# # def getSocialEngagements(db,datasource):
+# #     f_out = open('./'+datasource+'/SocialFeats.txt','w+')
+# #     if datasource=='BuzzFeed':
+# #         user_profiles_coll = db['TwitterUserProfile']
+# #     else:
+# #         user_profiles_coll = db['TwitterUserProfile2']
+# #     if datasource=='BuzzFeed':
+# #         user_followers_coll = db['TwitterUserFollowers']
+# #     else:
+# #         user_followers_coll = db['TwitterUserFollowers2']
+# #     if datasource=='BuzzFeed':
+# #         user_followees_coll = db['TwitterUserFollowees']
+# #     else:
+# #         user_followees_coll = db['TwitterUserFollowees2']
+# #     news_tweets = dict()
+# #     news_users = dict()
+# #     # Fake News / Real News
+# #
+# #     if datasource=='BuzzFeed':
+# #         dir_path = './Crawler/BuzzFeedCrawler/RealTwitterResult'
+# #     else:
+# #         dir_path = './Crawler/PolitiFact/PolitiFactTwitterResult'
+# #     org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
+# #     for of in org_files:
+# #         ID = of[:of.index('.json')]
+# #         file_name = dir_path+'/'+of
+# #         tweets = []
+# #         users = []
+# #         with open(file_name) as f_engagements:
+# #             for line in f_engagements:
+# #                 line = line.strip()
+# #                 tweet_json  = json.loads(line)
+# #                 tweets.append(tweet_json['text'])
+# #                 users.append(tweet_json['username'])
+# #         news_tweets[ID]=tweets
+# #         news_users[ID]=users
+# #
+# #     for k, tweets in news_tweets.items():
+# #         users = news_users[k]
+# #         if len(users)>150:
+# #             users = users[:150]
+# #         user_features = userFeatureAgg(users, user_profiles_coll)
+# #         content_features = content_featureAgg(tweets)
+# #         network_features = networkFeatureAgg(users,user_followers_coll,user_followees_coll)
+# #
+# #         all_feats=[]
+# #         all_feats.extend(user_features)
+# #         all_feats.extend(content_features)
+# #         all_feats.extend(network_features)
+# #         f_out.write(k+'\t'+'\t'.join(str(f) for f in all_feats)+'\n')
+# #         print k+'\t'+'\t'.join(str(f) for f in all_feats)
+# #     f_out.close()
+# #
+# # def getSocialEngagementsEarly(db,datasource,delta):
+# #     early_users = dict()
+# #     with open('./'+datasource+'/Early/User_'+delta+'.txt') as f_users:
+# #         for line in f_users:
+# #             line = line.strip()
+# #             early_users[line]=1
+# #
+# #     f_out = open('./'+datasource+'/Early/SocialFeatsReal'+delta+'.txt','w+')
+# #     if datasource=='BuzzFeed':
+# #         user_profiles_coll = db['TwitterUserProfile']
+# #     else:
+# #         user_profiles_coll = db['TwitterUserProfile2']
+# #     if datasource=='BuzzFeed':
+# #         user_followers_coll = db['TwitterUserFollowers']
+# #     else:
+# #         user_followers_coll = db['TwitterUserFollowers2']
+# #     if datasource=='BuzzFeed':
+# #         user_followees_coll = db['TwitterUserFollowees']
+# #     else:
+# #         user_followees_coll = db['TwitterUserFollowees2']
+# #     news_tweets = dict()
+# #     news_users = dict()
+# #     # Fake News / Real News
+# #
+# #     if datasource=='BuzzFeed':
+# #         dir_path = './Crawler/BuzzFeedCrawler/TwitterResult'
+# #     else:
+# #         dir_path = './Crawler/PolitiFact/RealTwitterResult'
+# #     org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
+# #     for of in org_files:
+# #         ID = of[:of.index('.json')]
+# #         file_name = dir_path+'/'+of
+# #         tweets = []
+# #         users = []
+# #         with open(file_name) as f_engagements:
+# #             for line in f_engagements:
+# #                 line = line.strip()
+# #                 tweet_json  = json.loads(line)
+# #                 if tweet_json['username'] not in early_users:
+# #                     continue
+# #                 tweets.append(tweet_json['text'])
+# #                 users.append(tweet_json['username'])
+# #         news_tweets[ID]=tweets
+# #         news_users[ID]=users
+# #
+# #     for k, tweets in news_tweets.items():
+# #         users = news_users[k]
+# #         if len(users)>150:
+# #             users = users[:150]
+# #         user_features = userFeatureAgg(users, user_profiles_coll)
+# #         content_features = content_featureAgg(tweets)
+# #         network_features = networkFeatureAgg(users,user_followers_coll,user_followees_coll)
+# #
+# #         all_feats=[]
+# #         all_feats.extend(user_features)
+# #         all_feats.extend(content_features)
+# #         all_feats.extend(network_features)
+# #         f_out.write(k+'\t'+'\t'.join(str(f) for f in all_feats)+'\n')
+# #         print k+'\t'+'\t'.join(str(f) for f in all_feats)
+# #     f_out.close()
+# #
+# # def userFeature(user, user_profiles_coll):
+# #     if list(user_profiles_coll.find({'screen_name':user})) ==[]:
+# #         return [0,0,0,0]
+# #     tmp = list(user_profiles_coll.find({'screen_name':user}))[0]
+# #     pnum = tmp['statuses_count']
+# #     fnum = tmp['friends_count']
+# #     fonum = tmp['followers_count']
+# #     create_time = tmp['created_at']
+# #     verified = tmp['verified']
+# #     if verified==False:
+# #         verified=0
+# #     else:
+# #         verified=1
+# #     date_create = datetime.strptime(create_time, '%a %b %d %H:%M:%S +0000 %Y')
+# #     today = datetime.now()
+# #     dregister =(today-date_create).days
+# #     return [pnum,fnum,fonum,dregister,verified]
+# #
+# # def content_feature(tweet):
+# #     topic_feature = []
+# #     url_num = len([m for m in re.finditer('http', tweet)])
+# #     question_flag = 0
+# #     if '?' in tweet:
+# #         question_flag=1
+# #     mention_num = len([m for m in re.finditer('@', tweet)])
+# #     retweet_count=0
+# #     try:
+# #         retweet_count = float(tweet.split(':::')[1])
+# #     except:
+# #         pass
+# #
+# #     return [url_num,question_flag,mention_num,retweet_count]
+# #
+# # def getTopicFeature(tweets, num_topic):
+# #     doc_set  = []
+# #     for entry in tweets:
+# #         try:
+# #             doc_set.append(entry.split(':::')[0])
+# #         except:
+# #             pass
+# #
+# #     texts = []
+# #     tokenizer = RegexpTokenizer(r'\w+')
+# #     en_stop = get_stop_words('en')
+# #     p_stemmer = PorterStemmer()
+# #     for i in doc_set:
+# #         # clean and tokenize document string
+# #         raw = i.lower()
+# #         # Filter http
+# #         raw = raw.replace('http','')
+# #         tokens = tokenizer.tokenize(raw)
+# #         # remove stop words from tokensk
+# #         stopped_tokens = [i for i in tokens if not i in en_stop]
+# #         # stem tokens
+# #         stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
+# #         # add tokens to list
+# #         texts.append(stemmed_tokens)
+# #
+# #     dictionary = corpora.Dictionary(texts)
+# #     # convert tokenized documents into a document-term matrix
+# #     corpus = [dictionary.doc2bow(text) for text in texts]
+# #     # generate LDA model
+# #     ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topic, id2word=dictionary, passes=20)
+# #
+# #     topic_distribution = []
+# #     for c in corpus:
+# #
+# #         dis = ldamodel[c]
+# #         tmp_dis = [0 for i in range(num_topic)]
+# #         for d in dis:
+# #             tmp_dis[d[0]]=d[1]
+# #         topic_distribution.append(tmp_dis)
+# #     return topic_distribution
+# #
+# # def TweetLevelFeaturs(db):
+# #     f_out = open('./'+datasource+'/TweetLevelFeatsReal.txt','w+')
+# #     if datasource=='BuzzFeed':
+# #         user_profiles_coll = db['TwitterUserProfile']
+# #     else:
+# #         user_profiles_coll = db['TwitterUserProfile1']
+# #         # Fake News / Real News
+# #     dir_path = './Crawler/BuzzFeedCrawler/RealTwitterResult'
+# #     org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
+# #     news_tweets = dict()
+# #     news_users = dict()
+# #     for of in org_files:
+# #         ID = of[:of.index('.json')]
+# #         file_name = dir_path+'/'+of
+# #         tweets = []
+# #         users = []
+# #         with open(file_name) as f_engagements:
+# #             for line in f_engagements:
+# #                 line = line.strip()
+# #                 tweet_json  = json.loads(line)
+# #                 tweets.append(tweet_json['text']+':::'+str(tweet_json['retweets'])+':::'+str(tweet_json['id']))
+# #                 users.append(tweet_json['username'])
+# #         news_tweets[ID]=tweets
+# #         news_users[ID]=users
+# #
+# #     for k, tweets in news_tweets.items():
+# #         users = news_users[k]
+# #         if 'Real' in k:
+# #             tw_label='1' ### Using 1 as high credibility
+# #         else:
+# #             tw_label='-1'
+# #
+# #         Topic_feats = getTopicFeature(tweets,10)
+# #
+# #         for i in range(len(users)):
+# #             user = users[i]
+# #             tweet = tweets[i]
+# #             tid = tweet.split(':::')[2]
+# #             user_features = userFeature(user,user_profiles_coll)
+# #             content_features = content_feature(tweet)
+# #             all_feats=[]
+# #             all_feats.extend(user_features)
+# #             all_feats.extend(content_features)
+# #             all_feats.extend(Topic_feats[i])
+# #             f_out.write(tid+'\t'+tw_label+'\t'+'\t'.join(str(f) for f in all_feats)+'\n')
+# #             print tid+'\t'+tw_label+'\t'+'\t'.join(str(f) for f in all_feats)
+# #     f_out.close()
+# #
+# # def Castillo11(datasource,delta):
+# #     all_news = []
+# #     with open('./'+datasource+'/News.txt') as f_news:
+# #         for line in f_news:
+# #             all_news.append(line.strip())
+# #
+# #     all_X = []
+# #     all_y = []
+# #     with open('./'+datasource+'/Early/SocialFeats'+delta+'.txt') as f_fake_social:
+# #         for line in f_fake_social:
+# #             line = line.strip()
+# #             ID = line.split('\t')[0]
+# #             if ID in all_news:
+# #                 feats = [float(x) for x in line.split('\t')[1:]]
+# #                 all_X.append(feats)
+# #                 all_y.append(1)
+# #     with open('./'+datasource+'/Early/SocialFeatsReal'+delta+'.txt') as f_real_social:
+# #         for line in f_real_social:
+# #             line = line.strip()
+# #             ID = line.split('\t')[0]
+# #             if ID in all_news:
+# #                 feats = [float(x) for x in line.split('\t')[1:]]
+# #                 all_X.append(feats)
+# #                 all_y.append(0)
+# #     X = np.array(all_X)
+# #     y = np.array(all_y)
+# #     arry = range(X.shape[0])
+# #     shuffle(arry)
+# #     X = X[arry, :]
+# #     y = y[arry]
+# #     clf = SVC(kernel='linear', class_weight='balanced')
+# #     # clf = RandomForestClassifier()
+# #     scoring = ['accuracy','precision', 'recall', 'f1']
+# #     print '***'+delta+'***'
+# #     res = cross_validate(estimator=clf, X=X, y=y, cv=5, verbose=0, n_jobs=-1, scoring=scoring)
+# #     print '\t'.join([str(x) for x in res['test_accuracy']])
+# #     # print '\t'.join([str(x) for x in res['test_precision']])
+# #     # print '\t'.join([str(x) for x in res['test_recall']])
+# #     print '\t'.join([str(x) for x in res['test_f1']])
+# #
+# #     # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=0, n_jobs=-1, scoring='accuracy')
+# #     # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
+# #     # print res
+# #
+# #     # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='precision')
+# #     # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
+# #     # print res
+# #     #
+# #     # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='recall')
+# #     # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
+# #     # print res
+# #
+# #     # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=0, n_jobs=-1, scoring='f1')
+# #     # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
+# #     # print res
+# #
+# # def Castillo11_2(datasource):
+# #     all_news = []
+# #     with open('./'+datasource+'/News.txt') as f_news:
+# #         for line in f_news:
+# #             all_news.append(line.strip())
+# #
+# #     all_X = []
+# #     all_y = []
+# #     with open('./'+datasource+'/SocialFeats.txt') as f_fake_social:
+# #         for line in f_fake_social:
+# #             line = line.strip()
+# #             ID = line.split('\t')[0]
+# #             if ID in all_news:
+# #                 feats = [float(x) for x in line.split('\t')[1:]]
+# #                 all_X.append(feats)
+# #                 all_y.append(1)
+# #     with open('./'+datasource+'/SocialFeatsReal.txt') as f_real_social:
+# #         for line in f_real_social:
+# #             line = line.strip()
+# #             ID = line.split('\t')[0]
+# #             if ID in all_news:
+# #                 feats = [float(x) for x in line.split('\t')[1:]]
+# #                 all_X.append(feats)
+# #                 all_y.append(0)
+# #     X = np.array(all_X)
+# #     y = np.array(all_y)
+# #     arry = range(X.shape[0])
+# #     shuffle(arry)
+# #     X = X[arry, :]
+# #     y = y[arry]
+# #
+# #     # X = preprocessing.normalize(X)
+# #     # clf = RandomForestClassifier()
+# #
+# #     train_sizes = [0.2,0.4,0.6,0.8]
+# #     for ts in train_sizes:
+# #         acc = []
+# #         prec = []
+# #         recall = []
+# #         f1 = []
+# #         for i in range(3):
+# #             clf = SVC(kernel='linear', class_weight='balanced')
+# #             X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = ts)
+# #             clf.fit(X_train,y_train)
+# #             y_pred = clf.predict(X_test)
+# #             acc.append(accuracy_score(y_test, y_pred))
+# #             prec.append(precision_score(y_test, y_pred))
+# #             recall.append(recall_score(y_test, y_pred))
+# #             f1.append(f1_score(y_test, y_pred))
+# #
+# #         print "", sum(acc)/len(acc)
+# #         print "", sum(prec)/len(prec)
+# #         print "", sum(recall)/len(recall)
+# #         print "", sum(f1)/len(f1)
+# #         print ""
+# #
+# # def balanced_subsample(x,y,id,subsample_size=1.0):
+# #
+# #     class_xs = []
+# #     min_elems = None
+# #
+# #     for yi in np.unique(y):
+# #         elems = x[(y == yi)]
+# #         class_xs.append((yi, elems))
+# #         if min_elems == None or elems.shape[0] < min_elems:
+# #             min_elems = elems.shape[0]
+# #
+# #     use_elems = min_elems
+# #     if subsample_size < 1:
+# #         use_elems = int(min_elems*subsample_size)
+# #
+# #     xs = []
+# #     ys = []
+# #
+# #     for ci,this_xs in class_xs:
+# #         if len(this_xs) > use_elems:
+# #             np.random.shuffle(this_xs)
+# #
+# #         x_ = this_xs[:use_elems]
+# #         y_ = np.empty(use_elems)
+# #         y_.fill(ci)
+# #
+# #         xs.append(x_)
+# #         ys.append(y_)
+# #
+# #     xs = np.concatenate(xs)
+# #     ys = np.concatenate(ys)
+# #
+# #     return xs,ys
+# #
+# # def TweetLevelPredict():
+# #     all_X = []
+# #     all_y = []
+# #     all_tid = []
+# #     with open('./BuzzFeed/TweetLevelFeats.txt') as f_fake_social:
+# #         for line in f_fake_social:
+# #             line = line.strip()
+# #             tid = line.split('\t')[0]
+# #             label = line.split('\t')[1]
+# #             feats = [float(x) for x in line.split('\t')[2:]]
+# #             all_X.append(feats)
+# #             all_y.append(label)
+# #             all_tid.append(tid)
+# #     with open('./BuzzFeed/TweetLevelFeatsReal.txt') as f_real_social:
+# #         for line in f_real_social:
+# #             line = line.strip()
+# #             label = line.split('\t')[1]
+# #             tid = line.split('\t')[0]
+# #             feats = [float(x) for x in line.split('\t')[2:]]
+# #             all_X.append(feats)
+# #             all_y.append(label)
+# #             all_tid.append(tid)
+# #     X = np.array(all_X)
+# #     y = np.array(all_y)
+# #     tid = np.array(all_tid)
+# #     Xs,ys = balanced_subsample(X,y,0.01)
+# #     arry = range(Xs.shape[0])
+# #     shuffle(arry)
+# #     Xs = Xs[arry, :]
+# #     ys= ys[arry]
+# #
+# #     # clf = RandomForestClassifier(max_depth=2,random_state=0)
+# #     clf = SVC(kernel='linear', class_weight='balanced',probability=True)
+# #     # res = cross_val_score(estimator=clf, X=Xs, y=ys, cv=5, verbose=1, n_jobs=-1, scoring='accuracy')
+# #     # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
+# #     # print res
+# #     clf.fit(Xs,ys)
+# #     y_predict = clf.predict(X)
+# #     print 'Accuracy '
+# #
+# # def Dummy(datasource):
+# #     all_news = []
+# #     with open('./'+datasource+'/News.txt') as f_news:
+# #         for line in f_news:
+# #             all_news.append(line.strip())
+# #
+# #     all_X = []
+# #     all_y = []
+# #     with open('./'+datasource+'/SocialFeats.txt') as f_fake_social:
+# #         for line in f_fake_social:
+# #             line = line.strip()
+# #             ID = line.split('\t')[0]
+# #             if ID in all_news:
+# #                 feats = [float(x) for x in line.split('\t')[1:]]
+# #                 all_X.append(feats)
+# #                 all_y.append(1)
+# #     with open('./'+datasource+'/SocialFeatsReal.txt') as f_real_social:
+# #         for line in f_real_social:
+# #             line = line.strip()
+# #             ID = line.split('\t')[0]
+# #             if ID in all_news:
+# #                 feats = [float(x) for x in line.split('\t')[1:]]
+# #                 all_X.append(feats)
+# #                 all_y.append(0)
+# #     X = np.array(all_X)
+# #     y = np.array(all_y)
+# #     arry = range(X.shape[0])
+# #     shuffle(arry)
+# #     X = X[arry, :]
+# #     y = y[arry]
+# #     clf = DummyClassifier(constant=1)
+# #     scoring = ['accuracy','precision', 'recall', 'f1']
+# #     res = cross_validate(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring=scoring)
+# #
+# #
+# #     print '\t'.join([str(x) for x in res['test_accuracy']])
+# #     print '\t'.join([str(x) for x in res['test_precision']])
+# #     print '\t'.join([str(x) for x in res['test_recall']])
+# #     print '\t'.join([str(x) for x in res['test_f1']])
+# #
+# #     res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='accuracy')
+# #     res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
+# #     print res
+# #
+# #     res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='precision')
+# #     res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
+# #     print res
+# #
+# #     res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='recall')
+# #     res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
+# #     print res
+# #
+# #     res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='f1')
+# #     res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
+# #     print res
+#
+#
+# def get_message_based_features(reply_id_content_dict):
+#     num_words = []
+#     num_urls = []
+#     question_mark_nums = []
+#     num_mentions = []
+#
+#     for reply_id, content in reply_id_content_dict.items():
+#         url_num = len([m for m in re.finditer('http', content)])
+#         question_flag = 0
+#         if '?' in content:
+#             question_flag = 1
+#         mention_num = len([m for m in re.finditer('@', content)])
+#         num_word = len(content.split())
+#
+#         num_words.append(num_word)
+#         num_urls.append(url_num)
+#         question_mark_nums.append(question_flag)
+#         num_mentions.append(mention_num)
 #
-#     if datasource=='BuzzFeed':
-#         dir_path = './Crawler/BuzzFeedCrawler/RealTwitterResult'
-#     else:
-#         dir_path = './Crawler/PolitiFact/PolitiFactTwitterResult'
-#     org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
-#     for of in org_files:
-#         ID = of[:of.index('.json')]
-#         file_name = dir_path+'/'+of
-#         tweets = []
-#         users = []
-#         with open(file_name) as f_engagements:
-#             for line in f_engagements:
-#                 line = line.strip()
-#                 tweet_json  = json.loads(line)
-#                 tweets.append(tweet_json['text'])
-#                 users.append(tweet_json['username'])
-#         news_tweets[ID]=tweets
-#         news_users[ID]=users
-#
-#     for k, tweets in news_tweets.items():
-#         users = news_users[k]
-#         if len(users)>150:
-#             users = users[:150]
-#         user_features = userFeatureAgg(users, user_profiles_coll)
-#         content_features = content_featureAgg(tweets)
-#         network_features = networkFeatureAgg(users,user_followers_coll,user_followees_coll)
-#
-#         all_feats=[]
-#         all_feats.extend(user_features)
-#         all_feats.extend(content_features)
-#         all_feats.extend(network_features)
-#         f_out.write(k+'\t'+'\t'.join(str(f) for f in all_feats)+'\n')
-#         print k+'\t'+'\t'.join(str(f) for f in all_feats)
-#     f_out.close()
-#
-# def getSocialEngagementsEarly(db,datasource,delta):
-#     early_users = dict()
-#     with open('./'+datasource+'/Early/User_'+delta+'.txt') as f_users:
-#         for line in f_users:
-#             line = line.strip()
-#             early_users[line]=1
-#
-#     f_out = open('./'+datasource+'/Early/SocialFeatsReal'+delta+'.txt','w+')
-#     if datasource=='BuzzFeed':
-#         user_profiles_coll = db['TwitterUserProfile']
-#     else:
-#         user_profiles_coll = db['TwitterUserProfile2']
-#     if datasource=='BuzzFeed':
-#         user_followers_coll = db['TwitterUserFollowers']
-#     else:
-#         user_followers_coll = db['TwitterUserFollowers2']
-#     if datasource=='BuzzFeed':
-#         user_followees_coll = db['TwitterUserFollowees']
-#     else:
-#         user_followees_coll = db['TwitterUserFollowees2']
-#     news_tweets = dict()
-#     news_users = dict()
-#     # Fake News / Real News
+#     try:
+#         mean_num_words = np.mean(num_words)
+#     except:
+#         mean_num_words = 0
+#
+#     try:
+#         mean_num_urls = np.mean(num_urls)
+#     except:
+#         mean_num_urls = 0
 #
-#     if datasource=='BuzzFeed':
-#         dir_path = './Crawler/BuzzFeedCrawler/TwitterResult'
-#     else:
-#         dir_path = './Crawler/PolitiFact/RealTwitterResult'
-#     org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
-#     for of in org_files:
-#         ID = of[:of.index('.json')]
-#         file_name = dir_path+'/'+of
-#         tweets = []
-#         users = []
-#         with open(file_name) as f_engagements:
-#             for line in f_engagements:
-#                 line = line.strip()
-#                 tweet_json  = json.loads(line)
-#                 if tweet_json['username'] not in early_users:
-#                     continue
-#                 tweets.append(tweet_json['text'])
-#                 users.append(tweet_json['username'])
-#         news_tweets[ID]=tweets
-#         news_users[ID]=users
-#
-#     for k, tweets in news_tweets.items():
-#         users = news_users[k]
-#         if len(users)>150:
-#             users = users[:150]
-#         user_features = userFeatureAgg(users, user_profiles_coll)
-#         content_features = content_featureAgg(tweets)
-#         network_features = networkFeatureAgg(users,user_followers_coll,user_followees_coll)
-#
-#         all_feats=[]
-#         all_feats.extend(user_features)
-#         all_feats.extend(content_features)
-#         all_feats.extend(network_features)
-#         f_out.write(k+'\t'+'\t'.join(str(f) for f in all_feats)+'\n')
-#         print k+'\t'+'\t'.join(str(f) for f in all_feats)
-#     f_out.close()
-#
-# def userFeature(user, user_profiles_coll):
-#     if list(user_profiles_coll.find({'screen_name':user})) ==[]:
-#         return [0,0,0,0]
-#     tmp = list(user_profiles_coll.find({'screen_name':user}))[0]
-#     pnum = tmp['statuses_count']
-#     fnum = tmp['friends_count']
-#     fonum = tmp['followers_count']
-#     create_time = tmp['created_at']
-#     verified = tmp['verified']
-#     if verified==False:
-#         verified=0
-#     else:
-#         verified=1
-#     date_create = datetime.strptime(create_time, '%a %b %d %H:%M:%S +0000 %Y')
-#     today = datetime.now()
-#     dregister =(today-date_create).days
-#     return [pnum,fnum,fonum,dregister,verified]
-#
-# def content_feature(tweet):
-#     topic_feature = []
-#     url_num = len([m for m in re.finditer('http', tweet)])
-#     question_flag = 0
-#     if '?' in tweet:
-#         question_flag=1
-#     mention_num = len([m for m in re.finditer('@', tweet)])
-#     retweet_count=0
 #     try:
-#         retweet_count = float(tweet.split(':::')[1])
+#         mean_question_mark_nums = np.mean(question_mark_nums)
 #     except:
-#         pass
-#
-#     return [url_num,question_flag,mention_num,retweet_count]
-#
-# def getTopicFeature(tweets, num_topic):
-#     doc_set  = []
-#     for entry in tweets:
-#         try:
-#             doc_set.append(entry.split(':::')[0])
-#         except:
-#             pass
-#
-#     texts = []
-#     tokenizer = RegexpTokenizer(r'\w+')
-#     en_stop = get_stop_words('en')
-#     p_stemmer = PorterStemmer()
-#     for i in doc_set:
-#         # clean and tokenize document string
-#         raw = i.lower()
-#         # Filter http
-#         raw = raw.replace('http','')
-#         tokens = tokenizer.tokenize(raw)
-#         # remove stop words from tokensk
-#         stopped_tokens = [i for i in tokens if not i in en_stop]
-#         # stem tokens
-#         stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
-#         # add tokens to list
-#         texts.append(stemmed_tokens)
-#
-#     dictionary = corpora.Dictionary(texts)
-#     # convert tokenized documents into a document-term matrix
-#     corpus = [dictionary.doc2bow(text) for text in texts]
-#     # generate LDA model
-#     ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topic, id2word=dictionary, passes=20)
-#
-#     topic_distribution = []
-#     for c in corpus:
-#
-#         dis = ldamodel[c]
-#         tmp_dis = [0 for i in range(num_topic)]
-#         for d in dis:
-#             tmp_dis[d[0]]=d[1]
-#         topic_distribution.append(tmp_dis)
-#     return topic_distribution
-#
-# def TweetLevelFeaturs(db):
-#     f_out = open('./'+datasource+'/TweetLevelFeatsReal.txt','w+')
-#     if datasource=='BuzzFeed':
-#         user_profiles_coll = db['TwitterUserProfile']
+#         mean_question_mark_nums = 0
+#
+#     try:
+#         mean_num_mentions = np.mean(num_mentions)
+#     except:
+#         mean_num_mentions = 0
+#
+#     return [mean_num_words, mean_num_urls, mean_question_mark_nums, mean_num_mentions]
+#
+#
+# def get_content_topic_based_features(reply_id_content_dict):
+#     positive_words = []
+#     negative_words = []
+#     neutral_words = []
+#     sentiment_scores = []
+#     reply_lengths = []
+#
+#     for reply_id, content in reply_id_content_dict.items():
+#         if reply_id in reply_id_content_dict:
+#             sentiment_info = all_reply_id_sentiment_score_dict[reply_id]
+#             positive_words.append(sentiment_info["pos"])
+#             negative_words.append(sentiment_info["neg"])
+#             neutral_words.append(sentiment_info["neu"])
+#             sentiment_scores.append(sentiment_info["compound"])
+#             reply_lengths.append(len(content))
+#
+#     try:
+#         mean_positive_words = np.mean(positive_words)
+#     except:
+#         mean_positive_words = 0
+#
+#     try:
+#         mean_negative_words = np.mean(negative_words)
+#     except:
+#         mean_negative_words = 0
+#
+#     try:
+#         mean_neutral_words = np.mean(neutral_words)
+#     except:
+#         mean_neutral_words = 0
+#
+#     try:
+#         mean_sentiment_score = np.mean(sentiment_scores)
+#     except:
+#         mean_sentiment_score = 0
+#
+#     try:
+#         mean_reply_length = np.mean(reply_lengths)
+#     except:
+#         mean_reply_length = 0
+#
+#     return [len(reply_id_content_dict), mean_positive_words, mean_negative_words, mean_neutral_words,
+#             mean_sentiment_score, mean_reply_length]
+#
+#
+# def get_user_aggregate_features(db, is_fake, user_ids):
+#     posts_num = []
+#     friends_num = []
+#     followers_num = []
+#     days_register = []
+#
+#     if is_fake:
+#         label_user_collection = db.fake_twitter_user_profile
 #     else:
-#         user_profiles_coll = db['TwitterUserProfile1']
-#         # Fake News / Real News
-#     dir_path = './Crawler/BuzzFeedCrawler/RealTwitterResult'
-#     org_files = [f for f in listdir(dir_path) if isfile(join(dir_path, f))]
-#     news_tweets = dict()
-#     news_users = dict()
-#     for of in org_files:
-#         ID = of[:of.index('.json')]
-#         file_name = dir_path+'/'+of
-#         tweets = []
-#         users = []
-#         with open(file_name) as f_engagements:
-#             for line in f_engagements:
-#                 line = line.strip()
-#                 tweet_json  = json.loads(line)
-#                 tweets.append(tweet_json['text']+':::'+str(tweet_json['retweets'])+':::'+str(tweet_json['id']))
-#                 users.append(tweet_json['username'])
-#         news_tweets[ID]=tweets
-#         news_users[ID]=users
-#
-#     for k, tweets in news_tweets.items():
-#         users = news_users[k]
-#         if 'Real' in k:
-#             tw_label='1' ### Using 1 as high credibility
+#         label_user_collection = db.real_twitter_user_profile
+#
+#     user_profile_collection = db.twitter_user_profile
+#
+#     # np.random.shuffle(user_ids)
+#
+#     for user_id in tqdm(user_ids):
+#
+#         user_object = label_user_collection.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1,
+#                                                                             "profile_info.friends_count": 1,
+#                                                                             "profile_info.followers_count": 1,
+#                                                                             "profile_info.created_at": 1})
+#         if user_object is None:
+#             user_object = user_profile_collection.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1,
+#                                                                                   "profile_info.friends_count": 1,
+#                                                                                   "profile_info.followers_count": 1,
+#                                                                                   "profile_info.created_at": 1})
+#
+#         if user_object is None:
+#             print('user {} not found'.format(user_id))
 #         else:
-#             tw_label='-1'
-#
-#         Topic_feats = getTopicFeature(tweets,10)
-#
-#         for i in range(len(users)):
-#             user = users[i]
-#             tweet = tweets[i]
-#             tid = tweet.split(':::')[2]
-#             user_features = userFeature(user,user_profiles_coll)
-#             content_features = content_feature(tweet)
-#             all_feats=[]
-#             all_feats.extend(user_features)
-#             all_feats.extend(content_features)
-#             all_feats.extend(Topic_feats[i])
-#             f_out.write(tid+'\t'+tw_label+'\t'+'\t'.join(str(f) for f in all_feats)+'\n')
-#             print tid+'\t'+tw_label+'\t'+'\t'.join(str(f) for f in all_feats)
-#     f_out.close()
-#
-# def Castillo11(datasource,delta):
-#     all_news = []
-#     with open('./'+datasource+'/News.txt') as f_news:
-#         for line in f_news:
-#             all_news.append(line.strip())
-#
-#     all_X = []
-#     all_y = []
-#     with open('./'+datasource+'/Early/SocialFeats'+delta+'.txt') as f_fake_social:
-#         for line in f_fake_social:
-#             line = line.strip()
-#             ID = line.split('\t')[0]
-#             if ID in all_news:
-#                 feats = [float(x) for x in line.split('\t')[1:]]
-#                 all_X.append(feats)
-#                 all_y.append(1)
-#     with open('./'+datasource+'/Early/SocialFeatsReal'+delta+'.txt') as f_real_social:
-#         for line in f_real_social:
-#             line = line.strip()
-#             ID = line.split('\t')[0]
-#             if ID in all_news:
-#                 feats = [float(x) for x in line.split('\t')[1:]]
-#                 all_X.append(feats)
-#                 all_y.append(0)
-#     X = np.array(all_X)
-#     y = np.array(all_y)
-#     arry = range(X.shape[0])
-#     shuffle(arry)
-#     X = X[arry, :]
-#     y = y[arry]
-#     clf = SVC(kernel='linear', class_weight='balanced')
-#     # clf = RandomForestClassifier()
-#     scoring = ['accuracy','precision', 'recall', 'f1']
-#     print '***'+delta+'***'
-#     res = cross_validate(estimator=clf, X=X, y=y, cv=5, verbose=0, n_jobs=-1, scoring=scoring)
-#     print '\t'.join([str(x) for x in res['test_accuracy']])
-#     # print '\t'.join([str(x) for x in res['test_precision']])
-#     # print '\t'.join([str(x) for x in res['test_recall']])
-#     print '\t'.join([str(x) for x in res['test_f1']])
-#
-#     # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=0, n_jobs=-1, scoring='accuracy')
-#     # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
-#     # print res
-#
-#     # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='precision')
-#     # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
-#     # print res
-#     #
-#     # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='recall')
-#     # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
-#     # print res
-#
-#     # res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=0, n_jobs=-1, scoring='f1')
-#     # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
-#     # print res
-#
-# def Castillo11_2(datasource):
-#     all_news = []
-#     with open('./'+datasource+'/News.txt') as f_news:
-#         for line in f_news:
-#             all_news.append(line.strip())
-#
-#     all_X = []
-#     all_y = []
-#     with open('./'+datasource+'/SocialFeats.txt') as f_fake_social:
-#         for line in f_fake_social:
-#             line = line.strip()
-#             ID = line.split('\t')[0]
-#             if ID in all_news:
-#                 feats = [float(x) for x in line.split('\t')[1:]]
-#                 all_X.append(feats)
-#                 all_y.append(1)
-#     with open('./'+datasource+'/SocialFeatsReal.txt') as f_real_social:
-#         for line in f_real_social:
-#             line = line.strip()
-#             ID = line.split('\t')[0]
-#             if ID in all_news:
-#                 feats = [float(x) for x in line.split('\t')[1:]]
-#                 all_X.append(feats)
-#                 all_y.append(0)
-#     X = np.array(all_X)
-#     y = np.array(all_y)
-#     arry = range(X.shape[0])
-#     shuffle(arry)
-#     X = X[arry, :]
-#     y = y[arry]
-#
-#     # X = preprocessing.normalize(X)
-#     # clf = RandomForestClassifier()
-#
-#     train_sizes = [0.2,0.4,0.6,0.8]
-#     for ts in train_sizes:
-#         acc = []
-#         prec = []
-#         recall = []
-#         f1 = []
-#         for i in range(3):
-#             clf = SVC(kernel='linear', class_weight='balanced')
-#             X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = ts)
-#             clf.fit(X_train,y_train)
-#             y_pred = clf.predict(X_test)
-#             acc.append(accuracy_score(y_test, y_pred))
-#             prec.append(precision_score(y_test, y_pred))
-#             recall.append(recall_score(y_test, y_pred))
-#             f1.append(f1_score(y_test, y_pred))
-#
-#         print "", sum(acc)/len(acc)
-#         print "", sum(prec)/len(prec)
-#         print "", sum(recall)/len(recall)
-#         print "", sum(f1)/len(f1)
-#         print ""
-#
-# def balanced_subsample(x,y,id,subsample_size=1.0):
-#
-#     class_xs = []
-#     min_elems = None
-#
-#     for yi in np.unique(y):
-#         elems = x[(y == yi)]
-#         class_xs.append((yi, elems))
-#         if min_elems == None or elems.shape[0] < min_elems:
-#             min_elems = elems.shape[0]
-#
-#     use_elems = min_elems
-#     if subsample_size < 1:
-#         use_elems = int(min_elems*subsample_size)
-#
-#     xs = []
-#     ys = []
-#
-#     for ci,this_xs in class_xs:
-#         if len(this_xs) > use_elems:
-#             np.random.shuffle(this_xs)
-#
-#         x_ = this_xs[:use_elems]
-#         y_ = np.empty(use_elems)
-#         y_.fill(ci)
-#
-#         xs.append(x_)
-#         ys.append(y_)
-#
-#     xs = np.concatenate(xs)
-#     ys = np.concatenate(ys)
-#
-#     return xs,ys
-#
-# def TweetLevelPredict():
-#     all_X = []
-#     all_y = []
-#     all_tid = []
-#     with open('./BuzzFeed/TweetLevelFeats.txt') as f_fake_social:
-#         for line in f_fake_social:
-#             line = line.strip()
-#             tid = line.split('\t')[0]
-#             label = line.split('\t')[1]
-#             feats = [float(x) for x in line.split('\t')[2:]]
-#             all_X.append(feats)
-#             all_y.append(label)
-#             all_tid.append(tid)
-#     with open('./BuzzFeed/TweetLevelFeatsReal.txt') as f_real_social:
-#         for line in f_real_social:
-#             line = line.strip()
-#             label = line.split('\t')[1]
-#             tid = line.split('\t')[0]
-#             feats = [float(x) for x in line.split('\t')[2:]]
-#             all_X.append(feats)
-#             all_y.append(label)
-#             all_tid.append(tid)
-#     X = np.array(all_X)
-#     y = np.array(all_y)
-#     tid = np.array(all_tid)
-#     Xs,ys = balanced_subsample(X,y,0.01)
-#     arry = range(Xs.shape[0])
-#     shuffle(arry)
-#     Xs = Xs[arry, :]
-#     ys= ys[arry]
-#
-#     # clf = RandomForestClassifier(max_depth=2,random_state=0)
-#     clf = SVC(kernel='linear', class_weight='balanced',probability=True)
-#     # res = cross_val_score(estimator=clf, X=Xs, y=ys, cv=5, verbose=1, n_jobs=-1, scoring='accuracy')
-#     # res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
-#     # print res
-#     clf.fit(Xs,ys)
-#     y_predict = clf.predict(X)
-#     print 'Accuracy '
-#
-# def Dummy(datasource):
-#     all_news = []
-#     with open('./'+datasource+'/News.txt') as f_news:
-#         for line in f_news:
-#             all_news.append(line.strip())
-#
-#     all_X = []
-#     all_y = []
-#     with open('./'+datasource+'/SocialFeats.txt') as f_fake_social:
-#         for line in f_fake_social:
-#             line = line.strip()
-#             ID = line.split('\t')[0]
-#             if ID in all_news:
-#                 feats = [float(x) for x in line.split('\t')[1:]]
-#                 all_X.append(feats)
-#                 all_y.append(1)
-#     with open('./'+datasource+'/SocialFeatsReal.txt') as f_real_social:
-#         for line in f_real_social:
-#             line = line.strip()
-#             ID = line.split('\t')[0]
-#             if ID in all_news:
-#                 feats = [float(x) for x in line.split('\t')[1:]]
-#                 all_X.append(feats)
-#                 all_y.append(0)
-#     X = np.array(all_X)
-#     y = np.array(all_y)
-#     arry = range(X.shape[0])
-#     shuffle(arry)
-#     X = X[arry, :]
-#     y = y[arry]
-#     clf = DummyClassifier(constant=1)
-#     scoring = ['accuracy','precision', 'recall', 'f1']
-#     res = cross_validate(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring=scoring)
-#
-#
-#     print '\t'.join([str(x) for x in res['test_accuracy']])
-#     print '\t'.join([str(x) for x in res['test_precision']])
-#     print '\t'.join([str(x) for x in res['test_recall']])
-#     print '\t'.join([str(x) for x in res['test_f1']])
-#
-#     res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='accuracy')
-#     res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
-#     print res
-#
-#     res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='precision')
-#     res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
-#     print res
-#
-#     res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='recall')
-#     res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
-#     print res
-#
-#     res = cross_val_score(estimator=clf, X=X, y=y, cv=5, verbose=1, n_jobs=-1, scoring='f1')
-#     res = "%0.3f +/- %0.3f" % ( np.mean(res), np.std(res))
-#     print res
-
-
-def get_message_based_features(reply_id_content_dict):
-    num_words = []
-    num_urls = []
-    question_mark_nums = []
-    num_mentions = []
-
-    for reply_id, content in reply_id_content_dict.items():
-        url_num = len([m for m in re.finditer('http', content)])
-        question_flag = 0
-        if '?' in content:
-            question_flag = 1
-        mention_num = len([m for m in re.finditer('@', content)])
-        num_word = len(content.split())
-
-        num_words.append(num_word)
-        num_urls.append(url_num)
-        question_mark_nums.append(question_flag)
-        num_mentions.append(mention_num)
-
-    try:
-        mean_num_words = np.mean(num_words)
-    except:
-        mean_num_words = 0
-
-    try:
-        mean_num_urls = np.mean(num_urls)
-    except:
-        mean_num_urls = 0
-
-    try:
-        mean_question_mark_nums = np.mean(question_mark_nums)
-    except:
-        mean_question_mark_nums = 0
-
-    try:
-        mean_num_mentions = np.mean(num_mentions)
-    except:
-        mean_num_mentions = 0
-
-    return [mean_num_words, mean_num_urls, mean_question_mark_nums, mean_num_mentions]
-
-
-def get_content_topic_based_features(reply_id_content_dict):
-    positive_words = []
-    negative_words = []
-    neutral_words = []
-    sentiment_scores = []
-    reply_lengths = []
-
-    for reply_id, content in reply_id_content_dict.items():
-        if reply_id in reply_id_content_dict:
-            sentiment_info = all_reply_id_sentiment_score_dict[reply_id]
-            positive_words.append(sentiment_info["pos"])
-            negative_words.append(sentiment_info["neg"])
-            neutral_words.append(sentiment_info["neu"])
-            sentiment_scores.append(sentiment_info["compound"])
-            reply_lengths.append(len(content))
-
-    try:
-        mean_positive_words = np.mean(positive_words)
-    except:
-        mean_positive_words = 0
-
-    try:
-        mean_negative_words = np.mean(negative_words)
-    except:
-        mean_negative_words = 0
-
-    try:
-        mean_neutral_words = np.mean(neutral_words)
-    except:
-        mean_neutral_words = 0
-
-    try:
-        mean_sentiment_score = np.mean(sentiment_scores)
-    except:
-        mean_sentiment_score = 0
-
-    try:
-        mean_reply_length = np.mean(reply_lengths)
-    except:
-        mean_reply_length = 0
-
-    return [len(reply_id_content_dict), mean_positive_words, mean_negative_words, mean_neutral_words,
-            mean_sentiment_score, mean_reply_length]
-
-
-def get_user_aggregate_features(db, is_fake, user_ids):
-    posts_num = []
-    friends_num = []
-    followers_num = []
-    days_register = []
-
-    if is_fake:
-        label_user_collection = db.fake_twitter_user_profile
-    else:
-        label_user_collection = db.real_twitter_user_profile
-
-    user_profile_collection = db.twitter_user_profile
-
-    # np.random.shuffle(user_ids)
-
-    for user_id in tqdm(user_ids):
-
-        user_object = label_user_collection.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1,
-                                                                            "profile_info.friends_count": 1,
-                                                                            "profile_info.followers_count": 1,
-                                                                            "profile_info.created_at": 1})
-        if user_object is None:
-            user_object = user_profile_collection.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1,
-                                                                                  "profile_info.friends_count": 1,
-                                                                                  "profile_info.followers_count": 1,
-                                                                                  "profile_info.created_at": 1})
-
-        if user_object is None:
-            print('user {} not found'.format(user_id))
-        else:
-            if "profile_info" in user_object:
-                pnum = user_object["profile_info"]['statuses_count']
-                fnum = user_object["profile_info"]['friends_count']
-                fonum = user_object["profile_info"]['followers_count']
-                create_time = user_object["profile_info"]['created_at']
-                date_create = datetime.strptime(create_time, '%a %b %d %H:%M:%S +0000 %Y')
-                today = datetime.now()
-                dregister = (today - date_create).days
-                posts_num.append(pnum)
-                friends_num.append(fnum)
-                followers_num.append(fonum)
-                days_register.append(dregister)
-
-    try:
-        avg_posts_num = sum(posts_num) / len(posts_num)
-    except:
-        avg_posts_num = 0
-    try:
-        avg_friends_num = sum(friends_num) / len(friends_num)
-    except:
-        avg_friends_num = 0
-    try:
-        avg_followers_num = sum(followers_num) / len(followers_num)
-    except:
-        avg_followers_num = 0
-    try:
-        avg_days_register = sum(days_register) / len(days_register)
-    except:
-        avg_days_register = 0
-
-    return [avg_posts_num, avg_friends_num, avg_followers_num, avg_days_register]
-
-
-def get_castillo_features(db, news_source, raw_data_dir, label, prop_graphs):
-    raw_data = pickle.load(open("{}/{}_{}_castillo_raw_data.pkl".format(raw_data_dir, news_source, label), "rb"))
-
-    all_features = []
-
-    for news in raw_data:
-        sample_feature = []
-        sample_feature.extend(get_user_aggregate_features(db, label == "fake", news["user_ids"]))
-        sample_feature.extend(get_message_based_features(news["reply_id_content_dict"]))
-        sample_feature.extend(get_content_topic_based_features(news["reply_id_content_dict"]))
-
-        all_features.append(sample_feature)
-
-    structure_feature_helper = StructureFeatureHelper()
-    structure_features = structure_feature_helper.get_features_array(prop_graphs, micro_features=False,
-                                                                     macro_features=True)
-
-    other_features = get_numpy_array(all_features)
-    structure_features = get_numpy_array(structure_features)[:, [0, 1, 2]]
-    print("Other features shape")
-    print(other_features.shape)
-
-    print("Structure features shape")
-    print(structure_features.shape)
-    return np.concatenate([other_features, structure_features], axis=1)
-
-
-def dump_castillo_features(db, news_source, raw_data_dir, feature_out_dir, prop_graphs_dir):
-    fake_prop_graph, real_prop_graph = get_propagation_graphs(prop_graphs_dir, news_source)
-    fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
-
-    create_dir(feature_out_dir)
-
-    fake_castillo_features = get_castillo_features(db, news_source, raw_data_dir, "fake", fake_prop_graph)
-    real_castillo_features = get_castillo_features(db, news_source, raw_data_dir, "real", real_prop_graph)
-
-    all_castillo_features = np.concatenate([fake_castillo_features, real_castillo_features])
-
-    print("All castillo features")
-    print(all_castillo_features.shape, flush=True)
-
-    pickle.dump(all_castillo_features, open("{}/{}_castillo_features.pkl".format(feature_out_dir, news_source), "wb"))
-
-
-def get_raw_feature_for_news(news):
-    data = {}
-
-    user_ids = set()
-
-    reply_id_content_dict = dict()
-
-    for tweet in news["tweets"]:
-        user_ids.add(tweet["user_id"])
-        get_reply_of_replies(tweet["reply"], reply_id_content_dict)
-
-    data["id"] = news["id"]
-    data["user_ids"] = list(user_ids)
-    data["reply_id_content_dict"] = reply_id_content_dict
-
-    return data
-
-
-def get_castillo_data(data_dir, prop_graphs, news_source, label):
-    prop_graphs_ids = []
-    for news_graph in prop_graphs:
-        prop_graphs_ids.append(news_graph.tweet_id)
-
-    castillo_raw_data = [None] * len(prop_graphs_ids)
-
-    prop_graphs_ids_set = set(prop_graphs_ids)
-
-    file_path = "{}/{}_{}_news_complete_dataset.json".format(data_dir, news_source, label)
-
-    for news in get_news_articles(file_path):
-        news_id = news["id"]
-        if news_id in prop_graphs_ids_set:
-            news_id_index = prop_graphs_ids.index(news_id)
-            castillo_raw_data[news_id_index] = get_raw_feature_for_news(news)
-
-    return castillo_raw_data
-
-
-def get_castillo_raw_data(data_dir, prop_graphs_dir, out_dir, news_source):
-    fake_prop_graph, real_prop_graph = get_propagation_graphs(prop_graphs_dir, news_source)
-
-    fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
-
-    create_dir(out_dir)
-
-    fake_castillo_raw_data = get_castillo_data(data_dir, fake_prop_graph, news_source, "fake")
-    real_castillo_raw_data = get_castillo_data(data_dir, real_prop_graph, news_source, "real")
-
-    pickle.dump(fake_castillo_raw_data, open("{}/{}_fake_castillo_raw_data.pkl".format(out_dir, news_source), "wb"))
-    pickle.dump(real_castillo_raw_data, open("{}/{}_real_castillo_raw_data.pkl".format(out_dir, news_source), "wb"))
-
-
-def get_castillo_feature_array(news_source, castillo_feature_dir):
-    file_path = "{}/{}_real_castillo_raw_data.pkl".format(castillo_feature_dir, news_source)
-    file_obj = Path(file_path)
-
-    if file_obj.exists():
-        return pickle.load(open(file_path, "wb"))
-
-    return None
-
-
-
-if __name__ == '__main__':
-    config = load_configuration("project.config")
-    db = get_database_connection(config)
-    news_source = "politifact"
-
-    for news_source in ["politifact", "gossipcop"]:
-        # get_castillo_raw_data("data/engagement_data_latest", "data/saved_new_no_filter", "data/castillo/raw_data",
-        #                       news_source)
-        #
-        # print("Raw data dumped", flush=True)
-
-        dump_castillo_features(db, news_source, "data/castillo/raw_data", "data/castillo/saved_features",
-                               "data/saved_new_no_filter")
-
-        print("Castillo features for {} dumped".format(news_source), flush=True)
+#             if "profile_info" in user_object:
+#                 pnum = user_object["profile_info"]['statuses_count']
+#                 fnum = user_object["profile_info"]['friends_count']
+#                 fonum = user_object["profile_info"]['followers_count']
+#                 create_time = user_object["profile_info"]['created_at']
+#                 date_create = datetime.strptime(create_time, '%a %b %d %H:%M:%S +0000 %Y')
+#                 today = datetime.now()
+#                 dregister = (today - date_create).days
+#                 posts_num.append(pnum)
+#                 friends_num.append(fnum)
+#                 followers_num.append(fonum)
+#                 days_register.append(dregister)
+#
+#     try:
+#         avg_posts_num = sum(posts_num) / len(posts_num)
+#     except:
+#         avg_posts_num = 0
+#     try:
+#         avg_friends_num = sum(friends_num) / len(friends_num)
+#     except:
+#         avg_friends_num = 0
+#     try:
+#         avg_followers_num = sum(followers_num) / len(followers_num)
+#     except:
+#         avg_followers_num = 0
+#     try:
+#         avg_days_register = sum(days_register) / len(days_register)
+#     except:
+#         avg_days_register = 0
+#
+#     return [avg_posts_num, avg_friends_num, avg_followers_num, avg_days_register]
+#
+#
+# def get_castillo_features(db, news_source, raw_data_dir, label, prop_graphs):
+#     raw_data = pickle.load(open("{}/{}_{}_castillo_raw_data.pkl".format(raw_data_dir, news_source, label), "rb"))
+#
+#     all_features = []
+#
+#     for news in raw_data:
+#         sample_feature = []
+#         sample_feature.extend(get_user_aggregate_features(db, label == "fake", news["user_ids"]))
+#         sample_feature.extend(get_message_based_features(news["reply_id_content_dict"]))
+#         sample_feature.extend(get_content_topic_based_features(news["reply_id_content_dict"]))
+#
+#         all_features.append(sample_feature)
+#
+#     structure_feature_helper = StructureFeatureHelper()
+#     structure_features = structure_feature_helper.get_features_array(prop_graphs, micro_features=False,
+#                                                                      macro_features=True)
+#
+#     other_features = get_numpy_array(all_features)
+#     structure_features = get_numpy_array(structure_features)[:, [0, 1, 2]]
+#     print("Other features shape")
+#     print(other_features.shape)
+#
+#     print("Structure features shape")
+#     print(structure_features.shape)
+#     return np.concatenate([other_features, structure_features], axis=1)
+#
+#
+# def dump_castillo_features(db, news_source, raw_data_dir, feature_out_dir, prop_graphs_dir):
+#     fake_prop_graph, real_prop_graph = get_propagation_graphs(prop_graphs_dir, news_source)
+#     fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
+#
+#     create_dir(feature_out_dir)
+#
+#     fake_castillo_features = get_castillo_features(db, news_source, raw_data_dir, "fake", fake_prop_graph)
+#     real_castillo_features = get_castillo_features(db, news_source, raw_data_dir, "real", real_prop_graph)
+#
+#     all_castillo_features = np.concatenate([fake_castillo_features, real_castillo_features])
+#
+#     print("All castillo features")
+#     print(all_castillo_features.shape, flush=True)
+#
+#     pickle.dump(all_castillo_features, open("{}/{}_castillo_features.pkl".format(feature_out_dir, news_source), "wb"))
+#
+#
+# def get_raw_feature_for_news(news):
+#     data = {}
+#
+#     user_ids = set()
+#
+#     reply_id_content_dict = dict()
+#
+#     for tweet in news["tweets"]:
+#         user_ids.add(tweet["user_id"])
+#         get_reply_of_replies(tweet["reply"], reply_id_content_dict)
+#
+#     data["id"] = news["id"]
+#     data["user_ids"] = list(user_ids)
+#     data["reply_id_content_dict"] = reply_id_content_dict
+#
+#     return data
+#
+#
+# def get_castillo_data(data_dir, prop_graphs, news_source, label):
+#     prop_graphs_ids = []
+#     for news_graph in prop_graphs:
+#         prop_graphs_ids.append(news_graph.tweet_id)
+#
+#     castillo_raw_data = [None] * len(prop_graphs_ids)
+#
+#     prop_graphs_ids_set = set(prop_graphs_ids)
+#
+#     file_path = "{}/{}_{}_news_complete_dataset.json".format(data_dir, news_source, label)
+#
+#     for news in get_news_articles(file_path):
+#         news_id = news["id"]
+#         if news_id in prop_graphs_ids_set:
+#             news_id_index = prop_graphs_ids.index(news_id)
+#             castillo_raw_data[news_id_index] = get_raw_feature_for_news(news)
+#
+#     return castillo_raw_data
+#
+#
+# def get_castillo_raw_data(data_dir, prop_graphs_dir, out_dir, news_source):
+#     fake_prop_graph, real_prop_graph = get_propagation_graphs(prop_graphs_dir, news_source)
+#
+#     fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
+#
+#     create_dir(out_dir)
+#
+#     fake_castillo_raw_data = get_castillo_data(data_dir, fake_prop_graph, news_source, "fake")
+#     real_castillo_raw_data = get_castillo_data(data_dir, real_prop_graph, news_source, "real")
+#
+#     pickle.dump(fake_castillo_raw_data, open("{}/{}_fake_castillo_raw_data.pkl".format(out_dir, news_source), "wb"))
+#     pickle.dump(real_castillo_raw_data, open("{}/{}_real_castillo_raw_data.pkl".format(out_dir, news_source), "wb"))
+#
+#
+# def get_castillo_feature_array(news_source, castillo_feature_dir):
+#     file_path = "{}/{}_real_castillo_raw_data.pkl".format(castillo_feature_dir, news_source)
+#     file_obj = Path(file_path)
+#
+#     if file_obj.exists():
+#         return pickle.load(open(file_path, "wb"))
+#
+#     return None
+#
+#
+#
+# if __name__ == '__main__':
+#     config = load_configuration("project.config")
+#     db = get_database_connection(config)
+#     news_source = "politifact"
+#
+#     for news_source in ["politifact", "gossipcop"]:
+#         # get_castillo_raw_data("data/engagement_data_latest", "data/saved_new_no_filter", "data/castillo/raw_data",
+#         #                       news_source)
+#         #
+#         # print("Raw data dumped", flush=True)
+#
+#         dump_castillo_features(db, news_source, "data/castillo/raw_data", "data/castillo/saved_features",
+#                                "data/saved_new_no_filter")
+#
+#         print("Castillo features for {} dumped".format(news_source), flush=True)
diff --git a/construct_sample_features.py b/construct_sample_features.py
index 2fd4981..027940e 100644
--- a/construct_sample_features.py
+++ b/construct_sample_features.py
@@ -9,7 +9,8 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
 
-from analysis_util import get_propagation_graphs, equal_samples
+from analysis_util import get_propagation_graphs, equal_samples, remove_prop_graph_noise, get_noise_news_ids
+from data_processing.save_dataset import load_from_nx_graphs
 from linguistic_analysis import get_all_linguistic_features, LinguisticFeatureHelper
 from structure_temp_analysis import get_all_structural_features, StructureFeatureHelper, ScienceCascadeFeatureHelper, \
     get_first_post_time
@@ -33,7 +34,7 @@ def get_dataset(news_source, load_dataset=False, micro_features=True, macro_feat
         target_labels = pickle.load(open("{}_target_labels.pkl".format(news_source), "rb"))
 
     else:
-        fake_prop_graph, real_prop_graph = get_propagation_graphs(news_source)
+        fake_prop_graph, real_prop_graph = get_nx_propagation_graphs(news_source)
         fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
 
         print("fake samples len : {} real samples len : {}".format(len(fake_prop_graph), len(real_prop_graph)))
@@ -92,19 +93,20 @@ def get_dataset_file_name(file_dir, news_source, include_micro=True, include_mac
 
 def get_TPNF_dataset(out_dir, news_source, include_micro=True, include_macro=True, include_structural=None,
                      include_temporal=None,
-                     include_linguistic=None):
+                     include_linguistic=None, time_interval=None, use_cache=False):
     file_name = get_dataset_file_name(out_dir, news_source, include_micro, include_macro, include_structural,
                                       include_temporal, include_linguistic)
 
     data_file = Path(file_name)
 
-    if data_file.is_file():
+    if use_cache and data_file.is_file():
         return pickle.load(open(file_name, "rb"))
 
     else:
         fake_sample_features, real_sample_features = get_dataset_feature_array(news_source, include_micro,
                                                                                include_macro, include_structural,
-                                                                               include_temporal, include_linguistic)
+                                                                               include_temporal, include_linguistic,
+                                                                               time_interval)
 
         sample_features = np.concatenate([fake_sample_features, real_sample_features], axis=0)
         pickle.dump(sample_features, open(file_name, "wb"))
@@ -138,11 +140,11 @@ def get_dataset_feature_names(include_micro=True, include_macro=True, include_st
     return feature_names_all, short_feature_names_all
 
 
-def is_valid_graph(prop_graph: tweet_node):
+def is_valid_graph(prop_graph: tweet_node, retweet = True, reply = True):
     """ Check if the prop graph has alteast one retweet or reply"""
 
     for post_node in prop_graph.children:
-        if len(post_node.reply_children) > 0 or len(post_node.retweet_children) > 0:
+        if (retweet and len(post_node.reply_children) > 0) or (reply and len(post_node.retweet_children) > 0):
             return True
 
     return False
@@ -190,10 +192,29 @@ def filter_propagation_graphs(graphs, limit_time):
     return result_graphs
 
 
+def get_nx_propagation_graphs(data_folder, news_source):
+    fake_propagation_graphs = load_from_nx_graphs(data_folder, news_source, "fake")
+    real_propagation_graphs = load_from_nx_graphs(data_folder, news_source, "real")
+
+    print("Before filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs)))
+    print("Before filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs)))
+
+    # fake_propagation_graphs = remove_prop_graph_noise(fake_propagation_graphs, get_noise_news_ids())
+    # real_propagation_graphs = remove_prop_graph_noise(real_propagation_graphs, get_noise_news_ids())
+
+    # fake_news_ids = [graph.news_id for graph in fake_propagation_graphs]
+    # real_news_ids = [graph.news_id for graph in real_propagation_graphs]
+
+    print("After filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs)))
+    print("After filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs)))
+    print(flush=True)
+
+    return fake_propagation_graphs, real_propagation_graphs
+
 def get_dataset_feature_array(news_source, include_micro=True, include_macro=True, include_structural=None,
                               include_temporal=None,
-                              include_linguistic=None):
-    fake_prop_graph, real_prop_graph = get_propagation_graphs("data/saved_new_no_filter", news_source)
+                              include_linguistic=None, time_inteval = None):
+    fake_prop_graph, real_prop_graph = get_nx_propagation_graphs("data/nx_network_data", news_source)
 
     fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
 
@@ -222,6 +243,8 @@ def get_dataset_feature_array(news_source, include_micro=True, include_macro=Tru
                                                           macro_features=include_macro, news_source=news_source,
                                                           label="real")
 
+        feature_names = feature_helper.get_feature_names(micro_features= include_micro, macro_features= include_macro)
+        print(feature_names)
         if fake_features is not None and real_features is not None:
             fake_feature_all.append(fake_features)
             real_feature_all.append(real_features)
diff --git a/data_processing/data_process.py b/data_processing/data_process.py
new file mode 100644
index 0000000..b843a86
--- /dev/null
+++ b/data_processing/data_process.py
@@ -0,0 +1,912 @@
+import errno
+import os
+import pickle
+import queue
+import time
+from math import ceil
+from pathlib import Path
+
+import networkx as nx
+import numpy as np
+import scipy.sparse as sp
+from gensim.models import KeyedVectors
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+
+from twitter_tokenize import twitter_tokenize
+from util.util import tweet_node
+
+
+def construct_networkx_graph(graph: tweet_node, network_type):
+    G = nx.DiGraph()
+
+    tweet_id_node_id_dict = dict()
+
+    G.add_node(get_tweet_id_node_id_mapping(graph.tweet_id, tweet_id_node_id_dict))
+
+    if network_type == "retweet":
+        for node in graph.retweet_children:
+            add_networkxx_retweet_data(G, node, tweet_id_node_id_dict)
+            G.add_edge(get_tweet_id_node_id_mapping(graph.tweet_id, tweet_id_node_id_dict),
+                       get_tweet_id_node_id_mapping(node.tweet_id, tweet_id_node_id_dict))
+    else:
+        for node in graph.reply_children:
+            add_network_reply_data(G, node, tweet_id_node_id_dict)
+            G.add_edge(get_tweet_id_node_id_mapping(graph.tweet_id, tweet_id_node_id_dict),
+                       get_tweet_id_node_id_mapping(node.tweet_id, tweet_id_node_id_dict))
+
+    return G, tweet_id_node_id_dict
+
+
+def get_tweet_id_node_id_mapping(tweet_id, tweet_id_node_id_dict):
+    if tweet_id not in tweet_id_node_id_dict:
+        tweet_id_node_id_dict[tweet_id] = len(tweet_id_node_id_dict)
+
+    return tweet_id_node_id_dict[tweet_id]
+
+
+def add_networkxx_retweet_data(nx_graph: nx.DiGraph, node: tweet_node, tweet_id_node_id_dict: dict):
+    nx_graph.add_node(get_tweet_id_node_id_mapping(node.tweet_id, tweet_id_node_id_dict))
+
+    for child in node.retweet_children:
+        add_networkxx_retweet_data(nx_graph, child, tweet_id_node_id_dict)
+        nx_graph.add_edge(get_tweet_id_node_id_mapping(node.tweet_id, tweet_id_node_id_dict),
+                          get_tweet_id_node_id_mapping(child.tweet_id, tweet_id_node_id_dict))
+
+
+def add_network_reply_data(nx_graph: nx.DiGraph, node: tweet_node, tweet_id_node_id_dict: dict):
+    nx_graph.add_node(node.tweet_id)
+
+    for child in node.reply_children:
+        add_network_reply_data(nx_graph, child, tweet_id_node_id_dict)
+        nx_graph.add_edge(get_tweet_id_node_id_mapping(node.tweet_id, tweet_id_node_id_dict),
+                          get_tweet_id_node_id_mapping(child.tweet_id, tweet_id_node_id_dict))
+
+
+def get_noise_news_ids():
+    with open("data/news_id_ignore_list") as file:
+        lines = file.readlines()
+        return [line.strip() for line in lines]
+
+
+def get_propagation_graphs(data_folder, news_source):
+    fake_propagation_graphs = load_prop_graph(data_folder, news_source, "fake")
+    # fake_propagation_graphs = []
+    real_propagation_graphs = load_prop_graph(data_folder, news_source, "real")
+
+    print("Before filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs)))
+    print("Before filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs)))
+
+    fake_propagation_graphs = remove_prop_graph_noise(fake_propagation_graphs, get_noise_news_ids())
+    real_propagation_graphs = remove_prop_graph_noise(real_propagation_graphs, get_noise_news_ids())
+
+    print("After filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs)))
+    print("After filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs)))
+    print(flush=True)
+
+    return fake_propagation_graphs, real_propagation_graphs
+
+
+def load_prop_graph(data_folder, news_source, news_label):
+    news_graphs = pickle.load(open("{}/{}_{}_news_prop_graphs.pkl".format(data_folder, news_source, news_label), "rb"))
+    return news_graphs
+
+
+def remove_prop_graph_noise(news_graphs, noise_ids):
+    noise_ids = set(noise_ids)
+    return [graph for graph in news_graphs if graph.tweet_id not in noise_ids]
+
+
+def sort_tweet_node_object_by_created_time(tweet_nodes: list):
+    tweet_nodes.sort(key=lambda x: x.created_time)
+
+    return tweet_nodes
+
+
+def equal_samples(sample1, sample2):
+    target_len = min(len(sample1), len(sample2))
+
+    np.random.seed(0)
+
+    np.random.shuffle(sample1)
+    np.random.shuffle(sample2)
+
+    return sample1[:target_len], sample2[:target_len]
+
+
+def filter_propagation_graphs(graphs, limit_time, retweet=True, reply=True):
+    result_graphs = []
+
+    for prop_graph in graphs:
+        filtered_prop_graph = remove_node_by_time(prop_graph, limit_time)
+        if is_valid_graph(filtered_prop_graph, retweet, reply):
+            result_graphs.append(filtered_prop_graph)
+
+    return result_graphs
+
+
+def is_valid_graph(prop_graph: tweet_node, retweet=True, reply=True):
+    """ Check if the prop graph has alteast one retweet or reply"""
+
+    for post_node in prop_graph.children:
+        if (retweet and len(post_node.reply_children) > 0) or (reply and len(post_node.retweet_children) > 0):
+            return True
+
+    return False
+
+
+def get_first_post_time(node: tweet_node):
+    first_post_time = time.time()
+
+    for child in node.children:
+        first_post_time = min(first_post_time, child.created_time)
+
+    return first_post_time
+
+
+def remove_node_by_time(graph: tweet_node, limit_time):
+    start_time = get_first_post_time(graph)
+    end_time = start_time + limit_time
+
+    q = queue.Queue()
+
+    q.put(graph)
+
+    while q.qsize() != 0:
+        node = q.get()
+
+        children = node.children
+
+        retweet_children = set(node.retweet_children)
+        reply_children = set(node.reply_children)
+
+        for child in children.copy():
+
+            if child.created_time <= end_time:
+                q.put(child)
+            else:
+                node.children.remove(child)
+                try:
+                    retweet_children.remove(child)
+                except KeyError:  # Element not found in the list
+                    pass
+                try:
+                    reply_children.remove(child)
+                except KeyError:  # Element not found in the list
+                    pass
+
+        node.retweet_children = list(retweet_children)
+        node.reply_children = list(reply_children)
+
+    return graph
+
+
+def get_all_propagation_graphs(news_source="politifact", time_interval=None, args=None):
+    if Path.is_file(Path("data/{}_graphs_data.pkl".format(news_source))):
+        graph_data = pickle.load(open("data/{}_graphs_data.pkl".format(news_source), "rb"))
+        return graph_data
+
+    fake_prop_graph, real_prop_graph = get_propagation_graphs("data/prop_graph_save", news_source)
+
+    # fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
+
+    # fake_prop_graph = fake_prop_graph[:100]
+    # real_prop_graph = real_prop_graph[:100]
+
+    if time_interval is not None:
+        time_limit = time_interval * 60 * 60
+
+        print("Time limit in seconds : {}".format(time_limit))
+
+        fake_prop_graph = filter_propagation_graphs(fake_prop_graph, time_limit, reply=False)
+        real_prop_graph = filter_propagation_graphs(real_prop_graph, time_limit, reply=False)
+
+        print("After time based filtering ")
+        print("No. of fake samples : {}  No. of real samples: {}".format(len(fake_prop_graph), len(real_prop_graph)))
+
+        fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
+
+    fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
+
+    all_network_xx_graphs = []
+    all_tweet_id_node_ids_dicts = []
+    all_tweet_id_text_dict = dict()
+    one_hot_labels = []
+
+    labels = []
+
+    max_num_nodes = 0
+
+    graph_hidden_states = []
+
+    news_article_text_contents = []
+
+    for graph in fake_prop_graph:
+        get_textual_features(graph, all_tweet_id_text_dict)
+        news_article_text_contents.append(graph.text)
+        # TODO: Uncomment after dumping time series data - prune graphs for network generation
+        # graph = prune_graph_by_max_nodes_time(graph, args.max_num_node)
+        graph, sample_tweet_id_node_id_dict = construct_networkx_graph(graph, "retweet")
+        all_network_xx_graphs.append(graph)
+        max_num_nodes = max(max_num_nodes, nx.number_of_nodes(graph))
+        all_tweet_id_node_ids_dicts.append(sample_tweet_id_node_id_dict)
+        one_hot_labels.append([1, 0])
+        labels.append(1)
+
+    for graph in real_prop_graph:
+        get_textual_features(graph, all_tweet_id_text_dict)
+        news_article_text_contents.append(graph.text)
+        # TODO: Uncomment after dumping time series data - prune graphs for network generation
+        # graph = prune_graph_by_max_nodes_time(graph, args.max_num_node)
+        graph, sample_tweet_id_node_id_dict = construct_networkx_graph(graph, "retweeet")
+        all_network_xx_graphs.append(graph)
+        max_num_nodes = max(max_num_nodes, nx.number_of_nodes(graph))
+        all_tweet_id_node_ids_dicts.append(sample_tweet_id_node_id_dict)
+        one_hot_labels.append([0, 1])
+        labels.append(0)
+
+    print("max number of nodes : {}".format(max_num_nodes))
+
+    # TODO: Construct hidden state of the network using Glove embedding
+
+    # model_path = "/home/dmahudes/temporal_event_analysis/pre_train_model/glove.twitter.27B.200d.w2vformat.txt"
+
+    model_path = "data/glove.twitter.27B.200d.w2vformat.txt"
+
+    glove_model = get_gensim_model(model_path)
+
+    for news_article in news_article_text_contents:
+        news_feature = get_tweet_latent_embeddings(news_article, glove_model)
+        news_feature = np.expand_dims(np.array(news_feature), axis=1).transpose()
+        graph_hidden_states.append(news_feature)
+
+    # return all_network_xx_graphs, all_tweet_id_text_dict, one_hot_labels, labels, all_tweet_id_node_ids_dicts
+    # return all_network_xx_graphs, all_tweet_id_text_dict, one_hot_labels, labels, all_tweet_id_node_ids_dicts, \
+    #        np.concatenate(graph_hidden_states)
+
+    graph_data = [all_network_xx_graphs, all_tweet_id_text_dict, one_hot_labels, labels, all_tweet_id_node_ids_dicts,
+                  np.concatenate(graph_hidden_states)]
+
+    pickle.dump(graph_data, open("data/{}_graphs_data.pkl".format(news_source), "wb"))
+
+    return graph_data
+
+
+def get_textual_features(graph: tweet_node, tweet_id_text_dict):
+    q = queue.Queue()
+
+    q.put(graph)
+
+    while q.qsize() != 0:
+        node = q.get()
+        tweet_id_text_dict[node.tweet_id] = node.text
+        for child in node.retweet_children:
+            q.put(child)
+
+
+def nodes_stats(all_network_xx_graphs):
+    node_sizes = []
+
+    for graph in all_network_xx_graphs:
+        node_sizes.append(nx.number_of_nodes(graph))
+
+    print("Min : {}".format(min(node_sizes)))
+    print("Max : {}".format(max(node_sizes)))
+    print("Mean : {}".format(np.mean(node_sizes)))
+    print("STD: {} ".format(np.std(node_sizes)))
+    print("Total nodes : {}".format(np.sum(node_sizes)))
+
+
+def filter_graphs(all_network_xx_graphs, max_nodes):
+    graphs = []
+    for graph in all_network_xx_graphs:
+        nodes_count = nx.number_of_nodes(graph)
+        if nodes_count <= max_nodes:
+            graphs.append(graph)
+
+    return graphs
+
+
+def get_nodes_count(node: tweet_node, edge_type="retweet"):
+    if node is None:
+        return 0
+
+    node_count = 0
+
+    if edge_type == "retweet":
+        children = node.retweet_children
+    elif edge_type == "reply":
+        children = node.reply_children
+    else:
+        children = node.children
+
+    for child in children:
+        node_count += get_nodes_count(child, edge_type)
+
+    return node_count + 1
+
+
+def get_K_node_time(graph, max_nodes):
+    node_creation_times = []
+
+    q = queue.Queue()
+
+    q.put(graph)
+
+    while q.qsize() != 0:
+        node = q.get()
+
+        children = node.retweet_children
+
+        for child in children:
+            q.put(child)
+            node_creation_times.append(child.created_time)
+
+    node_creation_times.sort()
+
+    return node_creation_times[max_nodes - 1]
+
+
+def prune_graph_by_max_nodes_time(graph, max_nodes):
+    if get_nodes_count(graph) < max_nodes:
+        return graph
+
+    node_k_time = get_K_node_time(graph, max_nodes)
+
+    return remove_node_by_end_time(graph, node_k_time)
+
+
+def remove_node_by_end_time(graph: tweet_node, end_time):
+    q = queue.Queue()
+
+    q.put(graph)
+
+    while q.qsize() != 0:
+        node = q.get()
+
+        children = node.children
+
+        for child in list(children):
+
+            if child.created_time <= end_time:
+                q.put(child)
+            else:
+                node.children.remove(child)
+                try:
+                    node.retweet_children.remove(child)
+                except ValueError:  # Element not found in the list
+                    pass
+                try:
+                    node.reply_children.remove(child)
+                except ValueError:  # Element not found in the list
+                    pass
+
+    return graph
+
+
+def reverse_dict(tweet_id_node_id_dict):
+    node_id_tweet_id_dict = dict()
+
+    for key, value in tweet_id_node_id_dict.items():
+        node_id_tweet_id_dict[value] = key
+
+    return node_id_tweet_id_dict
+
+
+def get_batch_pooling_matrix(graphs):
+    node_sizes = []
+
+    for graph in graphs:
+        nx.nodes(graph)
+        node_sizes.append(nx.number_of_nodes(graph))
+
+    num_graphs = len(graphs)
+    num_nodes = np.sum(node_sizes)
+
+    pooling_matrix = np.zeros((num_graphs, num_nodes))
+
+    start = 0
+
+    indexes = []
+
+    for idx, graph in enumerate(graphs):
+        indexes.append(start)
+
+        start += len(nx.nodes(graph))
+
+    indexes.append(start)
+
+    for idx in range(num_graphs):
+        pooling_matrix[idx, range(indexes[idx], indexes[idx + 1])] = (1 / (indexes[idx + 1] - indexes[idx]))
+
+    return pooling_matrix
+
+
+def get_overall_adjoint_matrix(graphs):
+    node_sizes = []
+
+    for graph in graphs:
+        nx.nodes(graph)
+        node_sizes.append(nx.number_of_nodes(graph))
+
+    num_graphs = len(graphs)
+    num_nodes = np.sum(node_sizes)
+
+    print("num of nodes : {}".format(num_nodes))
+
+    adj_matrix = [[0 for i in range(num_nodes)] for k in range(num_nodes)]
+
+    start = 0
+
+    indexes = []
+
+    for idx, graph in tqdm(enumerate(graphs)):
+        edges = nx.to_edgelist(graph)
+        indexes.append(start)
+        for edge in tqdm(edges):
+            u = edge[0]
+            v = edge[1]
+
+            u += start
+            v += start
+
+            adj_matrix[u][v] = 1
+
+        start += len(nx.nodes(graph))
+
+    adj_matrix = np.matrix(adj_matrix)
+    adj_matrix = sp.coo_matrix(adj_matrix)
+
+    # sp.save_npz("politifact_adj_matrix_basic", adj_matrix)
+    return adj_matrix
+
+
+def get_all_documents(news_source, all_tweet_id_text_dict):
+    tweet_ids = []
+    documents = []
+
+    for tweet_id, text in all_tweet_id_text_dict.items():
+        tweet_ids.append(tweet_id)
+
+        if str(news_source) in str(tweet_id):
+            documents.append(" ")
+            print("Root node tweet id : {}".format(tweet_id))
+        else:
+            documents.append(text)
+
+    vectorizer = TfidfVectorizer(max_features=2000, stop_words="english")
+    vectorizer.fit(documents)
+
+    pickle.dump(vectorizer, open("{}_doc_vectorizer.pkl".format(news_source), "wb"))
+
+    transformed_docs = vectorizer.transform(documents).todense()
+
+    from sklearn.decomposition import PCA
+
+    pca = PCA(n_components=10)
+    transformed_docs = pca.fit_transform(transformed_docs)
+
+    print("tranformed docs ", transformed_docs.shape)
+
+    single_node_embeddings = transformed_docs[0, :].transpose()
+
+    print("single doc transofmred doc", single_node_embeddings.shape, flush=True)
+
+    all_tweet_id_text_dict = dict()
+
+    for idx in range(transformed_docs.shape[0]):
+        all_tweet_id_text_dict[tweet_ids[idx]] = transformed_docs[idx, :]
+
+    return all_tweet_id_text_dict
+
+
+def get_all_documents_glove_embeddings(news_source, all_tweet_id_text_dict):
+    # model_path = "/home/dmahudes/temporal_event_analysis/pre_train_model/glove.twitter.27B.100d.w2vformat.txt"
+    # model_path = "/home/dmahudes/temporal_event_analysis/pre_train_model/glove.twitter.27B.25d.w2vformat.txt"
+
+    # model_path = "/home/dmahudes/temporal_event_analysis/pre_train_model/glove.twitter.27B.200d.w2vformat.txt"
+    model_path = "data/glove.twitter.27B.200d.w2vformat.txt"
+
+    glove_model = get_gensim_model(model_path)
+
+    tweet_id_embddings_dict = dict()
+
+    for tweet_id, text in tqdm(all_tweet_id_text_dict.items()):
+        if str(news_source) in str(tweet_id):
+            tweet_id_embddings_dict[tweet_id] = np.zeros((200,))
+            print("root tweet id : {}".format(tweet_id), flush=True)
+
+        else:
+            tweet_id_embddings_dict[tweet_id] = get_tweet_latent_embeddings(text, glove_model)
+
+    pickle.dump(tweet_id_embddings_dict, open("{}_tweet_id_glove_embeddings_dict.pkl".format(news_source), "wb"))
+
+    # vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
+    # vectorizer.fit(documents)
+
+    # pickle.dump(vectorizer, open("{}_doc_vectorizer.pkl".format(news_source), "wb"))
+
+    # transformed_docs = vectorizer.transform(documents).todense()
+
+    # print("tranformed docs ", transformed_docs.shape)
+    #
+    # single_node_embeddings = transformed_docs[0, :].transpose()
+    #
+    # print("single doc transofmred doc", single_node_embeddings.shape, flush=True)
+    #
+    # all_tweet_id_text_dict = dict()
+
+    # for idx in range(transformed_docs.shape[0]):
+    #     all_tweet_id_text_dict[tweet_ids[idx]] = transformed_docs[idx, :]
+
+    return tweet_id_embddings_dict
+
+
+def get_feature_matrix(graphs, tweet_id_feature_dict, graph_tweet_id_node_id_dicts):
+    node_features = []
+
+    for idx in range(len(graphs)):
+        graph = graphs[idx]
+        tweet_id_node_id_dict = graph_tweet_id_node_id_dicts[idx]
+        node_id_tweet_id_dict = reverse_dict(tweet_id_node_id_dict)
+        for node_id in nx.nodes(graph):
+            # print("node id", node_id)
+            tweet_id = node_id_tweet_id_dict[node_id]
+            tweet_feature = np.array(tweet_id_feature_dict[tweet_id]).transpose()
+            # print("tweet feature ", tweet_feature.shape)
+            # node_features.append(tweet_feature)
+            node_features.append(np.expand_dims(np.array(tweet_id_feature_dict[tweet_id]), axis=1).transpose())
+    # print("no. of nodes : {}".format(len(node_features)))
+    # return sp.csr_matrix(np.concatenate(node_features, axis=1).transpose())
+
+    return sp.csr_matrix(np.concatenate(node_features))
+
+
+def get_glove_feature_matrix(graphs, tweet_id_feature_dict, graph_tweet_id_node_id_dicts):
+    node_features = []
+
+    for idx in range(len(graphs)):
+        graph = graphs[idx]
+        tweet_id_node_id_dict = graph_tweet_id_node_id_dicts[idx]
+        node_id_tweet_id_dict = reverse_dict(tweet_id_node_id_dict)
+        for node_id in nx.nodes(graph):
+            # print("node id", node_id)
+            tweet_id = node_id_tweet_id_dict[node_id]
+            tweet_feature = np.expand_dims(np.array(tweet_id_feature_dict[tweet_id]), axis=1).transpose()
+            if len(tweet_feature.shape) > 1:
+                if tweet_feature.shape[0] != 1 or tweet_feature.shape[1] != 200:
+                    print("tweet feature : ", tweet_feature.shape)
+            else:
+                tweet_feature = np.zeros((1, 200))
+                print(tweet_feature.shape)
+
+            node_features.append(tweet_feature)
+
+    # print("no. of nodes : {}".format(len(node_features)))
+
+    print("batch_embdding_Size before concat ", len(node_features), node_features[0].shape)
+
+    return sp.csr_matrix(np.concatenate(node_features))
+
+
+def create_dir(dir_name):
+    if not os.path.exists(dir_name):
+        try:
+            os.makedirs(dir_name)
+        except OSError as exc:  # Guard against race condition
+            if exc.errno != errno.EEXIST:
+                raise
+
+
+# trasforma matrici in tuple
+def to_tuple(mat):
+    if not sp.isspmatrix_coo(mat):
+        mat = mat.tocoo()
+    idxs = np.vstack((mat.row, mat.col)).transpose()
+    values = mat.data
+    shape = mat.shape
+    return idxs, values, shape
+
+
+# trasforma matrici sparse in tuble
+def sparse_to_tuple(sparse_mat):
+    if isinstance(sparse_mat, list):
+        for i in range(len(sparse_mat)):
+            sparse_mat[i] = to_tuple(sparse_mat[i])
+    else:
+        sparse_mat = to_tuple(sparse_mat)
+    return sparse_mat
+
+
+# normalizza la matrice delle feature per riga e la trasforma in tupla
+def process_features(features: object) -> object:
+    features /= features.sum(1).reshape(-1, 1)
+    features[np.isnan(features) | np.isinf(features)] = 0  # serve per le features dei nodi globali, che sono di soli 0.
+    return sparse_to_tuple(sp.csr_matrix(features))
+
+
+# renormalization trick della matrice di adiacenza
+def normalize_adj(adj, symmetric=True):
+    if symmetric:
+        d = sp.diags(np.power(np.array(adj.sum(1)), -0.5).flatten(), 0)
+        a_norm = adj.dot(d).transpose().dot(d).tocsr()
+    else:
+        d = sp.diags(np.power(np.array(adj.sum(1)), -1.0).flatten(), 0)
+        a_norm = d.dot(adj).tocsr()
+    return sp.csr_matrix(a_norm)
+
+
+# conversione a tupla e normalizzazione della matrice d'adiacenza
+def preprocess_adj(adj, is_gcn, symmetric=True):
+    if is_gcn:
+        adj = adj + sp.eye(adj.shape[0])  # ogni nodo ha come vicino anche se stesso, fa parte di GCN
+    adj = normalize_adj(adj, symmetric)
+    return sparse_to_tuple(adj)
+
+
+def get_input_for_batches(news_source, batch_size, time_interval, input_dim):
+    all_network_x_graphs, all_tweet_id_text_dict, one_hot_labels, labels, all_tweet_id_node_ids_dicts = get_all_propagation_graphs(
+        news_source, time_interval)
+
+    # all_tweet_id_text_dict = get_all_documents(news_source, all_tweet_id_text_dict)
+
+    # tweet_id_embeddings_dict = all_tweet_id_text_dict
+
+    tweet_id_embeddings_dict = get_all_documents_glove_embeddings(news_source, all_tweet_id_text_dict)
+
+    # tweet_id_embeddings_dict = pickle.load(open("{}_tweet_id_glove_embeddings_dict.pkl".format(news_source), "rb"))
+
+    print("all_network_x_graphs count : {}".format(len(all_network_x_graphs)))
+    print("all_tweet_id_text_dict count : {}".format(len(one_hot_labels)))
+    print("all_tweet_id_node_ids_dicts count: {}".format(len(all_tweet_id_node_ids_dicts)))
+
+    train_network_x_graphs, test_network_x_graphs, train_one_hot_labels, test_one_hot_labels, train_tweet_id_node_ids_dicts, test_tweet_id_node_ids_dicts = train_test_split(
+        all_network_x_graphs, one_hot_labels, all_tweet_id_node_ids_dicts, stratify=labels,
+        test_size=0.2, random_state=42)
+
+    # all_network_x_graphs = train_network_x_graphs
+    # labels = train_one_hot_labels
+    # all_tweet_id_node_ids_dicts = train_tweet_id_node_ids_dicts
+
+    dump_batch_inputs(batch_size, news_source, "train", train_network_x_graphs, train_tweet_id_node_ids_dicts,
+                      train_one_hot_labels, tweet_id_embeddings_dict, time_interval, input_dim)
+
+    dump_batch_inputs(batch_size, news_source, "test", test_network_x_graphs, test_tweet_id_node_ids_dicts,
+                      test_one_hot_labels, tweet_id_embeddings_dict, time_interval, input_dim)
+
+    # dump_glove_feature_batch_embeddings(batch_size, news_source, "train", train_network_x_graphs,
+    #                                     train_tweet_id_node_ids_dicts, train_one_hot_labels, tweet_id_embeddings_dict, time_interval)
+    # dump_glove_feature_batch_embeddings(batch_size, news_source, "test", test_network_x_graphs,
+    #                                     test_tweet_id_node_ids_dicts,
+    #                                     test_one_hot_labels, tweet_id_embeddings_dict, time_interval)
+
+
+def dump_glove_feature_batch_embeddings(batch_size, news_source, split_label, all_network_x_graphs,
+                                        all_tweet_id_node_ids_dicts, labels,
+                                        all_tweet_id_text_dict, time_interval):
+    data_dir = "data/time_batch_data"
+    create_dir(data_dir)
+
+    data_dir = "{}/batch_{}".format(data_dir, time_interval)
+    create_dir(data_dir)
+
+    data_dir = "{}/{}".format(data_dir, news_source)
+
+    create_dir(data_dir)
+
+    data_dir = "{}/glove_feat".format(data_dir)
+    create_dir(data_dir)
+
+    data_dir = "{}/{}".format(data_dir, split_label)
+    create_dir(data_dir)
+
+    num_samples = len(labels)
+
+    num_batches = int(ceil(num_samples / batch_size))
+
+    for idx in tqdm(range(num_batches)):
+        start_idx = idx * batch_size
+        end_idx = start_idx + batch_size
+
+        batch_graphs = all_network_x_graphs[start_idx: end_idx]
+        batch_labels = labels[start_idx: end_idx]
+        batch_mapping_dicts = all_tweet_id_node_ids_dicts[start_idx: end_idx]
+
+        batch_node_features = get_glove_feature_matrix(batch_graphs, all_tweet_id_text_dict, batch_mapping_dicts)
+
+        # print("node feature matrix shape ", batch_node_features.shape)
+
+        batch_node_features = process_features(batch_node_features)
+
+        batch_input = [batch_node_features]
+        # batch_inputs.append(batch_input)
+
+        pickle.dump(batch_input, open("{}/batch_{}.pkl".format(data_dir, idx), "wb"))
+
+
+def dump_batch_inputs(batch_size, news_source, split_label, all_network_x_graphs, all_tweet_id_node_ids_dicts, labels,
+                      all_tweet_id_text_dict, time_interval, input_dim):
+    data_dir = "data/time_batch_data"
+    create_dir(data_dir)
+
+    data_dir = "{}/batch_{}".format(data_dir, time_interval)
+    create_dir(data_dir)
+
+    data_dir = "{}/{}_{}".format(data_dir, news_source, input_dim)
+
+    create_dir(data_dir)
+
+    data_dir = "{}/{}".format(data_dir, split_label)
+    create_dir(data_dir)
+
+    batch_inputs = []
+
+    num_samples = len(labels)
+
+    num_batches = int(ceil(num_samples / batch_size))
+
+    for idx in tqdm(range(num_batches)):
+        start_idx = idx * batch_size
+        end_idx = start_idx + batch_size
+
+        batch_graphs = all_network_x_graphs[start_idx: end_idx]
+        batch_labels = labels[start_idx: end_idx]
+        batch_mapping_dicts = all_tweet_id_node_ids_dicts[start_idx: end_idx]
+
+        batch_adj_matrix = get_overall_adjoint_matrix(batch_graphs)
+        batch_pooling_matrix = get_batch_pooling_matrix(batch_graphs)
+
+        batch_adj_matrix = preprocess_adj(batch_adj_matrix, True, False)
+
+        # batch_node_features = get_feature_matrix(batch_graphs, all_tweet_id_text_dict, batch_mapping_dicts)
+
+        batch_node_features = get_glove_feature_matrix(batch_graphs, all_tweet_id_text_dict, batch_mapping_dicts)
+
+        print("node feature matrix shape ", batch_node_features.shape)
+
+        batch_node_features = process_features(batch_node_features)
+
+        batch_input = [batch_adj_matrix, batch_node_features, batch_labels, batch_pooling_matrix]
+        # batch_inputs.append(batch_input)
+
+        pickle.dump(batch_input, open("{}/batch_{}.pkl".format(data_dir, idx), "wb"))
+
+    # pickle.dump(batch_inputs, open("{}_batched_inputs.pkl".format(news_source), "wb"))
+    # return batch_inputs
+
+
+def get_gensim_model(model_path):
+    model = KeyedVectors.load_word2vec_format(model_path, binary=False)
+
+    return model
+
+
+def get_tweet_latent_embeddings(text_contents, model):
+    word_embeddings = []
+
+    tokens = twitter_tokenize(text_contents)
+
+    for token in tokens.split():
+        try:
+            word_embeddings.append(model[token])
+        except:
+            pass
+
+    if len(word_embeddings) > 0:
+        try:
+            return np.mean(word_embeddings, axis=0)
+        except:
+            return np.zeros((200,))
+
+    return np.zeros((200,))
+
+
+def analyze_dataset(news_source):
+    graphs, all_tweet_id_text_dict, one_hot_labels, labels, all_tweet_id_node_ids_dicts = get_all_propagation_graphs(
+        news_source)
+
+    # graphs = filter_graphs(graphs, 1500)
+
+    graph_sizes = []
+
+    for graph in graphs:
+        graph_sizes.append(nx.number_of_nodes(graph))
+
+    import matplotlib
+    matplotlib.use('agg')
+    import matplotlib.pyplot as plt
+
+    plt.hist(graph_sizes, normed=True, bins=30)
+
+    plt.savefig("figures/{}_graph_distribution.png".format(news_source))
+
+
+def get_random_bfs_sequence(G):
+    start_id = 0
+    dictionary = dict(nx.bfs_successors(G, start_id))
+    start = [start_id]
+
+    max_prev_nodes = 0
+
+    while len(start) > 0:
+        next = []
+
+        while len(start) > 0:
+            current = start.pop(0)
+            neighbor = dictionary.get(current)
+
+            if neighbor is not None:
+                next = next + neighbor
+
+        max_prev_nodes = max(max_prev_nodes, len(next))
+
+        start = next
+
+    # print("max previous nodes : {}".format(max_prev_nodes))
+    return max_prev_nodes
+
+
+if __name__ == "__main__":
+    news_source = "gossipcop"
+
+    news_source = "politifact"
+
+    # analyze_dataset(news_source)
+
+    # time_intervals = [12, 24, 36, 48, 60, 72, 84, 96]
+
+    # time_intervals = [12, 24, 36, 48, 60, 72, 84, 96]
+
+    # time_intervals = [12, 24, 36,48,  60, 72, 84, 96]
+    #
+    input_dim = 200
+    #
+    time_intervals = [3, 6]
+    #
+    # # time_intervals = [None]
+    #
+    for time_interval in time_intervals:
+        print("=============Time Interval : {}  ==========".format(time_interval))
+        start_time = time.time()
+        # get_classificaton_results_tpnf("data/train_test_data", "politifact", time_interval)
+        # get_classificaton_results_tpnf("data/train_test_data", "gossipcop", time_interval)
+        get_input_for_batches(news_source, 8, time_interval, input_dim)
+
+        print("\n\n================Exectuion time - {} ==================================\n".format(
+            time.time() - start_time))
+
+    # graphs, all_tweet_id_text_dict, one_hot_labels, labels, all_tweet_id_node_ids_dicts, hidden_state = get_all_propagation_graphs(
+    #     news_source="gossipcop", args = Args())
+
+    # max_nodes = 5000
+
+    # graphs = filter_graphs(graphs, max_nodes)
+
+    # max_breadths = []
+    #
+    # for graph in graphs:
+    #     max_breadths.append(get_random_bfs_sequence(graph))
+    #
+    # print("Mean : {}".format(np.mean(max_breadths)))
+    # print("Max : {}".format(max(max_breadths)))
+    # print("Min : {}".format(min(max_breadths)))
+    # print(np.histogram(max_breadths))
+
+    exit(1)
+
+    all_network_x_graphs, all_tweet_id_text_dict, labels = get_all_propagation_graphs(news_source)
+
+    all_tweet_id_text_dict = get_all_documents(news_source, all_tweet_id_text_dict)
+
+    pickle.dump(all_network_x_graphs, open("{}_all_networkx_graphs.pkl".format(news_source), "wb"))
+    pickle.dump(all_tweet_id_text_dict, open("{}_graphs_text_dict.pkl".format(news_source), "wb"))
+    pickle.dump(labels, open("{}_labels.pkl".format(news_source), "wb"))
+
+    # all_network_xx_graphs = filter_graphs(all_network_x_graphs, 2000)
+
+    # adj_matrix = get_overall_adjoint_matrix(all_network_x_graphs)
+
+    # nodes_stats(all_network_xx_graphs)
+    #
+    # print(len(all_network_xx_graphs))
diff --git a/elmo_feature_extraction.py b/elmo_feature_extraction.py
deleted file mode 100644
index 391fa38..0000000
--- a/elmo_feature_extraction.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import pickle
-
-import numpy as np
-
-from allennlp.modules.elmo import Elmo, batch_to_ids
-import torch
-from nltk import TweetTokenizer
-from torch.autograd import Variable
-
-
-def get_batches(batch_size, params):
-    total_len = len(params)
-    for batch_i in range(int(np.ceil(total_len / batch_size))):
-        start_i = batch_i * batch_size
-
-        yield params[start_i:start_i + batch_size]
-
-
-def get_elmo_sentence_embeddings(documents):
-    options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
-    weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
-
-    elmo = Elmo(options_file, weight_file, 1, dropout=0)
-
-    # use batch_to_ids to convert sentences to character ids
-    # sentences = [['First', 'sentence', '.'], ['Another', '.']]
-
-    batches_lat_embeddings = []
-
-    device = torch.device('cuda')
-
-    batch_size = 128
-
-    elmo = elmo.to(device)
-
-    for batch_idx, doc_batch in enumerate(get_batches(batch_size, documents)):
-        character_ids = batch_to_ids(doc_batch)
-        character_ids = character_ids.to(device)
-        #
-
-        embeddings = elmo(character_ids)
-
-        # sentence_embeddings = torch.sum(embeddings['elmo_representations'][0], dim=1)
-        layer_1_rep = get_weights_from_layers(masks=embeddings["mask"], elmo_rep=embeddings['elmo_representations'][0])
-
-        batches_lat_embeddings.append(layer_1_rep)
-
-        print("batch idx : {} completed...".format(batch_idx), flush=True)
-
-    return np.concatenate(batches_lat_embeddings, axis=0)
-
-    # layer_2_rep = get_weights_from_layers(masks = embeddings["mask"],elmo_rep = embeddings['elmo_representations'][1])
-    #
-    # return np.concatenate([layer_1_rep, layer_2_rep], axis=1)
-
-
-def get_weights_from_layers(masks, elmo_rep):
-    batch_size = masks.shape[0]
-    max_seq_len = masks.shape[1]
-
-    # masks = masks.unsqueeze(1)
-
-    elmo_rep = elmo_rep.view(elmo_rep.shape[0] * elmo_rep.shape[1], 1024)
-
-    # mask_weighted_rep = torch.matmul(masks, elmo_rep)
-    masks = masks.view(masks.shape[0] * masks.shape[1])
-
-    masks = masks.view(-1, 1).repeat(1, 1024)
-    # mask_weighted_rep = torch.matmul(masks.float(), elmo_rep)
-
-    mask_weighted_rep = masks.float() * elmo_rep
-    mask_weighted_rep = mask_weighted_rep.view(batch_size, max_seq_len, 1024)
-
-    sentence_embeddings = torch.sum(mask_weighted_rep, dim=1)
-
-    return Variable(sentence_embeddings).data.cpu().numpy()
-
-
-def dump_elmo_features(data_dir, news_source, label, out_dir):
-    reply_id_content_dict = pickle.load(
-        open("{}/{}_{}_reply_id_content_dict.pkl".format(data_dir, news_source, label), "rb"))
-
-    reply_contents = []
-
-    reply_arr_idx_dict = dict()
-
-    idx = 0
-
-    tokenizer = TweetTokenizer(strip_handles=True)
-    for reply_id, content in reply_id_content_dict.items():
-        reply_arr_idx_dict[reply_id] = idx
-        reply_contents.append(tokenizer.tokenize(content))
-        idx += 1
-
-    sentence_lat_embeddings = get_elmo_sentence_embeddings(reply_contents)
-
-    pickle.dump(sentence_lat_embeddings,
-                open("{}/{}_{}_elmo_lat_embeddings.pkl".format(out_dir, news_source, label), "wb"))
-    pickle.dump(reply_arr_idx_dict,
-                open("{}/{}_{}_reply_id_latent_mat_index.pkl".format(out_dir, news_source, label), "wb"))
-
-
-if __name__ == "__main__":
-    # sentences = [['First', 'sentence', '.'], []]
-    # sentence_lat_embeddings = get_elmo_sentence_embeddings(sentences)
-
-    # print("============  Dumping fake data ============")
-    # dump_elmo_features("data/pre_process_data", "politifact", "fake", "data/pre_process_data/elmo_features")
-    #
-    # print("============  Dumping real data ============")
-    # dump_elmo_features("data/pre_process_data", "politifact", "real", "data/pre_process_data/elmo_features")
-
-    print("============  Dumping fake data ============")
-    dump_elmo_features("data/pre_process_data", "gossipcop", "fake", "data/pre_process_data/elmo_features")
-
-    print("============  Dumping real data ============")
-    dump_elmo_features("data/pre_process_data", "gossipcop", "real", "data/pre_process_data/elmo_features")
diff --git a/linguistic_analysis.py b/linguistic_analysis.py
index b51843b..79bb401 100644
--- a/linguistic_analysis.py
+++ b/linguistic_analysis.py
@@ -15,15 +15,19 @@
 from util.constants import REPLY_NODE, POST_NODE
 from util.util import tweet_node
 
-all_reply_id_sentiment_score_dict = pickle.load(open("{}/all_reply_id_sentiment_result.pkl"
-                                                     .format("data/pre_process_data/vader_sentiment"), "rb"))
+all_reply_id_sentiment_score_dict = dict()
 
 
-def tweet_text_sentiment(reply_id):
-    if reply_id in all_reply_id_sentiment_score_dict:
-        return all_reply_id_sentiment_score_dict[reply_id]["compound"]
-    else:
-        return 0
+#
+# all_reply_id_sentiment_score_dict = pickle.load(open("{}/all_reply_id_sentiment_result.pkl"
+#                                                      .format("data/pre_process_data/vader_sentiment"), "rb"))
+
+
+# def tweet_text_sentiment(reply_id):
+#     if reply_id in all_reply_id_sentiment_score_dict:
+#         return all_reply_id_sentiment_score_dict[reply_id]["compound"]
+#     else:
+#         return 0
 
 
 # def tweet_text_sentiment(text):
@@ -55,8 +59,8 @@ def get_first_reply_nodes_average_sentiment(prop_graph: tweet_node):
             q.put(child)
 
             if child.node_type == REPLY_NODE and node.node_type == POST_NODE:
-                if node.text:
-                    reply_diff_values.append(tweet_text_sentiment(child.tweet_id))
+                if child.sentiment:
+                    reply_diff_values.append(child.sentiment)
 
     if len(reply_diff_values) == 0:
         return 0
@@ -76,8 +80,8 @@ def get_reply_nodes_average_sentiment(prop_graph: tweet_node):
             q.put(child)
 
         if node.node_type == REPLY_NODE:
-            if node.text:
-                reply_diff_values.append(tweet_text_sentiment(node.tweet_id))
+            if node.sentiment:
+                reply_diff_values.append(node.sentiment)
 
     if len(reply_diff_values) == 0:
         return 0
@@ -147,8 +151,7 @@ def get_reply_nodes_sentiment_ratio(prop_graph: tweet_node):
             q.put(child)
 
         if node.node_type == REPLY_NODE:
-            if node.text:
-                reply_diff_values.append(tweet_text_sentiment(node.tweet_id))
+            reply_diff_values.append(node.sentiment)
 
     if len(reply_diff_values) == 0:
         return 0
@@ -192,7 +195,7 @@ def get_all_linguistic_features(news_graphs, micro_features, macro_features):
                                      get_deepest_cascade_first_level_reply_sentiment]
 
         for function_reference in reply_function_references:
-            features_set = get_stats_for_features(news_graphs, function_reference, print=False, feature_name=None)
+            features_set = get_stats_for_features(news_graphs, function_reference, print=True, feature_name=None)
             all_features.append(features_set)
 
     return np.transpose(get_numpy_array(all_features))
@@ -236,8 +239,7 @@ def get_micro_feature_method_references(self):
                        get_reply_nodes_average_sentiment,
                        get_first_reply_nodes_average_sentiment,
                        get_deepest_cascade_reply_nodes_avg_sentiment,
-                       get_deepest_cascade_first_level_reply_sentiment,
-                       get_supporting_opposing_replies_ratio]
+                       get_deepest_cascade_first_level_reply_sentiment]
 
         return method_refs
 
@@ -246,13 +248,12 @@ def get_micro_feature_method_names(self):
                          "Average sentiment of all replies",
                          "Average sentiment of first level replies",
                          "Average sentiment of replies in deepest cascade",
-                         "Average setiment of first level replies in deepest cascade",
-                         "Supporting or opposing ratio"]
+                         "Average setiment of first level replies in deepest cascade"]
 
         return feature_names
 
     def get_micro_feature_short_names(self):
-        feature_names = ["L1", "L2", "L3", "L4", "L5","L6"]
+        feature_names = ["L1", "L2", "L3", "L4", "L5", "L6"]
         return feature_names
 
     def get_macro_feature_method_references(self):
@@ -271,13 +272,14 @@ def get_macro_feature_short_names(self):
         feature_names = []
         return feature_names
 
-    def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None, file_dir="data/train_test_data"):
+    def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None,
+                           file_dir="data/train_test_data", use_cache=False):
         function_refs = []
 
         file_name = self.get_dump_file_name(news_source, micro_features, macro_features, label, file_dir)
         data_file = Path(file_name)
 
-        if data_file.is_file():
+        if use_cache and data_file.is_file():
             return pickle.load(open(file_name, "rb"))
 
         if micro_features:
@@ -288,19 +290,21 @@ def get_features_array(self, prop_graphs, micro_features, macro_features, news_s
 
         all_features = []
 
-        for idx in range(len(function_refs) - 1):
+        for idx in range(len(function_refs)):
             features_set = get_sample_feature_value(prop_graphs, function_refs[idx])
             all_features.append(features_set)
 
-        all_features.append(get_feature_involving_additional_args(prop_graphs, function_refs[-1],news_source, label))
+        # all_features.append(get_feature_involving_additional_args(prop_graphs, function_refs[-1], news_source, label))
 
         feature_array = np.transpose(get_numpy_array(all_features))
+
+        # feature_array = feature_array[:, :-1]
+
         pickle.dump(feature_array, open(file_name, "wb"))
 
         return feature_array
 
 
-
 def get_feature_involving_additional_args(prop_graphs, function_reference, news_source, label):
     feature_values = []
     for prop_graph in prop_graphs:
diff --git a/misc_process.py b/misc_process.py
index 6a619c0..e070f75 100644
--- a/misc_process.py
+++ b/misc_process.py
@@ -1,764 +1,764 @@
-import csv
-import json
-import mmap
-import os
-import pickle
-import queue
-import re
-import shutil
-import string
-import sys
-import traceback
-from datetime import datetime
-from pathlib import Path
-
-import datefinder
-import requests
-from bs4 import BeautifulSoup
-from newspaper import Article
-from pymongo import UpdateOne
-from tqdm import tqdm
-import newspaper
-
-from analysis_util import get_propagation_graphs
-from baseline_feature_extraction import dump_LIWC_Representation
-from pre_process_util import load_configuration, get_database_connection, get_news_articles
-from util.constants import RETWEET_EDGE, REPLY_EDGE, RETWEET_NODE, REPLY_NODE
-from util.util import tweet_node
-
-
-def get_reply_of_replies(replies: list, result_dict: dict):
-    for reply in replies:
-        if reply:
-            if "engagement" in reply:
-                get_reply_of_replies(reply["engagement"]["tweet_replies"], result_dict)
-
-            result_dict[reply["id"]] = reply["text"]
-
-
-def get_web_archieve_results(search_url):
-    try:
-        archieve_url = "http://web.archive.org/cdx/search/cdx?url={}&output=json".format(search_url)
-
-        response = requests.get(archieve_url)
-        response_json = json.loads(response.content)
-
-        response_json = response_json[1:]
-
-        return response_json
-
-    except:
-        return None
-
-
-def get_website_url_from_arhieve(url):
-    archieve_results = get_web_archieve_results(url)
-    if archieve_results:
-        modified_url = "https://web.archive.org/web/{}/{}".format(archieve_results[0][1], archieve_results[0][2])
-        return modified_url
-    else:
-        return url
-
-
-def dump_friends_file_as_lines(dataset_file, out_file):
-    pattern = re.compile(rb'{([^{}]+)}',
-                         re.DOTALL | re.IGNORECASE | re.MULTILINE)
-
-    with open(out_file, "w", 100) as out_file:
-        with open(dataset_file, 'r') as f:
-            with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
-                for match in pattern.findall(m):
-                    data = "{" + str(match.decode('utf-8')) + "}\n"
-                    out_file.write(data)
-
-
-def dump_social_network_to_db(db, folder):
-    friends_coll = db.twitter_user_friends_collection
-
-    batch_update_list = []
-
-    files = os.listdir(folder)
-    user_names = set([file[:file.find(".csv")] for file in files])
-
-    print("actual files : {}".format(len(user_names)), flush=True)
-
-    saved_user_names = set(friends_coll.distinct("user_name"))
-    print("saved user names  : {}".format(len(saved_user_names)), flush=True)
-
-    user_names = user_names.difference(saved_user_names)
-
-    print("user names to be saved : {}".format(len(user_names)), flush=True)
-
-    for idx, user_name in enumerate(user_names):
-        try:
-            friends_user_names = get_friends_names("{}/{}.csv".format(folder, user_name))
-
-            batch_update_list.append(UpdateOne({"user_name": user_name},
-                                               {"$set": {"user_name": user_name, "friends_name": friends_user_names}},
-                                               upsert=True))
-
-            if idx % 10000 == 0:
-                try:
-                    friends_coll.bulk_write(batch_update_list, ordered=False)
-                except:
-                    print("Exception")
-                    traceback.print_exc(file=sys.stdout)
-
-                batch_update_list = []
-                print("bulk update {}".format(idx), flush=True)
-
-        except Exception as ex:
-            print("Exception in file : {}/{} : {}".format(folder, user_name, str(ex)))
-            traceback.print_exc(file=sys.stdout)
-
-    if len(batch_update_list) > 0:
-        friends_coll.bulk_write(batch_update_list, ordered=False)
-        print("bulk update", flush=True)
-
-    print("completed dumping for folder {}".format(folder))
-
-
-def get_user_to_fetch(all_user_file, user_ids_user_name_dict, db):
-    user_names = set(json.load(open(all_user_file))["user_names"])
-
-    friends_coll = db.twitter_user_friends_collection
-
-    fake_friends_collection = db.fake_twitter_user_followees
-    real_friends_collection = db.real_twitter_user_followees
-
-    fake_users_ids = set(fake_friends_collection.distinct("user_id"))
-    real_users_ids = set(real_friends_collection.distinct("user_id"))
-
-    all_user_ids = set()
-    all_user_ids.update(fake_users_ids)
-    all_user_ids.update(real_users_ids)
-
-    id_fetched_user_names = set()
-
-    user_ids_user_name_dict = json.load(open(user_ids_user_name_dict))
-
-    for user_id, user_name in user_ids_user_name_dict.items():
-        if int(user_id) in all_user_ids:
-            id_fetched_user_names.add(user_name)
-
-    print("actual files : {}".format(len(user_names)), flush=True)
-
-    saved_user_names = set(friends_coll.distinct("user_name"))
-    print("saved user names  : {}".format(len(saved_user_names)), flush=True)
-
-    user_names = user_names.difference(saved_user_names)
-
-    print("user names to be collected : {}".format(len(user_names)), flush=True)
-
-    print("ID fetched users : {}".format(len(id_fetched_user_names)))
-
-    user_names = user_names.difference(id_fetched_user_names)
-
-    print("Final set of user names to be fetched : {}".format(len(user_names)))
-
-    json.dump({"user_names": list(user_names)}, open("politifact_user_names_to_collect.json", "w"))
-
-
-def chunks(l, n):
-    """Yield successive n-sized chunks from l."""
-    for i in range(0, len(l), n):
-        yield l[i:i + n]
-
-
-def dump_user_friends_data(db, user_names_file, dump_out_file):
-    user_names = json.load(open(user_names_file))["user_names"]
-    friends_collection = db.twitter_user_friends_collection
-    with open(dump_out_file, "w", 1000) as file:
-        for user_name_chunk in chunks(list(user_names), 10000):
-            for user_info in friends_collection.find({"user_name": {"$in": user_name_chunk}}, {"_id": 0}):
-                file.write(json.dumps(user_info))
-                file.write("\n")
-
-    print("Compeleted dumping {}".format(dump_out_file))
-
-
-def dump_user_id_friends_data(db, user_id_dict_file, dump_out_file):
-    user_id_name_dict = json.load(open(user_id_dict_file))
-
-    user_ids = user_id_name_dict.keys()
-
-    user_ids = [int(user_id) for user_id in user_ids]
-
-    user_ids = set(user_ids)
-
-    fake_friends_collection = db.fake_twitter_user_followees
-    real_friends_collection = db.real_twitter_user_followees
-
-    with open(dump_out_file, "w", 1000) as file:
-
-        for user_ids_chunk in chunks(list(user_ids), 10000):
-            for user_info in fake_friends_collection.find({"user_id": {"$in": user_ids_chunk}}, {"_id": 0}):
-                user_ids.remove(user_info["user_id"])
-                file.write(json.dumps(user_info) + "\n")
-
-        for user_ids_chunk in chunks(list(user_ids), 10000):
-            for user_info in real_friends_collection.find({"user_id": {"$in": user_ids_chunk}}, {"_id": 0}):
-                user_ids.remove(user_info["user_id"])
-                file.write(json.dumps(user_info) + "\n")
-
-    print("Compeleted dumping {}".format(dump_out_file))
-
-
-def get_friends_names(friends_file):
-    try:
-        with open(friends_file, encoding="UTF-8") as file:
-            lines = file.readlines()
-            lines = [line.strip() for line in lines]
-            return lines[1:]
-
-    except:
-        return []
-
-
-def write_file_if_not_exist(output_folder, user_id_followee_json_data):
-    file_path = "{}/{}.json".format(output_folder, user_id_followee_json_data["user_id"])
-    if not os.path.exists(file_path):
-        json.dump(user_id_followee_json_data, open(file_path, "w"))
-
-
-def write_file_user_name_if_not_exist(output_folder, user_name_followee_json_data):
-    file_path = "{}/{}.json".format(output_folder, user_name_followee_json_data["user_name"])
-    if not os.path.exists(file_path):
-        json.dump(user_name_followee_json_data, open(file_path, "w"))
-
-
-def dump_social_network_user_id_single_file(input_ids_file, output_folder):
-    with open(input_ids_file) as file:
-        for line in tqdm(file):
-            write_file_if_not_exist(output_folder, json.loads(line))
-
-
-def dump_social_network_user_name_single_file(input_names_file, output_folder):
-    with open(input_names_file) as file:
-        for line in tqdm(file):
-            write_file_user_name_if_not_exist(output_folder, json.loads(line))
-
-
-def download_news_article(url):
-    news_article = Article(url)
-    news_article.download()
-    news_article.parse()
-    return news_article
-
-
-def get_dataset_publication_date(news_id_publish_time_dict, news_id_fact_statement_date_dict, news_id_source_date_dict):
-    """
-    Check the different dates and choose the right date for filtering noise
-    :param news_id_publish_time:
-    :param news_id_fact_statement_date_dict:
-    :param news_id_source_date_dict:
-    :return:
-    """
-    all_news_ids = news_id_fact_statement_date_dict.keys()
-
-    news_id_selected_filter_date = dict()
-
-    for news_id in all_news_ids:
-        if news_id in news_id_publish_time_dict:
-            news_id_selected_filter_date[news_id] = news_id_publish_time_dict[news_id].timestamp()
-        elif news_id in news_id_source_date_dict:
-            news_id_selected_filter_date[news_id] = news_id_source_date_dict[news_id].timestamp()
-        elif news_id in news_id_fact_statement_date_dict:
-            news_id_selected_filter_date[news_id] = datetime.strptime(news_id_fact_statement_date_dict[news_id],
-                                                                      "%Y-%m-%d").timestamp()
-
-    return news_id_selected_filter_date
-
-
-def get_news_articles_published_time(db, is_fake):
-    news_id_publish_time_dict = dict()
-
-    if is_fake:
-        news_source_article_collection = db.fake_news_source_article
-    else:
-        news_source_article_collection = db.real_news_source_article
-
-    for news_source in news_source_article_collection.find({"news_source": "politifact"}):
-        news_id = news_source["id"]
-        if news_source and news_source["publish_date"]:
-            news_id_publish_time_dict[news_id] = news_source["publish_date"]
-
-    return news_id_publish_time_dict
-
-
-# def get_news_articles_published_time(dataset_file):
+# import csv
+# import json
+# import mmap
+# import os
+# import pickle
+# import queue
+# import re
+# import shutil
+# import string
+# import sys
+# import traceback
+# from datetime import datetime
+# from pathlib import Path
+#
+# import datefinder
+# import requests
+# from bs4 import BeautifulSoup
+# from newspaper import Article
+# from pymongo import UpdateOne
+# from tqdm import tqdm
+# import newspaper
+#
+# from analysis_util import get_propagation_graphs
+# from baseline_feature_extraction import dump_LIWC_Representation
+# from pre_process_util import load_configuration, get_database_connection, get_news_articles
+# from util.constants import RETWEET_EDGE, REPLY_EDGE, RETWEET_NODE, REPLY_NODE
+# from util.util import tweet_node
+#
+#
+# def get_reply_of_replies(replies: list, result_dict: dict):
+#     for reply in replies:
+#         if reply:
+#             if "engagement" in reply:
+#                 get_reply_of_replies(reply["engagement"]["tweet_replies"], result_dict)
+#
+#             result_dict[reply["id"]] = reply["text"]
+#
+#
+# def get_web_archieve_results(search_url):
+#     try:
+#         archieve_url = "http://web.archive.org/cdx/search/cdx?url={}&output=json".format(search_url)
+#
+#         response = requests.get(archieve_url)
+#         response_json = json.loads(response.content)
+#
+#         response_json = response_json[1:]
+#
+#         return response_json
+#
+#     except:
+#         return None
+#
+#
+# def get_website_url_from_arhieve(url):
+#     archieve_results = get_web_archieve_results(url)
+#     if archieve_results:
+#         modified_url = "https://web.archive.org/web/{}/{}".format(archieve_results[0][1], archieve_results[0][2])
+#         return modified_url
+#     else:
+#         return url
+#
+#
+# def dump_friends_file_as_lines(dataset_file, out_file):
+#     pattern = re.compile(rb'{([^{}]+)}',
+#                          re.DOTALL | re.IGNORECASE | re.MULTILINE)
+#
+#     with open(out_file, "w", 100) as out_file:
+#         with open(dataset_file, 'r') as f:
+#             with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
+#                 for match in pattern.findall(m):
+#                     data = "{" + str(match.decode('utf-8')) + "}\n"
+#                     out_file.write(data)
+#
+#
+# def dump_social_network_to_db(db, folder):
+#     friends_coll = db.twitter_user_friends_collection
+#
+#     batch_update_list = []
+#
+#     files = os.listdir(folder)
+#     user_names = set([file[:file.find(".csv")] for file in files])
+#
+#     print("actual files : {}".format(len(user_names)), flush=True)
+#
+#     saved_user_names = set(friends_coll.distinct("user_name"))
+#     print("saved user names  : {}".format(len(saved_user_names)), flush=True)
+#
+#     user_names = user_names.difference(saved_user_names)
+#
+#     print("user names to be saved : {}".format(len(user_names)), flush=True)
+#
+#     for idx, user_name in enumerate(user_names):
+#         try:
+#             friends_user_names = get_friends_names("{}/{}.csv".format(folder, user_name))
+#
+#             batch_update_list.append(UpdateOne({"user_name": user_name},
+#                                                {"$set": {"user_name": user_name, "friends_name": friends_user_names}},
+#                                                upsert=True))
+#
+#             if idx % 10000 == 0:
+#                 try:
+#                     friends_coll.bulk_write(batch_update_list, ordered=False)
+#                 except:
+#                     print("Exception")
+#                     traceback.print_exc(file=sys.stdout)
+#
+#                 batch_update_list = []
+#                 print("bulk update {}".format(idx), flush=True)
+#
+#         except Exception as ex:
+#             print("Exception in file : {}/{} : {}".format(folder, user_name, str(ex)))
+#             traceback.print_exc(file=sys.stdout)
+#
+#     if len(batch_update_list) > 0:
+#         friends_coll.bulk_write(batch_update_list, ordered=False)
+#         print("bulk update", flush=True)
+#
+#     print("completed dumping for folder {}".format(folder))
+#
+#
+# def get_user_to_fetch(all_user_file, user_ids_user_name_dict, db):
+#     user_names = set(json.load(open(all_user_file))["user_names"])
+#
+#     friends_coll = db.twitter_user_friends_collection
+#
+#     fake_friends_collection = db.fake_twitter_user_followees
+#     real_friends_collection = db.real_twitter_user_followees
+#
+#     fake_users_ids = set(fake_friends_collection.distinct("user_id"))
+#     real_users_ids = set(real_friends_collection.distinct("user_id"))
+#
+#     all_user_ids = set()
+#     all_user_ids.update(fake_users_ids)
+#     all_user_ids.update(real_users_ids)
+#
+#     id_fetched_user_names = set()
+#
+#     user_ids_user_name_dict = json.load(open(user_ids_user_name_dict))
+#
+#     for user_id, user_name in user_ids_user_name_dict.items():
+#         if int(user_id) in all_user_ids:
+#             id_fetched_user_names.add(user_name)
+#
+#     print("actual files : {}".format(len(user_names)), flush=True)
+#
+#     saved_user_names = set(friends_coll.distinct("user_name"))
+#     print("saved user names  : {}".format(len(saved_user_names)), flush=True)
+#
+#     user_names = user_names.difference(saved_user_names)
+#
+#     print("user names to be collected : {}".format(len(user_names)), flush=True)
+#
+#     print("ID fetched users : {}".format(len(id_fetched_user_names)))
+#
+#     user_names = user_names.difference(id_fetched_user_names)
+#
+#     print("Final set of user names to be fetched : {}".format(len(user_names)))
+#
+#     json.dump({"user_names": list(user_names)}, open("politifact_user_names_to_collect.json", "w"))
+#
+#
+# def chunks(l, n):
+#     """Yield successive n-sized chunks from l."""
+#     for i in range(0, len(l), n):
+#         yield l[i:i + n]
+#
+#
+# def dump_user_friends_data(db, user_names_file, dump_out_file):
+#     user_names = json.load(open(user_names_file))["user_names"]
+#     friends_collection = db.twitter_user_friends_collection
+#     with open(dump_out_file, "w", 1000) as file:
+#         for user_name_chunk in chunks(list(user_names), 10000):
+#             for user_info in friends_collection.find({"user_name": {"$in": user_name_chunk}}, {"_id": 0}):
+#                 file.write(json.dumps(user_info))
+#                 file.write("\n")
+#
+#     print("Compeleted dumping {}".format(dump_out_file))
+#
+#
+# def dump_user_id_friends_data(db, user_id_dict_file, dump_out_file):
+#     user_id_name_dict = json.load(open(user_id_dict_file))
+#
+#     user_ids = user_id_name_dict.keys()
+#
+#     user_ids = [int(user_id) for user_id in user_ids]
+#
+#     user_ids = set(user_ids)
+#
+#     fake_friends_collection = db.fake_twitter_user_followees
+#     real_friends_collection = db.real_twitter_user_followees
+#
+#     with open(dump_out_file, "w", 1000) as file:
+#
+#         for user_ids_chunk in chunks(list(user_ids), 10000):
+#             for user_info in fake_friends_collection.find({"user_id": {"$in": user_ids_chunk}}, {"_id": 0}):
+#                 user_ids.remove(user_info["user_id"])
+#                 file.write(json.dumps(user_info) + "\n")
+#
+#         for user_ids_chunk in chunks(list(user_ids), 10000):
+#             for user_info in real_friends_collection.find({"user_id": {"$in": user_ids_chunk}}, {"_id": 0}):
+#                 user_ids.remove(user_info["user_id"])
+#                 file.write(json.dumps(user_info) + "\n")
+#
+#     print("Compeleted dumping {}".format(dump_out_file))
+#
+#
+# def get_friends_names(friends_file):
+#     try:
+#         with open(friends_file, encoding="UTF-8") as file:
+#             lines = file.readlines()
+#             lines = [line.strip() for line in lines]
+#             return lines[1:]
+#
+#     except:
+#         return []
+#
+#
+# def write_file_if_not_exist(output_folder, user_id_followee_json_data):
+#     file_path = "{}/{}.json".format(output_folder, user_id_followee_json_data["user_id"])
+#     if not os.path.exists(file_path):
+#         json.dump(user_id_followee_json_data, open(file_path, "w"))
+#
+#
+# def write_file_user_name_if_not_exist(output_folder, user_name_followee_json_data):
+#     file_path = "{}/{}.json".format(output_folder, user_name_followee_json_data["user_name"])
+#     if not os.path.exists(file_path):
+#         json.dump(user_name_followee_json_data, open(file_path, "w"))
+#
+#
+# def dump_social_network_user_id_single_file(input_ids_file, output_folder):
+#     with open(input_ids_file) as file:
+#         for line in tqdm(file):
+#             write_file_if_not_exist(output_folder, json.loads(line))
+#
+#
+# def dump_social_network_user_name_single_file(input_names_file, output_folder):
+#     with open(input_names_file) as file:
+#         for line in tqdm(file):
+#             write_file_user_name_if_not_exist(output_folder, json.loads(line))
+#
+#
+# def download_news_article(url):
+#     news_article = Article(url)
+#     news_article.download()
+#     news_article.parse()
+#     return news_article
+#
+#
+# def get_dataset_publication_date(news_id_publish_time_dict, news_id_fact_statement_date_dict, news_id_source_date_dict):
+#     """
+#     Check the different dates and choose the right date for filtering noise
+#     :param news_id_publish_time:
+#     :param news_id_fact_statement_date_dict:
+#     :param news_id_source_date_dict:
+#     :return:
+#     """
+#     all_news_ids = news_id_fact_statement_date_dict.keys()
+#
+#     news_id_selected_filter_date = dict()
+#
+#     for news_id in all_news_ids:
+#         if news_id in news_id_publish_time_dict:
+#             news_id_selected_filter_date[news_id] = news_id_publish_time_dict[news_id].timestamp()
+#         elif news_id in news_id_source_date_dict:
+#             news_id_selected_filter_date[news_id] = news_id_source_date_dict[news_id].timestamp()
+#         elif news_id in news_id_fact_statement_date_dict:
+#             news_id_selected_filter_date[news_id] = datetime.strptime(news_id_fact_statement_date_dict[news_id],
+#                                                                       "%Y-%m-%d").timestamp()
+#
+#     return news_id_selected_filter_date
+#
+#
+# def get_news_articles_published_time(db, is_fake):
+#     news_id_publish_time_dict = dict()
+#
+#     if is_fake:
+#         news_source_article_collection = db.fake_news_source_article
+#     else:
+#         news_source_article_collection = db.real_news_source_article
+#
+#     for news_source in news_source_article_collection.find({"news_source": "politifact"}):
+#         news_id = news_source["id"]
+#         if news_source and news_source["publish_date"]:
+#             news_id_publish_time_dict[news_id] = news_source["publish_date"]
+#
+#     return news_id_publish_time_dict
+#
+#
+# # def get_news_articles_published_time(dataset_file):
+# #     dataset = get_news_articles(dataset_file)
+# #     news_id_publish_time = dict()
+# #     count = 0
+# #     print("total no. of articles : {}".format(len(dataset)))
+# #     for news in dataset:
+# #         if "publish_date" in news["text_content"] and news["text_content"]["publish_date"]:
+# #             count += 1
+# #             print(news["text_content"]["publish_date"])
+# #
+# #         # if "url" in news["text_content"]:
+# #         #     try:
+# #         #         formatted_url = news["text_content"]["url"].lstrip("'").rstrip("'").lstrip("/")
+# #         #
+# #         #         print("Formatted url : {}".format(formatted_url))
+# #         #
+# #         #         news_article = download_news_article(formatted_url)
+# #         #         print("News id : {} publish data : {}".format(news["id"], news_article.publish_date), flush=True)
+# #         #         news_id_publish_time[news["id"]] = news_article.publish_date.timestamp()
+# #         #     except Exception as ex:
+# #         #         print(ex)
+# #     print("old wrong present publish date count : {}".format(count))
+# #     return news_id_publish_time
+#
+#
+# def get_publish_date_from_sources_politifact(db, is_fake):
+#     if is_fake:
+#         news_collection = db.fake_news_collection
+#         news_format_collection = db.fake_news_format
+#     else:
+#         news_collection = db.real_news_collection
+#         news_format_collection = db.real_news_format
+#
+#     news_id_fact_statement_date_dict = dict()
+#
+#     news_id_source_date_dict = dict()
+#
+#     for news_format in news_format_collection.find({"news_source": "politifact"}):
+#         news_id = news_format["id"]
+#
+#         news_id_int = int(news_id.replace("politifact", ""))
+#
+#         news = news_collection.find_one({"id": news_id_int})
+#
+#         publish_date = get_formatted_news_publish_date(news)
+#
+#         try:
+#             if publish_date:
+#                 publish_date = next(publish_date)
+#
+#                 if publish_date:
+#                     news_id_source_date_dict[news_id] = publish_date
+#         except StopIteration:
+#             pass
+#
+#         news_id_fact_statement_date_dict[news_id] = news["statement_date"]
+#
+#     return news_id_fact_statement_date_dict, news_id_source_date_dict
+#
+#
+# def get_formatted_news_publish_date(fake_news):
+#     try:
+#         id = fake_news['id']
+#         source_html = fake_news['sources']
+#         sources_soup = BeautifulSoup(source_html)
+#         sources = sources_soup.find_all('p')
+#         if not sources:
+#             sources = sources_soup.find_all('div')
+#         statement = ''
+#         url = ''
+#
+#         ## Using the first source that contains href as the fake news source if source is not removed
+#         ## This is not always true
+#
+#         date_matches = None
+#         for i in range(len(sources)):
+#             if sources[i].find('a') is not None:
+#                 statement_tmp = sources[i].text
+#
+#                 date_matches = datefinder.find_dates(statement_tmp)
+#                 statements = re.findall(r'\"(.+?){\,,.}\"', statement_tmp)
+#                 if len(statements) == 0:
+#                     statement = sources[i].a.text
+#
+#                     # TODO: Verify this logic is proper
+#                     splits = statement_tmp.split(',')
+#                     for split in splits:
+#                         if len(statement) < len(split):
+#                             statement = split
+#
+#                     # TODO: Why encoding is required - encoding considers quotes of string also - understand why?
+#                     # statement = statement.encode('utf-8')
+#                 else:
+#                     # TODO: Why encoding is required - encoding considers quotes of string also - understand why?
+#                     # statement = statements[0].encode('utf-8')
+#                     statement = statements[0]
+#                     pass
+#
+#                 # TODO: Check if it is proper
+#                 statement = str(statement).translate(str.maketrans('', '', string.punctuation))
+#
+#                 # statement_new = statement.translate(str.maketrans('', '', string.punctuation))  # move punctuations
+#
+#                 url = sources[i].a['href']
+#                 break
+#
+#         # TODO: Check if the condition is proper
+#         if statement == '' or len(statement.split(' ')) <= 3:
+#             return None
+#
+#         return date_matches
+#
+#     except:
+#         return None
+#
+#
+# def get_politifact_tweet_filter_dates(db, is_fake):
+#     news_id_fact_statement_date_dict, news_id_source_date_dict = get_publish_date_from_sources_politifact(
+#         db, is_fake=is_fake)
+#     news_id_publish_time_dict = get_news_articles_published_time(db, is_fake=is_fake)
+#
+#     news_id_filter_date_dict = get_dataset_publication_date(news_id_publish_time_dict, news_id_fact_statement_date_dict,
+#                                                             news_id_source_date_dict)
+#
+#     return news_id_filter_date_dict
+#
+#
+# def get_replies_from_dataset(dataset_dir, news_source, label, out_dir):
+#     dataset_file = "{}/{}_{}_news_complete_dataset.json".format(dataset_dir, news_source, label)
 #     dataset = get_news_articles(dataset_file)
-#     news_id_publish_time = dict()
-#     count = 0
-#     print("total no. of articles : {}".format(len(dataset)))
+#
+#     reply_id_content_dict = dict()
+#
 #     for news in dataset:
-#         if "publish_date" in news["text_content"] and news["text_content"]["publish_date"]:
-#             count += 1
-#             print(news["text_content"]["publish_date"])
-#
-#         # if "url" in news["text_content"]:
-#         #     try:
-#         #         formatted_url = news["text_content"]["url"].lstrip("'").rstrip("'").lstrip("/")
-#         #
-#         #         print("Formatted url : {}".format(formatted_url))
-#         #
-#         #         news_article = download_news_article(formatted_url)
-#         #         print("News id : {} publish data : {}".format(news["id"], news_article.publish_date), flush=True)
-#         #         news_id_publish_time[news["id"]] = news_article.publish_date.timestamp()
-#         #     except Exception as ex:
-#         #         print(ex)
-#     print("old wrong present publish date count : {}".format(count))
-#     return news_id_publish_time
-
-
-def get_publish_date_from_sources_politifact(db, is_fake):
-    if is_fake:
-        news_collection = db.fake_news_collection
-        news_format_collection = db.fake_news_format
-    else:
-        news_collection = db.real_news_collection
-        news_format_collection = db.real_news_format
-
-    news_id_fact_statement_date_dict = dict()
-
-    news_id_source_date_dict = dict()
-
-    for news_format in news_format_collection.find({"news_source": "politifact"}):
-        news_id = news_format["id"]
-
-        news_id_int = int(news_id.replace("politifact", ""))
-
-        news = news_collection.find_one({"id": news_id_int})
-
-        publish_date = get_formatted_news_publish_date(news)
-
-        try:
-            if publish_date:
-                publish_date = next(publish_date)
-
-                if publish_date:
-                    news_id_source_date_dict[news_id] = publish_date
-        except StopIteration:
-            pass
-
-        news_id_fact_statement_date_dict[news_id] = news["statement_date"]
-
-    return news_id_fact_statement_date_dict, news_id_source_date_dict
-
-
-def get_formatted_news_publish_date(fake_news):
-    try:
-        id = fake_news['id']
-        source_html = fake_news['sources']
-        sources_soup = BeautifulSoup(source_html)
-        sources = sources_soup.find_all('p')
-        if not sources:
-            sources = sources_soup.find_all('div')
-        statement = ''
-        url = ''
-
-        ## Using the first source that contains href as the fake news source if source is not removed
-        ## This is not always true
-
-        date_matches = None
-        for i in range(len(sources)):
-            if sources[i].find('a') is not None:
-                statement_tmp = sources[i].text
-
-                date_matches = datefinder.find_dates(statement_tmp)
-                statements = re.findall(r'\"(.+?){\,,.}\"', statement_tmp)
-                if len(statements) == 0:
-                    statement = sources[i].a.text
-
-                    # TODO: Verify this logic is proper
-                    splits = statement_tmp.split(',')
-                    for split in splits:
-                        if len(statement) < len(split):
-                            statement = split
-
-                    # TODO: Why encoding is required - encoding considers quotes of string also - understand why?
-                    # statement = statement.encode('utf-8')
-                else:
-                    # TODO: Why encoding is required - encoding considers quotes of string also - understand why?
-                    # statement = statements[0].encode('utf-8')
-                    statement = statements[0]
-                    pass
-
-                # TODO: Check if it is proper
-                statement = str(statement).translate(str.maketrans('', '', string.punctuation))
-
-                # statement_new = statement.translate(str.maketrans('', '', string.punctuation))  # move punctuations
-
-                url = sources[i].a['href']
-                break
-
-        # TODO: Check if the condition is proper
-        if statement == '' or len(statement.split(' ')) <= 3:
-            return None
-
-        return date_matches
-
-    except:
-        return None
-
-
-def get_politifact_tweet_filter_dates(db, is_fake):
-    news_id_fact_statement_date_dict, news_id_source_date_dict = get_publish_date_from_sources_politifact(
-        db, is_fake=is_fake)
-    news_id_publish_time_dict = get_news_articles_published_time(db, is_fake=is_fake)
-
-    news_id_filter_date_dict = get_dataset_publication_date(news_id_publish_time_dict, news_id_fact_statement_date_dict,
-                                                            news_id_source_date_dict)
-
-    return news_id_filter_date_dict
-
-
-def get_replies_from_dataset(dataset_dir, news_source, label, out_dir):
-    dataset_file = "{}/{}_{}_news_complete_dataset.json".format(dataset_dir, news_source, label)
-    dataset = get_news_articles(dataset_file)
-
-    reply_id_content_dict = dict()
-
-    for news in dataset:
-        for tweet in news["tweets"]:
-            get_reply_of_replies(tweet["reply"], reply_id_content_dict)
-
-    pickle.dump(reply_id_content_dict,
-                open("{}/{}_{}_reply_id_content_dict.pkl".format(out_dir, news_source, label), "wb"))
-
-
-def dump_all_botometer_results(db):
-    screen_name_botometer_score_dict = dict()
-
-    for user_score in db.twitter_user_botometer_results.find():
-        screen_name_botometer_score_dict[user_score["screen_name"]] = user_score["result"]
-
-    pickle.dump(screen_name_botometer_score_dict, open("all_user_botometer_scores.pkl", "wb"))
-
-
-def dump_all_user_profile_info(db, is_fake, label):
-    user_id_profile_info = dict()
-
-    all_users_ids = pickle.load(open("all_prop_graph_{}_user.pkl".format(label), "rb"))
-
-    if is_fake:
-        user_profile_collection = db.fake_twitter_user_profile
-    else:
-        user_profile_collection = db.real_twitter_user_profile
-
-    for user_id in tqdm(all_users_ids):
-        user_object = user_profile_collection.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1,
-                                                                              "profile_info.friends_count": 1,
-                                                                              "profile_info.followers_count": 1,
-                                                                              "profile_info.created_at": 1})
-        if user_object is None:
-            user_object = db.twitter_user_profile.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1,
-                                                                                  "profile_info.friends_count": 1,
-                                                                                  "profile_info.followers_count": 1,
-                                                                                  "profile_info.created_at": 1})
-        if user_object and "profile_info" in user_object:
-            user_id_profile_info[user_id] = user_object["profile_info"]
-
-    print("No. of users found : {}".format(len(user_id_profile_info)))
-
-    pickle.dump(user_id_profile_info, open("all_{}_user_profile_info.pkl".format(label), "wb"))
-
-
-def get_user_aggregate_features(db, is_fake, user_names):
-    dump_folder = "/home/dmahudes/fake_user_profiles"
-
-    if is_fake:
-        label_user_collection = db.fake_twitter_user_profile
-    else:
-        label_user_collection = db.real_twitter_user_profile
-
-    user_profile_collection = db.twitter_user_profile
-
-    # np.random.shuffle(user_ids)
-
-    for user_name in tqdm(user_names):
-
-        user_object = label_user_collection.find_one({"screen_name": user_name}, {"screen_name": 1, "user_id": 1,
-                                                                                  "profile_info": 1, "_id": 0})
-        if user_object is None:
-            user_object = user_profile_collection.find_one({"user_id": user_name}, {"screen_name": 1, "user_id": 1,
-                                                                                    "profile_info": 1, "_id": 0})
-
-        if user_object is None:
-            print('user {} not found'.format(user_name))
-        else:
-            json.dump(user_object, open("{}/{}.json".format(dump_folder, user_name), "w"))
-
-
-def remove_escape_characters(text_content):
-    text_content = text_content.replace(',', ' ')
-    text_content = text_content.replace('\n', ' ')
-    text_content = text_content.replace('\t', ' ')
-    words = text_content.split(" ")
-    return " ".join(words[:1000])
-
-
-def get_missing_rst_news_content():
-    news_source = "gossipcop"
-
-    file = "/Users/deepak/Downloads/{}_content_no_ignore.tsv".format(news_source)
-    # rst_folder = "/Users/deepak/Desktop/DMML/GitRepo/FakeNewsPropagation/data/baseline_features/rst/raw_parsed_data/{}".format(
-    #     news_source)
-    #
-    # out_folder = "data/baseline_features/rst/raw_parsed_data/{}_kai".format(news_source)
-
-    fake_news_ids = list()
-
-    real_news_ids = list()
-
-    all_news_folder = "data/baseline_data_kai/all_{}".format(news_source)
-
-    kai_data_folder = "/Users/deepak/Desktop/DMML/GitRepo/FakeNewsPropagation/data/baseline_data_kai/kai_{}".format(
-        news_source)
-
-    missing_files = set()
-    with open(file, encoding="UTF-8") as file:
-        reader = csv.reader(file, delimiter='\t', )
-        next(reader)
-
-        for news in reader:
-
-            if news[1] == '1':
-                fake_news_ids.append(news[0])
-            else:
-                real_news_ids.append(news[0])
-
-            expected_file = "{}/{}.txt.brackets".format(all_news_folder, news[0])
-            out_file = "{}/{}.txt.brackets".format(kai_data_folder, news[0])
-
-            file = Path(expected_file)
-            todofile = Path("data/baseline_data_kai/{}_missed/{}.json".format(news_source, news[0]))
-            if file.is_file():
-                shutil.copy(expected_file, out_file)
-            elif todofile.is_file():
-                pass
-            else:
-                missing_files.add(expected_file)
-                with open("data/baseline_data_kai/{}_missed/{}.json".format(news_source, news[0]), "w",
-                          encoding="UTF-8") as out_file:
-                    out_file.write(remove_escape_characters(news[2]))
-            # file = Path(expected_file)
-            # if file.is_file():
-            #     with open("{}/{}.txt".format(out_folder, news[0]), "w", encoding="UTF-8") as out_file:
-            #         out_file.write(remove_escape_characters(news[2]))
-            # else:
-            #     missing_files.add(news[0])
-
-    pickle.dump(fake_news_ids,
-                open("data/baseline_data_kai/{}_{}_sample_news_ordered_ids.pkl".format(news_source, "fake"), "wb"))
-    pickle.dump(real_news_ids,
-                open("data/baseline_data_kai/{}_{}_sample_news_ordered_ids.pkl".format(news_source, "real"), "wb"))
-
-    print("No. of missing files : {}".format(len(missing_files)))
-
-
-def get_files_for_liwc_parsing():
-    news_source = "gossipcop"
-
-    file = "/Users/deepak/Downloads/{}_content_no_ignore.tsv".format(news_source)
-
-    fake_data_file = open("data/baseline_data_kai/liwc/raw_data/{}_fake_liwc_data.csv".format(news_source), "w",
-                          encoding="UTF-8")
-
-    real_data_file = open("data/baseline_data_kai/liwc/raw_data/{}_real_liwc_data.csv".format(news_source), "w",
-                          encoding="UTF-8")
-
-    fake_csv_writer = csv.writer(fake_data_file)
-    real_csv_writer = csv.writer(real_data_file)
-
-    with open(file, encoding="UTF-8") as file:
-        reader = csv.reader(file, delimiter='\t', )
-        next(reader)
-
-        for news in reader:
-            csv_row = [news[0], remove_escape_characters(news[2])]
-
-            if news[1] == '1':
-                fake_csv_writer.writerow(csv_row)
-            else:
-                real_csv_writer.writerow(csv_row)
-
-    fake_data_file.close()
-    real_data_file.close()
-
-
-def get_users_in_network(prop_graph: tweet_node, edge_type=None):
-    q = queue.Queue()
-
-    q.put(prop_graph)
-
-    users_list = set()
-
-    while q.qsize() != 0:
-        node = q.get()
-
-        if node.user_id is not None:
-            users_list.add(node.user_id)
-
-        if edge_type == RETWEET_EDGE:
-            children = node.retweet_children
-        elif edge_type == REPLY_EDGE:
-            children = node.reply_children
-        else:
-            children = node.children
-
-        for child in children:
-            q.put(child)
-
-    return users_list
-
-
-def get_node_ids_in_network_by_type(prop_graph: tweet_node, edge_type=None, node_type=None):
-    q = queue.Queue()
-
-    q.put(prop_graph)
-
-    node_ids_set = set()
-
-    while q.qsize() != 0:
-        node = q.get()
-
-        if node.tweet_id is not None and node.node_type == node_type:
-            node_ids_set.add(node.tweet_id)
-
-        if edge_type == RETWEET_EDGE:
-            children = node.retweet_children
-        elif edge_type == REPLY_EDGE:
-            children = node.reply_children
-        else:
-            children = node.children
-
-        for child in children:
-            q.put(child)
-
-    return node_ids_set
-
-
-def get_tweets_ids_in_prop_network(prop_graph: tweet_node):
-    tweet_ids = set()
-
-    for child in prop_graph.children:
-        tweet_ids.add(child.tweet_id)
-
-    return tweet_ids
-
-
-def prop_network_stats(news_source):
-    fake_prop_graph, real_prop_graph = get_propagation_graphs("data/saved_new_no_filter", news_source)
-
-    tweet_ids = set()
-    retweet_ids = set()
-    reply_ids = set()
-    user_ids = set()
-
-    for prop_graph in fake_prop_graph:
-        tweet_ids.update(get_tweets_ids_in_prop_network(prop_graph))
-        retweet_ids.update(get_node_ids_in_network_by_type(prop_graph, RETWEET_EDGE, RETWEET_NODE))
-        reply_ids.update(get_node_ids_in_network_by_type(prop_graph, REPLY_EDGE, REPLY_NODE))
-        user_ids.update(get_users_in_network(prop_graph))
-
-    for prop_graph in real_prop_graph:
-        tweet_ids.update(get_tweets_ids_in_prop_network(prop_graph))
-        retweet_ids.update(get_node_ids_in_network_by_type(prop_graph, RETWEET_EDGE, RETWEET_NODE))
-        reply_ids.update(get_node_ids_in_network_by_type(prop_graph, REPLY_EDGE, REPLY_NODE))
-        user_ids.update(get_users_in_network(prop_graph))
-
-    print("News source : {}".format(news_source))
-    print("No. of tweets : {}".format(len(tweet_ids)))
-    print("No. of retweet ids : {}".format(len(retweet_ids)))
-    print("No. of reply ids : {}".format(len(reply_ids)))
-    print("Nol. of user : {}".format(len(user_ids)))
-
-
-if __name__ == "__main__":
-    config = load_configuration("project.config")
-    db = get_database_connection(config)
-
-    # prop_network_stats("politifact")
-    # prop_network_stats("gossipcop")
-
-    # get_files_for_liwc_parsing()
-
-    news_source = "politifact"
-    dump_LIWC_Representation("data/baseline_data_kai/liwc/liwc_results/{}_fake_liwc_data.txt".format(news_source),
-                             "data/baseline_data_kai/liwc/extracted_featuers/{}_fake_liwc_features.csv".format(news_source))
-
-    dump_LIWC_Representation("data/baseline_data_kai/liwc/liwc_results/{}_real_liwc_data.txt".format(news_source),
-                             "data/baseline_data_kai/liwc/extracted_featuers/{}_real_liwc_features.csv".format(news_source))
-
-    # get_missing_rst_news_content()
-    # get_user_aggregate_features(db, is_fake=True,
-    #                             user_names=["News1Lightning", "OfeliasHeaven", "jimbradyispapa", "CraigRozniecki",
-    #                                         "yojudenz",
-    #                                         "GinaLawriw", "GossipCop", "GossipCopIntern", "findsugarmummy",
-    #                                         "DJDavidNewsroom"])
-    # dump_all_user_profile_info(db, is_fake=True, label="fake")
-    # dump_all_user_profile_info(db, is_fake=False, label="real")
-
-    exit(1)
-
-    # get_replies_from_dataset("data/engagement_data_latest","politifact","fake","data/pre_process_data")
-    # get_replies_from_dataset("data/engagement_data_latest", "politifact", "real", "data/pre_process_data")
-
-    get_replies_from_dataset("data/engagement_data_latest", "gossipcop", "fake", "data/pre_process_data")
-    get_replies_from_dataset("data/engagement_data_latest", "gossipcop", "real", "data/pre_process_data")
-
-    # news_id_filter_date_dict = get_politifact_tweet_filter_dates(db, is_fake=True)
-    #
-    # print(len(news_id_filter_date_dict))
-    #
-    # news_id_fact_statement_date_dict, news_id_source_date_dict = get_publish_date_from_sources_politifact(db,
-    #                                                                                                       is_fake=False)
-    # news_id_publish_time_dict = get_news_articles_published_time(db, is_fake=False)
-    #
-    # # news_id_publish_time = get_news_articles_published_time(
-    # #     "data/engagement_data/politifact_fake_news_dataset_format.json")
-    #
-    # news_id_filter_date_dict = get_dataset_publication_date(news_id_publish_time_dict, news_id_fact_statement_date_dict,
-    #                                                         news_id_source_date_dict)
-    #
-    # print("Source news id len : {}".format(len(news_id_source_date_dict)))
-    # print("Statement news id len : {}".format(len(news_id_fact_statement_date_dict)))
-    # print("publish news ids len  : {}".format(len(news_id_publish_time_dict)))
-    # print("News id propagation network filter date len : {}".format(len(news_id_filter_date_dict)))
-    #
-    # exit(1)
-
-    # dump_social_network_user_id_single_file("data/social_network_data/gossipcop_user_ids_friends_network.txt",
-    #                         "/Users/deepak/Desktop/social_network_single_files/user_ids_files" )
-    #
-    # dump_social_network_user_name_single_file("data/social_network_data/gossipcop_user_names_friends_network.txt",
-    #                                         "/Users/deepak/Desktop/social_network_single_files/user_names_files")
-
-    # dump_user_friends_data(db, "data/format/politifact_prop_user_names.json",
-    #                        "data/social_network_data/politifact_user_names_friends_network.txt")
-    #
-    # dump_user_friends_data(db, "data/format/gossipcop_prop_user_names.json",
-    #                        "data/social_network_data/gossipcop_user_names_friends_network.txt")
-
-    # dump_user_id_friends_data(db, "data/format/politifact_user_id_user_name_dict.json",
-    #                           "data/social_network_data/politifact_user_ids_friends_network.txt")
-    #
-    # dump_user_id_friends_data(db, "data/format/gossipcop_user_id_user_name_dict.json",
-    #                           "data/social_network_data/gossipcop_user_ids_friends_network.txt")
-
-    # dump_user_friends_data(db, "data/format/politifact_prop_user_names.json",
-    #                        "data/social_network_data/politifact_user_names_friends_network.txt")
-
-    # get_user_to_fetch("data/format/politifact_prop_user_names.json",
-    #                   "data/format/politifact_user_id_user_name_dict.json",
-    #                   db)
-
-    # dump_friends_file_as_lines("/home/dmahudes/FakeNewsPropagation/data/politifact_real_user_friends_ids_complete.txt",
-    #                     "/home/dmahudes/FakeNewsPropagation/data/format/politifact_real_user_friends_ids_complete_format.txt")
-
-    # dump_social_network_to_db(db, "/Users/deepak/Desktop/twint_collect/data 2")
-    # dump_social_network_to_db(db, "/Users/deepak/Desktop/twint_collect/data")
-    # dump_social_network_to_db(db, "/Users/deepak/Desktop/twint_collect/home/ubuntu/social_network_crawl/data")
-    # dump_social_network_to_db(db,
-    #                           "/home/dmahudes/FakeNewsPropagation/data/network_data/home/ubuntu/social_network_crawl/data")
+#         for tweet in news["tweets"]:
+#             get_reply_of_replies(tweet["reply"], reply_id_content_dict)
+#
+#     pickle.dump(reply_id_content_dict,
+#                 open("{}/{}_{}_reply_id_content_dict.pkl".format(out_dir, news_source, label), "wb"))
+#
+#
+# def dump_all_botometer_results(db):
+#     screen_name_botometer_score_dict = dict()
+#
+#     for user_score in db.twitter_user_botometer_results.find():
+#         screen_name_botometer_score_dict[user_score["screen_name"]] = user_score["result"]
+#
+#     pickle.dump(screen_name_botometer_score_dict, open("all_user_botometer_scores.pkl", "wb"))
+#
+#
+# def dump_all_user_profile_info(db, is_fake, label):
+#     user_id_profile_info = dict()
+#
+#     all_users_ids = pickle.load(open("all_prop_graph_{}_user.pkl".format(label), "rb"))
+#
+#     if is_fake:
+#         user_profile_collection = db.fake_twitter_user_profile
+#     else:
+#         user_profile_collection = db.real_twitter_user_profile
+#
+#     for user_id in tqdm(all_users_ids):
+#         user_object = user_profile_collection.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1,
+#                                                                               "profile_info.friends_count": 1,
+#                                                                               "profile_info.followers_count": 1,
+#                                                                               "profile_info.created_at": 1})
+#         if user_object is None:
+#             user_object = db.twitter_user_profile.find_one({"user_id": user_id}, {"profile_info.statuses_count": 1,
+#                                                                                   "profile_info.friends_count": 1,
+#                                                                                   "profile_info.followers_count": 1,
+#                                                                                   "profile_info.created_at": 1})
+#         if user_object and "profile_info" in user_object:
+#             user_id_profile_info[user_id] = user_object["profile_info"]
+#
+#     print("No. of users found : {}".format(len(user_id_profile_info)))
+#
+#     pickle.dump(user_id_profile_info, open("all_{}_user_profile_info.pkl".format(label), "wb"))
+#
+#
+# def get_user_aggregate_features(db, is_fake, user_names):
+#     dump_folder = "/home/dmahudes/fake_user_profiles"
+#
+#     if is_fake:
+#         label_user_collection = db.fake_twitter_user_profile
+#     else:
+#         label_user_collection = db.real_twitter_user_profile
+#
+#     user_profile_collection = db.twitter_user_profile
+#
+#     # np.random.shuffle(user_ids)
+#
+#     for user_name in tqdm(user_names):
+#
+#         user_object = label_user_collection.find_one({"screen_name": user_name}, {"screen_name": 1, "user_id": 1,
+#                                                                                   "profile_info": 1, "_id": 0})
+#         if user_object is None:
+#             user_object = user_profile_collection.find_one({"user_id": user_name}, {"screen_name": 1, "user_id": 1,
+#                                                                                     "profile_info": 1, "_id": 0})
+#
+#         if user_object is None:
+#             print('user {} not found'.format(user_name))
+#         else:
+#             json.dump(user_object, open("{}/{}.json".format(dump_folder, user_name), "w"))
+#
+#
+# def remove_escape_characters(text_content):
+#     text_content = text_content.replace(',', ' ')
+#     text_content = text_content.replace('\n', ' ')
+#     text_content = text_content.replace('\t', ' ')
+#     words = text_content.split(" ")
+#     return " ".join(words[:1000])
+#
+#
+# def get_missing_rst_news_content():
+#     news_source = "gossipcop"
+#
+#     file = "/Users/deepak/Downloads/{}_content_no_ignore.tsv".format(news_source)
+#     # rst_folder = "/Users/deepak/Desktop/DMML/GitRepo/FakeNewsPropagation/data/baseline_features/rst/raw_parsed_data/{}".format(
+#     #     news_source)
+#     #
+#     # out_folder = "data/baseline_features/rst/raw_parsed_data/{}_kai".format(news_source)
+#
+#     fake_news_ids = list()
+#
+#     real_news_ids = list()
+#
+#     all_news_folder = "data/baseline_data_kai/all_{}".format(news_source)
+#
+#     kai_data_folder = "/Users/deepak/Desktop/DMML/GitRepo/FakeNewsPropagation/data/baseline_data_kai/kai_{}".format(
+#         news_source)
+#
+#     missing_files = set()
+#     with open(file, encoding="UTF-8") as file:
+#         reader = csv.reader(file, delimiter='\t', )
+#         next(reader)
+#
+#         for news in reader:
+#
+#             if news[1] == '1':
+#                 fake_news_ids.append(news[0])
+#             else:
+#                 real_news_ids.append(news[0])
+#
+#             expected_file = "{}/{}.txt.brackets".format(all_news_folder, news[0])
+#             out_file = "{}/{}.txt.brackets".format(kai_data_folder, news[0])
+#
+#             file = Path(expected_file)
+#             todofile = Path("data/baseline_data_kai/{}_missed/{}.json".format(news_source, news[0]))
+#             if file.is_file():
+#                 shutil.copy(expected_file, out_file)
+#             elif todofile.is_file():
+#                 pass
+#             else:
+#                 missing_files.add(expected_file)
+#                 with open("data/baseline_data_kai/{}_missed/{}.json".format(news_source, news[0]), "w",
+#                           encoding="UTF-8") as out_file:
+#                     out_file.write(remove_escape_characters(news[2]))
+#             # file = Path(expected_file)
+#             # if file.is_file():
+#             #     with open("{}/{}.txt".format(out_folder, news[0]), "w", encoding="UTF-8") as out_file:
+#             #         out_file.write(remove_escape_characters(news[2]))
+#             # else:
+#             #     missing_files.add(news[0])
+#
+#     pickle.dump(fake_news_ids,
+#                 open("data/baseline_data_kai/{}_{}_sample_news_ordered_ids.pkl".format(news_source, "fake"), "wb"))
+#     pickle.dump(real_news_ids,
+#                 open("data/baseline_data_kai/{}_{}_sample_news_ordered_ids.pkl".format(news_source, "real"), "wb"))
+#
+#     print("No. of missing files : {}".format(len(missing_files)))
+#
+#
+# def get_files_for_liwc_parsing():
+#     news_source = "gossipcop"
+#
+#     file = "/Users/deepak/Downloads/{}_content_no_ignore.tsv".format(news_source)
+#
+#     fake_data_file = open("data/baseline_data_kai/liwc/raw_data/{}_fake_liwc_data.csv".format(news_source), "w",
+#                           encoding="UTF-8")
+#
+#     real_data_file = open("data/baseline_data_kai/liwc/raw_data/{}_real_liwc_data.csv".format(news_source), "w",
+#                           encoding="UTF-8")
+#
+#     fake_csv_writer = csv.writer(fake_data_file)
+#     real_csv_writer = csv.writer(real_data_file)
+#
+#     with open(file, encoding="UTF-8") as file:
+#         reader = csv.reader(file, delimiter='\t', )
+#         next(reader)
+#
+#         for news in reader:
+#             csv_row = [news[0], remove_escape_characters(news[2])]
+#
+#             if news[1] == '1':
+#                 fake_csv_writer.writerow(csv_row)
+#             else:
+#                 real_csv_writer.writerow(csv_row)
+#
+#     fake_data_file.close()
+#     real_data_file.close()
+#
+#
+# def get_users_in_network(prop_graph: tweet_node, edge_type=None):
+#     q = queue.Queue()
+#
+#     q.put(prop_graph)
+#
+#     users_list = set()
+#
+#     while q.qsize() != 0:
+#         node = q.get()
+#
+#         if node.user_id is not None:
+#             users_list.add(node.user_id)
+#
+#         if edge_type == RETWEET_EDGE:
+#             children = node.retweet_children
+#         elif edge_type == REPLY_EDGE:
+#             children = node.reply_children
+#         else:
+#             children = node.children
+#
+#         for child in children:
+#             q.put(child)
+#
+#     return users_list
+#
+#
+# def get_node_ids_in_network_by_type(prop_graph: tweet_node, edge_type=None, node_type=None):
+#     q = queue.Queue()
+#
+#     q.put(prop_graph)
+#
+#     node_ids_set = set()
+#
+#     while q.qsize() != 0:
+#         node = q.get()
+#
+#         if node.tweet_id is not None and node.node_type == node_type:
+#             node_ids_set.add(node.tweet_id)
+#
+#         if edge_type == RETWEET_EDGE:
+#             children = node.retweet_children
+#         elif edge_type == REPLY_EDGE:
+#             children = node.reply_children
+#         else:
+#             children = node.children
+#
+#         for child in children:
+#             q.put(child)
+#
+#     return node_ids_set
+#
+#
+# def get_tweets_ids_in_prop_network(prop_graph: tweet_node):
+#     tweet_ids = set()
+#
+#     for child in prop_graph.children:
+#         tweet_ids.add(child.tweet_id)
+#
+#     return tweet_ids
+#
+#
+# def prop_network_stats(news_source):
+#     fake_prop_graph, real_prop_graph = get_propagation_graphs("data/saved_new_no_filter", news_source)
+#
+#     tweet_ids = set()
+#     retweet_ids = set()
+#     reply_ids = set()
+#     user_ids = set()
+#
+#     for prop_graph in fake_prop_graph:
+#         tweet_ids.update(get_tweets_ids_in_prop_network(prop_graph))
+#         retweet_ids.update(get_node_ids_in_network_by_type(prop_graph, RETWEET_EDGE, RETWEET_NODE))
+#         reply_ids.update(get_node_ids_in_network_by_type(prop_graph, REPLY_EDGE, REPLY_NODE))
+#         user_ids.update(get_users_in_network(prop_graph))
+#
+#     for prop_graph in real_prop_graph:
+#         tweet_ids.update(get_tweets_ids_in_prop_network(prop_graph))
+#         retweet_ids.update(get_node_ids_in_network_by_type(prop_graph, RETWEET_EDGE, RETWEET_NODE))
+#         reply_ids.update(get_node_ids_in_network_by_type(prop_graph, REPLY_EDGE, REPLY_NODE))
+#         user_ids.update(get_users_in_network(prop_graph))
+#
+#     print("News source : {}".format(news_source))
+#     print("No. of tweets : {}".format(len(tweet_ids)))
+#     print("No. of retweet ids : {}".format(len(retweet_ids)))
+#     print("No. of reply ids : {}".format(len(reply_ids)))
+#     print("Nol. of user : {}".format(len(user_ids)))
+#
+#
+# if __name__ == "__main__":
+#     config = load_configuration("project.config")
+#     db = get_database_connection(config)
+#
+#     # prop_network_stats("politifact")
+#     # prop_network_stats("gossipcop")
+#
+#     # get_files_for_liwc_parsing()
+#
+#     news_source = "politifact"
+#     dump_LIWC_Representation("data/baseline_data_kai/liwc/liwc_results/{}_fake_liwc_data.txt".format(news_source),
+#                              "data/baseline_data_kai/liwc/extracted_featuers/{}_fake_liwc_features.csv".format(news_source))
+#
+#     dump_LIWC_Representation("data/baseline_data_kai/liwc/liwc_results/{}_real_liwc_data.txt".format(news_source),
+#                              "data/baseline_data_kai/liwc/extracted_featuers/{}_real_liwc_features.csv".format(news_source))
+#
+#     # get_missing_rst_news_content()
+#     # get_user_aggregate_features(db, is_fake=True,
+#     #                             user_names=["News1Lightning", "OfeliasHeaven", "jimbradyispapa", "CraigRozniecki",
+#     #                                         "yojudenz",
+#     #                                         "GinaLawriw", "GossipCop", "GossipCopIntern", "findsugarmummy",
+#     #                                         "DJDavidNewsroom"])
+#     # dump_all_user_profile_info(db, is_fake=True, label="fake")
+#     # dump_all_user_profile_info(db, is_fake=False, label="real")
+#
+#     exit(1)
+#
+#     # get_replies_from_dataset("data/engagement_data_latest","politifact","fake","data/pre_process_data")
+#     # get_replies_from_dataset("data/engagement_data_latest", "politifact", "real", "data/pre_process_data")
+#
+#     get_replies_from_dataset("data/engagement_data_latest", "gossipcop", "fake", "data/pre_process_data")
+#     get_replies_from_dataset("data/engagement_data_latest", "gossipcop", "real", "data/pre_process_data")
+#
+#     # news_id_filter_date_dict = get_politifact_tweet_filter_dates(db, is_fake=True)
+#     #
+#     # print(len(news_id_filter_date_dict))
+#     #
+#     # news_id_fact_statement_date_dict, news_id_source_date_dict = get_publish_date_from_sources_politifact(db,
+#     #                                                                                                       is_fake=False)
+#     # news_id_publish_time_dict = get_news_articles_published_time(db, is_fake=False)
+#     #
+#     # # news_id_publish_time = get_news_articles_published_time(
+#     # #     "data/engagement_data/politifact_fake_news_dataset_format.json")
+#     #
+#     # news_id_filter_date_dict = get_dataset_publication_date(news_id_publish_time_dict, news_id_fact_statement_date_dict,
+#     #                                                         news_id_source_date_dict)
+#     #
+#     # print("Source news id len : {}".format(len(news_id_source_date_dict)))
+#     # print("Statement news id len : {}".format(len(news_id_fact_statement_date_dict)))
+#     # print("publish news ids len  : {}".format(len(news_id_publish_time_dict)))
+#     # print("News id propagation network filter date len : {}".format(len(news_id_filter_date_dict)))
+#     #
+#     # exit(1)
+#
+#     # dump_social_network_user_id_single_file("data/social_network_data/gossipcop_user_ids_friends_network.txt",
+#     #                         "/Users/deepak/Desktop/social_network_single_files/user_ids_files" )
+#     #
+#     # dump_social_network_user_name_single_file("data/social_network_data/gossipcop_user_names_friends_network.txt",
+#     #                                         "/Users/deepak/Desktop/social_network_single_files/user_names_files")
+#
+#     # dump_user_friends_data(db, "data/format/politifact_prop_user_names.json",
+#     #                        "data/social_network_data/politifact_user_names_friends_network.txt")
+#     #
+#     # dump_user_friends_data(db, "data/format/gossipcop_prop_user_names.json",
+#     #                        "data/social_network_data/gossipcop_user_names_friends_network.txt")
+#
+#     # dump_user_id_friends_data(db, "data/format/politifact_user_id_user_name_dict.json",
+#     #                           "data/social_network_data/politifact_user_ids_friends_network.txt")
+#     #
+#     # dump_user_id_friends_data(db, "data/format/gossipcop_user_id_user_name_dict.json",
+#     #                           "data/social_network_data/gossipcop_user_ids_friends_network.txt")
+#
+#     # dump_user_friends_data(db, "data/format/politifact_prop_user_names.json",
+#     #                        "data/social_network_data/politifact_user_names_friends_network.txt")
+#
+#     # get_user_to_fetch("data/format/politifact_prop_user_names.json",
+#     #                   "data/format/politifact_user_id_user_name_dict.json",
+#     #                   db)
+#
+#     # dump_friends_file_as_lines("/home/dmahudes/FakeNewsPropagation/data/politifact_real_user_friends_ids_complete.txt",
+#     #                     "/home/dmahudes/FakeNewsPropagation/data/format/politifact_real_user_friends_ids_complete_format.txt")
+#
+#     # dump_social_network_to_db(db, "/Users/deepak/Desktop/twint_collect/data 2")
+#     # dump_social_network_to_db(db, "/Users/deepak/Desktop/twint_collect/data")
+#     # dump_social_network_to_db(db, "/Users/deepak/Desktop/twint_collect/home/ubuntu/social_network_crawl/data")
+#     # dump_social_network_to_db(db,
+#     #                           "/home/dmahudes/FakeNewsPropagation/data/network_data/home/ubuntu/social_network_crawl/data")
diff --git a/stat_test.py b/stat_test.py
index aafabdf..1dadc05 100644
--- a/stat_test.py
+++ b/stat_test.py
@@ -197,6 +197,24 @@ def get_box_plots_mod(samples1, samples2, save_folder, title=None, file_name=Non
 
 
 if __name__ == "__main__":
-    get_box_plots_mod(np.random.rand(200, ), np.random.rand(200, ),
+    import seaborn as sns
+
+    all_data = np.transpose(np.array([np.random.rand(2000, ), np.random.rand(2000, )]))
+    labels = ['Fake', 'Real']
+    df = pd.DataFrame(all_data, columns=labels)
+    my_pal = {"Fake": "pink", "Real": "lightblue", }
+
+    plt.xticks(fontsize=12)
+    plt.yticks(fontsize=12)
+
+
+    # sns.set(style="whitegrid")
+    tips = sns.load_dataset("tips")
+    ax = sns.violinplot(data=df, palette=my_pal, width=0.3, showfliers=False)
+
+    plt.show()
+    exit(1)
+
+    get_box_plots_mod(np.random.rand(2000, ), np.random.rand(2000, ),
                       "/Users/deepak/Desktop/DMML/GitRepo/FakeNewsPropagation",
                       "T10", "T10")
diff --git a/structure_temp_analysis.py b/structure_temp_analysis.py
index 49ad4c7..77a7323 100644
--- a/structure_temp_analysis.py
+++ b/structure_temp_analysis.py
@@ -8,15 +8,15 @@
 
 from analysis_util import get_propagation_graphs, equal_samples, get_numpy_array, BaseFeatureHelper, \
     get_sample_feature_value
-from stat_test import perform_t_test, plot_normal_distributions, get_box_plots
+from stat_test import perform_t_test, get_box_plots
 from util.constants import NEWS_ROOT_NODE, RETWEET_EDGE, REPLY_EDGE, RETWEET_NODE, REPLY_NODE
 from util.util import tweet_node
 
-user_id_profile_info_dict = dict()
-user_id_profile_info_dict.update(
-    pickle.load(open("data/pre_process_data/user_features/all_fake_user_profile_info.pkl", "rb")))
-user_id_profile_info_dict.update(
-    pickle.load(open("data/pre_process_data/user_features/all_real_user_profile_info.pkl", "rb")))
+# user_id_profile_info_dict = dict()
+# user_id_profile_info_dict.update(
+#     pickle.load(open("data/pre_process_data/user_features/all_fake_user_profile_info.pkl", "rb")))
+# user_id_profile_info_dict.update(
+#     pickle.load(open("data/pre_process_data/user_features/all_real_user_profile_info.pkl", "rb")))
 
 
 def get_post_tweet_deepest_cascade(prop_graph: tweet_node, edge_type=RETWEET_EDGE):
@@ -560,51 +560,51 @@ def get_fraction_of_unique_users(prop_graph: tweet_node, edge_type=None):
 
 
 def get_num_bot_users(prop_graph: tweet_node):
-    global user_id_bot_score_dict
-    retweeting_users = set(get_user_names_retweeting_in_prop_graph(prop_graph))
+    q = queue.Queue()
+    q.put(prop_graph)
 
     num_bot_users = 0
-    for user_name in retweeting_users:
-        if user_name in user_id_bot_score_dict:
-            botometer_score = user_id_bot_score_dict[user_name]
-            if "scores" in botometer_score:
-                if botometer_score['scores']['universal'] > 0.5:
+
+    while q.qsize() != 0:
+        node = q.get()
+
+        for child in node.retweet_children:
+            q.put(child)
+            if child.node_type == RETWEET_NODE and child.user_id is not None:
+                if child.botometer_score and child.botometer_score > 0.5:
                     num_bot_users += 1
-            else:
-                print("user {} not found ".format(user_name))
 
     return num_bot_users
 
 
 def get_fraction_of_bot_users_retweeting(prop_graph: tweet_node):
-    global user_id_bot_score_dict
-    retweeting_users = set(get_user_names_retweeting_in_prop_graph(prop_graph))
+    q = queue.Queue()
+    q.put(prop_graph)
 
     num_bot_users = 1
-    human_users = 1
-    for user_name in retweeting_users:
-        if user_name in user_id_bot_score_dict:
-            botometer_score = user_id_bot_score_dict[user_name]
-            if "scores" in botometer_score:
-                if botometer_score['scores']['universal'] > 0.5:
-                    num_bot_users += 1
-                else:
-                    human_users += 1
+    num_human_users = 1
 
-    return num_bot_users / (human_users+ num_bot_users)
+    while q.qsize() != 0:
+        node = q.get()
+
+        for child in node.retweet_children:
+            q.put(child)
+            if child.node_type == RETWEET_NODE and child.user_id is not None:
+                if child.botometer_score:
+                    if child.botometer_score > 0.5:
+                        num_bot_users += 1
+                    else:
+                        num_human_users += 1
+
+    return  num_bot_users / (num_human_users+ num_bot_users)
 
 
 def get_prop_graphs_num_bot_users_retweeting(prop_graphs: tweet_node, edge_type=None):
     global user_id_bot_score_dict
-    user_id_bot_score_dict = pickle.load(
-        open("data/pre_process_data/botometer_scores/all_user_botometer_scores.pkl", "rb"))
     return get_sample_feature_value(prop_graphs, get_num_bot_users)
 
 
 def get_prop_graphs_fraction_of_bot_users_retweeting(prop_graphs: tweet_node, edge_type=None):
-    global user_id_bot_score_dict
-    user_id_bot_score_dict = pickle.load(
-        open("data/pre_process_data/botometer_scores/all_user_botometer_scores.pkl", "rb"))
     return get_sample_feature_value(prop_graphs, get_fraction_of_bot_users_retweeting)
 
 
@@ -1036,13 +1036,13 @@ def get_macro_feature_short_names(self):
         return feature_names
 
     def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None,
-                           file_dir="data/train_test_data"):
+                           file_dir="data/train_test_data", use_cache = False):
         all_features = []
 
         file_name = self.get_dump_file_name(news_source, micro_features, macro_features, label, file_dir)
         data_file = Path(file_name)
 
-        if data_file.is_file():
+        if use_cache and data_file.is_file():
             return pickle.load(open(file_name, "rb"))
 
         if micro_features:
@@ -1115,13 +1115,13 @@ def get_macro_feature_short_names(self):
         return feature_names
 
     def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None,
-                           file_dir="data/train_test_data"):
+                           file_dir="data/train_test_data", use_cache = False):
         all_features = []
 
         file_name = self.get_dump_file_name(news_source, micro_features, macro_features, label, file_dir)
         data_file = Path(file_name)
 
-        if data_file.is_file():
+        if use_cache and data_file.is_file():
             return pickle.load(open(file_name, "rb"))
 
         if micro_features:
diff --git a/temporal_analysis.py b/temporal_analysis.py
index bcb4d79..e91c058 100644
--- a/temporal_analysis.py
+++ b/temporal_analysis.py
@@ -331,6 +331,27 @@ def get_macro_feature_short_names(self):
 
 
 if __name__ == "__main__":
+    temporal_feature_helper = TemporalFeatureHelper()
+
+    news_source = "gossipcop"
+
+    fake_prop_graph, real_prop_graph = get_propagation_graphs("data/saved_new_no_filter", news_source)
+
+    fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
+
+    fake_features = temporal_feature_helper.get_features_array(fake_prop_graph, micro_features=True,
+                                                               macro_features=True, news_source=news_source,
+                                                               label="fake", use_cache=True)
+    real_features = temporal_feature_helper.get_features_array(real_prop_graph, micro_features=True,
+                                                               macro_features=True, news_source=news_source,
+                                                               label="real", use_cache=True)
+
+    temporal_feature_helper.save_blox_plots_for_features(fake_feature_array=fake_features,
+                                                         real_feature_array=real_features, micro_features=True,
+                                                         macro_features=True, save_folder="data/feature_images/gossipcop_violin")
+
+    exit(1)
+
     fake_prop_graph, real_prop_graph = get_propagation_graphs("data/saved_new_no_filter", "politifact")
     fake_prop_graph, real_prop_graph = equal_samples(fake_prop_graph, real_prop_graph)
 
diff --git a/util/graph_dumper.py b/util/graph_dumper.py
index ba059fc..ad4b50d 100644
--- a/util/graph_dumper.py
+++ b/util/graph_dumper.py
@@ -1,62 +1,62 @@
-from util.util import tweet_node
-
-
-def dumps_graph(root_node: tweet_node, params):
-    tweet_info_object_dict = dict()
-    edges_list = []
-    nodes_list = []
-
-    tweet_id_node_id_dict = dict()
-
-    add_tweet_node_if_not_exists(tweet_id_node_id_dict, root_node, nodes_list, tweet_info_object_dict, params)
-
-    root_node_id = tweet_id_node_id_dict[root_node.tweet_id]
-
-    for child in root_node.children:
-        child_node_id = add_tweet_node_if_not_exists(tweet_id_node_id_dict, child, nodes_list, tweet_info_object_dict,
-                                                     params)
-
-        edges_list.append(get_edge(root_node_id, child_node_id))
-
-        dump_children_network(child, nodes_list, edges_list, tweet_id_node_id_dict, tweet_info_object_dict, params)
-
-    legend_node_id = len(tweet_id_node_id_dict)+1
-    return [tweet_info_object_dict, nodes_list, edges_list]
-
-
-def get_edge(parent_node_id, child_node_id):
-    return {"from": parent_node_id, "to": child_node_id}
-
-
-def add_tweet_node_if_not_exists(tweet_id_node_id_dict, node: tweet_node, nodes_list, tweet_info_object_dict: dict,
-                                 params):
-    if node.tweet_id not in tweet_id_node_id_dict:
-        tweet_id_node_id_dict[node.tweet_id] = len(tweet_id_node_id_dict) + 1
-
-        nodes_list.append({"id": tweet_id_node_id_dict[node.tweet_id], "tweet_id": str(node.tweet_id),
-                           "label": tweet_id_node_id_dict[node.tweet_id],
-                           "color": params["node_color"][node.node_type]})
-
-        tweet_info_object_dict[str(node.tweet_id)] = node.get_contents()
-
-    return tweet_id_node_id_dict[node.tweet_id]
-
-
-def dump_children_network(node, nodes_list: list, edge_list: list, tweet_id_node_id_dict: dict,
-                          tweet_info_object_dict: dict, params):
-    node_id = add_tweet_node_if_not_exists(tweet_id_node_id_dict, node, nodes_list, tweet_info_object_dict, params)
-
-    for child in node.children:
-        dump_children_network(child, nodes_list, edge_list, tweet_id_node_id_dict, tweet_info_object_dict, params)
-        child_id = tweet_id_node_id_dict[child.tweet_id]
-
-        edge_list.append(get_edge(node_id, child_id))
-
-# def dump_reply_network(node: tweet_node, nodes_list: list, edge_list: list, tweet_info_object_dict: dict):
-#     node_id = add_tweet_node_if_not_exists(tweet_id_node_id_dict, node, nodes_list, params)
-#
-#     for child in node.reply_children:
-#         dump_retweet_network(child, nodes_list, edge_list, tweet_id_node_id_dict, params)
+# from util.util import tweet_node
+#
+#
+# def dumps_graph(root_node: tweet_node, params):
+#     tweet_info_object_dict = dict()
+#     edges_list = []
+#     nodes_list = []
+#
+#     tweet_id_node_id_dict = dict()
+#
+#     add_tweet_node_if_not_exists(tweet_id_node_id_dict, root_node, nodes_list, tweet_info_object_dict, params)
+#
+#     root_node_id = tweet_id_node_id_dict[root_node.tweet_id]
+#
+#     for child in root_node.children:
+#         child_node_id = add_tweet_node_if_not_exists(tweet_id_node_id_dict, child, nodes_list, tweet_info_object_dict,
+#                                                      params)
+#
+#         edges_list.append(get_edge(root_node_id, child_node_id))
+#
+#         dump_children_network(child, nodes_list, edges_list, tweet_id_node_id_dict, tweet_info_object_dict, params)
+#
+#     legend_node_id = len(tweet_id_node_id_dict)+1
+#     return [tweet_info_object_dict, nodes_list, edges_list]
+#
+#
+# def get_edge(parent_node_id, child_node_id):
+#     return {"from": parent_node_id, "to": child_node_id}
+#
+#
+# def add_tweet_node_if_not_exists(tweet_id_node_id_dict, node: tweet_node, nodes_list, tweet_info_object_dict: dict,
+#                                  params):
+#     if node.tweet_id not in tweet_id_node_id_dict:
+#         tweet_id_node_id_dict[node.tweet_id] = len(tweet_id_node_id_dict) + 1
+#
+#         nodes_list.append({"id": tweet_id_node_id_dict[node.tweet_id], "tweet_id": str(node.tweet_id),
+#                            "label": tweet_id_node_id_dict[node.tweet_id],
+#                            "color": params["node_color"][node.node_type]})
+#
+#         tweet_info_object_dict[str(node.tweet_id)] = node.get_contents()
+#
+#     return tweet_id_node_id_dict[node.tweet_id]
+#
+#
+# def dump_children_network(node, nodes_list: list, edge_list: list, tweet_id_node_id_dict: dict,
+#                           tweet_info_object_dict: dict, params):
+#     node_id = add_tweet_node_if_not_exists(tweet_id_node_id_dict, node, nodes_list, tweet_info_object_dict, params)
+#
+#     for child in node.children:
+#         dump_children_network(child, nodes_list, edge_list, tweet_id_node_id_dict, tweet_info_object_dict, params)
 #         child_id = tweet_id_node_id_dict[child.tweet_id]
 #
 #         edge_list.append(get_edge(node_id, child_id))
+#
+# # def dump_reply_network(node: tweet_node, nodes_list: list, edge_list: list, tweet_info_object_dict: dict):
+# #     node_id = add_tweet_node_if_not_exists(tweet_id_node_id_dict, node, nodes_list, params)
+# #
+# #     for child in node.reply_children:
+# #         dump_retweet_network(child, nodes_list, edge_list, tweet_id_node_id_dict, params)
+# #         child_id = tweet_id_node_id_dict[child.tweet_id]
+# #
+# #         edge_list.append(get_edge(node_id, child_id))
diff --git a/util/util.py b/util/util.py
index 41ebc97..87cb236 100644
--- a/util/util.py
+++ b/util/util.py
@@ -4,7 +4,7 @@
 
 class tweet_node:
 
-    def __init__(self, tweet_id, text, created_time, user_name, user_id, news_id, node_type):
+    def __init__(self, tweet_id, text = None, created_time = None, user_name = None, user_id = None, news_id = None, node_type = None, botometer_score = None, sentiment= None):
         self.tweet_id = tweet_id
         self.text = text
         self.created_time = created_time
@@ -22,16 +22,18 @@ def __init__(self, tweet_id, text, created_time, user_name, user_id, news_id, no
 
         self.children = set()
 
-        self.sentiment = None
-        self.stance = None
-        self.topic_vector = None
+        self.sentiment = sentiment
+        # self.stance = None
+        # self.topic_vector = None
 
-        self.original_object = None
+        # self.original_object = None
 
         self.parent_node = None
 
         self.node_type = node_type
 
+        self.botometer_score = botometer_score
+
     def __eq__(self, other):
         return self.tweet_id == other.tweet_id