Clean up code & updated README

mdepak · May 25, 2020 · 8469e5f · 8469e5f
1 parent 9aef078
commit 8469e5f
Show file tree

Hide file tree

Showing 27 changed files with 8,146 additions and 4,977 deletions.
diff --git a/README.md b/README.md
@@ -1,29 +1,41 @@
 # Fake News Propagation
-
-Code for paper "Hierarchical Propagation Networks for Fake News Detection: Investigation and Exploitation" ICWSM 2020 https://arxiv.org/abs/1903.09196
+Code for ICWSM 2020 paper "`Hierarchical Propagation Networks for Fake News Detection: Investigation and Exploitation`" (https://arxiv.org/abs/1903.09196)
 
 ###Dataset
+Hierarchical propagation networks are constructed using the news samples from the FakeNewsNet dataset. 
+To adhere to Twitter's privacy policy, the user information is anonymized and tweet contents are not shared.
 
+The dataset is formatted in the `networkx` graph JSON format
+ and available at `data` directory, categorized based on the news source (Politifact/GossipCop) and label.
 
-###To Run:
+Each node in the graph contains the random tweet id, exact epoch timestamp, node type, random user id attributes.
+Additionally, the nodes in the retweet networks(macro networks) have bot score attribute and
+the nodes in the reply chain network(micro-network) have the sentiment of the tweet content.
+
+For the results mentioned in the paper, randomly down-sampled news ids provided in the `data/sample_ids` is used.
 
 
-###References
+###To Run:
+To use this dataset, un-zip the file `nx_network_data.zip` file in `data` directory and one can use an example in `load_dataset.py` to load the dataset.
+
+To extract features and run the model, install the dependencies in `requirements.txt` and use basic_model.py's main function. Function `get_classificaton_results_tpnf_by_time`
+in `basic_model` can be used to prune the graphs by time and work on the pruned dataset.
 
-If you use this dataset, please cite the following papers:
+###References/Citation
+If you use this dataset/code, please cite the following papers:
 
-@article{shu2019hierarchical,
+`@article{shu2019hierarchical,
   title={Hierarchical propagation networks for fake news detection: Investigation and exploitation},
   author={Shu, Kai and Mahudeswaran, Deepak and Wang, Suhang and Liu, Huan},
   journal={arXiv preprint arXiv:1903.09196},
   year={2019}
-}
+}`
 
-@article{shu2018fakenewsnet,
+`@article{shu2018fakenewsnet,
   title={FakeNewsNet: A Data Repository with News Content, Social Context and Dynamic Information for Studying Fake News on Social Media},
   author={Shu, Kai and  Mahudeswaran, Deepak and Wang, Suhang and Lee, Dongwon and Liu, Huan},
   journal={arXiv preprint arXiv:1809.01286},
   year={2018}
-}
+}`
 
-(C) 2019 Arizona Board of Regents on Behalf of ASU
+(C) 2020 Arizona Board of Regents on Behalf of ASU
diff --git a/analysis_util.py b/analysis_util.py
@@ -52,7 +52,7 @@ def get_dump_file_name(self, news_source, micro_features, macro_features, label,
         return "{}/{}.pkl".format(file_dir, "_".join(file_tags))
 
     def get_features_array(self, prop_graphs, micro_features, macro_features, news_source=None, label=None,
-                           file_dir="data/train_test_data", use_cache=False):
+                           file_dir="data/features", use_cache=False):
         function_refs = []
 
         file_name = self.get_dump_file_name(news_source, micro_features, macro_features, label, file_dir)
@@ -227,21 +227,21 @@ def equal_samples(sample1, sample2):
     return sample1[:target_len], sample2[:target_len]
 
 
-def get_propagation_graphs(data_folder, news_source):
-    fake_propagation_graphs = load_prop_graph(data_folder, news_source, "fake")
-    real_propagation_graphs = load_prop_graph(data_folder, news_source, "real")
-
-    print("Before filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs)))
-    print("Before filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs)))
-
-    fake_propagation_graphs = remove_prop_graph_noise(fake_propagation_graphs, get_noise_news_ids())
-    real_propagation_graphs = remove_prop_graph_noise(real_propagation_graphs, get_noise_news_ids())
-
-    print("After filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs)))
-    print("After filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs)))
-    print(flush=True)
-
-    return fake_propagation_graphs, real_propagation_graphs
+# def get_propagation_graphs(data_folder, news_source):
+#     fake_propagation_graphs = load_prop_graph(data_folder, news_source, "fake")
+#     real_propagation_graphs = load_prop_graph(data_folder, news_source, "real")
+#
+#     print("Before filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs)))
+#     print("Before filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs)))
+#
+#     fake_propagation_graphs = remove_prop_graph_noise(fake_propagation_graphs, get_noise_news_ids())
+#     real_propagation_graphs = remove_prop_graph_noise(real_propagation_graphs, get_noise_news_ids())
+#
+#     print("After filtering no. of FAKE prop graphs: {}".format(len(fake_propagation_graphs)))
+#     print("After filtering no. of REAL prop graphs: {}".format(len(real_propagation_graphs)))
+#     print(flush=True)
+#
+#     return fake_propagation_graphs, real_propagation_graphs
 
 
 def get_numpy_array(list_of_list):
@@ -259,7 +259,3 @@ def print_stat_values(feature_name, values, short_feature_name=""):
     print("Max value : {}".format(max(values)))
     print("Mean value : {}".format(np.mean(np.array(values))))
     print("=========================================")
-
-
-
-
diff --git a/baseline/LIWC.py b/baseline/LIWC.py