-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathdetect_events.py
61 lines (53 loc) · 2.82 KB
/
detect_events.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# coding: utf-8
# std
import timeit
import argparse
# mabed
from mabed.corpus import Corpus
from mabed.mabed import MABED
import mabed.utils as utils
__author__ = "Adrien Guille"
__email__ = "[email protected]"
if __name__ == '__main__':
p = argparse.ArgumentParser(description='Perform mention-anomaly-based event detection (MABED)')
p.add_argument('i', metavar='input', type=str, help='Input csv file')
p.add_argument('k', metavar='top_k_events', type=int, help='Number of top events to detect')
p.add_argument('--sw', metavar='stopwords', type=str, help='Stop-word list', default='stopwords/twitter_en.txt')
p.add_argument('--sep', metavar='separator', type=str, help='CSV separator', default='\t')
p.add_argument('--o', metavar='output', type=str, help='Output pickle file', default=None)
p.add_argument('--maf', metavar='min_absolute_frequency', type=int, help='Minimum absolute word frequency, default to 10', default=10)
p.add_argument('--mrf', metavar='max_relative_frequency', type=float, help='Maximum absolute word frequency, default to 0.4', default=0.4)
p.add_argument('--tsl', metavar='time_slice_length', type=int, help='Time-slice length, default to 30 (minutes)', default=30)
p.add_argument('--p', metavar='p', type=int, help='Number of candidate words per event, default to 10', default=10)
p.add_argument('--t', metavar='theta', type=float, help='Theta, default to 0.6', default=0.6)
p.add_argument('--s', metavar='sigma', type=float, help='Sigma, default to 0.6', default=0.6)
args = p.parse_args()
print('Parameters:')
print(' Corpus: %s\n k: %d\n Stop-words: %s\n Min. abs. word frequency: %d\n Max. rel. word frequency: %f' %
(args.i, args.k, args.sw, args.maf, args.mrf))
print(' p: %d\n theta: %f\n sigma: %f' % (args.p, args.t, args.s))
print('Loading corpus...')
start_time = timeit.default_timer()
my_corpus = Corpus(args.i, args.sw, args.maf, args.mrf, args.sep)
elapsed = timeit.default_timer() - start_time
print('Corpus loaded in %f seconds.' % elapsed)
time_slice_length = args.tsl
print('Partitioning tweets into %d-minute time-slices...' % time_slice_length)
start_time = timeit.default_timer()
my_corpus.discretize(time_slice_length)
elapsed = timeit.default_timer() - start_time
print('Partitioning done in %f seconds.' % elapsed)
print('Running MABED...')
k = args.k
p = args.p
theta = args.t
sigma = args.s
start_time = timeit.default_timer()
mabed = MABED(my_corpus)
mabed.run(k=k, p=p, theta=theta, sigma=sigma)
mabed.print_events()
elapsed = timeit.default_timer() - start_time
print('Event detection performed in %f seconds.' % elapsed)
if args.o is not None:
utils.save_events(mabed, args.o)
print('Events saved in %s' % args.o)