-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathloadArXiv.py
116 lines (64 loc) · 2.71 KB
/
loadArXiv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
sets up some basic functions and loads up useful data
assume run from Latent-Dirichlet-Allocation/ folder...
"""
import numpy as np
import scipy as sp
import os,sys
from liblda.low2corpus import Low2Corpus
from liblda.LDAmodel import LdaModel
# distances
from scipy.stats.distributions import entropy as spKLdiv
from liblda.math.distances import KLdiv, JSdiv
# Phi based mappings
from liblda.ILDA.hungarian_algorithm import getCostMatrix, find_closest
# Theta based mappings
from liblda.ILDA.hungarian_algorithm import getCostMatrix2, find_closest2
# data exploration, plotting and reporting
from liblda.topicviz.show_top import show_top
from liblda.topicviz.show_top import top_words_for_topic
import pylab as p
#####
##### MAIN SETTINGS FOR DATA SET
######################################################################
DATASET_NAME = "ArXiv16k"
print " LOADING DATA for: " + DATASET_NAME
DATA_PARENT_DIR="/CurrentPorjects/LatentDirichletAllocation/data/arXiv_as_LOW2/"
VOCAB_FILE = DATA_PARENT_DIR+"vocab.txt"
DOCS_FILE = DATA_PARENT_DIR+"arXiv_train_docs.txt"
IDS_FILE = DATA_PARENT_DIR+"arXiv_train_ids.txt"
TEST_DOCS_FILE = DATA_PARENT_DIR+"arXiv_test_docs.txt"
TEST_IDS_FILE = DATA_PARENT_DIR+"arXiv_test_ids.txt"
######################################################################
# loaders....
# vocab, model and doc2id
arXiv_corpus = Low2Corpus(DOCS_FILE)
arXiv_corpus.setVocabFromList( [w.strip() for w in open(VOCAB_FILE, 'r').readlines() ] )
arXiv_corpus.doCounts()
id_list = [w.strip() for w in open(IDS_FILE, 'r').readlines() ]
doc2id = dict( enumerate(id_list) )
# vocab, model and doc2id
arXiv_test_corpus = Low2Corpus(TEST_DOCS_FILE)
arXiv_test_corpus.setVocabFromList( [w.strip() for w in open(VOCAB_FILE, 'r').readlines() ] )
arXiv_test_corpus.doCounts()
test_id_list = [w.strip() for w in open(TEST_IDS_FILE, 'r').readlines() ]
test_doc2id = dict( enumerate(test_id_list) )
# the original to compare with
#phiT60_1 = np.load("../runs/repeatedT60-1/phi.npy")
#thetaT60_1 = np.load("../runs/repeatedT60-1/theta.npy")
#zT60_1 = np.load("../runs/repeatedT60-1/z.npy")
# Mon 29 Aug 2011 12:02:14 EDT
# testing log like
# hydrate from dir
morig = LdaModel( numT=60, corpus=arXiv_corpus, alpha=0.01, beta=0.01)
morig.allocate_arrays()
morig.read_dw_alphabetical()
#rd = os.path.join( RUNDIRS_ROOT, "../runs/repeatedT60-1/" )
rd = "/Users/ivan/Homes/master/Documents/Projects/runs/repeatedT60-1/"
morig.load_from_rundir(rd)
# same for merged topic model
mstar = LdaModel( numT=60, corpus=arXiv_corpus, alpha=0.01, beta=0.01)
mstar.allocate_arrays()
mstar.read_dw_alphabetical()
rd = "/Users/ivan/Homes/master/Documents/Projects/runs/new_merging_gibbs0"
mstar.load_from_rundir(rd)