-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_loadstore.py
244 lines (222 loc) · 13.1 KB
/
data_loadstore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#==============================================================================
# Loading and saving datasets
#==============================================================================
import numpy as np
#np.set_printoptions(threshold=np.inf) #View full arrays in console
import pickle
from random import shuffle
data_folder = './'
def load_any_data(filename):
'''
General case to load any data from filename
Can be stored in .npz or .pkl or .pkl.gz form
Output tuple: xtr,ytr,xva,yva,xte,yte
'''
loaded = np.load(filename)
xtr = loaded['xtr']
ytr = loaded['ytr']
xva = loaded['xva']
yva = loaded['yva']
xte = loaded['xte']
yte = loaded['yte']
return (xtr, ytr, xva, yva, xte, yte)
#==============================================================================
# Reuters RCV1-v2
#==============================================================================
'''
WORKFLOW:
Download relevant files from http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/lyrl2004_rcv1v2_README.htm
Online Appendix 2 - Topics
Online Appendix 8 - Articles IDs with topics
Online Appendix 12 - Article IDs with tokens
Rename OA8 to article_categories_all.dat
Delete all categories except 2nd level in OA2 and call it categories_level2.txt
Run organize_data() to get data.dat
Run build_data(cat_filename = 'categories_level2', build_data_filename = 'data_level2') to get data with only level2 articles. Takes a while
Run label_frequency(cat_filename = 'categories_level2', data_filename = 'data_level2')
This gives some very frequent categories and some very infrequent categories (see stuff.xlsx)
I deleted C15, C42, GOBIT, GVOTE, GMIL from categories_level2 and saved in categories_level2_final50
Run build_data(cat_filename = 'categories_level2_final50', build_data_filename = 'data_level2_final50') to get data with only relevant articles. Takes a while
Run label_frequency(cat_filename = 'categories_level2_final50', data_filename = 'data_level2_final50')
This gives a DIFFERENT distribution (see stuff.xlsx)
Since some applicable cats have been deleted, some articles which were previously ineligible due to having 2 applicable cats have now become eligible
So the article count for all remaining eligible cats will go up
Eg: Article 50959 had C15 and GTOUR, but with C15 being deleted, it only has GTOUR, so becomes eligible and increases the count for GTOUR
Run most_common_tokens(numtokens=2000, data_filename = 'data_level2_final50')
Saves the max occurring tokens with counts as a list of tuples in a pkl file
Max 2000 occurring tokens with counts are also saved in stuff.xlsx
If less tokens are to be considered (say 400), just load the pkl file, extract the 1st 400 elements, and save that as a new pkl file
Run build_mldata(numval=50000, numtest=100000, numtokens=xxx, data_filename = 'data_level2_final50') to get final npz data
xxx can be 2000 or 400, etc
'''
def organize_data():
'''
Combine data from the 5 original files in Online Appendix 12 (http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/lyrl2004_rcv1v2_README.htm)
into a single file tokens.dat
Delete .W and blank lines
Combine all tokens in 1 article into a single line
Every entry in tokens.dat has:
1st line: .I <article ID>
2nd line: <list of tokens>
'''
with open(data_folder + 'dataset_RCV1/data.dat','w') as fw:
for ftbr in ['train','test_pt0','test_pt1','test_pt2','test_pt3']:
with open(data_folder + 'dataset_RCV1/lyrl2004_tokens_{0}.dat'.format(ftbr),'r') as fr:
for line in fr:
if line[:2]=='.W':
continue
elif line[:2]=='.I':
fw.write(line)
newline = '' #start a new line which will have all tokens in the article
elif line=='\n':
fw.write(newline + '\n') #end the new line for the article
else:
newline += line[:-1] + ' ' #add tokens to new line
def extract_categories(filename = 'categories_level2_final50'):
'''
Extracts categories from filename and returns them as a list
'''
with open(data_folder + 'dataset_RCV1/{0}.txt'.format(filename),'r') as f:
catlines = f.readlines()
numcats = len(catlines)
cats = numcats*[''] #list to hold all categories
for i in range(numcats):
for char in catlines[i][23:]: #this is where the category name starts from
if char==' ': #category name ends
break
else: #category name continues
cats[i] += char
return cats
def build_data(cat_filename = 'categories_level2_final50', build_data_filename = 'data_level2_final50'):
'''
Get data from data.dat and categories from cat_filename
Delete all articles which don't have correct categories
Save in build_data_filename a new version of tokens which only has articles with correct category
'''
cats = set(extract_categories(filename = cat_filename))
with open(data_folder + 'dataset_RCV1/data.dat','r') as frtok:
tokenlines = frtok.readlines()
idlines = tokenlines[0:len(tokenlines):2] #only hold article ID lines
with open(data_folder + 'dataset_RCV1/article_categories_all.dat','r') as frcat:
artcatlines = frcat.readlines()
current_cats = set() #hold categories of current article
current_art = '2286' #1st article id
fullindex = 0
with open(data_folder + 'dataset_RCV1/{0}.dat'.format(build_data_filename),'w') as fw:
for i in range(len(artcatlines)):
if i%10000==0: print(i) #progress
previous_art = current_art
cat, current_art, _ = artcatlines[i].split(' ') #category, article id, 1
if previous_art == current_art: #still in same article
current_cats.add(cat)
else: #on to next article
num_correct_cats = cats.intersection(current_cats) #see how many categories are correct
current_cats = set((cat,)) #refresh current_cats and add category of new article (current_art) to it
if len(num_correct_cats)==1: #if there's only 1 correct category
try:
index = idlines.index('.I {0}\n'.format(previous_art)) #find line number of the applicable article
except ValueError:
continue
fullindex += index #fullindex keeps track of absolute index (index is relative index since idlines is getting truncated every time)
fw.write(tokenlines[fullindex*2][:-1] + ' ' + num_correct_cats.pop() + ' \n') #write ".I <article ID> <article category> "
fw.write(tokenlines[fullindex*2+1]) #write the tokens
del idlines[:index] #make future searches simpler, since articles are in increasing order. This will make idlines start at the article which just got added
def label_frequency(cat_filename = 'categories_level2_final50', data_filename = 'data_level2_final50'):
'''
Find number of articles in data_filename in which each category from cat_filename occurs
'''
cats = extract_categories(filename = cat_filename)
catfreqs = {key:value for (value,key) in enumerate(cats)} #create dictionary where each key is an element in cats and the corresponding value will hold its frequency
with open(data_folder + 'dataset_RCV1/{0}.dat'.format(data_filename),'r') as f:
tokenlines = f.readlines()
idlines = tokenlines[0:len(tokenlines):2] #only lines with article IDs
ids = ''.join(idlines) #combine into single string
for key in catfreqs:
catfreqs[key] = ids.count(' '+key+' \n')
catfreqs_sort = [(k,catfreqs[k]) for k in sorted(catfreqs, key=catfreqs.get, reverse=True)] #sort according to dict values, i.e. highest frequency first
return catfreqs_sort
def most_common_tokens(numtokens=2000, data_filename = 'data_level2_final50'):
'''
Find how many articles each token occurs in, in data_filename
Save 'numtokens' tokens with the highest counts as a list of tuples
Note that this counts how many unique articles a token is present in, not the total count of a token
For example, 'brazil' can occur 10 times each in 5 articles, but it will have lower rank than 'india' which occurs 1 time each in 6 articles
'''
tokendict = {}
linecounter = 0
with open(data_folder + 'dataset_RCV1/{0}.dat'.format(data_filename),'r') as f:
for line in f:
if line[:2]=='.I' or line=='\n': #don't consider ID lines
continue
else: #only consider token lines
linecounter += 1
line = line[:-1] #remove newline character at end
for token in np.unique(line.split(' ')): #get unique tokens in article
if token in tokendict:
tokendict[token] += 1
else:
tokendict[token] = 1
if linecounter%100000==0: print(linecounter) #progress
tokendict.pop('') #this token gets added by default to all articles
maxtokens_withcounts = [(k,tokendict[k]) for k in sorted(tokendict, key=tokendict.get, reverse=True)][:numtokens] #sort tokendict dictionary according to values, take the highest 'numtokens' values, return as a list of tuples
# maxtokens = maxtokens_withcounts.keys() #these are the most commonly occurring tokens
with open(data_folder + 'dataset_RCV1/maxtokens_withcounts_{0}.pkl'.format(numtokens),'w') as f:
pickle.dump(maxtokens_withcounts, f) #use this file in future to skip the long processing time of this method
def build_mldata(numval=50000, numtest=100000, numtokens=2000, data_filename = 'data_level2_final50'):
'''
Build actual data from data_filename using maxtokens_filename as numpy arrays
numval, numtest: How many samples to have in each
maxtokens_to_consider: No. of features
'''
cats = extract_categories()
dcats = {key:value for (value,key) in enumerate(cats)} #create dictionary where each key is an element in list and the corresponding value is its index
numcats = len(cats)
del cats
with open(data_folder + 'dataset_RCV1/maxtokens_withcounts_{0}.pkl'.format(numtokens),'r') as f:
alltokens_withcounts = pickle.load(f) #list of tuples
alltokens = [at[0] for at in alltokens_withcounts] #only get token names, ignore counts
shuffle(alltokens) #break up the order of the most frequent token being at pos0, next at pos1, and so on. In-place shuffle
dalltokens = {key:value for (value,key) in enumerate(alltokens)} #create dictionary where each key is an element in list and the corresponding value is its index
del alltokens
with open(data_folder + 'dataset_RCV1/{0}.dat'.format(data_filename),'r') as f:
numarticles = len(f.readlines())//2
xdata = np.zeros((numarticles,numtokens),dtype='float32') #samples, features
ydata = np.zeros((numarticles,numcats),dtype='uint8') #samples, categories
article_counter = 0
with open(data_folder + 'dataset_RCV1/{0}.dat'.format(data_filename),'r') as f: #for some reason, the file needs to be opened again
for line in f:
if line[:2]=='.I':
_,_,cat,_ = line.split(' ') #.I, id, category, \n
ydata[article_counter,dcats[cat]] = 1
else:
tokens = line.split(' ')
for token in tokens:
try:
xdata[article_counter,dalltokens[token]] += 1.
except KeyError:
continue
article_counter += 1
if article_counter%10000==0: print(article_counter) #progress
xdata = np.log10(1+xdata) #applying log transformation as in Hinton 2012 - 'Improving neural networks by preventing co-adaptation of feature detectors'
'''
Note that the numtokens and numcats axes are in no order which influences ML algorithms
For example, numcats is in alphabetical order, but the frequency of cats are independent of alphabetical order
numtokens is in purely random order due to shuffling
But numarticles is in time order:
According to the original paper, the 1st few samples (earliest in time) should be used for tr, and the remaining (later in time) for te
To keep that, use:
xtr = xdata[:-(numval+numtest)]
xva = xdata[-numval:-numtest]
xte = xdata[-numtest:]
and likewise for ytr,yva,yte
Instead I'll divide the data randomly (not chronologically), so I'll shuffle along the numarticles axis and then split
'''
shuff = np.random.permutation(numarticles)
xdata, ydata = xdata[shuff], ydata[shuff]
xte = xdata[:numtest]
yte = ydata[:numtest]
xva = xdata[numtest:numtest+numval]
yva = ydata[numtest:numtest+numval]
xtr = xdata[numtest+numval:]
ytr = ydata[numtest+numval:]
np.savez_compressed(data_folder + 'dataset_RCV1/rcv1_{0}.npz'.format(numtokens), xtr=xtr,ytr=ytr, xva=xva,yva=yva, xte=xte,yte=yte)