-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpageviews.py
48 lines (38 loc) · 1.41 KB
/
pageviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import gzip, pywikibot, os, sys
#/mnt/nfs/dumps-labstore1006.wikimedia.org/xmldatadumps/public/other/pageviews/2018/2018-05/
#os.chdir(r'projects/viewstats')
logfile = open('jun14-2018.txt','a', encoding='utf-8')
def encode_if_necessary(b):
if type(b) is bytes:
return b.decode('utf8')
return b
'''
with gzip.open('pageviews-20180501-000000.gz','r') as f:
for line in f:
line = encode_if_necessary(line).replace('\n','')
if not line.startswith('lv'): continue
if counter == 500: break
counter += 1
pywikibot.output(line)
'''
#
for r, d, f in os.walk(r'/mnt/nfs/dumps-labstore1006.wikimedia.org/xmldatadumps/public/other/pageviews/2018/2018-06/'):
for file in f:
#for file in os.listdir(r'/mnt/nfs/dumps-labstore1006.wikimedia.org/xmldatadumps/public/other/pageviews/2018/2018-05/'):
if 'pageviews-' not in file: continue
if '20180614' not in file: continue
fullname = os.path.join(r, file)
with gzip.open(fullname,'r') as f:
print(file)
thisfiledata = []
f_name = file.replace('pageviews-','').replace('.gz','')
counter = 0
for line in f:
line = encode_if_necessary(line).replace('\n','')
if not line.startswith('lv'): continue
if counter % 5000 == 0:
print(counter)
sys.stdout.flush()
counter += 1
thisfiledata.append('{}\t{}'.format(f_name,line))
logfile.write('{}\n'.format('\n'.join(thisfiledata)))