This repository has been archived by the owner on Jul 11, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathwikinews.py
97 lines (80 loc) · 3.45 KB
/
wikinews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/python
# -*- coding: utf-8 -*-
# robienie listy haseł polskich bez wymowy
import sys
sys.path.append('/home/alkamid/wikt/pywikipedia')
#sys.path.append('/home/adam/wikt/pywikipedia')
import codecs
from pywikibot import Category
import pywikibot
from pywikibot import pagegenerators
import re
import math
import datetime
def main():
site = pywikibot.Site('pl', 'wikinews')
lista_stron = pagegenerators.AllpagesPageGenerator(site=site)
re_cytat = re.compile('{{[cC]ytat\|(.*?)}}', re.DOTALL)
re_tresc = re.compile('(.*?)($|\|2=|\|3=|\|4=|\|5=|\|[0-9]*px\|[0-9]*px\|)', re.DOTALL)
re_autor = re.compile('(4=|\|[0-9]*px\|[0-9]*px\|)(.*?)($|\|)', re.DOTALL)
re_zrodlo = re.compile('5=(.*)', re.DOTALL)
re_cytatlewy = re.compile('{{[cC]ytatLewy\|(.*?)}}', re.DOTALL)
re_cytatprawy = re.compile('{{[cC]ytatPrawy\|(.*?)}}', re.DOTALL)
for a in lista_stron:
log = ''
try:
strona = a.get()
except pywikibot.IsRedirectPage:
#print u'[[%s]] - przekierowanie' % a.title()
log = log + '\n*[[%s]] - przekierowanie' % a.title()
except pywikibot.Error:
print('[[%s]] - błąd' % a.title())
log = log + '\n*[[%s]] - błąd' % a.title()
else:
s_cytat = re.findall(re_cytat, a.get())
for b in s_cytat:
final = ''
s_tresc = re.search(re_tresc, b)
s_autor = re.search(re_autor, b)
s_zrodlo = re.search(re_zrodlo, b)
print(b)
print('\n\n')
if s_tresc:
final = final + '\n\'\'\'treść\'\'\': %s' % s_tresc.group(1)
#print u'\n\'\'\'treść\'\'\': %s' % s_tresc.group(1)
if s_autor:
final = final + '\n:\'\'\'autor\'\'\': %s' % (s_autor.group(2))
print('\n:\'\'\'autor\'\'\': %s' % (s_autor.group(2)))
if s_zrodlo:
final = final + '\n:\'\'\'źródło\'\'\': %s' % s_zrodlo.group(1)
#print u'\n:\'\'\'źródło\'\'\': %s' % s_zrodlo.group(1)
final = final + '\n:\'\'\'link\'\'\': [[%s]]<br/><br/>' % a.title()
#print u'\n:\'\'\'link\'\'\': [[%s]]<br/><br/>\n' % a.title()
#print final
file = open("output/wikinews.txt", 'a')
file.write (final.encode("utf-8"))
file.close
s_cytatlewy = re.findall(re_cytatlewy, a.get())
for c in s_cytatlewy:
final_l = ''
final_l = final_l + '\n\'\'\'treść\'\'\': %s' % c
final_l = final_l + '\n:\'\'\'link\'\'\': [[%s]]<br/><br/>' % a.title()
file = open("output/wikinews_lewy.txt", 'a')
file.write (final_l.encode("utf-8"))
file.close
s_cytatprawy = re.findall(re_cytatprawy, a.get())
for d in s_cytatprawy:
final_p = ''
final_p = final_p + '\n\'\'\'treść\'\'\': %s' % d
final_p = final_p + '\n:\'\'\'link\'\'\': [[%s]]<br/><br/>' % a.title()
file = open("output/wikinews_prawy.txt", 'a')
file.write (final_p.encode("utf-8"))
file.close
file = open("log/wikinews.txt", 'a')
file.write (log.encode("utf-8"))
file.close
if __name__ == '__main__':
try:
main()
finally:
pywikibot.stopme()