-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchecktitles.py
73 lines (55 loc) · 2.01 KB
/
checktitles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import re, pywikibot, os
import toolforge
from customFuncs import get_quarry as quarry
from collections import Counter
site = pywikibot.Site("lv", "wikipedia")
conn = toolforge.connect('lvwiki_p','analytics')
def encode_if_necessary(b):
if type(b) is bytes:
return b.decode('utf8')
return b
def run_query(query,connection = conn):
#query = query.encode('utf-8')
#print(query)
try:
cursor = connection.cursor()
cursor.execute(query)
rows = cursor.fetchall()
except KeyboardInterrupt:
sys.exit()
return rows
#
SQLMAIN = """select page_title
from page
where page_is_redirect=0 and page_namespace=0"""
query_res = run_query(SQLMAIN,conn)
data = [encode_if_necessary(f[0]).replace('_',' ') for f in query_res]
lowercased_unique = Counter([f.lower() for f in data])
dublicates = sorted([f[0] for f in lowercased_unique.items() if f[1]>1])
#https://www.mediawiki.org/wiki/Manual:$wgLegalTitleChars
stringtocheck = "\"0-9A-Za-zĀČĒĢĪĶĻŅŠŪŽāčēģīķļņšūž\. ,!%\-\(\)—:'\/+&\?№ßö"
badthings = ['"',"'"]
regexr = "[^{}]".format(stringtocheck)
#titles = ['Ābece','Me*bele$s']
tosave = []
for title in data:
if title[0] in badthings and title[-1] in badthings:
#print('pēdiņas sākumā+beigās')
tosave.append('* [[{}]]: pēdiņas sākumā+beigās'.format(title))
trt = re.findall(regexr,title)
if trt:
if ' ' in title:
trt.append('* [[{}]]: divas atstarpes'.format(title))
#pywikibot.output(title)
#tosave.append("* [[{}]]: {}".format(title,', '.join(trt)))#([title,trt])
trt = re.findall('\d[-–]\d',title)
if trt:
tosave.append("* [[{}]]: {}".format(title,', '.join(trt)))#([title,trt])
#
for entry in dublicates:
candidates = [f for f in data if f.lower() == entry]
if len(candidates)>0:
tosave.append("* {}: [[{}]]".format(entry,']], [['.join(candidates)))
page = pywikibot.Page(site,"Dalībnieks:Edgars2007/Nestandarta nosaukumi")
page.text = '\n'.join(tosave)
page.save(summary="upd", botflag=False)