-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmissing_ref_content.py
117 lines (91 loc) · 4.12 KB
/
missing_ref_content.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import re
import pywikibot as pw
import mwparserfromhell as mwp
import helpers
from helpers import convert_to
hywiki, ruwiki, enwiki = helpers.get_wikipedias('hy', 'ru', 'en')
def extract_references_without_content(page_text):
parsed = mwp.parse(page_text)
references_with_content = {}
missing_refs = []
for tag in parsed.filter_tags():
if tag.tag == 'ref' and tag.has('name'):
ref_name = str(tag.get('name').value)
if ref_name in references_with_content:
references_with_content[ref_name].append(tag)
else:
references_with_content[ref_name] = [tag]
for ref_name, ref_tags in references_with_content.items():
has_content = any(not ref_tag.self_closing for ref_tag in ref_tags)
if not has_content:
missing_refs.append(ref_name)
return missing_refs
parse_map = {}
def get_reference_with_content_by_name(page_text, ref_name):
parsed = mwp.parse(page_text) if page_text not in parse_map else parse_map[page_text]
parse_map[page_text] = parsed
for tag in parsed.filter_tags():
if tag.has('name') and str(tag.get('name').value) == ref_name and not tag.self_closing:
return tag
return None
def get_from_rev(revision, ref_name):
if 'slots' in revision and 'main' in revision['slots'] and '*' in revision['slots']['main'] and ref_name in \
revision['slots']['main']['*']:
from_en = get_reference_with_content_by_name(revision['slots']['main']['*'], ref_name)
if from_en:
return (from_en, revision['revid'])
return (None, None)
def is_power_of_two(num):
return (num & (num - 1)) == 0 and num > 0
def process_page(page):
global parse_map
parse_map = {}
text = page.text
missing_refs = extract_references_without_content(text)
enpage, item = convert_to(page, enwiki)
new_refs = {}
en_revs = []
ru_revs = []
for ref_name in missing_refs:
found_on_en_revision = False
i = 0
if enpage and enpage.exists():
for revision in enpage.revisions(content=True, total=32):
i += 1
if not is_power_of_two(i):
continue
if 'slots' in revision and 'main' in revision['slots'] and '*' in revision['slots'][
'main'] and ref_name in revision['slots']['main']['*']:
from_en = get_reference_with_content_by_name(revision['slots']['main']['*'], ref_name)
if from_en:
new_refs[ref_name] = from_en
en_revs.append(revision['revid'])
found_on_en_revision = True
break
if found_on_en_revision:
continue
rupage, item = convert_to(page, ruwiki)
i = 0
if rupage and rupage.exists():
for revision in rupage.revisions(content=True, total=32):
i += 1
if not is_power_of_two(i):
continue
if 'slots' in revision and 'main' in revision['slots'] and '*' in revision['slots'][
'main'] and ref_name in revision['slots']['main']['*']:
from_ru = get_reference_with_content_by_name(revision['slots']['main']['*'], ref_name)
if from_ru:
new_refs[ref_name] = from_ru
ru_revs.append(revision['revid'])
break
for new_ref in new_refs:
text = re.sub(r'<ref *name *="?' + new_ref + r'"? */>', str(new_refs[new_ref]), text, count=1)
if text != page.text:
summary = 'Ծանոթագրություններ ըստ՝ ' + ', '.join(
map(lambda x: f'[[:en:Special:PermaLink/{str(x)}]]', list(set(en_revs)))) + ', '.join(
map(lambda x: f'[[:ru:Special:PermaLink/{str(x)}]]', list(set(ru_revs))))
page.text = text
page.save(summary, botflag=False)
cat = pw.Category(hywiki, 'Կատեգորիա:Դատարկ ծանոթագրություններով հոդվածներ')
for member in cat.members(reverse=True):
process_page(member)