-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbook_data.py
192 lines (151 loc) · 5.33 KB
/
book_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/local/bin/python3
import re
import os
import io
import sys
import common
import pywikibot
ISBN_13 = 'P212'
ISBN_10 = 'P957'
OCLC_ID = 'P243'
PAGE_NUM_ID = 'P1104'
BOOK_TEMPLATE = 'Infobox book'
ISBN_PROPS = { 10: ISBN_10, 13: ISBN_13 }
ALL_PROPS = (PAGE_NUM_ID, OCLC_ID, ISBN_13, ISBN_10)
# For basic validation of structure for both ISBN- 10 and 13
RE_ISBN = re.compile(r'^(97(8|9))?\d{9}(\d|X)$', re.I)
def main(limit):
site = pywikibot.Site('en', 'wikipedia')
repo = site.data_repository()
page = pywikibot.Page(site, BOOK_TEMPLATE, ns=10)
pages = page.getReferences(follow_redirects=False,
only_template_inclusion=True,
namespaces=[0],
total=limit)
# This does the heavy work...
allData = getData(pages, limit)
# Now push all to the repo, prop by prop
for prop in ALL_PROPS:
data = allData[prop]
if data != []:
result = common.addMultipleClaims(data, prop, check_value=False, summary='')
print(f"Finished for {prop}. Updated {result['added']} items, {result['skipped']} were skipped")
def getPageNum(templates):
page_num = getValueRaw(templates, 'pages')
if not page_num:
return None
if page_num.isdigit():
return page_num
num = re.findall(r'\d+', page_num)
# Sometimes there'd be multiple values, probably
# for different editions, it's hard to programmatically
# extract these from free-form string.
# So we will use it here only if there's a single value
return num[0] if len(num) == 1 else None
def getISBN(templates):
isbn = getValueRaw(templates, 'isbn')
if isbn:
isbn = isbn.strip()
raw = isbn.replace('-', '')
ntype = ISBN_PROPS.get(len(raw), False)
if ntype and RE_ISBN.match(raw):
return isbn, ntype
return None
def getOCLC(templates):
ocnum = getValueRaw(templates, 'oclc')
if ocnum:
ocnum = ocnum.strip()
return ocnum if re.match(r'^\d{1,14}$', ocnum) else None
return None
def getValueRaw(templates, name):
for t in templates:
if t[0] == BOOK_TEMPLATE:
return t[1].get(name, False)
return False
def checkClaims(claimIDs, page):
"""
Checks the repo to find if any of or all
of these claims already exist on the item.
This will also check if the item is of the
right instance for the modifications
"""
item = common.getDataItem(page)
if item is None:
# Pretend it does if we don't even have data item
return [],[]
claims = item.get()['claims']
P31 = claims['P31'] if 'P31' in claims else []
if len(P31):
instance = P31[0].toJSON()
i_item = instance['mainsnak']['datavalue']['value']['item']
#'book (Q571)', 'version, edition, or translation (Q3331189)'
if int(i_item.get('numeric-id')) not in (571, 3331189):
return [],[]
res = list()
for claimID in claimIDs:
if claimID not in data['claims']:
res.append(claimID)
return res, item
def getData(pages, limit):
"""
Initialize `data` dict with the properties that we will work on.
This dictionary will be returned with all the data collected so
far, meticulously arranged in the following format:
data = {
prop1: [ [ val, item ], [ val, item ] ],
prop2: [ [ val, item ], [ val, item ] ],
prop3: [ [ val, item ], [ val, item ] ]
}
`val` are the actual values extracted from various operations on
pages. Each val has corresponding data item object witch is where
it belongs on the repo (the data item of the source wiki page).
The prop key are the property ids where the data belong.
"""
data = dict.fromkeys(ALL_PROPS, [])
path = os.path.dirname(__file__) + '/__local__/P1104_titles.txt'
count = 0
with open(path, mode='a+') as file:
file.seek(io.SEEK_SET)
lines = file.readlines()
titles = [t.strip() for t in lines]
for page in pages:
title = page.title()
if title not in titles:
continue
ids, item = checkClaims(ALL_PROPS, page)
if ids == [] or item == []:
continue
temps = page.raw_extracted_templates
for prop in ids:
res = extractValue(prop, temps)
if res:
# Special handling for ISBN(13|10)
# We hold the prop value from point of extraction
if prop in ISBN_PROPS.keys():
res = res[0]
prop = res[1]
data[prop].append([res, item])
count += 1
if limit == count:
break
return data
def extractValue(prop, temps):
if prop == PAGE_NUM_ID:
return getPageNum(temps)
elif prop == OCLC_ID:
return getOCLC(temps)
elif prop in ISBN_PROPS.keys():
return getISBN(temps)
raise Exception('Unsupported prop %s' %prop)
def sparql_query():
"""SPARQL query alternative"""
return "SELECT ?item " \
"WHERE " \
"{" \
"VALUES ?type {wd:Q571 wd:Q7725634} " \
"?item wdt:P31 ?type " \
"MINUS { ?item wdt:P1104 [] }"\
"}"
if __name__ == '__main__':
limit = int(sys.argv[1])
main(limit)