-
Notifications
You must be signed in to change notification settings - Fork 2
/
xkcd-scraper.py
126 lines (98 loc) · 3.7 KB
/
xkcd-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from SPARQLWrapper import SPARQLWrapper, JSON
languages = ['fr', 'en', 'br', 'de']
def wikidata_sparql_query(query):
"""
Queries WDQS and returns the result
"""
endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
return results
def ordinal(value):
"""
Converts zero or a *positive* integer (or its string
representation) to an ordinal value.
"""
try:
value = int(value)
except ValueError:
return value
if value % 100 // 10 != 1:
if value % 10 == 1:
ordval = u"%d%s" % (value, "st")
elif value % 10 == 2:
ordval = u"%d%s" % (value, "nd")
elif value % 10 == 3:
ordval = u"%d%s" % (value, "rd")
else:
ordval = u"%d%s" % (value, "th")
else:
ordval = u"%d%s" % (value, "th")
return ordval
def statement(prop, value, source):
"""
Returns a statement in the QuickStatements experted format
"""
return "LAST\t{}\t{}\tS854\t\"{}\"".format(prop, value, source)
# Get what is already in Wikidata to ignore it
imported_episodes = []
results = wikidata_sparql_query("""
SELECT DISTINCT ?episode ?episodeLabel ?number WHERE {
?episode wdt:P31 wd:Q838795 .
?episode wdt:P361 wd:Q13915 .
?episode wdt:P433 ?number .
SERVICE wikibase:label {
bd:serviceParam wikibase:language "en" .
}
} ORDER BY xsd:integer(?number)
""")
for r in results["results"]["bindings"]:
try:
imported_episodes.append(int(r["number"]["value"]))
except:
print("{} has no episode number.".format(r))
latest_imported_episode = max(imported_episodes)
for i in range(1, latest_imported_episode):
# There is no episode 404.
if i not in imported_episodes and i != 404:
print('Episode {} is missing :('.format(i))
# Get the episodes list
root_url = 'http://www.xkcd.com'
index_url = root_url + '/archive/index.html'
response = requests.get(index_url)
soup = BeautifulSoup(response.text, "lxml")
episodes = soup.select('div#middleContainer a')
episodes.reverse()
for e in episodes:
title = "\"" + e.get_text() + "\"" or ""
urlbit = e.attrs.get('href') or ""
episode_url = root_url + urlbit
episodenumber = urlbit.replace("/", "")
if int(episodenumber) > latest_imported_episode:
date = "+0000000" + '-'.join([
"{0:0>2}".format(v) for v in e.attrs.get('title').split("-")]) + \
"T00:00:00Z/11" or ""
print("CREATE")
for l in languages:
print("LAST\tL{}\t{}".format(l, title))
print("LAST\tA{}\t\"xkcd {}\"".format(l, episodenumber))
print("LAST\tDfr\t\"strip de xkcd n°{}\"".format(episodenumber))
print("LAST\tDde\t\"Folge des Webcomics xkcd\"")
print("LAST\tDen\t\"{} strip of the webcomic xkcd\"".format(
ordinal(episodenumber)))
print(statement("P31", "Q838795", episode_url)) # instance of
print(statement("P31", title, episode_url)) # instance of
print(statement("P361", "Q13915", episode_url)) # part of
print(statement("P433", '"' + episodenumber + '"', episode_url)) # nb
print(statement("P577", date, episode_url)) # date
print(statement("P50", "Q285048", episode_url)) # Author: R. Munroe
print(statement("P2699", '"' + episode_url + '"', episode_url)) # URL
print(statement("P364", "Q1860", episode_url)) # Language: English
print(statement("P275", "Q6936496", episode_url)) # Licence: CC-BY-NC
print("")