-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbrowse_wikipedia.py
239 lines (211 loc) · 10.5 KB
/
browse_wikipedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
from bs4 import BeautifulSoup as bs
import requests
import numpy as np
import time
import networkx as nx
import matplotlib.pyplot as plt
import random
"""Remark: Browsing the web is relatively slow. This is because the function
requests.get() is relatively slow. Try to lower the limit value in order
to get a deeper graph."""
def get_hrefs(name, limit=100):
"""
@name is a string which corresponds to the name of the Wikipedia entry.
@limit is an integer.
The first @limit pages which link to the page @name are returned (at most).
"""
page = "https://en.wikipedia.org/w/index.php?title=Special:WhatLinksHere/"\
+ name + "&namespace=0&limit=" + str(limit)
page_response = requests.get(page)
page_content = bs(page_response.content, "html.parser")
# Get all hrefs from the site's content
return page_content.find_all("a")
def get_views(name):
"""Returns number of views in the past 30 days of Wikipedia entry with
name @name."""
page = "https://en.wikipedia.org/w/index.php?title="\
+ name + "&action=info"
page_response = requests.get(page)
page_content = bs(page_response.content, "html.parser")
try:
page_views = str(page_content).split("Page views in the past 30 days")\
[1].split("\">")[1].split("<")[0]
except IndexError:
# This error arises when a Wikipedia entry is faulty
page_views = str(0)
return int(page_views.replace(",", ""))
def get_predecessors(name, connections, num_pred, limit=100):
"""Get Wiki entries which are linked to the entry
https://en.wikipedia.org/wiki/name. In case they are not yet contained in
the dictionary @connections they are written into it.
@num_pred is the maximal number of entries added to connections.
@limit is the length of the list of entries which are linked to the entry
corresponding to @name."""
if name not in connections.keys():
links = get_hrefs(name, limit=num_pred)
# Restrict to hrefs to Wikipedia main articles (excludes for instance
# references and discussions).
strng_link = "a href=\"/wiki/"
themes = []
count = 0
for link in links:
if strng_link in str(link):
# Isolate name of Wikipedia entry
theme = str(link).split("<a href=\"/wiki/")[1].split("\"")[0]
if theme == name:
# Prevent loops
continue
if (":" not in theme and "User" not in theme
and "Help" not in theme and count < num_pred):
# Excludes references and discussions by ":" not in theme
themes.append(theme)
count += 1
# Make the list unique:
themes = list(np.unique(themes))
connections[name] = themes
return connections
def get_pred_ranked(name, connections, num_pred, limit=100):
"""Get the @num_pred most viewed Wiki entries which are linked to the entry
"https://en.wikipedia.org/wiki/name". In case they are not yet contained in
the dictionary @connections they are written into it.
@num_pred is the maximal number of entries added to connections.
@limit is the length of the list of entries which are linked to the entry
corresponding to @name."""
if name not in connections.keys():
links = get_hrefs(name, limit=limit)
# Restrict to hrefs to Wikipedia main articles (excludes for instance
# references and discussions).
strng_link = "a href=\"/wiki/"
themes = []
count = 0
for link in links:
if strng_link in str(link):
# Isolate name of Wikipedia entry
theme = str(link).split("<a href=\"/wiki/")[1].split("\"")[0]
if theme == name:
# Prevent loops
continue
if ":" not in theme and "User" not in theme and "Help" not in theme:
# Excludes references and discussions by ":" not in theme
page_theme_views = get_views(theme)
if len(themes) < num_pred:
# Fill the themes list up
themes.append((theme, page_theme_views))
else:
count +=1
""" If the themes list is full, replace the one with
least views"""
views = [theme[1] for theme in themes]
if page_theme_views > min(views):
ind_min = views.index(min(views))
del themes[ind_min]
themes.append((theme, page_theme_views))
# Make the list unique:
themes = list(np.unique([theme[0] for theme in themes]))
connections[name] = themes
return connections
def get_all_predecessors(connections, num_pred, method=get_predecessors, limit=100):
""" Extend the dictionary of keys=sites, values=predecessors by the
predecessors of the predecessors (the latter thereby become keys in the
new dictionary)."""
# First, get all predecessors (values) of the dictionary and collect them
# in a unique(!) list.
words_list = list()
for site in connections:
for predec in connections[site]:
if predec not in connections and predec not in words_list:
words_list.append(predec)
# Now, determine the predecessors for each (unique) word
for word in words_list:
connections = method(word, connections, num_pred, limit=limit)
return connections
def inc2adj(incidence_list):
""" Transforms a dictionary into the adjacency matrix of a directed graph.
All elements of the dictionary, i.e. keys and values are interpreted as
nodes of the graph. The values of a key are interpreted as ingoing nodes
of the node associated with the key. Furthermore, the dictionary node_values
is returned. It associates to each site-name (key) a unique integer (value)."""
# The number of nodes corresponds to the number of unique entries in the
# incidence list (keys + values).
num_nodes = len(list(set(list(incidence_list.keys())
+ [value for key in incidence_list.keys() for value in incidence_list[key]])))
adj = np.zeros((num_nodes, )*2)
# Make a dictionary which assigns a unique integer to each entry in the
# incidence list. Begin with numerating the keys.
node_values = {key: i for i, key in enumerate(incidence_list.keys())}
# begin with filling the adjacency matrix for the keys
count = len(incidence_list)
for key in incidence_list.keys():
for value in incidence_list[key]:
if value in node_values:
# If the value already has a number, put a 1 in the adjacency matrix
adj[node_values[key], node_values[value]] = 1
else:
# Number the value and put it in the adjacency matrix
node_values[value] = count
adj[node_values[key], count] = 1
count += 1
return adj, node_values
def plot_digraph(adjacency, node_values):
""" Plots a digraph from an adjacency matrix using the networkx module.
Needs a dictionary node_values with node names as keys."""
plt.figure()
# Set nodes and (directed) edges
num_nodes = len(adjacency)
positions = [(np.real(np.exp(2 * np.pi * 1j * k / num_nodes)), np.imag(np.exp(2 * np.pi * 1j * k / num_nodes)))
for k in range(num_nodes)]
G = nx.DiGraph()
for i in range(num_nodes):
G.add_node(i, pos=positions[i])
for k in range(num_nodes):
if adjacency[i, k] > 0:
G.add_edge(k, i)
pos = nx.get_node_attributes(G, 'pos')
labeldict = dict({(i, key) for i, key in enumerate(node_values.keys())})
# Draw network
edges = G.edges()
nx.draw_networkx_labels(G, pos, labels=labeldict, font_size=10, font_weight='normal')
nx.draw_networkx_nodes(G, pos, node_color='pink')
nx.draw_networkx_edges(G, pos, edges=edges, alpha=0.4, arrowsize=20)
plt.show()
def reduce_graph(adjacency, node_labels, centrality, rate):
""" This function reduces a large adjacency matrix according to some
measure. Measures can be node-degree, eigenvector centrality, or more
generally any centrality measure. All nodes which have a centrality
value above rate*max(centrality) are included in the new graph."""
# Determine nodes which have a centrality value above threshold
values = centrality(adjacency)
M = max(values)
node_list = [k for k in range(len(adjacency)) if values[k] > M*rate]
node_list.sort(key=lambda x: values[x], reverse=True)
# Get labels of the corresponding nodes
aux_dict = {value: key for key, value in node_labels.items()}
node_labels_new = {aux_dict[node]:i for i, node in enumerate(node_list)}
# Build reduced adjacency matrix with associated node labels
adjacency_reduced = np.zeros((len(node_list), )*2)
count = 0
for node in node_list:
adjacency_reduced[count, :] = [adjacency[node, num] for num in node_list]
count += 1
return adjacency_reduced, node_labels_new
def degree_centrality(adjacency, side='right'):
""" Compute the degree centrality of the graph given by adjacency matrix.
By default, the in-degree centrality is computed. If side='left' the out-
degree centrality is computed. """
if side == 'left':
adjacency = adjacency.transpose()
return np.array([sum(adjacency[k, :]) for k in range(len(adjacency))])
def eigenvector_centrality(adjacency, side='right'):
""" Return the eigenvector centrality of an adjacency matrix. The matrix
must be given as a (square) numpy array. If side='left', the left
eigenvector, corresponding to the centrality of the node corresponding to
the in-degree, is computed. Otherwise the right, corresponding to the
out-degree. """
if side == 'left':
adjacency = adjacency.transpose()
w, v = np.linalg.eig(adjacency)
# Find index of largest eigenvalue (which exists and is positive by the
# Perron-Frobenius theorem)
ind = np.argmax(w)
eigenvector = np.round(v[:, ind].real, 12)
return [abs(entry) for entry in eigenvector]