forked from predict-idlab/pyRDF2Vec
-
Notifications
You must be signed in to change notification settings - Fork 2
/
example.py
127 lines (101 loc) · 4.35 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import random
import os
import numpy as np
import rdflib
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.manifold import TSNE
from rdf2vec.converters import rdflib_to_kg
from rdf2vec.walkers import RandomWalker
from rdf2vec import RDF2VecTransformer
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
random.seed(42)
#########################################################################
# DATA LOADING #
#########################################################################
# Load our train & test instances and labels
test_data = pd.read_csv('sample/MUTAG_test.tsv', sep='\t')
train_data = pd.read_csv('sample/MUTAG_train.tsv', sep='\t')
train_entities = [rdflib.URIRef(x) for x in train_data['bond']]
train_labels = train_data['label_mutagenic']
test_entities = [rdflib.URIRef(x) for x in test_data['bond']]
test_labels = test_data['label_mutagenic']
all_entities = train_entities + test_entities
all_labels = list(train_labels) + list(test_labels)
# Define the label predicates, all triples with these predicates
# will be excluded from the graph
label_predicates = [
'http://dl-learner.org/carcinogenesis#isMutagenic'
]
# Convert the rdflib to our KnowledgeGraph object
kg = rdflib_to_kg('sample/mutag.owl', label_predicates=label_predicates)
#########################################################################
# CREATING EMBEDDINGS #
#########################################################################
# We'll all possible walks of depth 2
random_walker = RandomWalker(2, 4)
# Create embeddings with random walks
transformer = RDF2VecTransformer(walkers=[random_walker], sg=1)
walk_embeddings = transformer.fit_transform(kg, all_entities)
# Split into train and test embeddings
train_embeddings = walk_embeddings[:len(train_entities)]
test_embeddings = walk_embeddings[len(train_entities):]
#########################################################################
# FIT CLASSIFIER #
#########################################################################
# Fit a support vector machine on train embeddings and evaluate on test
clf = SVC(random_state=42)
clf.fit(train_embeddings, train_labels)
print(end='Support Vector Machine: Accuracy = ')
print(accuracy_score(test_labels, clf.predict(test_embeddings)))
print(confusion_matrix(test_labels, clf.predict(test_embeddings)))
#########################################################################
# T-SNE PLOT #
#########################################################################
# Create t-SNE embeddings from RDF2Vec embeddings (dimensionality reduction)
walk_tsne = TSNE(random_state=42)
X_walk_tsne = walk_tsne.fit_transform(walk_embeddings)
# Define a color map
colors = ['r', 'g']
color_map = {}
for i, label in enumerate(set(all_labels)):
color_map[label] = colors[i]
# Plot the train embeddings
plt.figure(figsize=(10, 4))
plt.scatter(
X_walk_tsne[:len(train_entities), 0],
X_walk_tsne[:len(train_entities), 1],
edgecolors=[color_map[i] for i in all_labels[:len(train_entities)]],
facecolors=[color_map[i] for i in all_labels[:len(train_entities)]],
)
# Plot the test embeddings
plt.scatter(
X_walk_tsne[len(train_entities):, 0],
X_walk_tsne[len(train_entities):, 1],
edgecolors=[color_map[i] for i in all_labels[len(train_entities):]],
facecolors='none'
)
# Annotate a few points
for i, ix in enumerate([25, 35]):
plt.annotate(
all_entities[ix].split('/')[-1],
xy=(X_walk_tsne[ix, 0], X_walk_tsne[ix, 1]), xycoords='data',
xytext=(0.1 * i, 0.05 + 0.1 * i),
fontsize=8, textcoords='axes fraction',
arrowprops=dict(arrowstyle="->", facecolor='black')
)
# Create a legend
plt.scatter([], [], edgecolors='r', facecolors='r', label='train -')
plt.scatter([], [], edgecolors='g', facecolors='g', label='train +')
plt.scatter([], [], edgecolors='r', facecolors='none', label='test -')
plt.scatter([], [], edgecolors='g', facecolors='none', label='test +')
plt.legend(loc='top right', ncol=2)
# Show & save the figure
plt.title('pyRDF2Vec', fontsize=32)
plt.axis('off')
plt.savefig('embeddings.png')
plt.show()