Skip to content

Commit

Permalink
upload necessary data and preprocess code
Browse files Browse the repository at this point in the history
  • Loading branch information
skeletondyh committed Jun 16, 2021
1 parent 182fe83 commit 02ca17f
Show file tree
Hide file tree
Showing 16 changed files with 462 additions and 0 deletions.
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
lp/data/
lp/preprocessed/Amazon/unconnected_pairs_offset.npy
lp/preprocessed/Amazon/neg_ratings_offset.npy
lp/preprocessed/Douban_Movie/neg_ratings_offset.npy
lp/preprocessed/Douban_Movie/unconnected_pairs_offset.npy
lp/preprocessed/Yelp/neg_ratings_offset.npy
lp/preprocessed/Yelp/unconnected_pairs_offset.npy

nc/data/
40 changes: 40 additions & 0 deletions lp/gen_neg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import numpy as np
import scipy.sparse as sp
import os
import torch
import sys

def main(prefix):

pos_pairs_offset = np.load(os.path.join(prefix, "pos_pairs_offset.npz"))
unconnected_pairs_offset = np.load(os.path.join(prefix, "unconnected_pairs_offset.npy"))
neg_ratings_offset = np.load(os.path.join(prefix, "neg_ratings_offset.npy"))

train_len = pos_pairs_offset['train'].shape[0]
val_len = pos_pairs_offset['val'].shape[0]
test_len = pos_pairs_offset['test'].shape[0]
pos_len = train_len + val_len + test_len

if pos_len > neg_ratings_offset.shape[0]:
indices = np.arange(unconnected_pairs_offset.shape[0])
assert(indices.shape[0] > pos_len)
np.random.shuffle(indices)
makeup = indices[:pos_len - neg_ratings_offset.shape[0]]
neg_ratings_offset = np.concatenate((neg_ratings_offset, unconnected_pairs_offset[makeup]), axis=0)
assert(pos_len == neg_ratings_offset.shape[0])

indices = np.arange(neg_ratings_offset.shape[0])
np.random.shuffle(indices)
np.savez(os.path.join(prefix, "neg_pairs_offset"), train=neg_ratings_offset[indices[:train_len]],
val=neg_ratings_offset[indices[train_len:train_len + val_len]],
test=neg_ratings_offset[indices[train_len + val_len:pos_len]])

if __name__ == '__main__':
dataset = sys.argv[1]
prefix = os.path.join("./preprocessed/", dataset)
np.random.seed(int(sys.argv[2]))
main(prefix)

#! Yelp 2
#! Amazon 4
#! Douban_Movie 6
377 changes: 377 additions & 0 deletions lp/preprocess.py

Large diffs are not rendered by default.

Binary file added lp/preprocessed/Amazon/adjs_offset.pkl
Binary file not shown.
Binary file added lp/preprocessed/Amazon/neg_pairs_offset.npz
Binary file not shown.
Binary file added lp/preprocessed/Amazon/node_types.npy
Binary file not shown.
Binary file added lp/preprocessed/Amazon/pos_pairs_offset.npz
Binary file not shown.
Binary file added lp/preprocessed/Douban_Movie/adjs_offset.pkl
Binary file not shown.
Binary file added lp/preprocessed/Douban_Movie/neg_pairs_offset.npz
Binary file not shown.
Binary file added lp/preprocessed/Douban_Movie/node_types.npy
Binary file not shown.
Binary file added lp/preprocessed/Douban_Movie/pos_pairs_offset.npz
Binary file not shown.
Binary file added lp/preprocessed/Yelp/adjs_offset.pkl
Binary file not shown.
Binary file added lp/preprocessed/Yelp/neg_pairs_offset.npz
Binary file not shown.
Binary file added lp/preprocessed/Yelp/node_types.npy
Binary file not shown.
Binary file added lp/preprocessed/Yelp/pos_pairs_offset.npz
Binary file not shown.
36 changes: 36 additions & 0 deletions nc/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
import sys
import numpy as np
import torch
import pickle as pkl

cstr_nc = {
"DBLP" : [1, 4],
"ACM" : [0, 2, 4],
"IMDB" : [0, 2, 4]
}

def main(dataset):
prefix = os.path.join("./data/", dataset)
with open(os.path.join(prefix, "edges.pkl"), "rb") as f:
edges = pkl.load(f)
f.close()

node_types = np.zeros((edges[0].shape[0],), dtype=np.int32)

a = np.unique(list(edges[0].tocoo().row) + list(edges[2].tocoo().row))
b = np.unique(edges[0].tocoo().col)
c = np.unique(edges[2].tocoo().col)
print(a.shape[0], b.shape[0], c.shape[0])
assert(a.shape[0] + b.shape[0] + c.shape[0] == node_types.shape[0])
assert(np.unique(np.concatenate((a, b, c))).shape[0] == node_types.shape[0])

node_types[a.shape[0]:a.shape[0] + b.shape[0]] = 1
node_types[a.shape[0] + b.shape[0]:] = 2
assert(node_types.sum() == b.shape[0] + 2 * c.shape[0])
np.save(os.path.join(prefix, "node_types"), node_types)

if __name__ == "__main__":
main("DBLP")
main("ACM")
main("IMDB")

0 comments on commit 02ca17f

Please sign in to comment.