-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathDataset.py
118 lines (100 loc) · 4.24 KB
/
Dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import scipy.sparse as sp
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
import fastrand
from numpy import random
import time
class Dataset(object):
def __init__(self, totalFilename, trainFilename, valFilename, testFilename, negativesFilename):
print("Reading Dataset")
self.totalData = pd.read_csv(totalFilename, sep='\t')[['uid','iid']]
self.train = pd.read_csv(trainFilename, sep='\t')[['uid','iid']]
self.trainMatrix = self.load_rating_file_as_matrix(trainFilename)
self.valRatings = self.load_rating_file_as_list(valFilename)
self.testRatings = self.load_rating_file_as_list(testFilename)
self.negatives = self.load_negative_file(negativesFilename)
assert len(self.testRatings) == len(self.negatives)
self.numUsers, self.numItems = len(self.totalData.uid.unique()), len(self.totalData.iid.unique())
self.userCache = self.getuserCache()
self.itemCache = self.getitemCache()
self.totalTrainUsers = set(self.train.uid.unique())
self.totalTrainItems = set(self.train.iid.unique())
print("[Rating] numUsers: %d, numItems: %d, numRatings: %d]" %(self.numUsers, self.numItems, len(self.trainMatrix)))
# Free memory
self.totalData.drop(self.totalData.index, inplace=True)
def load_rating_file_as_list(self, filename):
ratingList = []
with open(filename, "r") as f:
line = f.readline()
line = f.readline()
while line != None and line != "":
arr = line.split("\t")
user, item = int(arr[0]), int(arr[1])
ratingList.append([user, item])
line = f.readline()
return ratingList
def load_negative_file(self, filename):
negativeList = []
with open(filename, "r") as f:
line = f.readline()
while line != None and line != "":
arr = line.split("\t")
negatives = []
for x in arr[1: ]:
negatives.append(int(x))
negativeList.append(negatives)
line = f.readline()
return negativeList
def load_rating_file_as_matrix(self, filename):
# Get number of users and items
num_users, num_items = 0, 0
with open(filename, "r") as f:
line = f.readline()
line = f.readline()
while line != None and line != "":
arr = line.split("\t")
u, i = int(arr[0]), int(arr[1])
num_users = max(num_users, u)
num_items = max(num_items, i)
line = f.readline()
# Construct matrix
mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
with open(filename, "r") as f:
line = f.readline()
line = f.readline()
while line != None and line != "":
arr = line.split("\t")
user, item = int(arr[0]), int(arr[1])
mat[user, item] = 1.0
line = f.readline()
return mat
def getuserCache(self):
train = self.train
totalItems = set(range(self.numItems))
userCache = {}
userCache_rev = {}
for uid in train.uid.unique():
items = train.loc[train.uid == uid]['iid'].values.tolist()
userCache[uid] = items
return userCache
def getitemCache(self):
train = self.train
totalUsers = set(range(self.numUsers))
itemCache = {}
itemCache_rev = {}
#for iid in train.iid.unique():
for iid in range(self.numItems):
users = train.loc[train.iid == iid]['uid'].values.tolist()
if len(users) == 0:
users = []
itemCache[iid] = users
return itemCache
class Dataset_TransCF(Dataset):
def __init__(self, totalData):
self.totalData = totalData
def __len__(self):
return len(self.totalData)
def __getitem__(self, idx):
result = {'u':self.totalData[idx,0],'i':self.totalData[idx,1],'j':self.totalData[idx,2]}
return result