Skip to content

Commit

Permalink
Merge pull request #26 from boostcampaitech6/Feat/24-CAR_templates
Browse files Browse the repository at this point in the history
[FEAT] CAR 모델 탬플릿 제작 #24
  • Loading branch information
ChangZero authored Feb 21, 2024
2 parents e1044cb + c584322 commit d012c48
Show file tree
Hide file tree
Showing 6 changed files with 575 additions and 0 deletions.
48 changes: 48 additions & 0 deletions context/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
import torch
import torch.nn as nn
import wandb

import argparse
from module.trainer import trainer
from module.utils import set_seed, parse_args_boolean, logging_conf, get_logger

logger = get_logger(logger_conf=logging_conf)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def parse_args():
parser = argparse.ArgumentParser()

parser.add_argument("--data_path", default="../../data/train/", type=str, help="Data path")
parser.add_argument("--output_dir", default="outputs/", type=str, help="Submission path")

parser.add_argument("--model", default="fm", type=str, help="Model name")
parser.add_argument("--dropout_rate", default=0.01, type=float, help="Dropout rate")
parser.add_argument("--weight_decay", default=1e-5, type=float, help="weight_decay")
parser.add_argument("--lr", default=0.001, type=float, help="Learning rate")
parser.add_argument("--batch_size", default=1024, type=int, help="Batch size")
parser.add_argument("--epochs", default=5, type=int, help="Number of epochs")

parser.add_argument("--num_nag_samples", default=50, type=int, help="number of nagative samples")
parser.add_argument("--embed_dim", default=8, type=int, help="embed feature`s dimension")
parser.add_argument('--mlp_dims', type=parse_args, default=(30, 20, 10), help='DeepFM의 MLP Network의 차원을 조정할 수 있습니다.')

parser.add_argument("--seed", default=42, type=int, help="Seed")

parser.add_argument('--wandb', type=parse_args_boolean, default=True, help='WandB 사용 여부를 설정할 수 있습니다.')
args = parser.parse_args()

return args


def main():
# os.makedirs(args.model_dir, exist_ok=True)

args = parse_args()
os.makedirs(args.output_dir, exist_ok=True)
set_seed(args.seed)
trainer(args)

if __name__ == "__main__":
main()

91 changes: 91 additions & 0 deletions context/src/module/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset

class RatingDataset(Dataset):
def __init__(self, input_tensor, target_tensor):
self.input_tensor = input_tensor.long()
self.target_tensor = target_tensor.long()

def __getitem__(self, index):
return self.input_tensor[index], self.target_tensor[index]

def __len__(self):
return self.target_tensor.size(0)


class MakeDataset:
def __init__(self, rating_data_path, genre_data_path, num_negative=50):
self.rating_data_path = rating_data_path
self.genre_data_path = genre_data_path
self.users = None
self.items = None
self.genres = None
self.data = None
self.num_negative = num_negative
self.user_dict = None
self.item_dict = None
self.genre_dict = None
self.raw_rating_df = None
self.user_group_dfs = None
self.raw_genre_df = None

def load_data(self):
# Load rating data
self.raw_rating_df = pd.read_csv(self.rating_data_path)
self.raw_rating_df['rating'] = 1.0 # implicit feedback
self.raw_rating_df.drop(['time'], axis=1, inplace=True)

# Load genre data
self.raw_genre_df = pd.read_csv(self.genre_data_path, sep='\t')
self.raw_genre_df = self.raw_genre_df.drop_duplicates(subset=['item'])

# Map genre to ids
genre_dict = {genre: i for i, genre in enumerate(set(self.raw_genre_df['genre']))}
self.raw_genre_df['genre'] = self.raw_genre_df['genre'].map(lambda x: genre_dict[x])

self.users = set(self.raw_rating_df['user'])
self.items = set(self.raw_rating_df['item'])
self.genres = set(self.raw_genre_df['genre'])

self.data = self.raw_rating_df.merge(self.raw_genre_df, on='item', how='inner')

def create_negative_instances(self):
self.user_group_dfs = list(self.data.groupby('user')['item'])
neg_instances = []

for u, u_items in tqdm(self.user_group_dfs):
u_items = set(u_items)
i_user_neg_item = np.random.choice(list(self.items - u_items), self.num_negative, replace=False)
neg_instances.extend([(u, item, 0) for item in i_user_neg_item])

neg_df = pd.DataFrame(neg_instances, columns=['user', 'item', 'rating'])
self.data = pd.concat([self.data, neg_df], ignore_index=True)

def index_mapping(self):
self.users = sorted(list(self.users))
self.items = sorted(list(self.items))
self.genres = sorted(list(self.genres))

self.user_dict = {user: i for i, user in enumerate(self.users)}
self.item_dict = {item: i for i, item in enumerate(self.items)}
self.genre_dict = {genre: i for i, genre in enumerate(self.genres)}

self.data['user'] = self.data['user'].map(self.user_dict)
self.data['item'] = self.data['item'].map(self.item_dict)
self.data['genre'] = self.data['genre'].map(self.genre_dict)

def preprocess(self):
self.load_data()
self.create_negative_instances()
self.index_mapping()
return self.raw_rating_df, self.user_group_dfs, self.raw_genre_df

def get_statistics(self):
n_data = len(self.data)
n_user = len(self.users)
n_item = len(self.items)
n_genre = len(self.genres)

return n_data, n_user, n_item, n_genre, self.user_dict, self.item_dict, self.genre_dict
41 changes: 41 additions & 0 deletions context/src/module/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

def inference(model, items_dict, users_dict, raw_genre_df, user_group_dfs, n_item, device):
# 모든 유저-아이템을 인풋으로 넣어서 결과 생성 후 랭킹 (31360 x 6807)
u_list = []
i_list = []
ritems_dict = {v:k for k,v in items_dict.items()}
for u, u_items in tqdm(user_group_dfs):

# 인코딩하기 전에 유저id 저장
u_list.append([u]*10)

# user incoding
u = users_dict[u]
u_items = set(u_items.map(lambda x : items_dict[x])) # incoding된 유저의 시청 아이템

# user, item, genre 데이터를 인코딩하여 학습한 모델에 맞는 값으로 변환
i_user_col = torch.tensor([u] * n_item)
i_item_col = torch.tensor(raw_genre_df['item'].map(lambda x : items_dict[x]).values)
i_genre_col = torch.tensor(raw_genre_df['genre'].values)

x = torch.cat([i_user_col.unsqueeze(1), i_item_col.unsqueeze(1), i_genre_col.unsqueeze(1)], dim=1)
x = x.to(device)

model.eval()
output_batch = model(x)
output_batch = output_batch.cpu().detach().numpy()

output_batch[list(u_items)] = -1 # 이미 본 아이템 제외
result_batch = np.argsort(output_batch)[-10:][::-1] # Top 10 item_id 추출
i_list.append(list(map(lambda x : ritems_dict[x], result_batch))) # item decoding


u_list = np.concatenate(u_list)
i_list = np.concatenate(i_list)
submit_df = pd.DataFrame(data={'user': u_list, 'item': i_list}, columns=['user', 'item'])
return submit_df

183 changes: 183 additions & 0 deletions context/src/module/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import numpy as np
import torch
import torch.nn as nn

# FM
class FMLayer(nn.Module):
def __init__(self, input_dim, factor_dim):
'''
Parameter
input_dim: Input dimension in sparse representation (2652 in MovieLens-100k)
factor_dim: Factorization dimension
'''
super(FMLayer, self).__init__()
self.v = nn.Parameter(
torch.empty(input_dim, factor_dim) # FILL HERE : Fill in the places `None` #
, requires_grad = True
)

def square(self, x):
return torch.pow(x,2)

def forward(self, x):
'''
Parameter
x: Float tensor of size "(batch_size, input_dim)"
'''
square_of_sum = self.square(torch.sum(torch.matmul(x, self.v), dim=1)) # FILL HERE : Use `torch.matmul()` and `self.square()` #
sum_of_square = torch.sum(torch.matmul(self.square(x), self.square(self.v)), dim=1) # FILL HERE : Use `torch.matmul()` and `self.square()` #

return 0.5 * torch.sum(square_of_sum - sum_of_square, dim=0)

class FactorizationMachine(nn.Module):
def __init__(self, input_dim, factor_dim):
'''
Parameter
input_dim: Input dimension in sparse representation (2652 in MovieLens-100k)
factor_dim: Factorization dimension
'''
super(FactorizationMachine, self).__init__()

self.linear = nn.Linear(input_dim, 1, bias=True) # FILL HERE : Fill in the places `None` #
self.fm = FMLayer(input_dim, factor_dim) # FILL HERE : Fill in the places `None` #

self._initialize_weights()

def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.constant_(m.bias, 0)
elif isinstance(m, FMLayer):
nn.init.normal_(m.v, 0, 0.01)


def forward(self, x):
'''
Parameter
x: Long tensor of size "(batch_size, input_dim)"
Return
y: Float tensor of size "(batch_size)"
'''
x = x.float()
y = self.linear(x).squeeze(1) + self.fm(x) # FILL HERE : Use `self.linear()` and `self.fm()` #
y = torch.sigmoid(y)
return y

# FFM
class FeaturesLinear(nn.Module):

def __init__(self, field_dims: np.ndarray, output_dim: int=1):
super().__init__()
self.fc = torch.nn.Embedding(sum(field_dims), output_dim)
self.bias = torch.nn.Parameter(torch.zeros((output_dim,)))
self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.int64)

# self.linear = nn.Linear(input_dim, 1, bias=True)
#linear(x)
def forward(self, x: torch.Tensor):
"""
:param x: Long tensor of size ``(batch_size, num_fields)`` = (256,9)
:return : (batch_size, output_dim=1)
"""
x = x + x.new_tensor(self.offsets).unsqueeze(0) #[256,9]
return torch.sum(self.fc(x), dim=1) + self.bias #self.fc(x) = (256,9,1) #self.bias = [0.] => [256,1]

class FieldAwareFactorizationMachine(nn.Module):

def __init__(self, field_dims: np.ndarray, embed_dim: int):
super().__init__()
self.num_fields = len(field_dims)
self.embeddings = torch.nn.ModuleList([
torch.nn.Embedding(sum(field_dims), embed_dim) for _ in range(self.num_fields)
])
self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.int64)
for embedding in self.embeddings:
torch.nn.init.xavier_uniform_(embedding.weight.data)

def forward(self, x: torch.Tensor):
"""
:param x: Long tensor of size ``(batch_size, num_fields)`` #[256,9]
"""
x = x + x.new_tensor(self.offsets).unsqueeze(0)
xs = [self.embeddings[i](x) for i in range(self.num_fields)]
ix = list()
for i in range(self.num_fields - 1):
for j in range(i + 1, self.num_fields):
ix.append(xs[j][:, i] * xs[i][:, j])
ix = torch.stack(ix, dim=1)

return ix #[256,36,8]

class FieldAwareFactorizationMachineModel(nn.Module):

def __init__(self, field_dims: np.ndarray, embed_dim: int):
super().__init__()
self.linear = FeaturesLinear(field_dims)
self.ffm = FieldAwareFactorizationMachine(field_dims, embed_dim)

def forward(self, x: torch.Tensor):
"""
:param x: Long tensor of size ``(batch_size, num_fields)``
"""
ffm_term = torch.sum(torch.sum(self.ffm(x), dim=1), dim=1, keepdim=True)
x = self.linear(x) + ffm_term
return torch.sigmoid(x.squeeze(1))

# DeepFM
class DeepFM(nn.Module):

def __init__(self, input_dims, embedding_dim, mlp_dims, drop_rate=0.1):
super(DeepFM, self).__init__()
total_input_dim = int(sum(input_dims)) # 입력 특성의 차원 n_user + n_movie + n_genre

# Fm component의 constant bias term과 1차 bias term
self.bias = nn.Parameter(torch.zeros((1,)))
self.fc = nn.Embedding(total_input_dim, 1)

self.embedding = nn.Embedding(total_input_dim, embedding_dim)
self.embedding_dim = len(input_dims) * embedding_dim


mlp_layers = [] # mlp hidden layer
for i, dim in enumerate(mlp_dims):
if i==0:
mlp_layers.append(nn.Linear(self.embedding_dim, dim))
else:
mlp_layers.append(nn.Linear(mlp_dims[i-1], dim))
mlp_layers.append(nn.ReLU(True))
mlp_layers.append(nn.Dropout(drop_rate))
mlp_layers.append(nn.Linear(mlp_dims[-1], 1))
self.mlp_layers = nn.Sequential(*mlp_layers)

def fm(self, x):
# x : (batch_size, total_num_input)
embed_x = self.embedding(x)

fm_y = self.bias + torch.sum(self.fc(x), dim=1)

square_of_sum = torch.sum(embed_x, dim=1) ** 2
sum_of_square = torch.sum(embed_x ** 2, dim=1)
fm_y += 0.5 * torch.sum(square_of_sum - sum_of_square, dim=1, keepdim=True)
return fm_y

def mlp(self, x):
embed_x = self.embedding(x)

inputs = embed_x.view(-1, self.embedding_dim)
mlp_y = self.mlp_layers(inputs)
return mlp_y

def forward(self, x):
embed_x = self.embedding(x)

# fm component
fm_y = self.fm(x).squeeze(1)

# deep component
mlp_y = self.mlp(x).squeeze(1)

# 시그모이드 함수를 사용하여 ctr 1/0 에 대한 확률 값을 출력
y = torch.sigmoid(fm_y + mlp_y)
return y
Loading

0 comments on commit d012c48

Please sign in to comment.