-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #26 from boostcampaitech6/Feat/24-CAR_templates
[FEAT] CAR 모델 탬플릿 제작 #24
- Loading branch information
Showing
6 changed files
with
575 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import os | ||
import torch | ||
import torch.nn as nn | ||
import wandb | ||
|
||
import argparse | ||
from module.trainer import trainer | ||
from module.utils import set_seed, parse_args_boolean, logging_conf, get_logger | ||
|
||
logger = get_logger(logger_conf=logging_conf) | ||
device = 'cuda' if torch.cuda.is_available() else 'cpu' | ||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument("--data_path", default="../../data/train/", type=str, help="Data path") | ||
parser.add_argument("--output_dir", default="outputs/", type=str, help="Submission path") | ||
|
||
parser.add_argument("--model", default="fm", type=str, help="Model name") | ||
parser.add_argument("--dropout_rate", default=0.01, type=float, help="Dropout rate") | ||
parser.add_argument("--weight_decay", default=1e-5, type=float, help="weight_decay") | ||
parser.add_argument("--lr", default=0.001, type=float, help="Learning rate") | ||
parser.add_argument("--batch_size", default=1024, type=int, help="Batch size") | ||
parser.add_argument("--epochs", default=5, type=int, help="Number of epochs") | ||
|
||
parser.add_argument("--num_nag_samples", default=50, type=int, help="number of nagative samples") | ||
parser.add_argument("--embed_dim", default=8, type=int, help="embed feature`s dimension") | ||
parser.add_argument('--mlp_dims', type=parse_args, default=(30, 20, 10), help='DeepFM의 MLP Network의 차원을 조정할 수 있습니다.') | ||
|
||
parser.add_argument("--seed", default=42, type=int, help="Seed") | ||
|
||
parser.add_argument('--wandb', type=parse_args_boolean, default=True, help='WandB 사용 여부를 설정할 수 있습니다.') | ||
args = parser.parse_args() | ||
|
||
return args | ||
|
||
|
||
def main(): | ||
# os.makedirs(args.model_dir, exist_ok=True) | ||
|
||
args = parse_args() | ||
os.makedirs(args.output_dir, exist_ok=True) | ||
set_seed(args.seed) | ||
trainer(args) | ||
|
||
if __name__ == "__main__": | ||
main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from tqdm import tqdm | ||
from torch.utils.data import Dataset | ||
|
||
class RatingDataset(Dataset): | ||
def __init__(self, input_tensor, target_tensor): | ||
self.input_tensor = input_tensor.long() | ||
self.target_tensor = target_tensor.long() | ||
|
||
def __getitem__(self, index): | ||
return self.input_tensor[index], self.target_tensor[index] | ||
|
||
def __len__(self): | ||
return self.target_tensor.size(0) | ||
|
||
|
||
class MakeDataset: | ||
def __init__(self, rating_data_path, genre_data_path, num_negative=50): | ||
self.rating_data_path = rating_data_path | ||
self.genre_data_path = genre_data_path | ||
self.users = None | ||
self.items = None | ||
self.genres = None | ||
self.data = None | ||
self.num_negative = num_negative | ||
self.user_dict = None | ||
self.item_dict = None | ||
self.genre_dict = None | ||
self.raw_rating_df = None | ||
self.user_group_dfs = None | ||
self.raw_genre_df = None | ||
|
||
def load_data(self): | ||
# Load rating data | ||
self.raw_rating_df = pd.read_csv(self.rating_data_path) | ||
self.raw_rating_df['rating'] = 1.0 # implicit feedback | ||
self.raw_rating_df.drop(['time'], axis=1, inplace=True) | ||
|
||
# Load genre data | ||
self.raw_genre_df = pd.read_csv(self.genre_data_path, sep='\t') | ||
self.raw_genre_df = self.raw_genre_df.drop_duplicates(subset=['item']) | ||
|
||
# Map genre to ids | ||
genre_dict = {genre: i for i, genre in enumerate(set(self.raw_genre_df['genre']))} | ||
self.raw_genre_df['genre'] = self.raw_genre_df['genre'].map(lambda x: genre_dict[x]) | ||
|
||
self.users = set(self.raw_rating_df['user']) | ||
self.items = set(self.raw_rating_df['item']) | ||
self.genres = set(self.raw_genre_df['genre']) | ||
|
||
self.data = self.raw_rating_df.merge(self.raw_genre_df, on='item', how='inner') | ||
|
||
def create_negative_instances(self): | ||
self.user_group_dfs = list(self.data.groupby('user')['item']) | ||
neg_instances = [] | ||
|
||
for u, u_items in tqdm(self.user_group_dfs): | ||
u_items = set(u_items) | ||
i_user_neg_item = np.random.choice(list(self.items - u_items), self.num_negative, replace=False) | ||
neg_instances.extend([(u, item, 0) for item in i_user_neg_item]) | ||
|
||
neg_df = pd.DataFrame(neg_instances, columns=['user', 'item', 'rating']) | ||
self.data = pd.concat([self.data, neg_df], ignore_index=True) | ||
|
||
def index_mapping(self): | ||
self.users = sorted(list(self.users)) | ||
self.items = sorted(list(self.items)) | ||
self.genres = sorted(list(self.genres)) | ||
|
||
self.user_dict = {user: i for i, user in enumerate(self.users)} | ||
self.item_dict = {item: i for i, item in enumerate(self.items)} | ||
self.genre_dict = {genre: i for i, genre in enumerate(self.genres)} | ||
|
||
self.data['user'] = self.data['user'].map(self.user_dict) | ||
self.data['item'] = self.data['item'].map(self.item_dict) | ||
self.data['genre'] = self.data['genre'].map(self.genre_dict) | ||
|
||
def preprocess(self): | ||
self.load_data() | ||
self.create_negative_instances() | ||
self.index_mapping() | ||
return self.raw_rating_df, self.user_group_dfs, self.raw_genre_df | ||
|
||
def get_statistics(self): | ||
n_data = len(self.data) | ||
n_user = len(self.users) | ||
n_item = len(self.items) | ||
n_genre = len(self.genres) | ||
|
||
return n_data, n_user, n_item, n_genre, self.user_dict, self.item_dict, self.genre_dict |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import torch | ||
import numpy as np | ||
import pandas as pd | ||
from tqdm import tqdm | ||
|
||
def inference(model, items_dict, users_dict, raw_genre_df, user_group_dfs, n_item, device): | ||
# 모든 유저-아이템을 인풋으로 넣어서 결과 생성 후 랭킹 (31360 x 6807) | ||
u_list = [] | ||
i_list = [] | ||
ritems_dict = {v:k for k,v in items_dict.items()} | ||
for u, u_items in tqdm(user_group_dfs): | ||
|
||
# 인코딩하기 전에 유저id 저장 | ||
u_list.append([u]*10) | ||
|
||
# user incoding | ||
u = users_dict[u] | ||
u_items = set(u_items.map(lambda x : items_dict[x])) # incoding된 유저의 시청 아이템 | ||
|
||
# user, item, genre 데이터를 인코딩하여 학습한 모델에 맞는 값으로 변환 | ||
i_user_col = torch.tensor([u] * n_item) | ||
i_item_col = torch.tensor(raw_genre_df['item'].map(lambda x : items_dict[x]).values) | ||
i_genre_col = torch.tensor(raw_genre_df['genre'].values) | ||
|
||
x = torch.cat([i_user_col.unsqueeze(1), i_item_col.unsqueeze(1), i_genre_col.unsqueeze(1)], dim=1) | ||
x = x.to(device) | ||
|
||
model.eval() | ||
output_batch = model(x) | ||
output_batch = output_batch.cpu().detach().numpy() | ||
|
||
output_batch[list(u_items)] = -1 # 이미 본 아이템 제외 | ||
result_batch = np.argsort(output_batch)[-10:][::-1] # Top 10 item_id 추출 | ||
i_list.append(list(map(lambda x : ritems_dict[x], result_batch))) # item decoding | ||
|
||
|
||
u_list = np.concatenate(u_list) | ||
i_list = np.concatenate(i_list) | ||
submit_df = pd.DataFrame(data={'user': u_list, 'item': i_list}, columns=['user', 'item']) | ||
return submit_df | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
import numpy as np | ||
import torch | ||
import torch.nn as nn | ||
|
||
# FM | ||
class FMLayer(nn.Module): | ||
def __init__(self, input_dim, factor_dim): | ||
''' | ||
Parameter | ||
input_dim: Input dimension in sparse representation (2652 in MovieLens-100k) | ||
factor_dim: Factorization dimension | ||
''' | ||
super(FMLayer, self).__init__() | ||
self.v = nn.Parameter( | ||
torch.empty(input_dim, factor_dim) # FILL HERE : Fill in the places `None` # | ||
, requires_grad = True | ||
) | ||
|
||
def square(self, x): | ||
return torch.pow(x,2) | ||
|
||
def forward(self, x): | ||
''' | ||
Parameter | ||
x: Float tensor of size "(batch_size, input_dim)" | ||
''' | ||
square_of_sum = self.square(torch.sum(torch.matmul(x, self.v), dim=1)) # FILL HERE : Use `torch.matmul()` and `self.square()` # | ||
sum_of_square = torch.sum(torch.matmul(self.square(x), self.square(self.v)), dim=1) # FILL HERE : Use `torch.matmul()` and `self.square()` # | ||
|
||
return 0.5 * torch.sum(square_of_sum - sum_of_square, dim=0) | ||
|
||
class FactorizationMachine(nn.Module): | ||
def __init__(self, input_dim, factor_dim): | ||
''' | ||
Parameter | ||
input_dim: Input dimension in sparse representation (2652 in MovieLens-100k) | ||
factor_dim: Factorization dimension | ||
''' | ||
super(FactorizationMachine, self).__init__() | ||
|
||
self.linear = nn.Linear(input_dim, 1, bias=True) # FILL HERE : Fill in the places `None` # | ||
self.fm = FMLayer(input_dim, factor_dim) # FILL HERE : Fill in the places `None` # | ||
|
||
self._initialize_weights() | ||
|
||
def _initialize_weights(self): | ||
for m in self.modules(): | ||
if isinstance(m, nn.Linear): | ||
nn.init.normal_(m.weight, 0, 0.01) | ||
nn.init.constant_(m.bias, 0) | ||
elif isinstance(m, FMLayer): | ||
nn.init.normal_(m.v, 0, 0.01) | ||
|
||
|
||
def forward(self, x): | ||
''' | ||
Parameter | ||
x: Long tensor of size "(batch_size, input_dim)" | ||
Return | ||
y: Float tensor of size "(batch_size)" | ||
''' | ||
x = x.float() | ||
y = self.linear(x).squeeze(1) + self.fm(x) # FILL HERE : Use `self.linear()` and `self.fm()` # | ||
y = torch.sigmoid(y) | ||
return y | ||
|
||
# FFM | ||
class FeaturesLinear(nn.Module): | ||
|
||
def __init__(self, field_dims: np.ndarray, output_dim: int=1): | ||
super().__init__() | ||
self.fc = torch.nn.Embedding(sum(field_dims), output_dim) | ||
self.bias = torch.nn.Parameter(torch.zeros((output_dim,))) | ||
self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.int64) | ||
|
||
# self.linear = nn.Linear(input_dim, 1, bias=True) | ||
#linear(x) | ||
def forward(self, x: torch.Tensor): | ||
""" | ||
:param x: Long tensor of size ``(batch_size, num_fields)`` = (256,9) | ||
:return : (batch_size, output_dim=1) | ||
""" | ||
x = x + x.new_tensor(self.offsets).unsqueeze(0) #[256,9] | ||
return torch.sum(self.fc(x), dim=1) + self.bias #self.fc(x) = (256,9,1) #self.bias = [0.] => [256,1] | ||
|
||
class FieldAwareFactorizationMachine(nn.Module): | ||
|
||
def __init__(self, field_dims: np.ndarray, embed_dim: int): | ||
super().__init__() | ||
self.num_fields = len(field_dims) | ||
self.embeddings = torch.nn.ModuleList([ | ||
torch.nn.Embedding(sum(field_dims), embed_dim) for _ in range(self.num_fields) | ||
]) | ||
self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.int64) | ||
for embedding in self.embeddings: | ||
torch.nn.init.xavier_uniform_(embedding.weight.data) | ||
|
||
def forward(self, x: torch.Tensor): | ||
""" | ||
:param x: Long tensor of size ``(batch_size, num_fields)`` #[256,9] | ||
""" | ||
x = x + x.new_tensor(self.offsets).unsqueeze(0) | ||
xs = [self.embeddings[i](x) for i in range(self.num_fields)] | ||
ix = list() | ||
for i in range(self.num_fields - 1): | ||
for j in range(i + 1, self.num_fields): | ||
ix.append(xs[j][:, i] * xs[i][:, j]) | ||
ix = torch.stack(ix, dim=1) | ||
|
||
return ix #[256,36,8] | ||
|
||
class FieldAwareFactorizationMachineModel(nn.Module): | ||
|
||
def __init__(self, field_dims: np.ndarray, embed_dim: int): | ||
super().__init__() | ||
self.linear = FeaturesLinear(field_dims) | ||
self.ffm = FieldAwareFactorizationMachine(field_dims, embed_dim) | ||
|
||
def forward(self, x: torch.Tensor): | ||
""" | ||
:param x: Long tensor of size ``(batch_size, num_fields)`` | ||
""" | ||
ffm_term = torch.sum(torch.sum(self.ffm(x), dim=1), dim=1, keepdim=True) | ||
x = self.linear(x) + ffm_term | ||
return torch.sigmoid(x.squeeze(1)) | ||
|
||
# DeepFM | ||
class DeepFM(nn.Module): | ||
|
||
def __init__(self, input_dims, embedding_dim, mlp_dims, drop_rate=0.1): | ||
super(DeepFM, self).__init__() | ||
total_input_dim = int(sum(input_dims)) # 입력 특성의 차원 n_user + n_movie + n_genre | ||
|
||
# Fm component의 constant bias term과 1차 bias term | ||
self.bias = nn.Parameter(torch.zeros((1,))) | ||
self.fc = nn.Embedding(total_input_dim, 1) | ||
|
||
self.embedding = nn.Embedding(total_input_dim, embedding_dim) | ||
self.embedding_dim = len(input_dims) * embedding_dim | ||
|
||
|
||
mlp_layers = [] # mlp hidden layer | ||
for i, dim in enumerate(mlp_dims): | ||
if i==0: | ||
mlp_layers.append(nn.Linear(self.embedding_dim, dim)) | ||
else: | ||
mlp_layers.append(nn.Linear(mlp_dims[i-1], dim)) | ||
mlp_layers.append(nn.ReLU(True)) | ||
mlp_layers.append(nn.Dropout(drop_rate)) | ||
mlp_layers.append(nn.Linear(mlp_dims[-1], 1)) | ||
self.mlp_layers = nn.Sequential(*mlp_layers) | ||
|
||
def fm(self, x): | ||
# x : (batch_size, total_num_input) | ||
embed_x = self.embedding(x) | ||
|
||
fm_y = self.bias + torch.sum(self.fc(x), dim=1) | ||
|
||
square_of_sum = torch.sum(embed_x, dim=1) ** 2 | ||
sum_of_square = torch.sum(embed_x ** 2, dim=1) | ||
fm_y += 0.5 * torch.sum(square_of_sum - sum_of_square, dim=1, keepdim=True) | ||
return fm_y | ||
|
||
def mlp(self, x): | ||
embed_x = self.embedding(x) | ||
|
||
inputs = embed_x.view(-1, self.embedding_dim) | ||
mlp_y = self.mlp_layers(inputs) | ||
return mlp_y | ||
|
||
def forward(self, x): | ||
embed_x = self.embedding(x) | ||
|
||
# fm component | ||
fm_y = self.fm(x).squeeze(1) | ||
|
||
# deep component | ||
mlp_y = self.mlp(x).squeeze(1) | ||
|
||
# 시그모이드 함수를 사용하여 ctr 1/0 에 대한 확률 값을 출력 | ||
y = torch.sigmoid(fm_y + mlp_y) | ||
return y |
Oops, something went wrong.