V1.0.0 (#1)

* changed name from utils to easy * removed DS_Store & updated gitignore * fix install metadata * fix rec_torch_utils * fix rec_torch, cfg, notebook
federicosiciliano · Apr 7, 2024 · ec283e9 · ec283e9
1 parent e86544d
commit ec283e9
Show file tree

Hide file tree

Showing 49 changed files with 322 additions and 19 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
 ### CUSTOM GITIGNORE
 # DS_Store files
 # http://stackoverflow.com/questions/107701/how-can-i-remove-ds-store-files-from-a-git-repository
-**/.DS_Store
+.DS_Store
 
 # Config (personal)
 cfg/wandb/*
@@ -13,7 +13,7 @@ data/*
 doc/*
 
 # Output
-out/model/*
+out/*
 *.pth
 
 ### DEFAULT GITIGNORE

diff --git a/README.md b/README.md
@@ -43,17 +43,17 @@ Below is an outline of key files and folders you'll find in this project, along
 
 ### Folders
 
-1. **data_utils**
+1. **easy_data**
     - Contains utilities for data loading, file management, data and data structure management, data splitting, and statistics.
 
-2. **exp_utils**
+2. **easy_exp**
     - Manages experiments by defining unique IDs based on their configuration.
     - Allows for hashing of each ID to check for previously conducted experiments.
     - Excludes GPU/CPU usage and training modes from the experiment ID.
     - Saves experiments in a specific file along with their relative configuration.
     - Includes methods for parsing YAML configs and handles special characters used in them (e.g., through `var.py`).
 
-3. **torch_utils**
+3. **easy_torch**
     - Includes functions for metrics, loading models, and creating trainers in PyTorch Lightning.
     - Defines steps, loss, optimizer, and other parameters to use.
     - Sets callbacks and dataloaders.

diff --git a/cfg/config_rec.yaml b/cfg/config_rec.yaml
@@ -0,0 +1,19 @@
++data_params: data_cfg
+
++model: model
+
+#EXPERIMENT PARAMETERS - NOT SAVED AS PART OF THE CONFIGURATION
+__exp__:
+    name: prova #Emb_size_beauty #27_1_lookback_ml-1m #provaCaser #name of the experiment, optional, default = "experiment_name"
+    # project_folder: ../ #project folder, used to locate folders, optional, default = "../"
+    # key_len: 16 #Length of experiment key, optional, default = 16
+    # key_prefix: "" #Prefix for experiment key, optional, default = ""
+    # __imports__: #imports needed for config preparation (e.g. type casting)
+    #   - torchvision
+    #   - easy_data
+    #   - numpy as np #" as " will be recognized
+    #   - name: pandas #" as " will be recognized
+    #     fromlist:
+    #       - DataFrame
+    #     as:
+    #       - DF
diff --git a/cfg/data_params/data_cfg.yaml b/cfg/data_params/data_cfg.yaml
@@ -0,0 +1,29 @@
+name: ml-1m #[amazon_beauty, ml-100k, ml-1m, steam] #the name of the dataset
+/data_folder: ../data/raw/  #path of the dataset
+
+# preprocessing
+# whether to keep items with low rating
+min_rating: 0 #the minimum rating of the dataset
+
+#whether to filter the items with low frequency
+min_items_per_user: 5 #the minimum number of items rated by a user
+min_users_per_item: 5 #the minimum number of users that have rated an item
+
+densify_index: True #whether to densify the index of the dataset
+
+dataset_params:
+  sequential_keys: [sid, timestamp, rating] #the sequential keys of the dataset
+  padding_value: 0 #the padding value for the dataset
+  lookback: 200 #[20, 50, 100, 200] #the lookback of the dataset
+  lookforward: 1 #the lookforward of the dataset
+  stride: null #the stride of the dataset
+  simultaneous_lookforward: 1 #the simultaneous lookforward of the dataset
+  out_seq_len: # Number of predictions to keep (i.e. not masked as padding) --> to avoid train/test leakage
+    train: null #the output sequence length of the training set
+    val: &val_size 1 #the output sequence length of the validation set
+    test: &test_size 1 #the output sequence length of the test set
+
+split_method: leave_n_out #the split method of the dataset, including 'leave_n_out', 'hold_out', 'k_fold'
+test_sizes: [*test_size,*val_size] #"n" for leave_n_out the number of (positive) samples for each user in the test set
+
+# random_state: 42 #the random seed for splitting the dataset
diff --git a/cfg/model/emission_tracker/emission_tracker_cfg.yaml b/cfg/model/emission_tracker/emission_tracker_cfg.yaml
@@ -0,0 +1,3 @@
+tracking_mode: process
+log_level: critical
+/output_dir: ${__exp__.project_folder}out/log/${__exp__.name}/
diff --git a/cfg/model/loader_params/loader_params_cfg.yaml b/cfg/model/loader_params/loader_params_cfg.yaml
@@ -0,0 +1,10 @@
+batch_size: 128 #2048
+# num_workers: 1
+# shuffle: True
+persistent_workers: True
+pin_memory: True
+num_negatives:
+  train: 1
+  val: 1
+  test: 100
+padding_value: ${data_params.dataset_params.padding_value}
diff --git a/cfg/model/model.yaml b/cfg/model/model.yaml
@@ -0,0 +1,57 @@
+optimizer:
+  name: Adam
+  # params:
+  #   lr: 1.0e-3
+  #   betas: [0.9, 0.98]
+
+loss: SequentialBCEWithLogitsLoss
+
+metrics:
+  - Precision
+  - Recall
+  - F1
+  - MAP
+  - NDCG
+  - MRR
+
+log_params:
+  on_epoch: True
+  on_step: False
+
+/step_routing:
+  loss_input_from_batch:
+    target: relevance
+  metrics_input_from_batch:
+    Precision:
+      relevance: relevance
+    Recall:
+      relevance: relevance
+    F1:
+      relevance: relevance
+    MAP:
+      relevance: relevance
+    NDCG:
+      relevance: relevance
+    MRR:
+      relevance: relevance
+  metrics_input_from_model_output:
+    Precision:
+      scores: null
+    Recall:
+      scores: null
+    F1:
+      scores: null
+    MAP:
+      scores: null
+    NDCG:
+      scores: null
+    MRR:
+      scores: null
+
++loader_params: loader_params_cfg
+
++trainer_params: trainer_params_cfg
+
++emission_tracker: emission_tracker_cfg
+
++rec_model: SASRec
diff --git a/cfg/model/rec_model/BERT4Rec.yaml b/cfg/model/rec_model/BERT4Rec.yaml
@@ -0,0 +1,21 @@
+name: BERT4Rec
+
+emb_size: [32, 64, 128, 256, 512] #52 #256 
+bert_num_blocks: 2
+bert_num_heads: 4
+dropout_rate: 0.1
+
+
+^/step_routing:
+  model_input_from_batch: ["in_sid", "out_sid"]
+  loss_input_from_model_output:
+    input: null
+
+^loader_params:
+  # num_negatives:
+  #   train: 0
+  #   val: 0
+  mask_prob: 0.15
+
+__global__:
+  data_params.dataset_params.lookforward: 0
diff --git a/cfg/model/rec_model/CORE.yaml b/cfg/model/rec_model/CORE.yaml
@@ -0,0 +1,16 @@
+name: CORE
+
+emb_size: 50   #dimension of latent dimension
+sess_dropout_rate: 0.2
+item_dropout_rate: 0.2
+
+^/step_routing:
+  model_input_from_batch: ["in_sid", "out_sid"]
+  loss_input_from_model_output:
+    input: null
+
+__global__:
+  data_params.dataset_params.keep_last.train: 1
+  data_params.dataset_params.keep_last.val: null
+  data_params.dataset_params.keep_last.test: null
+  data_params.dataset_params.stride: 10
diff --git a/cfg/model/rec_model/Caser.yaml b/cfg/model/rec_model/Caser.yaml
@@ -0,0 +1,20 @@
+name: Caser
+
+lookback: ${data_params.dataset_params.lookback}   #length of the sequence
+emb_size: 512   #dimension of latent dimension
+num_ver_filters: 2   #number of vertical filters
+num_hor_filters: 2   #number of horizontal filters
+act_conv: Tanh #activation function of convolutional layer (i.e., phi_c in paper)
+act_fc: Tanh  #activation function of fully connected layer (i.e., phi_a in paper)
+drop_rate: 0.5 #dropout rate
+
+^/step_routing:
+  model_input_from_batch: ["in_sid", "out_sid", "uid"]
+  loss_input_from_model_output:
+    input: null
+
+__global__:
+  data_params.dataset_params.keep_last.train: 1
+  data_params.dataset_params.keep_last.val: null
+  data_params.dataset_params.keep_last.test: null
+  data_params.dataset_params.stride: 10
diff --git a/cfg/model/rec_model/CosRec.yaml b/cfg/model/rec_model/CosRec.yaml
@@ -0,0 +1,18 @@
+name: CosRec
+
+emb_size: 50 #[32, 64, 128, 256, 512] #50
+block_dims: [128, 256]
+fc_dim: 150
+act_fc: Tanh
+dropout_rate: 0.5
+
+^/step_routing:
+  model_input_from_batch: ["in_sid", "out_sid"]
+  loss_input_from_model_output:
+    input: null
+
+__global__:
+  data_params.dataset_params.keep_last.train: 1
+  data_params.dataset_params.keep_last.val: null
+  data_params.dataset_params.keep_last.test: null
+  data_params.dataset_params.stride: 10
diff --git a/cfg/model/rec_model/GRU4Rec.yaml b/cfg/model/rec_model/GRU4Rec.yaml
@@ -0,0 +1,13 @@
+name: GRU4Rec
+
+hidden_size: 100
+num_layers: 1
+
+dropout_hidden: 0.0
+dropout_input: 0.2
+emb_size: 50 #128 #[32, 64, 128, 256, 512]
+
+^/step_routing:
+  model_input_from_batch: ["in_sid", "out_sid"]
+  loss_input_from_model_output:
+    input: null
diff --git a/cfg/model/rec_model/GRU4Rec_OLD.yaml b/cfg/model/rec_model/GRU4Rec_OLD.yaml
@@ -0,0 +1,36 @@
+name: GRU4Rec
+
+layers: [100]       # number of layers in the network
+
+dropout_p_embed: 0.0
+dropout_p_hidden: 0.0
+embedding: 0
+constrained_embedding: True
+
+# loss: cross-entropy
+# batch_size: 64 #TODO: check
+#learning_rate: 0.05
+#momentum: 0.0
+#sample_alpha: 0.75
+# n_sample: 2048 TODO: check
+# n_epochs: 10 TODO: check
+# bpreg: 1.0
+# elu_param: 1.0
+# logq: 0.0
+/^step_routing:
+  model_input_from_batch: ["in_sid", "out_sid"]
+  loss_input_from_batch:
+    target: relevance
+    #out_data: out_sid
+  loss_input_from_model_output:
+    input: null
+  metrics_input_from_batch:
+    NDCG:
+      relevance: relevance
+    MRR:
+      relevance: relevance
+  metrics_input_from_model_output:
+    NDCG:
+      scores: null
+    MRR:
+      scores: null
diff --git a/cfg/model/rec_model/HGN.yaml b/cfg/model/rec_model/HGN.yaml
@@ -0,0 +1,16 @@
+name: HGN
+
+lookback: ${data_params.dataset_params.lookback}  #length of the sequence
+emb_size: [32, 64, 128, 256, 512]  #dimension of latent dimension
+
+^/step_routing:
+  model_input_from_batch: ["in_sid", "out_sid", "uid"]
+  loss_input_from_model_output:
+    input: null
+
+__global__:
+  data_params.dataset_params.keep_last.train: 1
+  data_params.dataset_params.keep_last.val: null
+  data_params.dataset_params.keep_last.test: null
+  data_params.dataset_params.stride: 10
+  # data_params.dataset_params.simultaneous_lookforward: 3 #the simultaneous lookforward of the dataset
diff --git a/cfg/model/rec_model/NARM.yaml b/cfg/model/rec_model/NARM.yaml
@@ -0,0 +1,18 @@
+name: NARM
+
+hidden_size: 50
+emb_size: [32, 64, 128, 256, 512] #512 #[32, 64, 128, 256, 512] #50
+n_layers: 1
+emb_dropout: 0.25
+ct_dropout: 0.5
+
+^/step_routing:
+  model_input_from_batch: ["in_sid", "out_sid"]
+  loss_input_from_model_output:
+    input: null
+
+__global__:
+  data_params.dataset_params.keep_last.train: 1
+  data_params.dataset_params.keep_last.val: null
+  data_params.dataset_params.keep_last.test: null
+  data_params.dataset_params.stride: 10
diff --git a/cfg/model/rec_model/SASRec.yaml b/cfg/model/rec_model/SASRec.yaml
@@ -0,0 +1,11 @@
+name: SASRec
+
+num_blocks: 2
+emb_size: 50 #50 #[32, 64, 128, 256, 512]
+num_heads: 1
+dropout_rate: 0.2 #0.2 ml-1m 0.5 amazon_beauty amazon_games
+
+^/step_routing:
+  model_input_from_batch: ["in_sid", "out_sid"]
+  loss_input_from_model_output:
+    input: null
diff --git a/cfg/model/trainer_params/trainer_params_cfg.yaml b/cfg/model/trainer_params/trainer_params_cfg.yaml
@@ -0,0 +1,15 @@
+accelerator: cpu #cuda #cpu
+/enable_checkpointing: True
+max_epochs: 1 #601
+callbacks:
+    - ModelCheckpoint:
+        /dirpath: ${__exp__.project_folder}out/models/${__exp__.name}/
+        /filename: best
+        save_top_k: 1
+        save_last: True
+        monitor: val_loss/dataloader_idx_0 #val_loss
+        mode: min
+/logger:
+    name: CSVLogger
+    params:
+        save_dir: ${__exp__.project_folder}out/log/${__exp__.name}/
diff --git a/rec_utils/__init__.py → easy_rec/__init__.py b/rec_utils/__init__.py → easy_rec/__init__.py
@@ -1,4 +1,4 @@
 from . import data_generation_utils, metrics, losses
-from . import rec_torch_utils
+from . import rec_torch
 from ._version import __version__  # Import the '__version__' variable from this package
 from . import model
diff --git a/rec_utils/_version.py → easy_rec/_version.py b/rec_utils/_version.py → easy_rec/_version.py