From cfd34524395537dfdfc352520ad7387c9f69bd0f Mon Sep 17 00:00:00 2001
From: antoine_tixier <anti5662@lix.polytechnique.fr>
Date: Wed, 11 Mar 2020 13:42:36 +0100
Subject: [PATCH] sgd

---
 NMT/code/grid_search.py |  6 +--
 NMT/code/model.py       | 81 ++++++++++++++++++++---------------------
 2 files changed, 43 insertions(+), 44 deletions(-)

diff --git a/NMT/code/grid_search.py b/NMT/code/grid_search.py
index 9f37a3f..1167a73 100644
--- a/NMT/code/grid_search.py
+++ b/NMT/code/grid_search.py
@@ -59,9 +59,9 @@ def load_pairs(train_or_test):
 num_layers = 1
 bidirectional = False
 
-for att_strategy in ['dot','general','concat']: #['none']:
+for att_strategy in ['concat','general','dot','none']:
     
-    hidden_dim_s = 30   
+    hidden_dim_s = 30
     
     if bidirectional:
         if att_strategy == 'dot':
@@ -89,7 +89,7 @@ def load_pairs(train_or_test):
                          max_size = 30, # for the decoder, in prediction mode
                          dropout = 0)
 
-    model.fit(training_set, test_set, lr=0.002, batch_size=64, n_epochs=1, patience=5)
+    model.fit(training_set, test_set, lr=0.1, batch_size=64, n_epochs=200, patience=10, my_optimizer='SGD')
 
     model_name = '_'.join([att_strategy, str(num_layers), str(bidirectional)])
     model.save(path_to_save, model_name)
diff --git a/NMT/code/model.py b/NMT/code/model.py
index 101865b..342273a 100644
--- a/NMT/code/model.py
+++ b/NMT/code/model.py
@@ -84,44 +84,34 @@ def __init__(self, hidden_dim, hidden_dim_s, hidden_dim_t, strategy, bidirection
         
     def forward(self, target_h, source_hs):
         
-        # if self.strategy in ['dot','general']:
-            # source_hs = source_hs.permute(1,0,2) # (seq,batch,hidden_dim_s) -> (batch,seq,hidden_dim_s)
-        
-        # if self.strategy == 'dot':
-            # # with this strategy, no trainable parameters are involved
-            # # here, feat = hidden_dim_t = hidden_dim_s
-            # target_h = target_h.permute(1,2,0) # (1,batch,feat) -> (batch,feat,1)
-            # dot_product = torch.matmul(source_hs, target_h) # (batch,seq,feat) * (batch,feat,1) -> (batch,seq,1)
-            # scores = dot_product.permute(1,0,2) # -> (seq,batch,1)
+        if self.strategy in ['dot','general']:
+            source_hs = source_hs.permute(1,0,2) # (seq,batch,hidden_dim_s) -> (batch,seq,hidden_dim_s)
+        
+        if self.strategy == 'dot':
+            # with this strategy, no trainable parameters are involved
+            # here, feat = hidden_dim_t = hidden_dim_s
+            target_h = target_h.permute(1,2,0) # (1,batch,feat) -> (batch,feat,1)
+            dot_product = torch.matmul(source_hs, target_h) # (batch,seq,feat) * (batch,feat,1) -> (batch,seq,1)
+            scores = dot_product.permute(1,0,2) # -> (seq,batch,1)
             
-        # elif self.strategy == 'general':
-            # target_h = target_h.permute(1,0,2) # (1,batch,hidden_dim_t) -> (batch,1,hidden_dim_t)
-            # output = self.ff_general(target_h) #  -> (batch,1,hidden_dim_s)
-            # output = output.permute(0,2,1) # -> (batch,hidden_dim_s,1)
-            # dot_product = torch.matmul(source_hs, output) # (batch,seq,hidden_dim_s) * (batch,hidden_dim_s,1) -> (batch,seq,1)
-            # scores = dot_product.permute(1,0,2) # -> (seq,batch,1)
+        elif self.strategy == 'general':
+            target_h = target_h.permute(1,0,2) # (1,batch,hidden_dim_t) -> (batch,1,hidden_dim_t)
+            output = self.ff_general(target_h) #  -> (batch,1,hidden_dim_s)
+            output = output.permute(0,2,1) # -> (batch,hidden_dim_s,1)
+            dot_product = torch.matmul(source_hs, output) # (batch,seq,hidden_dim_s) * (batch,hidden_dim_s,1) -> (batch,seq,1)
+            scores = dot_product.permute(1,0,2) # -> (seq,batch,1)
             
-        # elif self.strategy == 'concat':
-            # target_h_rep = target_h.repeat(source_hs.size(0),1,1) # (1,batch,hidden_dim_s) -> (seq,batch,hidden_dim_s)
-            # concat_output = self.ff_concat(torch.cat((target_h_rep,source_hs),-1)) # (seq,batch,hidden_dim_s+hidden_dim_t) -> (seq,batch,hidden_dim)
-            # scores = self.ff_score(torch.tanh(concat_output)) # -> (seq,batch,1)
-            # source_hs = source_hs.permute(1,0,2) # (seq,batch,hidden_dim_s) -> (batch,seq,hidden_dim_s)
+        elif self.strategy == 'concat':
+            target_h_rep = target_h.repeat(source_hs.size(0),1,1) # (1,batch,hidden_dim_s) -> (seq,batch,hidden_dim_s)
+            concat_output = self.ff_concat(torch.cat((target_h_rep,source_hs),-1)) # (seq,batch,hidden_dim_s+hidden_dim_t) -> (seq,batch,hidden_dim)
+            scores = self.ff_score(torch.tanh(concat_output)) # -> (seq,batch,1)
+            source_hs = source_hs.permute(1,0,2) # (seq,batch,hidden_dim_s) -> (batch,seq,hidden_dim_s)
                 
-        # scores = scores.squeeze(dim=2) # (seq,batch,1) -> (seq,batch). We specify a dimension, because we don't want to squeeze the batch dim in case batch size is equal to 1
-        # norm_scores = torch.softmax(scores,0) # sequence-wise normalization
-        # source_hs_p = source_hs.permute((2,1,0)) # (batch,seq,hidden_dim_s) -> (hidden_dim_s,seq,batch)
-        # weighted_source_hs = (norm_scores * source_hs_p) # (seq,batch) * (hidden_dim_s,seq,batch) -> (hidden_dim_s,seq,batch) (we use broadcasting here - the * operator checks from right to left that the dimensions match)
-        # ct = torch.sum(weighted_source_hs.permute((1,2,0)),0,keepdim=True) # (hidden_dim_s,seq,batch) -> (seq,batch,hidden_dim_s) -> (1,batch,hidden_dim_s); we need keepdim as sum squeezes by default 
-        
-        target_h_rep = target_h.repeat(source_hs.size(0),1,1) # (1,batch,feat) -> (seq,batch,feat)
-        concat_output = self.ff_concat(torch.cat((target_h_rep,source_hs),-1)) # source_hs is (seq,batch,feat)
-        scores = self.ff_score(torch.tanh(concat_output)) # (seq,batch,feat) -> (seq,batch,1)
-        scores = scores.squeeze(dim=2) # (seq,batch,1) -> (seq,batch). dim=2 because we don't want to squeeze the batch dim if batch size = 1
-        norm_scores = torch.softmax(scores,0)
-        source_hs_p = source_hs.permute((2,0,1)) # (seq,batch,feat) -> (feat,seq,batch)
-        weighted_source_hs = (norm_scores * source_hs_p) # (seq,batch) * (feat,seq,batch) (* checks from right to left that the dimensions match)
-        ct = torch.sum(weighted_source_hs.permute((1,2,0)),0,keepdim=True) # (feat,seq,batch) -> (seq,batch,feat) -> (1,batch,feat); keepdim otherwise sum squeezes 
-
+        scores = scores.squeeze(dim=2) # (seq,batch,1) -> (seq,batch). We specify a dimension, because we don't want to squeeze the batch dim in case batch size is equal to 1
+        norm_scores = torch.softmax(scores,0) # sequence-wise normalization
+        source_hs_p = source_hs.permute((2,1,0)) # (batch,seq,hidden_dim_s) -> (hidden_dim_s,seq,batch)
+        weighted_source_hs = (norm_scores * source_hs_p) # (seq,batch) * (hidden_dim_s,seq,batch) -> (hidden_dim_s,seq,batch) (we use broadcasting here - the * operator checks from right to left that the dimensions match)
+        ct = torch.sum(weighted_source_hs.permute((1,2,0)),0,keepdim=True) # (hidden_dim_s,seq,batch) -> (seq,batch,hidden_dim_s) -> (1,batch,hidden_dim_s); we need keepdim as sum squeezes by default 
         
         return ct
 
@@ -261,11 +251,17 @@ def forward(self, input, max_size, is_prod):
         return to_return
     
     
-    def fit(self, trainingDataset, testDataset, lr, batch_size, n_epochs, patience):
+    def fit(self, trainingDataset, testDataset, lr, batch_size, n_epochs, patience, my_optimizer):
         
         parameters = [p for p in self.parameters() if p.requires_grad]
         
-        optimizer = optim.Adam(parameters, lr=lr)
+        if my_optimizer == 'adam':
+            optimizer = optim.Adam(parameters, lr=lr)
+        elif my_optimizer == 'SGD':
+            optimizer = optim.SGD(parameters, lr=lr) # https://pytorch.org/docs/stable/optim.html#torch.optim.SGD
+            scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
+                                                             factor=0.1, patience=5, 
+                                                             verbose=True, threshold=0.1) # https://pytorch.org/docs/stable/optim.html#torch.optim.lr_scheduler.ReduceLROnPlateau
         
         criterion = torch.nn.CrossEntropyLoss(ignore_index=self.padding_token) # the softmax is inside the loss!
         
@@ -286,9 +282,9 @@ def fit(self, trainingDataset, testDataset, lr, batch_size, n_epochs, patience):
         it_times = []
         
         # my fake code
-        for p in self.parameters():
-            if not p.requires_grad:
-                 print(p.name, p.data)
+        #for p in self.parameters():
+        #    if not p.requires_grad:
+        #         print(p.name, p.data)
         
         for epoch in range(n_epochs):
             
@@ -366,7 +362,10 @@ def fit(self, trainingDataset, testDataset, lr, batch_size, n_epochs, patience):
             
             if patience_counter>patience:
                 break
-        
+            
+            if my_optimizer == 'SGD':
+                scheduler.step(total_loss)
+            
             self.test_toy(test_sents) 
         
         self.logs['avg_time_it'] = round(np.mean(it_times),4)