From cfd34524395537dfdfc352520ad7387c9f69bd0f Mon Sep 17 00:00:00 2001 From: antoine_tixier Date: Wed, 11 Mar 2020 13:42:36 +0100 Subject: [PATCH] sgd --- NMT/code/grid_search.py | 6 +-- NMT/code/model.py | 81 ++++++++++++++++++++--------------------- 2 files changed, 43 insertions(+), 44 deletions(-) diff --git a/NMT/code/grid_search.py b/NMT/code/grid_search.py index 9f37a3f..1167a73 100644 --- a/NMT/code/grid_search.py +++ b/NMT/code/grid_search.py @@ -59,9 +59,9 @@ def load_pairs(train_or_test): num_layers = 1 bidirectional = False -for att_strategy in ['dot','general','concat']: #['none']: +for att_strategy in ['concat','general','dot','none']: - hidden_dim_s = 30 + hidden_dim_s = 30 if bidirectional: if att_strategy == 'dot': @@ -89,7 +89,7 @@ def load_pairs(train_or_test): max_size = 30, # for the decoder, in prediction mode dropout = 0) - model.fit(training_set, test_set, lr=0.002, batch_size=64, n_epochs=1, patience=5) + model.fit(training_set, test_set, lr=0.1, batch_size=64, n_epochs=200, patience=10, my_optimizer='SGD') model_name = '_'.join([att_strategy, str(num_layers), str(bidirectional)]) model.save(path_to_save, model_name) diff --git a/NMT/code/model.py b/NMT/code/model.py index 101865b..342273a 100644 --- a/NMT/code/model.py +++ b/NMT/code/model.py @@ -84,44 +84,34 @@ def __init__(self, hidden_dim, hidden_dim_s, hidden_dim_t, strategy, bidirection def forward(self, target_h, source_hs): - # if self.strategy in ['dot','general']: - # source_hs = source_hs.permute(1,0,2) # (seq,batch,hidden_dim_s) -> (batch,seq,hidden_dim_s) - - # if self.strategy == 'dot': - # # with this strategy, no trainable parameters are involved - # # here, feat = hidden_dim_t = hidden_dim_s - # target_h = target_h.permute(1,2,0) # (1,batch,feat) -> (batch,feat,1) - # dot_product = torch.matmul(source_hs, target_h) # (batch,seq,feat) * (batch,feat,1) -> (batch,seq,1) - # scores = dot_product.permute(1,0,2) # -> (seq,batch,1) + if self.strategy in ['dot','general']: + source_hs = source_hs.permute(1,0,2) # (seq,batch,hidden_dim_s) -> (batch,seq,hidden_dim_s) + + if self.strategy == 'dot': + # with this strategy, no trainable parameters are involved + # here, feat = hidden_dim_t = hidden_dim_s + target_h = target_h.permute(1,2,0) # (1,batch,feat) -> (batch,feat,1) + dot_product = torch.matmul(source_hs, target_h) # (batch,seq,feat) * (batch,feat,1) -> (batch,seq,1) + scores = dot_product.permute(1,0,2) # -> (seq,batch,1) - # elif self.strategy == 'general': - # target_h = target_h.permute(1,0,2) # (1,batch,hidden_dim_t) -> (batch,1,hidden_dim_t) - # output = self.ff_general(target_h) # -> (batch,1,hidden_dim_s) - # output = output.permute(0,2,1) # -> (batch,hidden_dim_s,1) - # dot_product = torch.matmul(source_hs, output) # (batch,seq,hidden_dim_s) * (batch,hidden_dim_s,1) -> (batch,seq,1) - # scores = dot_product.permute(1,0,2) # -> (seq,batch,1) + elif self.strategy == 'general': + target_h = target_h.permute(1,0,2) # (1,batch,hidden_dim_t) -> (batch,1,hidden_dim_t) + output = self.ff_general(target_h) # -> (batch,1,hidden_dim_s) + output = output.permute(0,2,1) # -> (batch,hidden_dim_s,1) + dot_product = torch.matmul(source_hs, output) # (batch,seq,hidden_dim_s) * (batch,hidden_dim_s,1) -> (batch,seq,1) + scores = dot_product.permute(1,0,2) # -> (seq,batch,1) - # elif self.strategy == 'concat': - # target_h_rep = target_h.repeat(source_hs.size(0),1,1) # (1,batch,hidden_dim_s) -> (seq,batch,hidden_dim_s) - # concat_output = self.ff_concat(torch.cat((target_h_rep,source_hs),-1)) # (seq,batch,hidden_dim_s+hidden_dim_t) -> (seq,batch,hidden_dim) - # scores = self.ff_score(torch.tanh(concat_output)) # -> (seq,batch,1) - # source_hs = source_hs.permute(1,0,2) # (seq,batch,hidden_dim_s) -> (batch,seq,hidden_dim_s) + elif self.strategy == 'concat': + target_h_rep = target_h.repeat(source_hs.size(0),1,1) # (1,batch,hidden_dim_s) -> (seq,batch,hidden_dim_s) + concat_output = self.ff_concat(torch.cat((target_h_rep,source_hs),-1)) # (seq,batch,hidden_dim_s+hidden_dim_t) -> (seq,batch,hidden_dim) + scores = self.ff_score(torch.tanh(concat_output)) # -> (seq,batch,1) + source_hs = source_hs.permute(1,0,2) # (seq,batch,hidden_dim_s) -> (batch,seq,hidden_dim_s) - # scores = scores.squeeze(dim=2) # (seq,batch,1) -> (seq,batch). We specify a dimension, because we don't want to squeeze the batch dim in case batch size is equal to 1 - # norm_scores = torch.softmax(scores,0) # sequence-wise normalization - # source_hs_p = source_hs.permute((2,1,0)) # (batch,seq,hidden_dim_s) -> (hidden_dim_s,seq,batch) - # weighted_source_hs = (norm_scores * source_hs_p) # (seq,batch) * (hidden_dim_s,seq,batch) -> (hidden_dim_s,seq,batch) (we use broadcasting here - the * operator checks from right to left that the dimensions match) - # ct = torch.sum(weighted_source_hs.permute((1,2,0)),0,keepdim=True) # (hidden_dim_s,seq,batch) -> (seq,batch,hidden_dim_s) -> (1,batch,hidden_dim_s); we need keepdim as sum squeezes by default - - target_h_rep = target_h.repeat(source_hs.size(0),1,1) # (1,batch,feat) -> (seq,batch,feat) - concat_output = self.ff_concat(torch.cat((target_h_rep,source_hs),-1)) # source_hs is (seq,batch,feat) - scores = self.ff_score(torch.tanh(concat_output)) # (seq,batch,feat) -> (seq,batch,1) - scores = scores.squeeze(dim=2) # (seq,batch,1) -> (seq,batch). dim=2 because we don't want to squeeze the batch dim if batch size = 1 - norm_scores = torch.softmax(scores,0) - source_hs_p = source_hs.permute((2,0,1)) # (seq,batch,feat) -> (feat,seq,batch) - weighted_source_hs = (norm_scores * source_hs_p) # (seq,batch) * (feat,seq,batch) (* checks from right to left that the dimensions match) - ct = torch.sum(weighted_source_hs.permute((1,2,0)),0,keepdim=True) # (feat,seq,batch) -> (seq,batch,feat) -> (1,batch,feat); keepdim otherwise sum squeezes - + scores = scores.squeeze(dim=2) # (seq,batch,1) -> (seq,batch). We specify a dimension, because we don't want to squeeze the batch dim in case batch size is equal to 1 + norm_scores = torch.softmax(scores,0) # sequence-wise normalization + source_hs_p = source_hs.permute((2,1,0)) # (batch,seq,hidden_dim_s) -> (hidden_dim_s,seq,batch) + weighted_source_hs = (norm_scores * source_hs_p) # (seq,batch) * (hidden_dim_s,seq,batch) -> (hidden_dim_s,seq,batch) (we use broadcasting here - the * operator checks from right to left that the dimensions match) + ct = torch.sum(weighted_source_hs.permute((1,2,0)),0,keepdim=True) # (hidden_dim_s,seq,batch) -> (seq,batch,hidden_dim_s) -> (1,batch,hidden_dim_s); we need keepdim as sum squeezes by default return ct @@ -261,11 +251,17 @@ def forward(self, input, max_size, is_prod): return to_return - def fit(self, trainingDataset, testDataset, lr, batch_size, n_epochs, patience): + def fit(self, trainingDataset, testDataset, lr, batch_size, n_epochs, patience, my_optimizer): parameters = [p for p in self.parameters() if p.requires_grad] - optimizer = optim.Adam(parameters, lr=lr) + if my_optimizer == 'adam': + optimizer = optim.Adam(parameters, lr=lr) + elif my_optimizer == 'SGD': + optimizer = optim.SGD(parameters, lr=lr) # https://pytorch.org/docs/stable/optim.html#torch.optim.SGD + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', + factor=0.1, patience=5, + verbose=True, threshold=0.1) # https://pytorch.org/docs/stable/optim.html#torch.optim.lr_scheduler.ReduceLROnPlateau criterion = torch.nn.CrossEntropyLoss(ignore_index=self.padding_token) # the softmax is inside the loss! @@ -286,9 +282,9 @@ def fit(self, trainingDataset, testDataset, lr, batch_size, n_epochs, patience): it_times = [] # my fake code - for p in self.parameters(): - if not p.requires_grad: - print(p.name, p.data) + #for p in self.parameters(): + # if not p.requires_grad: + # print(p.name, p.data) for epoch in range(n_epochs): @@ -366,7 +362,10 @@ def fit(self, trainingDataset, testDataset, lr, batch_size, n_epochs, patience): if patience_counter>patience: break - + + if my_optimizer == 'SGD': + scheduler.step(total_loss) + self.test_toy(test_sents) self.logs['avg_time_it'] = round(np.mean(it_times),4)