Skip to content

Commit

Permalink
sgd
Browse files Browse the repository at this point in the history
  • Loading branch information
Tixierae committed Mar 11, 2020
1 parent e30c612 commit cfd3452
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 44 deletions.
6 changes: 3 additions & 3 deletions NMT/code/grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ def load_pairs(train_or_test):
num_layers = 1
bidirectional = False

for att_strategy in ['dot','general','concat']: #['none']:
for att_strategy in ['concat','general','dot','none']:

hidden_dim_s = 30
hidden_dim_s = 30

if bidirectional:
if att_strategy == 'dot':
Expand Down Expand Up @@ -89,7 +89,7 @@ def load_pairs(train_or_test):
max_size = 30, # for the decoder, in prediction mode
dropout = 0)

model.fit(training_set, test_set, lr=0.002, batch_size=64, n_epochs=1, patience=5)
model.fit(training_set, test_set, lr=0.1, batch_size=64, n_epochs=200, patience=10, my_optimizer='SGD')

model_name = '_'.join([att_strategy, str(num_layers), str(bidirectional)])
model.save(path_to_save, model_name)
81 changes: 40 additions & 41 deletions NMT/code/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,44 +84,34 @@ def __init__(self, hidden_dim, hidden_dim_s, hidden_dim_t, strategy, bidirection

def forward(self, target_h, source_hs):

# if self.strategy in ['dot','general']:
# source_hs = source_hs.permute(1,0,2) # (seq,batch,hidden_dim_s) -> (batch,seq,hidden_dim_s)

# if self.strategy == 'dot':
# # with this strategy, no trainable parameters are involved
# # here, feat = hidden_dim_t = hidden_dim_s
# target_h = target_h.permute(1,2,0) # (1,batch,feat) -> (batch,feat,1)
# dot_product = torch.matmul(source_hs, target_h) # (batch,seq,feat) * (batch,feat,1) -> (batch,seq,1)
# scores = dot_product.permute(1,0,2) # -> (seq,batch,1)
if self.strategy in ['dot','general']:
source_hs = source_hs.permute(1,0,2) # (seq,batch,hidden_dim_s) -> (batch,seq,hidden_dim_s)

if self.strategy == 'dot':
# with this strategy, no trainable parameters are involved
# here, feat = hidden_dim_t = hidden_dim_s
target_h = target_h.permute(1,2,0) # (1,batch,feat) -> (batch,feat,1)
dot_product = torch.matmul(source_hs, target_h) # (batch,seq,feat) * (batch,feat,1) -> (batch,seq,1)
scores = dot_product.permute(1,0,2) # -> (seq,batch,1)

# elif self.strategy == 'general':
# target_h = target_h.permute(1,0,2) # (1,batch,hidden_dim_t) -> (batch,1,hidden_dim_t)
# output = self.ff_general(target_h) # -> (batch,1,hidden_dim_s)
# output = output.permute(0,2,1) # -> (batch,hidden_dim_s,1)
# dot_product = torch.matmul(source_hs, output) # (batch,seq,hidden_dim_s) * (batch,hidden_dim_s,1) -> (batch,seq,1)
# scores = dot_product.permute(1,0,2) # -> (seq,batch,1)
elif self.strategy == 'general':
target_h = target_h.permute(1,0,2) # (1,batch,hidden_dim_t) -> (batch,1,hidden_dim_t)
output = self.ff_general(target_h) # -> (batch,1,hidden_dim_s)
output = output.permute(0,2,1) # -> (batch,hidden_dim_s,1)
dot_product = torch.matmul(source_hs, output) # (batch,seq,hidden_dim_s) * (batch,hidden_dim_s,1) -> (batch,seq,1)
scores = dot_product.permute(1,0,2) # -> (seq,batch,1)

# elif self.strategy == 'concat':
# target_h_rep = target_h.repeat(source_hs.size(0),1,1) # (1,batch,hidden_dim_s) -> (seq,batch,hidden_dim_s)
# concat_output = self.ff_concat(torch.cat((target_h_rep,source_hs),-1)) # (seq,batch,hidden_dim_s+hidden_dim_t) -> (seq,batch,hidden_dim)
# scores = self.ff_score(torch.tanh(concat_output)) # -> (seq,batch,1)
# source_hs = source_hs.permute(1,0,2) # (seq,batch,hidden_dim_s) -> (batch,seq,hidden_dim_s)
elif self.strategy == 'concat':
target_h_rep = target_h.repeat(source_hs.size(0),1,1) # (1,batch,hidden_dim_s) -> (seq,batch,hidden_dim_s)
concat_output = self.ff_concat(torch.cat((target_h_rep,source_hs),-1)) # (seq,batch,hidden_dim_s+hidden_dim_t) -> (seq,batch,hidden_dim)
scores = self.ff_score(torch.tanh(concat_output)) # -> (seq,batch,1)
source_hs = source_hs.permute(1,0,2) # (seq,batch,hidden_dim_s) -> (batch,seq,hidden_dim_s)

# scores = scores.squeeze(dim=2) # (seq,batch,1) -> (seq,batch). We specify a dimension, because we don't want to squeeze the batch dim in case batch size is equal to 1
# norm_scores = torch.softmax(scores,0) # sequence-wise normalization
# source_hs_p = source_hs.permute((2,1,0)) # (batch,seq,hidden_dim_s) -> (hidden_dim_s,seq,batch)
# weighted_source_hs = (norm_scores * source_hs_p) # (seq,batch) * (hidden_dim_s,seq,batch) -> (hidden_dim_s,seq,batch) (we use broadcasting here - the * operator checks from right to left that the dimensions match)
# ct = torch.sum(weighted_source_hs.permute((1,2,0)),0,keepdim=True) # (hidden_dim_s,seq,batch) -> (seq,batch,hidden_dim_s) -> (1,batch,hidden_dim_s); we need keepdim as sum squeezes by default

target_h_rep = target_h.repeat(source_hs.size(0),1,1) # (1,batch,feat) -> (seq,batch,feat)
concat_output = self.ff_concat(torch.cat((target_h_rep,source_hs),-1)) # source_hs is (seq,batch,feat)
scores = self.ff_score(torch.tanh(concat_output)) # (seq,batch,feat) -> (seq,batch,1)
scores = scores.squeeze(dim=2) # (seq,batch,1) -> (seq,batch). dim=2 because we don't want to squeeze the batch dim if batch size = 1
norm_scores = torch.softmax(scores,0)
source_hs_p = source_hs.permute((2,0,1)) # (seq,batch,feat) -> (feat,seq,batch)
weighted_source_hs = (norm_scores * source_hs_p) # (seq,batch) * (feat,seq,batch) (* checks from right to left that the dimensions match)
ct = torch.sum(weighted_source_hs.permute((1,2,0)),0,keepdim=True) # (feat,seq,batch) -> (seq,batch,feat) -> (1,batch,feat); keepdim otherwise sum squeezes

scores = scores.squeeze(dim=2) # (seq,batch,1) -> (seq,batch). We specify a dimension, because we don't want to squeeze the batch dim in case batch size is equal to 1
norm_scores = torch.softmax(scores,0) # sequence-wise normalization
source_hs_p = source_hs.permute((2,1,0)) # (batch,seq,hidden_dim_s) -> (hidden_dim_s,seq,batch)
weighted_source_hs = (norm_scores * source_hs_p) # (seq,batch) * (hidden_dim_s,seq,batch) -> (hidden_dim_s,seq,batch) (we use broadcasting here - the * operator checks from right to left that the dimensions match)
ct = torch.sum(weighted_source_hs.permute((1,2,0)),0,keepdim=True) # (hidden_dim_s,seq,batch) -> (seq,batch,hidden_dim_s) -> (1,batch,hidden_dim_s); we need keepdim as sum squeezes by default

return ct

Expand Down Expand Up @@ -261,11 +251,17 @@ def forward(self, input, max_size, is_prod):
return to_return


def fit(self, trainingDataset, testDataset, lr, batch_size, n_epochs, patience):
def fit(self, trainingDataset, testDataset, lr, batch_size, n_epochs, patience, my_optimizer):

parameters = [p for p in self.parameters() if p.requires_grad]

optimizer = optim.Adam(parameters, lr=lr)
if my_optimizer == 'adam':
optimizer = optim.Adam(parameters, lr=lr)
elif my_optimizer == 'SGD':
optimizer = optim.SGD(parameters, lr=lr) # https://pytorch.org/docs/stable/optim.html#torch.optim.SGD
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
factor=0.1, patience=5,
verbose=True, threshold=0.1) # https://pytorch.org/docs/stable/optim.html#torch.optim.lr_scheduler.ReduceLROnPlateau

criterion = torch.nn.CrossEntropyLoss(ignore_index=self.padding_token) # the softmax is inside the loss!

Expand All @@ -286,9 +282,9 @@ def fit(self, trainingDataset, testDataset, lr, batch_size, n_epochs, patience):
it_times = []

# my fake code
for p in self.parameters():
if not p.requires_grad:
print(p.name, p.data)
#for p in self.parameters():
# if not p.requires_grad:
# print(p.name, p.data)

for epoch in range(n_epochs):

Expand Down Expand Up @@ -366,7 +362,10 @@ def fit(self, trainingDataset, testDataset, lr, batch_size, n_epochs, patience):

if patience_counter>patience:
break


if my_optimizer == 'SGD':
scheduler.step(total_loss)

self.test_toy(test_sents)

self.logs['avg_time_it'] = round(np.mean(it_times),4)
Expand Down

0 comments on commit cfd3452

Please sign in to comment.