diff --git a/Assignments/assignment4/MakiNaruto/README.md b/Assignments/assignment4/MakiNaruto/README.md new file mode 100644 index 0000000..8267252 --- /dev/null +++ b/Assignments/assignment4/MakiNaruto/README.md @@ -0,0 +1,2 @@ +做了这个作业后,发现要充分了解模型的各个输入输出维度。很关键 +![词级翻译模型](./word_nmt.jpg) \ No newline at end of file diff --git a/Assignments/assignment4/MakiNaruto/__init__.py b/Assignments/assignment4/MakiNaruto/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Assignments/assignment4/MakiNaruto/model_embeddings.py b/Assignments/assignment4/MakiNaruto/model_embeddings.py new file mode 100644 index 0000000..35459ea --- /dev/null +++ b/Assignments/assignment4/MakiNaruto/model_embeddings.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +CS224N 2019-20: Homework 4 +model_embeddings.py: Embeddings for the NMT model +Pencheng Yin +Sahil Chopra +Anand Dhoot +Vera Lin +""" + +import torch.nn as nn + +class ModelEmbeddings(nn.Module): + """ + Class that converts input words to their embeddings. + """ + def __init__(self, embed_size, vocab): + """ + Init the Embedding layers. + + @param embed_size (int): Embedding size (dimensionality) + @param vocab (Vocab): Vocabulary object containing src and tgt languages + See vocab.py for documentation. + """ + super(ModelEmbeddings, self).__init__() + self.embed_size = embed_size + + # default values + self.source = None + self.target = None + + src_pad_token_idx = vocab.src[''] + tgt_pad_token_idx = vocab.tgt[''] + + ### YOUR CODE HERE (~2 Lines) + ### TODO - Initialize the following variables: + ### self.source (Embedding Layer for source language) + ### self.target (Embedding Layer for target langauge) + ### + ### Note: + ### 1. `vocab` object contains two vocabularies: + ### `vocab.src` for source + ### `vocab.tgt` for target + ### 2. You can get the length of a specific vocabulary by running: + ### `len(vocab.)` + ### 3. Remember to include the padding token for the specific vocabulary + ### when creating your Embedding. + ### + ### Use the following docs to properly initialize these variables: + ### Embedding Layer: + ### https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding + + self.source = nn.Embedding(len(vocab.src), self.embed_size, padding_idx=src_pad_token_idx) + self.target = nn.Embedding(len(vocab.tgt), self.embed_size, padding_idx=tgt_pad_token_idx) + + ### END YOUR CODE + + diff --git a/Assignments/assignment4/MakiNaruto/nmt_model.py b/Assignments/assignment4/MakiNaruto/nmt_model.py new file mode 100644 index 0000000..bd1552f --- /dev/null +++ b/Assignments/assignment4/MakiNaruto/nmt_model.py @@ -0,0 +1,526 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +CS224N 2019-20: Homework 4 +nmt_model.py: NMT Model +Pencheng Yin +Sahil Chopra +Vera Lin +""" +from collections import namedtuple +import sys +from typing import List, Tuple, Dict, Set, Union +import torch +import torch.nn as nn +import torch.nn.utils +import torch.nn.functional as F +from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence + +from model_embeddings import ModelEmbeddings +Hypothesis = namedtuple('Hypothesis', ['value', 'score']) + + +class NMT(nn.Module): + """ Simple Neural Machine Translation Model: + - Bidrectional LSTM Encoder + - Unidirection LSTM Decoder + - Global Attention Model (Luong, et al. 2015) + """ + def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): + """ Init NMT Model. + + @param embed_size (int): Embedding size (dimensionality) + @param hidden_size (int): Hidden Size, the size of hidden states (dimensionality) + @param vocab (Vocab): Vocabulary object containing src and tgt languages + See vocab.py for documentation. + @param dropout_rate (float): Dropout probability, for attention + """ + super(NMT, self).__init__() + self.model_embeddings = ModelEmbeddings(embed_size, vocab) + self.hidden_size = hidden_size + self.dropout_rate = dropout_rate + self.vocab = vocab + + # default values + self.encoder = None + self.decoder = None + self.h_projection = None + self.c_projection = None + self.att_projection = None + self.combined_output_projection = None + self.target_vocab_projection = None + self.dropout = None + # For sanity check only, not relevant to implementation + self.gen_sanity_check = False + self.counter = 0 + + + ### YOUR CODE HERE (~8 Lines) + ### TODO - Initialize the following variables: + ### self.encoder (Bidirectional LSTM with bias) + ### self.decoder (LSTM Cell with bias) + ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. + ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. + ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. + ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. + ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. + ### self.dropout (Dropout Layer) + ### + ### Use the following docs to properly initialize these variables: + ### LSTM: + ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM + ### LSTM Cell: + ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell + ### Linear Layer: + ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear + ### Dropout Layer: + ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout + + self.encoder = nn.LSTM(embed_size, self.hidden_size, bidirectional=True) + self.decoder = nn.LSTMCell(embed_size + self.hidden_size, self.hidden_size) + self.h_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) + self.c_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) + self.att_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) + self.combined_output_projection = nn.Linear(3*self.hidden_size, self.hidden_size, bias=False) + self.target_vocab_projection = nn.Linear(self.hidden_size, len(vocab.tgt), bias=False) + self.dropout = nn.Dropout(p=self.dropout_rate) + + + ### END YOUR CODE + + + def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: + """ Take a mini-batch of source and target sentences, compute the log-likelihood of + target sentences under the language models learned by the NMT system. + + @param source (List[List[str]]): list of source sentence tokens + @param target (List[List[str]]): list of target sentence tokens, wrapped by `` and `` + + @returns scores (Tensor): a variable/tensor of shape (b, ) representing the + log-likelihood of generating the gold-standard target sentence for + each example in the input batch. Here b = batch size. + """ + # Compute sentence lengths + source_lengths = [len(s) for s in source] + + # Convert list of lists into tensors + source_padded = self.vocab.src.to_input_tensor(source, device=self.device) # Tensor: (src_len, b) + target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device) # Tensor: (tgt_len, b) + + ### Run the network forward: + ### 1. Apply the encoder to `source_padded` by calling `self.encode()` + ### 2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()` + ### 3. Apply the decoder to compute combined-output by calling `self.decode()` + ### 4. Compute log probability distribution over the target vocabulary using the + ### combined_outputs returned by the `self.decode()` function. + + enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) + enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) + combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) + P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) + + # Zero out, probabilities for which we have nothing in the target text + target_masks = (target_padded != self.vocab.tgt['']).float() + + # Compute log probability of generating true target words + target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] + scores = target_gold_words_log_prob.sum(dim=0) + return scores + + + def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """ Apply the encoder to source sentences to obtain encoder hidden states. + Additionally, take the final states of the encoder and project them to obtain initial states for decoder. + + @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where + b = batch_size, src_len = maximum source sentence length. Note that + these have already been sorted in order of longest to shortest sentence. + @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch + @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where + b = batch size, src_len = maximum source sentence length, h = hidden size. + @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial + hidden state and cell. + """ + enc_hiddens, dec_init_state = None, None + + ### YOUR CODE HERE (~ 8 Lines) + ### TODO: + ### 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. + ### src_len = maximum source sentence length, b = batch size, e = embedding size. Note + ### that there is no initial hidden state or cell for the decoder. + + ### 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. + ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. + ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. + ### - Note that the shape of the tensor returned by the encoder is (src_len, b, h*2) and we want to + ### return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. + + ### 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): + ### - `init_decoder_hidden`: + ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. + ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). + ### Apply the h_projection layer to this in order to compute init_decoder_hidden. + ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size + + ### - `init_decoder_cell`: + ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. + ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). + ### Apply the c_projection layer to this in order to compute init_decoder_cell. + ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size + ### + + ### See the following docs, as you may need to use some of the following functions in your implementation: + ### Pack the padded sequence X before passing to the encoder: + ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence + ### Pad the packed sequence, enc_hiddens, returned by the encoder: + ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence + ### Tensor Concatenation: + ### https://pytorch.org/docs/stable/torch.html#torch.cat + ### Tensor Permute: + ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute + + X = self.model_embeddings.source(source_padded) # src_len, b, e + X = pack_padded_sequence(X, lengths=source_lengths) # + enc_hiddens, (last_hidden, last_cell) = self.encoder(X) # 各层隐层,LSTM输出h_t,c_t + enc_hiddens, length = pad_packed_sequence(enc_hiddens) # src_len, b, h*2 + enc_hiddens = enc_hiddens.permute(1, 0, 2) # b, src_len, h*2->src_len, b, h*2,,使用tensor.transpose()也能达到效果。 + # 区别是transpose只能进行二维的转换, permute可以高维 + + + init_decoder_hidden = self.h_projection( + torch.cat( + (last_hidden[0], last_hidden[1]), 1) + ) # (b, h) -> (h, 2h) + + + init_decoder_cell = self.c_projection( + torch.cat( + (last_cell[0], last_cell[1]), 1) + ) # (b, h) -> (h, 2h) + + dec_init_state = (init_decoder_hidden, init_decoder_cell) + ### END YOUR CODE + + return enc_hiddens, dec_init_state + + + def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, + dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: + """Compute combined output vectors for a batch. + + @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where + b = batch size, src_len = maximum source sentence length, h = hidden size. + @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where + b = batch size, src_len = maximum source sentence length. + @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder + @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where + tgt_len = maximum target sentence length, b = batch size. + + @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where + tgt_len = maximum target sentence length, b = batch_size, h = hidden size + """ + # Chop of the token for max length sentences. + target_padded = target_padded[:-1] + + # Initialize the decoder state (hidden and cell) + dec_state = dec_init_state + + # Initialize previous combined output vector o_{t-1} as zero + batch_size = enc_hiddens.size(0) + o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) + + # Initialize a list we will use to collect the combined output o_t on each step + combined_outputs = [] + + ### YOUR CODE HERE (~9 Lines) + ### TODO: + ### 1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`, + ### which should be shape (b, src_len, h), + ### where b = batch size, src_len = maximum source length, h = hidden size. + ### This is applying W_{attProj} to h^enc, as described in the PDF. + ### 2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings. + ### where tgt_len = maximum target sentence length, b = batch size, e = embedding size. + ### 3. Use the torch.split function to iterate over the time dimension of Y. + ### Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size. + ### - Squeeze Y_t into a tensor of dimension (b, e). + ### - Construct Ybar_t by concatenating Y_t with o_prev on their last dimension + ### - Use the step function to compute the the Decoder's next (cell, state) values + ### as well as the new combined output o_t. + ### - Append o_t to combined_outputs + ### - Update o_prev to the new o_t. + ### 4. Use torch.stack to convert combined_outputs from a list length tgt_len of + ### tensors shape (b, h), to a single tensor shape (tgt_len, b, h) + ### where tgt_len = maximum target sentence length, b = batch size, h = hidden size. + ### + ### Note: + ### - When using the squeeze() function make sure to specify the dimension you want to squeeze + ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. + ### + ### You may find some of these functions useful: + ### Zeros Tensor: + ### https://pytorch.org/docs/stable/torch.html#torch.zeros + ### Tensor Splitting (iteration): + ### https://pytorch.org/docs/stable/torch.html#torch.split + ### Tensor Dimension Squeezing: + ### https://pytorch.org/docs/stable/torch.html#torch.squeeze + ### Tensor Concatenation: + ### https://pytorch.org/docs/stable/torch.html#torch.cat + ### Tensor Stacking: + ### https://pytorch.org/docs/stable/torch.html#torch.stack + + enc_hiddens_proj = self.att_projection(enc_hiddens) + Y = self.model_embeddings.target(target_padded) + for i in torch.split(Y, 1, dim=0): + Y_t = i.squeeze(0) + Ybar_t = torch.cat((Y_t, o_prev), 1) + dec_state, o_t, e_t = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) + combined_outputs.append(o_t) + o_prev = o_t + ### END YOUR CODE + combined_outputs = torch.stack(combined_outputs, dim=0) + return combined_outputs + + + def step(self, Ybar_t: torch.Tensor, + dec_state: Tuple[torch.Tensor, torch.Tensor], + enc_hiddens: torch.Tensor, + enc_hiddens_proj: torch.Tensor, + enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: + """ Compute one forward step of the LSTM decoder, including the attention computation. + + @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, + where b = batch size, e = embedding size, h = hidden size. + @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. + First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. + @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, + src_len = maximum source length, h = hidden size. + @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), + where b = batch size, src_len = maximum source length, h = hidden size. + @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), + where b = batch size, src_len is maximum source length. + + @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. + First tensor is decoder's new hidden state, second tensor is decoder's new cell. + @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. + @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. + Note: You will not use this outside of this function. + We are simply returning this value so that we can sanity check + your implementation. + """ + + combined_output = None + + ### YOUR CODE HERE (~3 Lines) + ### TODO: + ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. + ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) + ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). + ### Note: b = batch_size, src_len = maximum source length, h = hidden size. + ### + ### Hints: + ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) + ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). + ### - Use batched matrix multiplication (torch.bmm) to compute e_t (be careful about the input/ output shapes!) + ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. + ### - When using the squeeze() function make sure to specify the dimension you want to squeeze + ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. + ### + ### Use the following docs to implement this functionality: + ### Batch Multiplication: + ### https://pytorch.org/docs/stable/torch.html#torch.bmm + ### Tensor Unsqueeze: + ### https://pytorch.org/docs/stable/torch.html#torch.unsqueeze + ### Tensor Squeeze: + ### https://pytorch.org/docs/stable/torch.html#torch.squeeze + dec_state = self.decoder(Ybar_t, dec_state) + dec_hidden, dec_cell = dec_state + e_t = torch.bmm( + enc_hiddens_proj, # (b, src_len, h) + dec_hidden.unsqueeze(2) # (b, h, 1) + ).squeeze(2) # (b, src_len) + ### END YOUR CODE + + # Set e_t to -inf where enc_masks has 1 + if enc_masks is not None: + e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) + + ### YOUR CODE HERE (~6 Lines) + ### TODO: + ### 1. Apply softmax to e_t to yield alpha_t + ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the + ### attention output vector, a_t. + ### - alpha_t is shape (b, src_len) + ### - enc_hiddens is shape (b, src_len, 2h) + ### - a_t should be shape (b, 2h) + ### - You will need to do some squeezing and unsqueezing. + ### Note: b = batch size, src_len = maximum source length, h = hidden size. + ### + ### 3. Concatenate dec_hidden with a_t to compute tensor U_t + ### 4. Apply the combined output projection layer to U_t to compute tensor V_t + ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. + ### + ### Use the following docs to implement this functionality: + ### Softmax: + ### https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax + ### Batch Multiplication: + ### https://pytorch.org/docs/stable/torch.html#torch.bmm + ### Tensor View: + ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view + ### Tensor Concatenation: + ### https://pytorch.org/docs/stable/torch.html#torch.cat + ### Tanh: + ### https://pytorch.org/docs/stable/torch.html#torch.tanh + + alpha_t = F.softmax(e_t, dim=1) + a_t = torch.bmm( + alpha_t.unsqueeze(1), # (b, 1, src_len) + enc_hiddens # (b, src_len, 2h) + ).squeeze(1) # (b, 1, 2h) ->(b, 2h) + U_t = torch.cat((dec_hidden, a_t), 1) # (b, 3h) + V_t = self.combined_output_projection(U_t) + O_t = self.dropout(torch.tanh(V_t)) + + ### END YOUR CODE + + combined_output = O_t + return dec_state, combined_output, e_t + + def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: + """ Generate sentence masks for encoder hidden states. + + @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, + src_len = max source length, h = hidden size. + @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. + + @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), + where src_len = max source length, h = hidden size. + """ + enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) + for e_id, src_len in enumerate(source_lengths): + enc_masks[e_id, src_len:] = 1 + return enc_masks.to(self.device) + + + def beam_search(self, src_sent: List[str], beam_size: int=5, max_decoding_time_step: int=70) -> List[Hypothesis]: + """ Given a single source sentence, perform beam search, yielding translations in the target language. + @param src_sent (List[str]): a single source sentence (words) + @param beam_size (int): beam size + @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN + @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: + value: List[str]: the decoded target sentence, represented as a list of words + score: float: the log-likelihood of the target sentence + """ + src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) + + src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) + src_encodings_att_linear = self.att_projection(src_encodings) + + h_tm1 = dec_init_vec + att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) + + eos_id = self.vocab.tgt[''] + + hypotheses = [['']] + hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) + completed_hypotheses = [] + + t = 0 + while len(completed_hypotheses) < beam_size and t < max_decoding_time_step: + t += 1 + hyp_num = len(hypotheses) + + exp_src_encodings = src_encodings.expand(hyp_num, + src_encodings.size(1), + src_encodings.size(2)) + + exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num, + src_encodings_att_linear.size(1), + src_encodings_att_linear.size(2)) + + y_tm1 = torch.tensor([self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) + y_t_embed = self.model_embeddings.target(y_tm1) + + x = torch.cat([y_t_embed, att_tm1], dim=-1) + + (h_t, cell_t), att_t, _ = self.step(x, h_tm1, + exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) + + # log probabilities over target words + log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) + + live_hyp_num = beam_size - len(completed_hypotheses) + contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) + top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num) + + prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) + hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) + + new_hypotheses = [] + live_hyp_ids = [] + new_hyp_scores = [] + + for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): + prev_hyp_id = prev_hyp_id.item() + hyp_word_id = hyp_word_id.item() + cand_new_hyp_score = cand_new_hyp_score.item() + + hyp_word = self.vocab.tgt.id2word[hyp_word_id] + new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] + if hyp_word == '': + completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1], + score=cand_new_hyp_score)) + else: + new_hypotheses.append(new_hyp_sent) + live_hyp_ids.append(prev_hyp_id) + new_hyp_scores.append(cand_new_hyp_score) + + if len(completed_hypotheses) == beam_size: + break + + live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) + h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) + att_tm1 = att_t[live_hyp_ids] + + hypotheses = new_hypotheses + hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) + + if len(completed_hypotheses) == 0: + completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:], + score=hyp_scores[0].item())) + + completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) + + return completed_hypotheses + + @property + def device(self) -> torch.device: + """ Determine which device to place the Tensors upon, CPU or GPU. + """ + return self.model_embeddings.source.weight.device + + @staticmethod + def load(model_path: str): + """ Load the model from a file. + @param model_path (str): path to model + """ + params = torch.load(model_path, map_location=lambda storage, loc: storage) + args = params['args'] + model = NMT(vocab=params['vocab'], **args) + model.load_state_dict(params['state_dict']) + + return model + + def save(self, path: str): + """ Save the odel to a file. + @param path (str): path to the model + """ + print('save model parameters to [%s]' % path, file=sys.stderr) + + params = { + 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), + 'vocab': self.vocab, + 'state_dict': self.state_dict() + } + + torch.save(params, path) diff --git a/Assignments/assignment4/MakiNaruto/run.py b/Assignments/assignment4/MakiNaruto/run.py new file mode 100644 index 0000000..8e036be --- /dev/null +++ b/Assignments/assignment4/MakiNaruto/run.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +CS224N 2019-20: Homework 4 +run.py: Run Script for Simple NMT Model +Pencheng Yin +Sahil Chopra +Vera Lin + +Usage: + run.py train --train-src= --train-tgt= --dev-src= --dev-tgt= --vocab= [options] + run.py decode [options] MODEL_PATH TEST_SOURCE_FILE OUTPUT_FILE + run.py decode [options] MODEL_PATH TEST_SOURCE_FILE TEST_TARGET_FILE OUTPUT_FILE + +Options: + -h --help show this screen. + --cuda use GPU + --train-src= train source file + --train-tgt= train target file + --dev-src= dev source file + --dev-tgt= dev target file + --vocab= vocab file + --seed= seed [default: 0] + --batch-size= batch size [default: 32] + --embed-size= embedding size [default: 256] + --hidden-size= hidden size [default: 256] + --clip-grad= gradient clipping [default: 5.0] + --log-every= log every [default: 10] + --max-epoch= max epoch [default: 30] + --input-feed use input feeding + --patience= wait for how many iterations to decay learning rate [default: 5] + --max-num-trial= terminate training after how many trials [default: 5] + --lr-decay= learning rate decay [default: 0.5] + --beam-size= beam size [default: 5] + --sample-size= sample size [default: 5] + --lr= learning rate [default: 0.001] + --uniform-init= uniformly initialize all parameters [default: 0.1] + --save-to= model save path [default: model.bin] + --valid-niter= perform validation after how many iterations [default: 2000] + --dropout= dropout [default: 0.3] + --max-decoding-time-step= maximum number of decoding time steps [default: 70] +""" +import math +import sys +import pickle +import time + + +from docopt import docopt +from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction +from nmt_model import Hypothesis, NMT +import numpy as np +from typing import List, Tuple, Dict, Set, Union +from tqdm import tqdm +from utils import read_corpus, batch_iter +from vocab import Vocab, VocabEntry + +import torch +import torch.nn.utils + + +def evaluate_ppl(model, dev_data, batch_size=32): + """ Evaluate perplexity on dev sentences + @param model (NMT): NMT Model + @param dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence + @param batch_size (batch size) + @returns ppl (perplixty on dev sentences) + """ + was_training = model.training + model.eval() + + cum_loss = 0. + cum_tgt_words = 0. + + # no_grad() signals backend to throw away all gradients + with torch.no_grad(): + for src_sents, tgt_sents in batch_iter(dev_data, batch_size): + loss = -model(src_sents, tgt_sents).sum() + + cum_loss += loss.item() + tgt_word_num_to_predict = sum(len(s[1:]) for s in tgt_sents) # omitting leading `` + cum_tgt_words += tgt_word_num_to_predict + + ppl = np.exp(cum_loss / cum_tgt_words) + + if was_training: + model.train() + + return ppl + + +def compute_corpus_level_bleu_score(references: List[List[str]], hypotheses: List[Hypothesis]) -> float: + """ Given decoding results and reference sentences, compute corpus-level BLEU score. + @param references (List[List[str]]): a list of gold-standard reference target sentences + @param hypotheses (List[Hypothesis]): a list of hypotheses, one for each reference + @returns bleu_score: corpus-level BLEU score + """ + if references[0][0] == '': + references = [ref[1:-1] for ref in references] + bleu_score = corpus_bleu([[ref] for ref in references], + [hyp.value for hyp in hypotheses]) + return bleu_score + + +def train(args: Dict): + """ Train the NMT Model. + @param args (Dict): args from cmd line + """ + train_data_src = read_corpus(args['--train-src'], source='src') + train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') + + dev_data_src = read_corpus(args['--dev-src'], source='src') + dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') + + train_data = list(zip(train_data_src, train_data_tgt)) + dev_data = list(zip(dev_data_src, dev_data_tgt)) + + train_batch_size = int(args['--batch-size']) + clip_grad = float(args['--clip-grad']) + valid_niter = int(args['--valid-niter']) + log_every = int(args['--log-every']) + model_save_path = args['--save-to'] + + vocab = Vocab.load(args['--vocab']) + + model = NMT(embed_size=int(args['--embed-size']), + hidden_size=int(args['--hidden-size']), + dropout_rate=float(args['--dropout']), + vocab=vocab) + model.train() + + uniform_init = float(args['--uniform-init']) + if np.abs(uniform_init) > 0.: + print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) + for p in model.parameters(): + p.data.uniform_(-uniform_init, uniform_init) + + vocab_mask = torch.ones(len(vocab.tgt)) + vocab_mask[vocab.tgt['']] = 0 + + device = torch.device("cuda:0" if args['--cuda'] else "cpu") + print('use device: %s' % device, file=sys.stderr) + + model = model.to(device) + + optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) + + num_trial = 0 + train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 + cum_examples = report_examples = epoch = valid_num = 0 + hist_valid_scores = [] + train_time = begin_time = time.time() + print('begin Maximum Likelihood training') + + while True: + epoch += 1 + + for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True): + train_iter += 1 + + optimizer.zero_grad() + + batch_size = len(src_sents) + + example_losses = -model(src_sents, tgt_sents) # (batch_size,) + batch_loss = example_losses.sum() + loss = batch_loss / batch_size + + loss.backward() + + # clip gradient + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) + + optimizer.step() + + batch_losses_val = batch_loss.item() + report_loss += batch_losses_val + cum_loss += batch_losses_val + + tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents) # omitting leading `` + report_tgt_words += tgt_words_num_to_predict + cum_tgt_words += tgt_words_num_to_predict + report_examples += batch_size + cum_examples += batch_size + + if train_iter % log_every == 0: + print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ + 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, + report_loss / report_examples, + math.exp(report_loss / report_tgt_words), + cum_examples, + report_tgt_words / (time.time() - train_time), + time.time() - begin_time), file=sys.stderr) + + train_time = time.time() + report_loss = report_tgt_words = report_examples = 0. + + # perform validation + if train_iter % valid_niter == 0: + print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, + cum_loss / cum_examples, + np.exp(cum_loss / cum_tgt_words), + cum_examples), file=sys.stderr) + + cum_loss = cum_examples = cum_tgt_words = 0. + valid_num += 1 + + print('begin validation ...', file=sys.stderr) + + # compute dev. ppl and bleu + dev_ppl = evaluate_ppl(model, dev_data, batch_size=128) # dev batch size can be a bit larger + valid_metric = -dev_ppl + + print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr) + + is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores) + hist_valid_scores.append(valid_metric) + + if is_better: + patience = 0 + print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) + model.save(model_save_path) + + # also save the optimizers' state + torch.save(optimizer.state_dict(), model_save_path + '.optim') + elif patience < int(args['--patience']): + patience += 1 + print('hit patience %d' % patience, file=sys.stderr) + + if patience == int(args['--patience']): + num_trial += 1 + print('hit #%d trial' % num_trial, file=sys.stderr) + if num_trial == int(args['--max-num-trial']): + print('early stop!', file=sys.stderr) + exit(0) + + # decay lr, and restore from previously best checkpoint + lr = optimizer.param_groups[0]['lr'] * float(args['--lr-decay']) + print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr) + + # load model + params = torch.load(model_save_path, map_location=lambda storage, loc: storage) + model.load_state_dict(params['state_dict']) + model = model.to(device) + + print('restore parameters of the optimizers', file=sys.stderr) + optimizer.load_state_dict(torch.load(model_save_path + '.optim')) + + # set new lr + for param_group in optimizer.param_groups: + param_group['lr'] = lr + + # reset patience + patience = 0 + + if epoch == int(args['--max-epoch']): + print('reached maximum number of epochs!', file=sys.stderr) + exit(0) + + +def decode(args: Dict[str, str]): + """ Performs decoding on a test set, and save the best-scoring decoding results. + If the target gold-standard sentences are given, the function also computes + corpus-level BLEU score. + @param args (Dict): args from cmd line + """ + + print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr) + test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') + if args['TEST_TARGET_FILE']: + print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr) + test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') + + print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) + model = NMT.load(args['MODEL_PATH']) + + if args['--cuda']: + model = model.to(torch.device("cuda:0")) + + hypotheses = beam_search(model, test_data_src, + beam_size=int(args['--beam-size']), + max_decoding_time_step=int(args['--max-decoding-time-step'])) + + if args['TEST_TARGET_FILE']: + top_hypotheses = [hyps[0] for hyps in hypotheses] + bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) + print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) + + with open(args['OUTPUT_FILE'], 'w') as f: + for src_sent, hyps in zip(test_data_src, hypotheses): + top_hyp = hyps[0] + hyp_sent = ' '.join(top_hyp.value) + f.write(hyp_sent + '\n') + + +def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]: + """ Run beam search to construct hypotheses for a list of src-language sentences. + @param model (NMT): NMT Model + @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set. + @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step) + @param max_decoding_time_step (int): maximum sentence length that Beam search can produce + @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence. + """ + was_training = model.training + model.eval() + + hypotheses = [] + with torch.no_grad(): + for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout): + example_hyps = model.beam_search(src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step) + + hypotheses.append(example_hyps) + + if was_training: model.train(was_training) + + return hypotheses + + +def main(): + """ Main func. + """ + args = docopt(__doc__) + + + # Check pytorch version + assert(torch.__version__ >= "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__) + + # seed the random number generators + seed = int(args['--seed']) + torch.manual_seed(seed) + if args['--cuda']: + torch.cuda.manual_seed(seed) + np.random.seed(seed * 13 // 7) + + if args['train']: + train(args) + elif args['decode']: + decode(args) + else: + raise RuntimeError('invalid run mode') + + +if __name__ == '__main__': + main() diff --git a/Assignments/assignment4/MakiNaruto/sanity_check.py b/Assignments/assignment4/MakiNaruto/sanity_check.py new file mode 100644 index 0000000..7cce01c --- /dev/null +++ b/Assignments/assignment4/MakiNaruto/sanity_check.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +CS224N 2019-20: Homework 4 +sanity_check.py: sanity checks for assignment 4 +Sahil Chopra +Michael Hahn <> +Vera Lin + +If you are a student, please don't run overwrite_output_for_sanity_check as it will overwrite the correct output! + +Usage: + sanity_check.py 1d + sanity_check.py 1e + sanity_check.py 1f + sanity_check.py overwrite_output_for_sanity_check +""" +import sys + +import numpy as np + +from docopt import docopt +from utils import batch_iter +from utils import read_corpus +from vocab import Vocab, VocabEntry + +from nmt_model import NMT + + +import torch +import torch.nn as nn +import torch.nn.utils + +#---------- +# CONSTANTS +#---------- +BATCH_SIZE = 5 +EMBED_SIZE = 3 +HIDDEN_SIZE = 3 +DROPOUT_RATE = 0.0 + +def reinitialize_layers(model): + """ Reinitialize the Layer Weights for Sanity Checks. + """ + def init_weights(m): + if type(m) == nn.Linear: + m.weight.data.fill_(0.3) + if m.bias is not None: + m.bias.data.fill_(0.1) + elif type(m) == nn.Embedding: + m.weight.data.fill_(0.15) + elif type(m) == nn.Dropout: + nn.Dropout(DROPOUT_RATE) + with torch.no_grad(): + model.apply(init_weights) + + +def generate_outputs(model, source, target, vocab): + """ Generate outputs. + """ + print ("-"*80) + print("Generating Comparison Outputs") + reinitialize_layers(model) + model.gen_sanity_check = True + model.counter = 0 + + # Compute sentence lengths + source_lengths = [len(s) for s in source] + + # Convert list of lists into tensors + source_padded = model.vocab.src.to_input_tensor(source, device=model.device) + target_padded = model.vocab.tgt.to_input_tensor(target, device=model.device) + + # Run the model forward + with torch.no_grad(): + enc_hiddens, dec_init_state = model.encode(source_padded, source_lengths) + enc_masks = model.generate_sent_masks(enc_hiddens, source_lengths) + combined_outputs = model.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) + + # Save Tensors to disk + torch.save(enc_hiddens, './sanity_check_en_es_data/enc_hiddens.pkl') + torch.save(dec_init_state, './sanity_check_en_es_data/dec_init_state.pkl') + torch.save(enc_masks, './sanity_check_en_es_data/enc_masks.pkl') + torch.save(combined_outputs, './sanity_check_en_es_data/combined_outputs.pkl') + torch.save(target_padded, './sanity_check_en_es_data/target_padded.pkl') + + # 1f + # Inputs + Ybar_t = torch.load('./sanity_check_en_es_data/Ybar_t.pkl') + enc_hiddens_proj = torch.load('./sanity_check_en_es_data/enc_hiddens_proj.pkl') + reinitialize_layers(model) + # Run Tests + with torch.no_grad(): + dec_state_target, o_t_target, e_t_target = model.step(Ybar_t, dec_init_state, enc_hiddens, enc_hiddens_proj, + enc_masks) + torch.save(dec_state_target, './sanity_check_en_es_data/dec_state.pkl') + torch.save(o_t_target, './sanity_check_en_es_data/o_t.pkl') + torch.save(e_t_target, './sanity_check_en_es_data/e_t.pkl') + + model.gen_sanity_check = False + +def question_1d_sanity_check(model, src_sents, tgt_sents, vocab): + """ Sanity check for question 1d. + Compares student output to that of model with dummy data. + """ + print("Running Sanity Check for Question 1d: Encode") + print ("-"*80) + + # Configure for Testing + reinitialize_layers(model) + source_lengths = [len(s) for s in src_sents] + source_padded = model.vocab.src.to_input_tensor(src_sents, device=model.device) + + # Load Outputs + enc_hiddens_target = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl') + dec_init_state_target = torch.load('./sanity_check_en_es_data/dec_init_state.pkl') + + # Test + with torch.no_grad(): + enc_hiddens_pred, dec_init_state_pred = model.encode(source_padded, source_lengths) + assert(np.allclose(enc_hiddens_target.numpy(), enc_hiddens_pred.numpy())), "enc_hiddens is incorrect: it should be:\n {:} but is:\n{}".format(enc_hiddens_target, enc_hiddens_pred) + print("enc_hiddens Sanity Checks Passed!") + assert(np.allclose(dec_init_state_target[0].numpy(), dec_init_state_pred[0].numpy())), "dec_init_state[0] is incorrect: it should be:\n {} but is:\n{}".format(dec_init_state_target[0], dec_init_state_pred[0]) + print("dec_init_state[0] Sanity Checks Passed!") + assert(np.allclose(dec_init_state_target[1].numpy(), dec_init_state_pred[1].numpy())), "dec_init_state[1] is incorrect: it should be:\n {} but is:\n{}".format(dec_init_state_target[1], dec_init_state_pred[1]) + print("dec_init_state[1] Sanity Checks Passed!") + print("-"*80) + print("All Sanity Checks Passed for Question 1d: Encode!") + print("-"*80) + + +def question_1e_sanity_check(model, src_sents, tgt_sents, vocab): + """ Sanity check for question 1e. + Compares student output to that of model with dummy data. + """ + print("-"*80) + print("Running Sanity Check for Question 1e: Decode") + print("-"*80) + + # Load Inputs + dec_init_state = torch.load('./sanity_check_en_es_data/dec_init_state.pkl') + enc_hiddens = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl') + enc_masks = torch.load('./sanity_check_en_es_data/enc_masks.pkl') + target_padded = torch.load('./sanity_check_en_es_data/target_padded.pkl') + + # Load Outputs + combined_outputs_target = torch.load('./sanity_check_en_es_data/combined_outputs.pkl') + print(combined_outputs_target.shape) + + # Configure for Testing + reinitialize_layers(model) + COUNTER = [0] + def stepFunction(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks): + dec_state = torch.load('./sanity_check_en_es_data/step_dec_state_{}.pkl'.format(COUNTER[0])) + o_t = torch.load('./sanity_check_en_es_data/step_o_t_{}.pkl'.format(COUNTER[0])) + COUNTER[0]+=1 + return dec_state, o_t, None + model.step = stepFunction + + # Run Tests + with torch.no_grad(): + combined_outputs_pred = model.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) + assert(np.allclose(combined_outputs_pred.numpy(), combined_outputs_target.numpy())), "combined_outputs is incorrect: it should be:\n {} but is:\n{}".format(combined_outputs_target, combined_outputs_pred) + print("combined_outputs Sanity Checks Passed!") + print("-"*80) + print("All Sanity Checks Passed for Question 1e: Decode!") + print("-"*80) + +def question_1f_sanity_check(model, src_sents, tgt_sents, vocab): + """ Sanity check for question 1f. + Compares student output to that of model with dummy data. + """ + print ("-"*80) + print("Running Sanity Check for Question 1f: Step") + print ("-"*80) + reinitialize_layers(model) + + # Inputs + Ybar_t = torch.load('./sanity_check_en_es_data/Ybar_t.pkl') + dec_init_state = torch.load('./sanity_check_en_es_data/dec_init_state.pkl') + enc_hiddens = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl') + enc_masks = torch.load('./sanity_check_en_es_data/enc_masks.pkl') + enc_hiddens_proj = torch.load('./sanity_check_en_es_data/enc_hiddens_proj.pkl') + + # Output + dec_state_target = torch.load('./sanity_check_en_es_data/dec_state.pkl') + o_t_target = torch.load('./sanity_check_en_es_data/o_t.pkl') + e_t_target = torch.load('./sanity_check_en_es_data/e_t.pkl') + + # Run Tests + with torch.no_grad(): + dec_state_pred, o_t_pred, e_t_pred= model.step(Ybar_t, dec_init_state, enc_hiddens, enc_hiddens_proj, enc_masks) + assert(np.allclose(dec_state_target[0].numpy(), dec_state_pred[0].numpy())), "decoder_state[0] is incorrect: it should be:\n {} but is:\n{}".format(dec_state_target[0], dec_state_pred[0]) + print("dec_state[0] Sanity Checks Passed!") + assert(np.allclose(dec_state_target[1].numpy(), dec_state_pred[1].numpy())), "decoder_state[1] is incorrect: it should be:\n {} but is:\n{}".format(dec_state_target[1], dec_state_pred[1]) + print("dec_state[1] Sanity Checks Passed!") + assert(np.allclose(o_t_target.numpy(), o_t_pred.numpy())), "combined_output is incorrect: it should be:\n {} but is:\n{}".format(o_t_target, o_t_pred) + print("combined_output Sanity Checks Passed!") + assert(np.allclose(e_t_target.numpy(), e_t_pred.numpy())), "e_t is incorrect: it should be:\n {} but is:\n{}".format(e_t_target, e_t_pred) + print("e_t Sanity Checks Passed!") + print("-"*80) + print("All Sanity Checks Passed for Question 1f: Step!") + print("-"*80) + + +def main(): + """ Main func. + """ + # args = docopt(__doc__) + args = {'1d': False, + '1e': False, + '1f': True, + 'overwrite_output_for_sanity_check': False} + + # print(args) + # Check Python & PyTorch Versions + assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5" + assert(torch.__version__ >= "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__) + + # Seed the Random Number Generators + seed = 1234 + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + np.random.seed(seed * 13 // 7) + + # Load training data & vocabulary + train_data_src = read_corpus('./sanity_check_en_es_data/train_sanity_check.es', 'src') + train_data_tgt = read_corpus('./sanity_check_en_es_data/train_sanity_check.en', 'tgt') + train_data = list(zip(train_data_src, train_data_tgt)) + + for src_sents, tgt_sents in batch_iter(train_data, batch_size=BATCH_SIZE, shuffle=True): + src_sents = src_sents + tgt_sents = tgt_sents + break + vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') + + # Create NMT Model + model = NMT( + embed_size=EMBED_SIZE, + hidden_size=HIDDEN_SIZE, + dropout_rate=DROPOUT_RATE, + vocab=vocab) + + if args['1d']: + question_1d_sanity_check(model, src_sents, tgt_sents, vocab) + elif args['1e']: + question_1e_sanity_check(model, src_sents, tgt_sents, vocab) + elif args['1f']: + question_1f_sanity_check(model, src_sents, tgt_sents, vocab) + elif args['overwrite_output_for_sanity_check']: + generate_outputs(model, src_sents, tgt_sents, vocab) + else: + raise RuntimeError('invalid run mode') + + +if __name__ == '__main__': + main() + diff --git a/Assignments/assignment4/MakiNaruto/utils.py b/Assignments/assignment4/MakiNaruto/utils.py new file mode 100644 index 0000000..00f30a7 --- /dev/null +++ b/Assignments/assignment4/MakiNaruto/utils.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +CS224N 2019-20: Homework 4 +nmt.py: NMT Model +Pencheng Yin +Sahil Chopra +Vera Lin +""" + +import math +from typing import List + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import nltk +# nltk.download('punkt') + + +def pad_sents(sents, pad_token): + """ Pad list of sentences according to the longest sentence in the batch. + The paddings should be at the end of each sentence. + @param sents (list[list[str]]): list of sentences, where each sentence + is represented as a list of words + @param pad_token (str): padding token + @returns sents_padded (list[list[str]]): list of sentences where sentences shorter + than the max length sentence are padded out with the pad_token, such that + each sentences in the batch now has equal length. + """ + sents_padded = [] + + ### YOUR CODE HERE (~6 Lines) + max_len = max([len(sent) for sent in sents]) + for sent in sents: + sent_len = len(sent) + sents_padded.append(sent + (max_len - sent_len) * [pad_token]) + ### END YOUR CODE + + return sents_padded + + +def read_corpus(file_path, source): + """ Read file, where each sentence is dilineated by a `\n`. + @param file_path (str): path to file containing corpus + @param source (str): "tgt" or "src" indicating whether text + is of the source language or target language + """ + data = [] + for line in open(file_path): + sent = nltk.word_tokenize(line) + # only append and to the target sentence + if source == 'tgt': + sent = [''] + sent + [''] + data.append(sent) + + return data + + +def batch_iter(data, batch_size, shuffle=False): + """ Yield batches of source and target sentences reverse sorted by length (largest to smallest). + @param data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence + @param batch_size (int): batch size + @param shuffle (boolean): whether to randomly shuffle the dataset + """ + batch_num = math.ceil(len(data) / batch_size) + index_array = list(range(len(data))) + + if shuffle: + np.random.shuffle(index_array) + + for i in range(batch_num): + indices = index_array[i * batch_size: (i + 1) * batch_size] + examples = [data[idx] for idx in indices] + + examples = sorted(examples, key=lambda e: len(e[0]), reverse=True) + src_sents = [e[0] for e in examples] + tgt_sents = [e[1] for e in examples] + + yield src_sents, tgt_sents + diff --git a/Assignments/assignment4/MakiNaruto/vocab.py b/Assignments/assignment4/MakiNaruto/vocab.py new file mode 100644 index 0000000..ebe5082 --- /dev/null +++ b/Assignments/assignment4/MakiNaruto/vocab.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +CS224N 2019-20: Homework 4 +vocab.py: Vocabulary Generation +Pencheng Yin +Sahil Chopra +Vera Lin + +Usage: + vocab.py --train-src= --train-tgt= [options] VOCAB_FILE + +Options: + -h --help Show this screen. + --train-src= File of training source sentences + --train-tgt= File of training target sentences + --size= vocab size [default: 50000] + --freq-cutoff= frequency cutoff [default: 2] +""" + +from collections import Counter +from docopt import docopt +from itertools import chain +import json +import torch +from typing import List +from utils import read_corpus, pad_sents + + +class VocabEntry(object): + """ Vocabulary Entry, i.e. structure containing either + src or tgt language terms. + """ + def __init__(self, word2id=None): + """ Init VocabEntry Instance. + @param word2id (dict): dictionary mapping words 2 indices + """ + if word2id: + self.word2id = word2id + else: + self.word2id = dict() + self.word2id[''] = 0 # Pad Token + self.word2id[''] = 1 # Start Token + self.word2id[''] = 2 # End Token + self.word2id[''] = 3 # Unknown Token + self.unk_id = self.word2id[''] + self.id2word = {v: k for k, v in self.word2id.items()} + + def __getitem__(self, word): + """ Retrieve word's index. Return the index for the unk + token if the word is out of vocabulary. + @param word (str): word to look up. + @returns index (int): index of word + """ + return self.word2id.get(word, self.unk_id) + + def __contains__(self, word): + """ Check if word is captured by VocabEntry. + @param word (str): word to look up + @returns contains (bool): whether word is contained + """ + return word in self.word2id + + def __setitem__(self, key, value): + """ Raise error, if one tries to edit the VocabEntry. + """ + raise ValueError('vocabulary is readonly') + + def __len__(self): + """ Compute number of words in VocabEntry. + @returns len (int): number of words in VocabEntry + """ + return len(self.word2id) + + def __repr__(self): + """ Representation of VocabEntry to be used + when printing the object. + """ + return 'Vocabulary[size=%d]' % len(self) + + def id2word(self, wid): + """ Return mapping of index to word. + @param wid (int): word index + @returns word (str): word corresponding to index + """ + return self.id2word[wid] + + def add(self, word): + """ Add word to VocabEntry, if it is previously unseen. + @param word (str): word to add to VocabEntry + @return index (int): index that the word has been assigned + """ + if word not in self: + wid = self.word2id[word] = len(self) + self.id2word[wid] = word + return wid + else: + return self[word] + + def words2indices(self, sents): + """ Convert list of words or list of sentences of words + into list or list of list of indices. + @param sents (list[str] or list[list[str]]): sentence(s) in words + @return word_ids (list[int] or list[list[int]]): sentence(s) in indices + """ + if type(sents[0]) == list: + return [[self[w] for w in s] for s in sents] + else: + return [self[w] for w in sents] + + def indices2words(self, word_ids): + """ Convert list of indices into words. + @param word_ids (list[int]): list of word ids + @return sents (list[str]): list of words + """ + return [self.id2word[w_id] for w_id in word_ids] + + def to_input_tensor(self, sents: List[List[str]], device: torch.device) -> torch.Tensor: + """ Convert list of sentences (words) into tensor with necessary padding for + shorter sentences. + + @param sents (List[List[str]]): list of sentences (words) + @param device: device on which to load the tesnor, i.e. CPU or GPU + + @returns sents_var: tensor of (max_sentence_length, batch_size) + """ + word_ids = self.words2indices(sents) + sents_t = pad_sents(word_ids, self['']) + sents_var = torch.tensor(sents_t, dtype=torch.long, device=device) + return torch.t(sents_var) + + @staticmethod + def from_corpus(corpus, size, freq_cutoff=2): + """ Given a corpus construct a Vocab Entry. + @param corpus (list[str]): corpus of text produced by read_corpus function + @param size (int): # of words in vocabulary + @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word + @returns vocab_entry (VocabEntry): VocabEntry instance produced from provided corpus + """ + vocab_entry = VocabEntry() + word_freq = Counter(chain(*corpus)) + valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff] + print('number of word types: {}, number of word types w/ frequency >= {}: {}' + .format(len(word_freq), freq_cutoff, len(valid_words))) + top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:size] + for word in top_k_words: + vocab_entry.add(word) + return vocab_entry + + +class Vocab(object): + """ Vocab encapsulating src and target langauges. + """ + def __init__(self, src_vocab: VocabEntry, tgt_vocab: VocabEntry): + """ Init Vocab. + @param src_vocab (VocabEntry): VocabEntry for source language + @param tgt_vocab (VocabEntry): VocabEntry for target language + """ + self.src = src_vocab + self.tgt = tgt_vocab + + @staticmethod + def build(src_sents, tgt_sents, vocab_size, freq_cutoff) -> 'Vocab': + """ Build Vocabulary. + @param src_sents (list[str]): Source sentences provided by read_corpus() function + @param tgt_sents (list[str]): Target sentences provided by read_corpus() function + @param vocab_size (int): Size of vocabulary for both source and target languages + @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word. + """ + assert len(src_sents) == len(tgt_sents) + + print('initialize source vocabulary ..') + src = VocabEntry.from_corpus(src_sents, vocab_size, freq_cutoff) + + print('initialize target vocabulary ..') + tgt = VocabEntry.from_corpus(tgt_sents, vocab_size, freq_cutoff) + + return Vocab(src, tgt) + + def save(self, file_path): + """ Save Vocab to file as JSON dump. + @param file_path (str): file path to vocab file + """ + json.dump(dict(src_word2id=self.src.word2id, tgt_word2id=self.tgt.word2id), open(file_path, 'w'), indent=2) + + @staticmethod + def load(file_path): + """ Load vocabulary from JSON dump. + @param file_path (str): file path to vocab file + @returns Vocab object loaded from JSON dump + """ + entry = json.load(open(file_path, 'r')) + src_word2id = entry['src_word2id'] + tgt_word2id = entry['tgt_word2id'] + + return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id)) + + def __repr__(self): + """ Representation of Vocab to be used + when printing the object. + """ + return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt)) + + + +if __name__ == '__main__': + args = docopt(__doc__) + + # print(args) + print('read in source sentences: %s' % args['--train-src']) + print('read in target sentences: %s' % args['--train-tgt']) + + src_sents = read_corpus(args['--train-src'], source='src') + tgt_sents = read_corpus(args['--train-tgt'], source='tgt') + + vocab = Vocab.build(src_sents, tgt_sents, int(args['--size']), int(args['--freq-cutoff'])) + print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt))) + + vocab.save(args['VOCAB_FILE']) + print('vocabulary saved to %s' % args['VOCAB_FILE']) diff --git a/Assignments/assignment4/MakiNaruto/word_nmt.jpg b/Assignments/assignment4/MakiNaruto/word_nmt.jpg new file mode 100644 index 0000000..89af8b5 Binary files /dev/null and b/Assignments/assignment4/MakiNaruto/word_nmt.jpg differ