diff --git a/Assignments/assignment4/MakiNaruto/README.md b/Assignments/assignment4/MakiNaruto/README.md
new file mode 100644
index 0000000..8267252
--- /dev/null
+++ b/Assignments/assignment4/MakiNaruto/README.md
@@ -0,0 +1,2 @@
+做了这个作业后，发现要充分了解模型的各个输入输出维度。很关键
+![词级翻译模型](./word_nmt.jpg)
\ No newline at end of file
diff --git a/Assignments/assignment4/MakiNaruto/__init__.py b/Assignments/assignment4/MakiNaruto/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Assignments/assignment4/MakiNaruto/model_embeddings.py b/Assignments/assignment4/MakiNaruto/model_embeddings.py
new file mode 100644
index 0000000..35459ea
--- /dev/null
+++ b/Assignments/assignment4/MakiNaruto/model_embeddings.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+CS224N 2019-20: Homework 4
+model_embeddings.py: Embeddings for the NMT model
+Pencheng Yin <pcyin@cs.cmu.edu>
+Sahil Chopra <schopra8@stanford.edu>
+Anand Dhoot <anandd@stanford.edu>
+Vera Lin <veralin@stanford.edu>
+"""
+
+import torch.nn as nn
+
+class ModelEmbeddings(nn.Module): 
+    """
+    Class that converts input words to their embeddings.
+    """
+    def __init__(self, embed_size, vocab):
+        """
+        Init the Embedding layers.
+
+        @param embed_size (int): Embedding size (dimensionality)
+        @param vocab (Vocab): Vocabulary object containing src and tgt languages
+                              See vocab.py for documentation.
+        """
+        super(ModelEmbeddings, self).__init__()
+        self.embed_size = embed_size
+
+        # default values
+        self.source = None
+        self.target = None
+
+        src_pad_token_idx = vocab.src['<pad>']
+        tgt_pad_token_idx = vocab.tgt['<pad>']
+
+        ### YOUR CODE HERE (~2 Lines)
+        ### TODO - Initialize the following variables:
+        ###     self.source (Embedding Layer for source language)
+        ###     self.target (Embedding Layer for target langauge)
+        ###
+        ### Note:
+        ###     1. `vocab` object contains two vocabularies:
+        ###            `vocab.src` for source
+        ###            `vocab.tgt` for target
+        ###     2. You can get the length of a specific vocabulary by running:
+        ###             `len(vocab.<specific_vocabulary>)`
+        ###     3. Remember to include the padding token for the specific vocabulary
+        ###        when creating your Embedding.
+        ###
+        ### Use the following docs to properly initialize these variables:
+        ###     Embedding Layer:
+        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding
+
+        self.source = nn.Embedding(len(vocab.src), self.embed_size, padding_idx=src_pad_token_idx)
+        self.target = nn.Embedding(len(vocab.tgt), self.embed_size, padding_idx=tgt_pad_token_idx)
+
+        ### END YOUR CODE
+
+
diff --git a/Assignments/assignment4/MakiNaruto/nmt_model.py b/Assignments/assignment4/MakiNaruto/nmt_model.py
new file mode 100644
index 0000000..bd1552f
--- /dev/null
+++ b/Assignments/assignment4/MakiNaruto/nmt_model.py
@@ -0,0 +1,526 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+CS224N 2019-20: Homework 4
+nmt_model.py: NMT Model
+Pencheng Yin <pcyin@cs.cmu.edu>
+Sahil Chopra <schopra8@stanford.edu>
+Vera Lin <veralin@stanford.edu>
+"""
+from collections import namedtuple
+import sys
+from typing import List, Tuple, Dict, Set, Union
+import torch
+import torch.nn as nn
+import torch.nn.utils
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
+
+from model_embeddings import ModelEmbeddings
+Hypothesis = namedtuple('Hypothesis', ['value', 'score'])
+
+
+class NMT(nn.Module):
+    """ Simple Neural Machine Translation Model:
+        - Bidrectional LSTM Encoder
+        - Unidirection LSTM Decoder
+        - Global Attention Model (Luong, et al. 2015)
+    """
+    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
+        """ Init NMT Model.
+
+        @param embed_size (int): Embedding size (dimensionality)
+        @param hidden_size (int): Hidden Size, the size of hidden states (dimensionality)
+        @param vocab (Vocab): Vocabulary object containing src and tgt languages
+                              See vocab.py for documentation.
+        @param dropout_rate (float): Dropout probability, for attention
+        """
+        super(NMT, self).__init__()
+        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
+        self.hidden_size = hidden_size
+        self.dropout_rate = dropout_rate
+        self.vocab = vocab
+
+        # default values
+        self.encoder = None
+        self.decoder = None
+        self.h_projection = None
+        self.c_projection = None
+        self.att_projection = None
+        self.combined_output_projection = None
+        self.target_vocab_projection = None
+        self.dropout = None
+        # For sanity check only, not relevant to implementation
+        self.gen_sanity_check = False
+        self.counter = 0
+
+
+        ### YOUR CODE HERE (~8 Lines)
+        ### TODO - Initialize the following variables:
+        ###     self.encoder (Bidirectional LSTM with bias)
+        ###     self.decoder (LSTM Cell with bias)
+        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
+        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
+        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
+        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
+        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
+        ###     self.dropout (Dropout Layer)
+        ###
+        ### Use the following docs to properly initialize these variables:
+        ###     LSTM:
+        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
+        ###     LSTM Cell:
+        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
+        ###     Linear Layer:
+        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
+        ###     Dropout Layer:
+        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout
+
+        self.encoder = nn.LSTM(embed_size, self.hidden_size, bidirectional=True)
+        self.decoder = nn.LSTMCell(embed_size + self.hidden_size, self.hidden_size)
+        self.h_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False)
+        self.c_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False)
+        self.att_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False)
+        self.combined_output_projection = nn.Linear(3*self.hidden_size, self.hidden_size, bias=False)
+        self.target_vocab_projection = nn.Linear(self.hidden_size, len(vocab.tgt), bias=False)
+        self.dropout = nn.Dropout(p=self.dropout_rate)
+
+
+        ### END YOUR CODE
+
+
+    def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor:
+        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
+        target sentences under the language models learned by the NMT system.
+
+        @param source (List[List[str]]): list of source sentence tokens
+        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`
+
+        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
+                                    log-likelihood of generating the gold-standard target sentence for
+                                    each example in the input batch. Here b = batch size.
+        """
+        # Compute sentence lengths
+        source_lengths = [len(s) for s in source]
+
+        # Convert list of lists into tensors
+        source_padded = self.vocab.src.to_input_tensor(source, device=self.device)   # Tensor: (src_len, b)
+        target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device)   # Tensor: (tgt_len, b)
+
+        ###     Run the network forward:
+        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
+        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
+        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
+        ###     4. Compute log probability distribution over the target vocabulary using the
+        ###        combined_outputs returned by the `self.decode()` function.
+
+        enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths)
+        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
+        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
+        P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1)
+
+        # Zero out, probabilities for which we have nothing in the target text
+        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()
+
+        # Compute log probability of generating true target words
+        target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]
+        scores = target_gold_words_log_prob.sum(dim=0)
+        return scores
+
+
+    def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """ Apply the encoder to source sentences to obtain encoder hidden states.
+            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.
+
+        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
+                                        b = batch_size, src_len = maximum source sentence length. Note that 
+                                       these have already been sorted in order of longest to shortest sentence.
+        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
+        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
+                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
+        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
+                                                hidden state and cell.
+        """
+        enc_hiddens, dec_init_state = None, None
+
+        ### YOUR CODE HERE (~ 8 Lines)
+        ### TODO:
+        ###     1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
+        ###         src_len = maximum source sentence length, b = batch size, e = embedding size. Note
+        ###         that there is no initial hidden state or cell for the decoder.
+
+        ###     2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
+        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
+        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
+        ###         - Note that the shape of the tensor returned by the encoder is (src_len, b, h*2) and we want to
+        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
+
+        ###     3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
+        ###         - `init_decoder_hidden`:
+        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
+        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
+        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
+        ###             This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
+
+        ###         - `init_decoder_cell`:
+        ###             `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
+        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
+        ###             Apply the c_projection layer to this in order to compute init_decoder_cell.
+        ###             This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
+        ###
+
+        ### See the following docs, as you may need to use some of the following functions in your implementation:
+        ###     Pack the padded sequence X before passing to the encoder:
+        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence
+        ###     Pad the packed sequence, enc_hiddens, returned by the encoder:
+        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence
+        ###     Tensor Concatenation:
+        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
+        ###     Tensor Permute:
+        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute
+
+        X = self.model_embeddings.source(source_padded)             # src_len, b, e
+        X = pack_padded_sequence(X, lengths=source_lengths)         #
+        enc_hiddens, (last_hidden, last_cell) = self.encoder(X)     # 各层隐层，LSTM输出h_t,c_t
+        enc_hiddens, length = pad_packed_sequence(enc_hiddens)      # src_len, b, h*2
+        enc_hiddens = enc_hiddens.permute(1, 0, 2)                  # b, src_len, h*2->src_len, b, h*2,,使用tensor.transpose()也能达到效果。
+                                                                    # 区别是transpose只能进行二维的转换, permute可以高维
+
+
+        init_decoder_hidden = self.h_projection(
+            torch.cat(
+                (last_hidden[0], last_hidden[1]), 1)
+        )                                                           # (b, h) -> (h, 2h)
+
+
+        init_decoder_cell = self.c_projection(
+            torch.cat(
+                (last_cell[0], last_cell[1]), 1)
+        )                                                           # (b, h) -> (h, 2h)
+
+        dec_init_state = (init_decoder_hidden, init_decoder_cell)
+        ### END YOUR CODE
+
+        return enc_hiddens, dec_init_state
+
+
+    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
+                dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor:
+        """Compute combined output vectors for a batch.
+
+        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
+                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
+        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
+                                     b = batch size, src_len = maximum source sentence length.
+        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
+        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
+                                       tgt_len = maximum target sentence length, b = batch size. 
+
+        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
+                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
+        """
+        # Chop of the <END> token for max length sentences.
+        target_padded = target_padded[:-1]
+
+        # Initialize the decoder state (hidden and cell)
+        dec_state = dec_init_state
+
+        # Initialize previous combined output vector o_{t-1} as zero
+        batch_size = enc_hiddens.size(0)
+        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)
+
+        # Initialize a list we will use to collect the combined output o_t on each step
+        combined_outputs = []
+
+        ### YOUR CODE HERE (~9 Lines)
+        ### TODO:
+        ###     1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`,
+        ###         which should be shape (b, src_len, h),
+        ###         where b = batch size, src_len = maximum source length, h = hidden size.
+        ###         This is applying W_{attProj} to h^enc, as described in the PDF.
+        ###     2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings.
+        ###         where tgt_len = maximum target sentence length, b = batch size, e = embedding size.
+        ###     3. Use the torch.split function to iterate over the time dimension of Y.
+        ###         Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size.
+        ###             - Squeeze Y_t into a tensor of dimension (b, e). 
+        ###             - Construct Ybar_t by concatenating Y_t with o_prev on their last dimension
+        ###             - Use the step function to compute the the Decoder's next (cell, state) values
+        ###               as well as the new combined output o_t.
+        ###             - Append o_t to combined_outputs
+        ###             - Update o_prev to the new o_t.
+        ###     4. Use torch.stack to convert combined_outputs from a list length tgt_len of
+        ###         tensors shape (b, h), to a single tensor shape (tgt_len, b, h)
+        ###         where tgt_len = maximum target sentence length, b = batch size, h = hidden size.
+        ###
+        ### Note:
+        ###    - When using the squeeze() function make sure to specify the dimension you want to squeeze
+        ###      over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
+        ###   
+        ### You may find some of these functions useful:
+        ###     Zeros Tensor:
+        ###         https://pytorch.org/docs/stable/torch.html#torch.zeros
+        ###     Tensor Splitting (iteration):
+        ###         https://pytorch.org/docs/stable/torch.html#torch.split
+        ###     Tensor Dimension Squeezing:
+        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
+        ###     Tensor Concatenation:
+        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
+        ###     Tensor Stacking:
+        ###         https://pytorch.org/docs/stable/torch.html#torch.stack
+
+        enc_hiddens_proj = self.att_projection(enc_hiddens)
+        Y = self.model_embeddings.target(target_padded)
+        for i in torch.split(Y, 1, dim=0):
+            Y_t = i.squeeze(0)
+            Ybar_t = torch.cat((Y_t, o_prev), 1)
+            dec_state, o_t, e_t = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks)
+            combined_outputs.append(o_t)
+            o_prev = o_t
+        ### END YOUR CODE
+        combined_outputs = torch.stack(combined_outputs, dim=0)
+        return combined_outputs
+
+
+    def step(self, Ybar_t: torch.Tensor,
+            dec_state: Tuple[torch.Tensor, torch.Tensor],
+            enc_hiddens: torch.Tensor,
+            enc_hiddens_proj: torch.Tensor,
+            enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
+        """ Compute one forward step of the LSTM decoder, including the attention computation.
+
+        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
+                                where b = batch size, e = embedding size, h = hidden size.
+        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
+                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
+        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
+                                    src_len = maximum source length, h = hidden size.
+        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
+                                    where b = batch size, src_len = maximum source length, h = hidden size.
+        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
+                                    where b = batch size, src_len is maximum source length. 
+
+        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
+                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
+        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
+        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
+                                Note: You will not use this outside of this function.
+                                      We are simply returning this value so that we can sanity check
+                                      your implementation.
+        """
+
+        combined_output = None
+
+        ### YOUR CODE HERE (~3 Lines)
+        ### TODO:
+        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
+        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
+        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len). 
+        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.
+        ###
+        ###       Hints:
+        ###         - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched)
+        ###         - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched).
+        ###         - Use batched matrix multiplication (torch.bmm) to compute e_t (be careful about the input/ output shapes!)
+        ###         - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing.
+        ###         - When using the squeeze() function make sure to specify the dimension you want to squeeze
+        ###             over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
+        ###
+        ### Use the following docs to implement this functionality:
+        ###     Batch Multiplication:
+        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
+        ###     Tensor Unsqueeze:
+        ###         https://pytorch.org/docs/stable/torch.html#torch.unsqueeze
+        ###     Tensor Squeeze:
+        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
+        dec_state = self.decoder(Ybar_t, dec_state)
+        dec_hidden, dec_cell = dec_state
+        e_t = torch.bmm(
+            enc_hiddens_proj,                   # (b, src_len, h)
+            dec_hidden.unsqueeze(2)             # (b, h, 1)
+        ).squeeze(2)                            # (b, src_len)
+        ### END YOUR CODE
+
+        # Set e_t to -inf where enc_masks has 1
+        if enc_masks is not None:
+            e_t.data.masked_fill_(enc_masks.bool(), -float('inf'))
+
+        ### YOUR CODE HERE (~6 Lines)
+        ### TODO:
+        ###     1. Apply softmax to e_t to yield alpha_t
+        ###     2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
+        ###         attention output vector, a_t.
+        ###           - alpha_t is shape (b, src_len)
+        ###           - enc_hiddens is shape (b, src_len, 2h)
+        ###           - a_t should be shape (b, 2h)
+        ###           - You will need to do some squeezing and unsqueezing.
+        ###     Note: b = batch size, src_len = maximum source length, h = hidden size.
+        ###
+        ###     3. Concatenate dec_hidden with a_t to compute tensor U_t
+        ###     4. Apply the combined output projection layer to U_t to compute tensor V_t
+        ###     5. Compute tensor O_t by first applying the Tanh function and then the dropout layer.
+        ###
+        ### Use the following docs to implement this functionality:
+        ###     Softmax:
+        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax
+        ###     Batch Multiplication:
+        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
+        ###     Tensor View:
+        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
+        ###     Tensor Concatenation:
+        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
+        ###     Tanh:
+        ###         https://pytorch.org/docs/stable/torch.html#torch.tanh
+
+        alpha_t = F.softmax(e_t, dim=1)
+        a_t = torch.bmm(
+            alpha_t.unsqueeze(1),               # (b, 1, src_len)
+            enc_hiddens                         # (b, src_len, 2h)
+        ).squeeze(1)                            # (b, 1, 2h) ->(b, 2h)
+        U_t = torch.cat((dec_hidden, a_t), 1)   # (b, 3h)
+        V_t = self.combined_output_projection(U_t)
+        O_t = self.dropout(torch.tanh(V_t))
+
+        ### END YOUR CODE
+
+        combined_output = O_t
+        return dec_state, combined_output, e_t
+
+    def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor:
+        """ Generate sentence masks for encoder hidden states.
+
+        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
+                                     src_len = max source length, h = hidden size. 
+        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
+        
+        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
+                                    where src_len = max source length, h = hidden size.
+        """
+        enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)
+        for e_id, src_len in enumerate(source_lengths):
+            enc_masks[e_id, src_len:] = 1
+        return enc_masks.to(self.device)
+
+
+    def beam_search(self, src_sent: List[str], beam_size: int=5, max_decoding_time_step: int=70) -> List[Hypothesis]:
+        """ Given a single source sentence, perform beam search, yielding translations in the target language.
+        @param src_sent (List[str]): a single source sentence (words)
+        @param beam_size (int): beam size
+        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
+        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
+                value: List[str]: the decoded target sentence, represented as a list of words
+                score: float: the log-likelihood of the target sentence
+        """
+        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)
+
+        src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)])
+        src_encodings_att_linear = self.att_projection(src_encodings)
+
+        h_tm1 = dec_init_vec
+        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)
+
+        eos_id = self.vocab.tgt['</s>']
+
+        hypotheses = [['<s>']]
+        hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device)
+        completed_hypotheses = []
+
+        t = 0
+        while len(completed_hypotheses) < beam_size and t < max_decoding_time_step:
+            t += 1
+            hyp_num = len(hypotheses)
+
+            exp_src_encodings = src_encodings.expand(hyp_num,
+                                                     src_encodings.size(1),
+                                                     src_encodings.size(2))
+
+            exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num,
+                                                                           src_encodings_att_linear.size(1),
+                                                                           src_encodings_att_linear.size(2))
+
+            y_tm1 = torch.tensor([self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device)
+            y_t_embed = self.model_embeddings.target(y_tm1)
+
+            x = torch.cat([y_t_embed, att_tm1], dim=-1)
+
+            (h_t, cell_t), att_t, _  = self.step(x, h_tm1,
+                                                      exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None)
+
+            # log probabilities over target words
+            log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1)
+
+            live_hyp_num = beam_size - len(completed_hypotheses)
+            contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
+            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num)
+
+            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
+            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)
+
+            new_hypotheses = []
+            live_hyp_ids = []
+            new_hyp_scores = []
+
+            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
+                prev_hyp_id = prev_hyp_id.item()
+                hyp_word_id = hyp_word_id.item()
+                cand_new_hyp_score = cand_new_hyp_score.item()
+
+                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
+                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
+                if hyp_word == '</s>':
+                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
+                                                           score=cand_new_hyp_score))
+                else:
+                    new_hypotheses.append(new_hyp_sent)
+                    live_hyp_ids.append(prev_hyp_id)
+                    new_hyp_scores.append(cand_new_hyp_score)
+
+            if len(completed_hypotheses) == beam_size:
+                break
+
+            live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device)
+            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
+            att_tm1 = att_t[live_hyp_ids]
+
+            hypotheses = new_hypotheses
+            hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device)
+
+        if len(completed_hypotheses) == 0:
+            completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
+                                                   score=hyp_scores[0].item()))
+
+        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)
+
+        return completed_hypotheses
+
+    @property
+    def device(self) -> torch.device:
+        """ Determine which device to place the Tensors upon, CPU or GPU.
+        """
+        return self.model_embeddings.source.weight.device
+
+    @staticmethod
+    def load(model_path: str):
+        """ Load the model from a file.
+        @param model_path (str): path to model
+        """
+        params = torch.load(model_path, map_location=lambda storage, loc: storage)
+        args = params['args']
+        model = NMT(vocab=params['vocab'], **args)
+        model.load_state_dict(params['state_dict'])
+
+        return model
+
+    def save(self, path: str):
+        """ Save the odel to a file.
+        @param path (str): path to the model
+        """
+        print('save model parameters to [%s]' % path, file=sys.stderr)
+
+        params = {
+            'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate),
+            'vocab': self.vocab,
+            'state_dict': self.state_dict()
+        }
+
+        torch.save(params, path)
diff --git a/Assignments/assignment4/MakiNaruto/run.py b/Assignments/assignment4/MakiNaruto/run.py
new file mode 100644
index 0000000..8e036be
--- /dev/null
+++ b/Assignments/assignment4/MakiNaruto/run.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+CS224N 2019-20: Homework 4
+run.py: Run Script for Simple NMT Model
+Pencheng Yin <pcyin@cs.cmu.edu>
+Sahil Chopra <schopra8@stanford.edu>
+Vera Lin <veralin@stanford.edu>
+
+Usage:
+    run.py train --train-src=<file> --train-tgt=<file> --dev-src=<file> --dev-tgt=<file> --vocab=<file> [options]
+    run.py decode [options] MODEL_PATH TEST_SOURCE_FILE OUTPUT_FILE
+    run.py decode [options] MODEL_PATH TEST_SOURCE_FILE TEST_TARGET_FILE OUTPUT_FILE
+
+Options:
+    -h --help                               show this screen.
+    --cuda                                  use GPU
+    --train-src=<file>                      train source file
+    --train-tgt=<file>                      train target file
+    --dev-src=<file>                        dev source file
+    --dev-tgt=<file>                        dev target file
+    --vocab=<file>                          vocab file
+    --seed=<int>                            seed [default: 0]
+    --batch-size=<int>                      batch size [default: 32]
+    --embed-size=<int>                      embedding size [default: 256]
+    --hidden-size=<int>                     hidden size [default: 256]
+    --clip-grad=<float>                     gradient clipping [default: 5.0]
+    --log-every=<int>                       log every [default: 10]
+    --max-epoch=<int>                       max epoch [default: 30]
+    --input-feed                            use input feeding
+    --patience=<int>                        wait for how many iterations to decay learning rate [default: 5]
+    --max-num-trial=<int>                   terminate training after how many trials [default: 5]
+    --lr-decay=<float>                      learning rate decay [default: 0.5]
+    --beam-size=<int>                       beam size [default: 5]
+    --sample-size=<int>                     sample size [default: 5]
+    --lr=<float>                            learning rate [default: 0.001]
+    --uniform-init=<float>                  uniformly initialize all parameters [default: 0.1]
+    --save-to=<file>                        model save path [default: model.bin]
+    --valid-niter=<int>                     perform validation after how many iterations [default: 2000]
+    --dropout=<float>                       dropout [default: 0.3]
+    --max-decoding-time-step=<int>          maximum number of decoding time steps [default: 70]
+"""
+import math
+import sys
+import pickle
+import time
+
+
+from docopt import docopt
+from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
+from nmt_model import Hypothesis, NMT
+import numpy as np
+from typing import List, Tuple, Dict, Set, Union
+from tqdm import tqdm
+from utils import read_corpus, batch_iter
+from vocab import Vocab, VocabEntry
+
+import torch
+import torch.nn.utils
+
+
+def evaluate_ppl(model, dev_data, batch_size=32):
+    """ Evaluate perplexity on dev sentences
+    @param model (NMT): NMT Model
+    @param dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
+    @param batch_size (batch size)
+    @returns ppl (perplixty on dev sentences)
+    """
+    was_training = model.training
+    model.eval()
+
+    cum_loss = 0.
+    cum_tgt_words = 0.
+
+    # no_grad() signals backend to throw away all gradients
+    with torch.no_grad():
+        for src_sents, tgt_sents in batch_iter(dev_data, batch_size):
+            loss = -model(src_sents, tgt_sents).sum()
+
+            cum_loss += loss.item()
+            tgt_word_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
+            cum_tgt_words += tgt_word_num_to_predict
+
+        ppl = np.exp(cum_loss / cum_tgt_words)
+
+    if was_training:
+        model.train()
+
+    return ppl
+
+
+def compute_corpus_level_bleu_score(references: List[List[str]], hypotheses: List[Hypothesis]) -> float:
+    """ Given decoding results and reference sentences, compute corpus-level BLEU score.
+    @param references (List[List[str]]): a list of gold-standard reference target sentences
+    @param hypotheses (List[Hypothesis]): a list of hypotheses, one for each reference
+    @returns bleu_score: corpus-level BLEU score
+    """
+    if references[0][0] == '<s>':
+        references = [ref[1:-1] for ref in references]
+    bleu_score = corpus_bleu([[ref] for ref in references],
+                             [hyp.value for hyp in hypotheses])
+    return bleu_score
+
+
+def train(args: Dict):
+    """ Train the NMT Model.
+    @param args (Dict): args from cmd line
+    """
+    train_data_src = read_corpus(args['--train-src'], source='src')
+    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')
+
+    dev_data_src = read_corpus(args['--dev-src'], source='src')
+    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')
+
+    train_data = list(zip(train_data_src, train_data_tgt))
+    dev_data = list(zip(dev_data_src, dev_data_tgt))
+
+    train_batch_size = int(args['--batch-size'])
+    clip_grad = float(args['--clip-grad'])
+    valid_niter = int(args['--valid-niter'])
+    log_every = int(args['--log-every'])
+    model_save_path = args['--save-to']
+
+    vocab = Vocab.load(args['--vocab'])
+
+    model = NMT(embed_size=int(args['--embed-size']),
+                hidden_size=int(args['--hidden-size']),
+                dropout_rate=float(args['--dropout']),
+                vocab=vocab)
+    model.train()
+
+    uniform_init = float(args['--uniform-init'])
+    if np.abs(uniform_init) > 0.:
+        print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
+        for p in model.parameters():
+            p.data.uniform_(-uniform_init, uniform_init)
+
+    vocab_mask = torch.ones(len(vocab.tgt))
+    vocab_mask[vocab.tgt['<pad>']] = 0
+
+    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
+    print('use device: %s' % device, file=sys.stderr)
+
+    model = model.to(device)
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))
+
+    num_trial = 0
+    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
+    cum_examples = report_examples = epoch = valid_num = 0
+    hist_valid_scores = []
+    train_time = begin_time = time.time()
+    print('begin Maximum Likelihood training')
+
+    while True:
+        epoch += 1
+
+        for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
+            train_iter += 1
+
+            optimizer.zero_grad()
+
+            batch_size = len(src_sents)
+
+            example_losses = -model(src_sents, tgt_sents) # (batch_size,)
+            batch_loss = example_losses.sum()
+            loss = batch_loss / batch_size
+
+            loss.backward()
+
+            # clip gradient
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
+
+            optimizer.step()
+
+            batch_losses_val = batch_loss.item()
+            report_loss += batch_losses_val
+            cum_loss += batch_losses_val
+
+            tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
+            report_tgt_words += tgt_words_num_to_predict
+            cum_tgt_words += tgt_words_num_to_predict
+            report_examples += batch_size
+            cum_examples += batch_size
+
+            if train_iter % log_every == 0:
+                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
+                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
+                                                                                         report_loss / report_examples,
+                                                                                         math.exp(report_loss / report_tgt_words),
+                                                                                         cum_examples,
+                                                                                         report_tgt_words / (time.time() - train_time),
+                                                                                         time.time() - begin_time), file=sys.stderr)
+
+                train_time = time.time()
+                report_loss = report_tgt_words = report_examples = 0.
+
+            # perform validation
+            if train_iter % valid_niter == 0:
+                print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
+                                                                                         cum_loss / cum_examples,
+                                                                                         np.exp(cum_loss / cum_tgt_words),
+                                                                                         cum_examples), file=sys.stderr)
+
+                cum_loss = cum_examples = cum_tgt_words = 0.
+                valid_num += 1
+
+                print('begin validation ...', file=sys.stderr)
+
+                # compute dev. ppl and bleu
+                dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
+                valid_metric = -dev_ppl
+
+                print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr)
+
+                is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
+                hist_valid_scores.append(valid_metric)
+
+                if is_better:
+                    patience = 0
+                    print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)
+                    model.save(model_save_path)
+
+                    # also save the optimizers' state
+                    torch.save(optimizer.state_dict(), model_save_path + '.optim')
+                elif patience < int(args['--patience']):
+                    patience += 1
+                    print('hit patience %d' % patience, file=sys.stderr)
+
+                    if patience == int(args['--patience']):
+                        num_trial += 1
+                        print('hit #%d trial' % num_trial, file=sys.stderr)
+                        if num_trial == int(args['--max-num-trial']):
+                            print('early stop!', file=sys.stderr)
+                            exit(0)
+
+                        # decay lr, and restore from previously best checkpoint
+                        lr = optimizer.param_groups[0]['lr'] * float(args['--lr-decay'])
+                        print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)
+
+                        # load model
+                        params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
+                        model.load_state_dict(params['state_dict'])
+                        model = model.to(device)
+
+                        print('restore parameters of the optimizers', file=sys.stderr)
+                        optimizer.load_state_dict(torch.load(model_save_path + '.optim'))
+
+                        # set new lr
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr
+
+                        # reset patience
+                        patience = 0
+
+                if epoch == int(args['--max-epoch']):
+                    print('reached maximum number of epochs!', file=sys.stderr)
+                    exit(0)
+
+
+def decode(args: Dict[str, str]):
+    """ Performs decoding on a test set, and save the best-scoring decoding results.
+    If the target gold-standard sentences are given, the function also computes
+    corpus-level BLEU score.
+    @param args (Dict): args from cmd line
+    """
+
+    print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr)
+    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
+    if args['TEST_TARGET_FILE']:
+        print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr)
+        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')
+
+    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
+    model = NMT.load(args['MODEL_PATH'])
+
+    if args['--cuda']:
+        model = model.to(torch.device("cuda:0"))
+
+    hypotheses = beam_search(model, test_data_src,
+                             beam_size=int(args['--beam-size']),
+                             max_decoding_time_step=int(args['--max-decoding-time-step']))
+
+    if args['TEST_TARGET_FILE']:
+        top_hypotheses = [hyps[0] for hyps in hypotheses]
+        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
+        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)
+
+    with open(args['OUTPUT_FILE'], 'w') as f:
+        for src_sent, hyps in zip(test_data_src, hypotheses):
+            top_hyp = hyps[0]
+            hyp_sent = ' '.join(top_hyp.value)
+            f.write(hyp_sent + '\n')
+
+
+def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]:
+    """ Run beam search to construct hypotheses for a list of src-language sentences.
+    @param model (NMT): NMT Model
+    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
+    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
+    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
+    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
+    """
+    was_training = model.training
+    model.eval()
+
+    hypotheses = []
+    with torch.no_grad():
+        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
+            example_hyps = model.beam_search(src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step)
+
+            hypotheses.append(example_hyps)
+
+    if was_training: model.train(was_training)
+
+    return hypotheses
+
+
+def main():
+    """ Main func.
+    """
+    args = docopt(__doc__)
+
+
+    # Check pytorch version
+    assert(torch.__version__ >= "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__)
+
+    # seed the random number generators
+    seed = int(args['--seed'])
+    torch.manual_seed(seed)
+    if args['--cuda']:
+        torch.cuda.manual_seed(seed)
+    np.random.seed(seed * 13 // 7)
+
+    if args['train']:
+        train(args)
+    elif args['decode']:
+        decode(args)
+    else:
+        raise RuntimeError('invalid run mode')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/Assignments/assignment4/MakiNaruto/sanity_check.py b/Assignments/assignment4/MakiNaruto/sanity_check.py
new file mode 100644
index 0000000..7cce01c
--- /dev/null
+++ b/Assignments/assignment4/MakiNaruto/sanity_check.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+CS224N 2019-20: Homework 4
+sanity_check.py: sanity checks for assignment 4
+Sahil Chopra <schopra8@stanford.edu>
+Michael Hahn <>
+Vera Lin <veralin@stanford.edu>
+
+If you are a student, please don't run overwrite_output_for_sanity_check as it will overwrite the correct output!
+
+Usage:
+    sanity_check.py 1d
+    sanity_check.py 1e
+    sanity_check.py 1f
+    sanity_check.py overwrite_output_for_sanity_check
+"""
+import sys
+
+import numpy as np
+
+from docopt import docopt
+from utils import batch_iter
+from utils import read_corpus
+from vocab import Vocab, VocabEntry
+
+from nmt_model import NMT
+
+
+import torch
+import torch.nn as nn
+import torch.nn.utils
+
+#----------
+# CONSTANTS
+#----------
+BATCH_SIZE = 5
+EMBED_SIZE = 3
+HIDDEN_SIZE = 3
+DROPOUT_RATE = 0.0
+
+def reinitialize_layers(model):
+    """ Reinitialize the Layer Weights for Sanity Checks.
+    """
+    def init_weights(m):
+        if type(m) == nn.Linear:
+            m.weight.data.fill_(0.3)
+            if m.bias is not None:
+                m.bias.data.fill_(0.1)
+        elif type(m) == nn.Embedding:
+            m.weight.data.fill_(0.15)
+        elif type(m) == nn.Dropout:
+            nn.Dropout(DROPOUT_RATE)
+    with torch.no_grad():
+        model.apply(init_weights)
+
+
+def generate_outputs(model, source, target, vocab):
+    """ Generate outputs.
+    """
+    print ("-"*80)
+    print("Generating Comparison Outputs")
+    reinitialize_layers(model)
+    model.gen_sanity_check = True
+    model.counter = 0
+
+    # Compute sentence lengths
+    source_lengths = [len(s) for s in source]
+
+    # Convert list of lists into tensors
+    source_padded = model.vocab.src.to_input_tensor(source, device=model.device)
+    target_padded = model.vocab.tgt.to_input_tensor(target, device=model.device)
+
+    # Run the model forward
+    with torch.no_grad():
+        enc_hiddens, dec_init_state = model.encode(source_padded, source_lengths)
+        enc_masks = model.generate_sent_masks(enc_hiddens, source_lengths)
+        combined_outputs = model.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
+
+    # Save Tensors to disk
+    torch.save(enc_hiddens, './sanity_check_en_es_data/enc_hiddens.pkl')
+    torch.save(dec_init_state, './sanity_check_en_es_data/dec_init_state.pkl') 
+    torch.save(enc_masks, './sanity_check_en_es_data/enc_masks.pkl')
+    torch.save(combined_outputs, './sanity_check_en_es_data/combined_outputs.pkl')
+    torch.save(target_padded, './sanity_check_en_es_data/target_padded.pkl')
+
+    # 1f
+    # Inputs
+    Ybar_t = torch.load('./sanity_check_en_es_data/Ybar_t.pkl')
+    enc_hiddens_proj = torch.load('./sanity_check_en_es_data/enc_hiddens_proj.pkl')
+    reinitialize_layers(model)
+    # Run Tests
+    with torch.no_grad():
+        dec_state_target, o_t_target, e_t_target = model.step(Ybar_t, dec_init_state, enc_hiddens, enc_hiddens_proj,
+                                                        enc_masks)
+    torch.save(dec_state_target, './sanity_check_en_es_data/dec_state.pkl')
+    torch.save(o_t_target, './sanity_check_en_es_data/o_t.pkl')
+    torch.save(e_t_target, './sanity_check_en_es_data/e_t.pkl')
+
+    model.gen_sanity_check = False
+
+def question_1d_sanity_check(model, src_sents, tgt_sents, vocab):
+    """ Sanity check for question 1d. 
+        Compares student output to that of model with dummy data.
+    """
+    print("Running Sanity Check for Question 1d: Encode")
+    print ("-"*80)
+
+    # Configure for Testing
+    reinitialize_layers(model)
+    source_lengths = [len(s) for s in src_sents]
+    source_padded = model.vocab.src.to_input_tensor(src_sents, device=model.device)
+
+    # Load Outputs
+    enc_hiddens_target = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl')
+    dec_init_state_target = torch.load('./sanity_check_en_es_data/dec_init_state.pkl')
+
+    # Test
+    with torch.no_grad():
+        enc_hiddens_pred, dec_init_state_pred = model.encode(source_padded, source_lengths)
+    assert(np.allclose(enc_hiddens_target.numpy(), enc_hiddens_pred.numpy())), "enc_hiddens is incorrect: it should be:\n {:} but is:\n{}".format(enc_hiddens_target, enc_hiddens_pred)
+    print("enc_hiddens Sanity Checks Passed!")
+    assert(np.allclose(dec_init_state_target[0].numpy(), dec_init_state_pred[0].numpy())), "dec_init_state[0] is incorrect: it should be:\n {} but is:\n{}".format(dec_init_state_target[0], dec_init_state_pred[0])
+    print("dec_init_state[0] Sanity Checks Passed!")
+    assert(np.allclose(dec_init_state_target[1].numpy(), dec_init_state_pred[1].numpy())), "dec_init_state[1] is incorrect: it should be:\n {} but is:\n{}".format(dec_init_state_target[1], dec_init_state_pred[1])
+    print("dec_init_state[1] Sanity Checks Passed!")
+    print("-"*80)
+    print("All Sanity Checks Passed for Question 1d: Encode!")
+    print("-"*80)
+
+
+def question_1e_sanity_check(model, src_sents, tgt_sents, vocab):
+    """ Sanity check for question 1e. 
+        Compares student output to that of model with dummy data.
+    """
+    print("-"*80)
+    print("Running Sanity Check for Question 1e: Decode")
+    print("-"*80)
+
+    # Load Inputs
+    dec_init_state = torch.load('./sanity_check_en_es_data/dec_init_state.pkl')
+    enc_hiddens = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl')
+    enc_masks = torch.load('./sanity_check_en_es_data/enc_masks.pkl')
+    target_padded = torch.load('./sanity_check_en_es_data/target_padded.pkl')
+
+    # Load Outputs
+    combined_outputs_target = torch.load('./sanity_check_en_es_data/combined_outputs.pkl')
+    print(combined_outputs_target.shape)
+
+    # Configure for Testing
+    reinitialize_layers(model)
+    COUNTER = [0]
+    def stepFunction(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks):
+       dec_state = torch.load('./sanity_check_en_es_data/step_dec_state_{}.pkl'.format(COUNTER[0]))
+       o_t = torch.load('./sanity_check_en_es_data/step_o_t_{}.pkl'.format(COUNTER[0]))
+       COUNTER[0]+=1
+       return dec_state, o_t, None
+    model.step = stepFunction
+
+    # Run Tests
+    with torch.no_grad():
+        combined_outputs_pred = model.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
+    assert(np.allclose(combined_outputs_pred.numpy(), combined_outputs_target.numpy())), "combined_outputs is incorrect: it should be:\n {} but is:\n{}".format(combined_outputs_target, combined_outputs_pred)
+    print("combined_outputs Sanity Checks Passed!")
+    print("-"*80)
+    print("All Sanity Checks Passed for Question 1e: Decode!")
+    print("-"*80)
+
+def question_1f_sanity_check(model, src_sents, tgt_sents, vocab):
+    """ Sanity check for question 1f. 
+        Compares student output to that of model with dummy data.
+    """
+    print ("-"*80)
+    print("Running Sanity Check for Question 1f: Step")
+    print ("-"*80)
+    reinitialize_layers(model)
+
+    # Inputs
+    Ybar_t = torch.load('./sanity_check_en_es_data/Ybar_t.pkl')
+    dec_init_state = torch.load('./sanity_check_en_es_data/dec_init_state.pkl')
+    enc_hiddens = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl')
+    enc_masks = torch.load('./sanity_check_en_es_data/enc_masks.pkl')
+    enc_hiddens_proj = torch.load('./sanity_check_en_es_data/enc_hiddens_proj.pkl')
+
+    # Output
+    dec_state_target = torch.load('./sanity_check_en_es_data/dec_state.pkl')
+    o_t_target = torch.load('./sanity_check_en_es_data/o_t.pkl')
+    e_t_target = torch.load('./sanity_check_en_es_data/e_t.pkl')
+
+    # Run Tests
+    with torch.no_grad():
+        dec_state_pred, o_t_pred, e_t_pred= model.step(Ybar_t, dec_init_state, enc_hiddens, enc_hiddens_proj, enc_masks)
+    assert(np.allclose(dec_state_target[0].numpy(), dec_state_pred[0].numpy())), "decoder_state[0] is incorrect: it should be:\n {} but is:\n{}".format(dec_state_target[0], dec_state_pred[0])
+    print("dec_state[0] Sanity Checks Passed!")
+    assert(np.allclose(dec_state_target[1].numpy(), dec_state_pred[1].numpy())), "decoder_state[1] is incorrect: it should be:\n {} but is:\n{}".format(dec_state_target[1], dec_state_pred[1])
+    print("dec_state[1] Sanity Checks Passed!")
+    assert(np.allclose(o_t_target.numpy(), o_t_pred.numpy())), "combined_output is incorrect: it should be:\n {} but is:\n{}".format(o_t_target, o_t_pred)
+    print("combined_output  Sanity Checks Passed!")
+    assert(np.allclose(e_t_target.numpy(), e_t_pred.numpy())), "e_t is incorrect: it should be:\n {} but is:\n{}".format(e_t_target, e_t_pred)
+    print("e_t Sanity Checks Passed!")
+    print("-"*80)
+    print("All Sanity Checks Passed for Question 1f: Step!")
+    print("-"*80)
+
+
+def main():
+    """ Main func.
+    """
+    # args = docopt(__doc__)
+    args = {'1d': False,
+     '1e': False,
+     '1f': True,
+     'overwrite_output_for_sanity_check': False}
+
+    # print(args)
+    # Check Python & PyTorch Versions
+    assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5"
+    assert(torch.__version__ >= "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(torch.__version__)
+
+    # Seed the Random Number Generators
+    seed = 1234
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    np.random.seed(seed * 13 // 7)
+
+    # Load training data & vocabulary
+    train_data_src = read_corpus('./sanity_check_en_es_data/train_sanity_check.es', 'src')
+    train_data_tgt = read_corpus('./sanity_check_en_es_data/train_sanity_check.en', 'tgt')
+    train_data = list(zip(train_data_src, train_data_tgt))
+
+    for src_sents, tgt_sents in batch_iter(train_data, batch_size=BATCH_SIZE, shuffle=True):
+        src_sents = src_sents
+        tgt_sents = tgt_sents
+        break
+    vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') 
+
+    # Create NMT Model
+    model = NMT(
+        embed_size=EMBED_SIZE,
+        hidden_size=HIDDEN_SIZE,
+        dropout_rate=DROPOUT_RATE,
+        vocab=vocab)
+
+    if args['1d']:
+        question_1d_sanity_check(model, src_sents, tgt_sents, vocab)
+    elif args['1e']:
+        question_1e_sanity_check(model, src_sents, tgt_sents, vocab)
+    elif args['1f']:
+        question_1f_sanity_check(model, src_sents, tgt_sents, vocab)
+    elif args['overwrite_output_for_sanity_check']:
+        generate_outputs(model, src_sents, tgt_sents, vocab)
+    else:
+        raise RuntimeError('invalid run mode')
+
+
+if __name__ == '__main__':
+    main()
+    
diff --git a/Assignments/assignment4/MakiNaruto/utils.py b/Assignments/assignment4/MakiNaruto/utils.py
new file mode 100644
index 0000000..00f30a7
--- /dev/null
+++ b/Assignments/assignment4/MakiNaruto/utils.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+CS224N 2019-20: Homework 4
+nmt.py: NMT Model
+Pencheng Yin <pcyin@cs.cmu.edu>
+Sahil Chopra <schopra8@stanford.edu>
+Vera Lin <veralin@stanford.edu>
+"""
+
+import math
+from typing import List
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import nltk
+# nltk.download('punkt')
+
+
+def pad_sents(sents, pad_token):
+    """ Pad list of sentences according to the longest sentence in the batch.
+        The paddings should be at the end of each sentence.
+    @param sents (list[list[str]]): list of sentences, where each sentence
+                                    is represented as a list of words
+    @param pad_token (str): padding token
+    @returns sents_padded (list[list[str]]): list of sentences where sentences shorter
+        than the max length sentence are padded out with the pad_token, such that
+        each sentences in the batch now has equal length.
+    """
+    sents_padded = []
+
+    ### YOUR CODE HERE (~6 Lines)
+    max_len = max([len(sent) for sent in sents])
+    for sent in sents:
+        sent_len = len(sent)
+        sents_padded.append(sent + (max_len - sent_len) * [pad_token])
+    ### END YOUR CODE
+
+    return sents_padded
+
+
+def read_corpus(file_path, source):
+    """ Read file, where each sentence is dilineated by a `\n`.
+    @param file_path (str): path to file containing corpus
+    @param source (str): "tgt" or "src" indicating whether text
+        is of the source language or target language
+    """
+    data = []
+    for line in open(file_path):
+        sent = nltk.word_tokenize(line)
+        # only append <s> and </s> to the target sentence
+        if source == 'tgt':
+            sent = ['<s>'] + sent + ['</s>']
+        data.append(sent)
+
+    return data
+
+
+def batch_iter(data, batch_size, shuffle=False):
+    """ Yield batches of source and target sentences reverse sorted by length (largest to smallest).
+    @param data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
+    @param batch_size (int): batch size
+    @param shuffle (boolean): whether to randomly shuffle the dataset
+    """
+    batch_num = math.ceil(len(data) / batch_size)
+    index_array = list(range(len(data)))
+
+    if shuffle:
+        np.random.shuffle(index_array)
+
+    for i in range(batch_num):
+        indices = index_array[i * batch_size: (i + 1) * batch_size]
+        examples = [data[idx] for idx in indices]
+
+        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)
+        src_sents = [e[0] for e in examples]
+        tgt_sents = [e[1] for e in examples]
+
+        yield src_sents, tgt_sents
+
diff --git a/Assignments/assignment4/MakiNaruto/vocab.py b/Assignments/assignment4/MakiNaruto/vocab.py
new file mode 100644
index 0000000..ebe5082
--- /dev/null
+++ b/Assignments/assignment4/MakiNaruto/vocab.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+CS224N 2019-20: Homework 4
+vocab.py: Vocabulary Generation
+Pencheng Yin <pcyin@cs.cmu.edu>
+Sahil Chopra <schopra8@stanford.edu>
+Vera Lin <veralin@stanford.edu>
+
+Usage:
+    vocab.py --train-src=<file> --train-tgt=<file> [options] VOCAB_FILE
+
+Options:
+    -h --help                  Show this screen.
+    --train-src=<file>         File of training source sentences
+    --train-tgt=<file>         File of training target sentences
+    --size=<int>               vocab size [default: 50000]
+    --freq-cutoff=<int>        frequency cutoff [default: 2]
+"""
+
+from collections import Counter
+from docopt import docopt
+from itertools import chain
+import json
+import torch
+from typing import List
+from utils import read_corpus, pad_sents
+
+
+class VocabEntry(object):
+    """ Vocabulary Entry, i.e. structure containing either
+    src or tgt language terms.
+    """
+    def __init__(self, word2id=None):
+        """ Init VocabEntry Instance.
+        @param word2id (dict): dictionary mapping words 2 indices
+        """
+        if word2id:
+            self.word2id = word2id
+        else:
+            self.word2id = dict()
+            self.word2id['<pad>'] = 0   # Pad Token
+            self.word2id['<s>'] = 1 # Start Token
+            self.word2id['</s>'] = 2    # End Token
+            self.word2id['<unk>'] = 3   # Unknown Token
+        self.unk_id = self.word2id['<unk>']
+        self.id2word = {v: k for k, v in self.word2id.items()}
+
+    def __getitem__(self, word):
+        """ Retrieve word's index. Return the index for the unk
+        token if the word is out of vocabulary.
+        @param word (str): word to look up.
+        @returns index (int): index of word 
+        """
+        return self.word2id.get(word, self.unk_id)
+
+    def __contains__(self, word):
+        """ Check if word is captured by VocabEntry.
+        @param word (str): word to look up
+        @returns contains (bool): whether word is contained    
+        """
+        return word in self.word2id
+
+    def __setitem__(self, key, value):
+        """ Raise error, if one tries to edit the VocabEntry.
+        """
+        raise ValueError('vocabulary is readonly')
+
+    def __len__(self):
+        """ Compute number of words in VocabEntry.
+        @returns len (int): number of words in VocabEntry
+        """
+        return len(self.word2id)
+
+    def __repr__(self):
+        """ Representation of VocabEntry to be used
+        when printing the object.
+        """
+        return 'Vocabulary[size=%d]' % len(self)
+
+    def id2word(self, wid):
+        """ Return mapping of index to word.
+        @param wid (int): word index
+        @returns word (str): word corresponding to index
+        """
+        return self.id2word[wid]
+
+    def add(self, word):
+        """ Add word to VocabEntry, if it is previously unseen.
+        @param word (str): word to add to VocabEntry
+        @return index (int): index that the word has been assigned
+        """
+        if word not in self:
+            wid = self.word2id[word] = len(self)
+            self.id2word[wid] = word
+            return wid
+        else:
+            return self[word]
+
+    def words2indices(self, sents):
+        """ Convert list of words or list of sentences of words
+        into list or list of list of indices.
+        @param sents (list[str] or list[list[str]]): sentence(s) in words
+        @return word_ids (list[int] or list[list[int]]): sentence(s) in indices
+        """
+        if type(sents[0]) == list:
+            return [[self[w] for w in s] for s in sents]
+        else:
+            return [self[w] for w in sents]
+
+    def indices2words(self, word_ids):
+        """ Convert list of indices into words.
+        @param word_ids (list[int]): list of word ids
+        @return sents (list[str]): list of words
+        """
+        return [self.id2word[w_id] for w_id in word_ids]
+
+    def to_input_tensor(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
+        """ Convert list of sentences (words) into tensor with necessary padding for 
+        shorter sentences.
+
+        @param sents (List[List[str]]): list of sentences (words)
+        @param device: device on which to load the tesnor, i.e. CPU or GPU
+
+        @returns sents_var: tensor of (max_sentence_length, batch_size)
+        """
+        word_ids = self.words2indices(sents)
+        sents_t = pad_sents(word_ids, self['<pad>'])
+        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
+        return torch.t(sents_var)
+
+    @staticmethod
+    def from_corpus(corpus, size, freq_cutoff=2):
+        """ Given a corpus construct a Vocab Entry.
+        @param corpus (list[str]): corpus of text produced by read_corpus function
+        @param size (int): # of words in vocabulary
+        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word
+        @returns vocab_entry (VocabEntry): VocabEntry instance produced from provided corpus
+        """
+        vocab_entry = VocabEntry()
+        word_freq = Counter(chain(*corpus))
+        valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]
+        print('number of word types: {}, number of word types w/ frequency >= {}: {}'
+              .format(len(word_freq), freq_cutoff, len(valid_words)))
+        top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:size]
+        for word in top_k_words:
+            vocab_entry.add(word)
+        return vocab_entry
+
+
+class Vocab(object):
+    """ Vocab encapsulating src and target langauges.
+    """
+    def __init__(self, src_vocab: VocabEntry, tgt_vocab: VocabEntry):
+        """ Init Vocab.
+        @param src_vocab (VocabEntry): VocabEntry for source language
+        @param tgt_vocab (VocabEntry): VocabEntry for target language
+        """
+        self.src = src_vocab
+        self.tgt = tgt_vocab
+
+    @staticmethod
+    def build(src_sents, tgt_sents, vocab_size, freq_cutoff) -> 'Vocab':
+        """ Build Vocabulary.
+        @param src_sents (list[str]): Source sentences provided by read_corpus() function
+        @param tgt_sents (list[str]): Target sentences provided by read_corpus() function
+        @param vocab_size (int): Size of vocabulary for both source and target languages
+        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word.
+        """
+        assert len(src_sents) == len(tgt_sents)
+
+        print('initialize source vocabulary ..')
+        src = VocabEntry.from_corpus(src_sents, vocab_size, freq_cutoff)
+
+        print('initialize target vocabulary ..')
+        tgt = VocabEntry.from_corpus(tgt_sents, vocab_size, freq_cutoff)
+
+        return Vocab(src, tgt)
+
+    def save(self, file_path):
+        """ Save Vocab to file as JSON dump.
+        @param file_path (str): file path to vocab file
+        """
+        json.dump(dict(src_word2id=self.src.word2id, tgt_word2id=self.tgt.word2id), open(file_path, 'w'), indent=2)
+
+    @staticmethod
+    def load(file_path):
+        """ Load vocabulary from JSON dump.
+        @param file_path (str): file path to vocab file
+        @returns Vocab object loaded from JSON dump
+        """
+        entry = json.load(open(file_path, 'r'))
+        src_word2id = entry['src_word2id']
+        tgt_word2id = entry['tgt_word2id']
+
+        return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id))
+
+    def __repr__(self):
+        """ Representation of Vocab to be used
+        when printing the object.
+        """
+        return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt))
+
+
+
+if __name__ == '__main__':
+    args = docopt(__doc__)
+
+    # print(args)
+    print('read in source sentences: %s' % args['--train-src'])
+    print('read in target sentences: %s' % args['--train-tgt'])
+
+    src_sents = read_corpus(args['--train-src'], source='src')
+    tgt_sents = read_corpus(args['--train-tgt'], source='tgt')
+
+    vocab = Vocab.build(src_sents, tgt_sents, int(args['--size']), int(args['--freq-cutoff']))
+    print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt)))
+
+    vocab.save(args['VOCAB_FILE'])
+    print('vocabulary saved to %s' % args['VOCAB_FILE'])
diff --git a/Assignments/assignment4/MakiNaruto/word_nmt.jpg b/Assignments/assignment4/MakiNaruto/word_nmt.jpg
new file mode 100644
index 0000000..89af8b5
Binary files /dev/null and b/Assignments/assignment4/MakiNaruto/word_nmt.jpg differ