diff --git a/README.md b/README.md index 21a04b8..a2d3fbf 100644 --- a/README.md +++ b/README.md @@ -26,11 +26,11 @@ Options: --model Whether to use LSTM or GRU units gru --n_epochs Number of epochs to train 2000 --print_every Log learning rate at this interval 100 ---hidden_size Hidden size of GRU 50 +--hidden_size Hidden size of GRU 128 --n_layers Number of GRU layers 2 --learning_rate Learning rate 0.01 --chunk_len Length of training chunks 200 ---batch_size Number of examples per batch 100 +--batch_size Number of examples per batch 64 --cuda Use CUDA ``` diff --git a/generate.py b/generate.py index 0fdf414..40fb9e7 100755 --- a/generate.py +++ b/generate.py @@ -8,21 +8,18 @@ from helpers import * from model import * -def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=False): - hidden = decoder.init_hidden(1) - prime_input = Variable(char_tensor(prime_str).unsqueeze(0)) +def generate(decoder, all_characters, prime_str='A', predict_len=100, temperature=0.8, cuda=False): + hidden = decoder.init_hidden(1, cuda) + prime_input = Variable(char_tensor(prime_str, all_characters).unsqueeze(0)) if cuda: - hidden = hidden.cuda() prime_input = prime_input.cuda() predicted = prime_str # Use priming string to "build up" hidden state - for p in range(len(prime_str) - 1): - _, hidden = decoder(prime_input[:,p], hidden) - - inp = prime_input[:,-1] - + _, hidden = decoder(prime_input, hidden) + inp = prime_input[0,-1].view(1, -1) + for p in range(predict_len): output, hidden = decoder(inp, hidden) @@ -33,7 +30,7 @@ def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=Fals # Add predicted character to string and use as next input predicted_char = all_characters[top_i] predicted += predicted_char - inp = Variable(char_tensor(predicted_char).unsqueeze(0)) + inp = Variable(char_tensor(predicted_char, all_characters).unsqueeze(0)) if cuda: inp = inp.cuda() @@ -51,7 +48,8 @@ def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=Fals argparser.add_argument('--cuda', action='store_true') args = argparser.parse_args() - decoder = torch.load(args.filename) + all_characters, decoder = torch.load(args.filename, map_location = 'cuda' if args.cuda else 'cpu') del args.filename - print(generate(decoder, **vars(args))) + with torch.no_grad(): + print(generate(decoder, all_characters, **vars(args))) diff --git a/helpers.py b/helpers.py index abbd56a..575dbe4 100644 --- a/helpers.py +++ b/helpers.py @@ -1,24 +1,20 @@ # https://github.com/spro/char-rnn.pytorch -import unidecode import string import random import time import math import torch -# Reading and un-unicode-encoding data - -all_characters = string.printable -n_characters = len(all_characters) def read_file(filename): - file = unidecode.unidecode(open(filename).read()) - return file, len(file) + file = open(filename).read() + all_characters = list(set(file)) + return file, len(file), all_characters, len(all_characters) # Turning a string into a tensor -def char_tensor(string): +def char_tensor(string, all_characters): tensor = torch.zeros(len(string)).long() for c in range(len(string)): try: diff --git a/model.py b/model.py index b619634..94a6387 100644 --- a/model.py +++ b/model.py @@ -15,27 +15,25 @@ def __init__(self, input_size, hidden_size, output_size, model="gru", n_layers=1 self.encoder = nn.Embedding(input_size, hidden_size) if self.model == "gru": - self.rnn = nn.GRU(hidden_size, hidden_size, n_layers) + self.rnn = nn.GRU(hidden_size, hidden_size, n_layers, batch_first=True) elif self.model == "lstm": - self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers) + self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers, batch_first=True) self.decoder = nn.Linear(hidden_size, output_size) def forward(self, input, hidden): - batch_size = input.size(0) + """ + input: shape=(batch_size, seq_size) + output: shape=(batch_size, seq_size, output_size) + """ encoded = self.encoder(input) - output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden) - output = self.decoder(output.view(batch_size, -1)) + output, hidden = self.rnn(encoded, hidden) + output = self.decoder(output) return output, hidden - def forward2(self, input, hidden): - encoded = self.encoder(input.view(1, -1)) - output, hidden = self.rnn(encoded.view(1, 1, -1), hidden) - output = self.decoder(output.view(1, -1)) - return output, hidden - - def init_hidden(self, batch_size): + def init_hidden(self, batch_size, cuda): + cuda_wrapper = lambda x: x.cuda() if cuda else x if self.model == "lstm": - return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)), - Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))) - return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)) + return (cuda_wrapper(Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))), + cuda_wrapper(Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)))) + return cuda_wrapper(Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))) diff --git a/train.py b/train.py index 4d47bba..25714d3 100755 --- a/train.py +++ b/train.py @@ -19,11 +19,11 @@ argparser.add_argument('--model', type=str, default="gru") argparser.add_argument('--n_epochs', type=int, default=2000) argparser.add_argument('--print_every', type=int, default=100) -argparser.add_argument('--hidden_size', type=int, default=100) +argparser.add_argument('--hidden_size', type=int, default=128) argparser.add_argument('--n_layers', type=int, default=2) argparser.add_argument('--learning_rate', type=float, default=0.01) argparser.add_argument('--chunk_len', type=int, default=200) -argparser.add_argument('--batch_size', type=int, default=100) +argparser.add_argument('--batch_size', type=int, default=128) argparser.add_argument('--shuffle', action='store_true') argparser.add_argument('--cuda', action='store_true') args = argparser.parse_args() @@ -31,17 +31,17 @@ if args.cuda: print("Using CUDA") -file, file_len = read_file(args.filename) +file, file_len, all_characters, n_characters = read_file(args.filename) def random_training_set(chunk_len, batch_size): inp = torch.LongTensor(batch_size, chunk_len) target = torch.LongTensor(batch_size, chunk_len) for bi in range(batch_size): - start_index = random.randint(0, file_len - chunk_len) + start_index = random.randint(0, file_len - chunk_len - 1) end_index = start_index + chunk_len + 1 chunk = file[start_index:end_index] - inp[bi] = char_tensor(chunk[:-1]) - target[bi] = char_tensor(chunk[1:]) + inp[bi] = char_tensor(chunk[:-1], all_characters) + target[bi] = char_tensor(chunk[1:], all_characters) inp = Variable(inp) target = Variable(target) if args.cuda: @@ -50,24 +50,24 @@ def random_training_set(chunk_len, batch_size): return inp, target def train(inp, target): - hidden = decoder.init_hidden(args.batch_size) - if args.cuda: - hidden = hidden.cuda() + """ + inp: (batch_size, seq_size) + target: (batch_size, seq_size) + """ + hidden = decoder.init_hidden(args.batch_size, args.cuda) decoder.zero_grad() - loss = 0 - for c in range(args.chunk_len): - output, hidden = decoder(inp[:,c], hidden) - loss += criterion(output.view(args.batch_size, -1), target[:,c]) + output, hidden = decoder(inp, hidden) + loss = criterion(output.view(-1, output.size(-1)), target.view(-1)) loss.backward() decoder_optimizer.step() - return loss.data[0] / args.chunk_len + return loss.item() def save(): save_filename = os.path.splitext(os.path.basename(args.filename))[0] + '.pt' - torch.save(decoder, save_filename) + torch.save((all_characters, decoder), save_filename) print('Saved as %s' % save_filename) # Initialize models and start training @@ -97,7 +97,7 @@ def save(): if epoch % args.print_every == 0: print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / args.n_epochs * 100, loss)) - print(generate(decoder, 'Wh', 100, cuda=args.cuda), '\n') + print(generate(decoder, all_characters, 'Wh', 100, cuda=args.cuda), '\n') print("Saving...") save()