Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ERROR while pulling python execute.py #95

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
data/
working_dir/
*.pyc
.venv/
.idea/
73 changes: 66 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,81 @@ Use [pip](https://pypi.python.org/pypi/pip) to install any missing dependencies
Usage
===========

To train the bot, edit the `seq2seq.ini` file so that mode is set to train like so
Create venv & install dependencies:

> Using viritualenv here to be compatable with python2, install with "pip install virtualenv"

```
# create venv
python -m virtualenv .venv

# enter venv (assuming macos/linux)
source .venv/bin/activate

# install requirements
pip install -r requirements.txt
```

Training
--------------------

0. Create directories
```
mkdir working_dir
mkdir data
```

1. Download the [Cornell Movie Dialogue dataset](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html) and place the unzipped content in the `data/` directory.
```
cd data
wget http://www.mpi-sws.org/~cristian/data/cornell_movie_dialogs_corpus.zip
unzip cornell_movie_dialogs_corpus.zip

# move to `data/` root
mv "corenell movie-dialogs corpus"/* .
```

2. Prepare data for training, in the `data/` directory, run the `prepare_data.py` script
```
# move to project root
cd ..
python prepare_data.py
```

3. To train the bot, edit the `seq2seq.ini` file so that mode is set to train like so

`mode = train`

then run the code like so
4. Start training, by running the code like so:

``python execute.py``

> There is no mechanism to stop training, you will need to 'ctrl-c' to stop training after a period of time.


Test
-------------

1. To test the bot during or after training, edit the `seq2seq.ini` file so that mode is set to test like so

`mode = test`

``python execute.py``
2. To test run the code like so:

To test the bot during or after training, edit the `seq2seq.ini` file so that mode is set to test like so
```
python execute.py

`mode = test`
>> Mode : test

then run the code like so
Reading model parameters from working_dir/seq2seq.ckpt-10200
>
```

``python execute.py``

3. Confrim...
- What does "Test do?"
- How to use it?

Challenge
===========
Expand Down
154 changes: 79 additions & 75 deletions data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@

import os
import re

from six.moves import urllib
from io import open
from collections import Counter

from tensorflow.python.platform import gfile

# Special vocabulary symbols - we always put them at the start.
_PAD = b"_PAD"
_GO = b"_GO"
_EOS = b"_EOS"
_UNK = b"_UNK"
_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_UNK = "_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
Expand All @@ -38,93 +38,97 @@
UNK_ID = 3

# Regular expressions used to tokenize.
_WORD_SPLIT = re.compile(b"([.,!?\"':;)(])")
_DIGIT_RE = re.compile(br"\d")
_WORD_SPLIT = re.compile("([.,!?\"':;)(])")
_DIGIT_RE = re.compile(r"\d")

CORNELL_MOVIE_CORPUS_ENCODING = 'ISO-8859-2'


def basic_tokenizer(sentence):
"""Very basic tokenizer: split the sentence into a list of tokens."""
words = []
for space_separated_fragment in sentence.strip().split():
words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
return [w for w in words if w]
"""Very basic tokenizer: split the sentence into a list of tokens."""
all_words = []
for space_separated_fragment in sentence.strip().split():
words = re.split(_WORD_SPLIT, space_separated_fragment)
for word in words:
if word:
all_words.append(word)
return all_words


def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
tokenizer=None, normalize_digits=True):

if not gfile.Exists(vocabulary_path):
print("Creating vocabulary %s from %s" % (vocabulary_path, data_path))
if not tokenizer:
tokenizer = basic_tokenizer

if not os.path.exists(vocabulary_path):
print("Creating vocabulary %s from %s" % (vocabulary_path, data_path))
vocab = Counter()
with open(data_path, 'rt', encoding='utf8') as f:
for counter, sentence in enumerate(f, 1):
if counter % 100000 == 0:
print(" processing line %d" % counter)
tokens = tokenizer(sentence)
for w in tokens:
if normalize_digits:
word = re.sub(_DIGIT_RE, '0', w)
else:
word = w
vocab[word] += 1

vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
print('>> Full Vocabulary Size :', len(vocab_list))
if len(vocab_list) > max_vocabulary_size:
vocab_list = vocab_list[:max_vocabulary_size]
print('>>>> Vocab Truncated to: {}'.format(max_vocabulary_size))
with open(vocabulary_path, 'wt', encoding='utf8') as vocab_file:
for w in vocab_list:
vocab_file.write(w + '\n')


def initialize_vocabulary(vocabulary_path, encoding=CORNELL_MOVIE_CORPUS_ENCODING):
vocab = {}
with gfile.GFile(data_path, mode="rb") as f:
counter = 0
for line in f:
counter += 1
if counter % 100000 == 0:
print(" processing line %d" % counter)
tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
for w in tokens:
word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w
if word in vocab:
vocab[word] += 1
else:
vocab[word] = 1
vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
print('>> Full Vocabulary Size :',len(vocab_list))
if len(vocab_list) > max_vocabulary_size:
vocab_list = vocab_list[:max_vocabulary_size]
with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
for w in vocab_list:
vocab_file.write(w + b"\n")


def initialize_vocabulary(vocabulary_path):

if gfile.Exists(vocabulary_path):
rev_vocab = []
with gfile.GFile(vocabulary_path, mode="rb") as f:
rev_vocab.extend(f.readlines())
rev_vocab = [line.strip() for line in rev_vocab]
vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
if gfile.Exists(vocabulary_path):
with open(vocabulary_path, 'rt', encoding=encoding) as f:
for index, line in enumerate(f, 1):
element = line.strip()
rev_vocab.append(element)
vocab[element] = index
assert len(vocab) == len(rev_vocab)
if not (vocab and rev_vocab):
raise ValueError('File empty: {}'.format(vocabulary_path))
else:
raise ValueError("Vocabulary file %s not found.", vocabulary_path)
return vocab, rev_vocab
else:
raise ValueError("Vocabulary file %s not found.", vocabulary_path)


def sentence_to_token_ids(sentence, vocabulary, tokenizer=None, normalize_digits=True):

if tokenizer:
if not tokenizer:
tokenizer = basic_tokenizer
words = tokenizer(sentence)
else:
words = basic_tokenizer(sentence)
if not normalize_digits:
return [vocabulary.get(w, UNK_ID) for w in words]
# Normalize digits by 0 before looking words up in the vocabulary.
return [vocabulary.get(re.sub(_DIGIT_RE, b"0", w), UNK_ID) for w in words]
if not normalize_digits:
return [vocabulary.get(w, UNK_ID) for w in words]
# Normalize digits by 0 before looking words up in the vocabulary.
return [vocabulary.get(re.sub(_DIGIT_RE, '0', w), UNK_ID) for w in words]


def data_to_token_ids(data_path, target_path, vocabulary_path,
tokenizer=None, normalize_digits=True):

if not gfile.Exists(target_path):
print("Tokenizing data in %s" % data_path)
vocab, _ = initialize_vocabulary(vocabulary_path)
with gfile.GFile(data_path, mode="rb") as data_file:
with gfile.GFile(target_path, mode="w") as tokens_file:
counter = 0
for line in data_file:
counter += 1
if counter % 100000 == 0:
print(" tokenizing line %d" % counter)
token_ids = sentence_to_token_ids(line, vocab, tokenizer,
normalize_digits)
tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")



def prepare_custom_data(working_directory, train_enc, train_dec, test_enc, test_dec, enc_vocabulary_size, dec_vocabulary_size, tokenizer=None):

if not gfile.Exists(target_path):
print("Tokenizing data in %s" % data_path)
vocab, _ = initialize_vocabulary(vocabulary_path)
with gfile.GFile(data_path, mode="rb") as data_file:
with gfile.GFile(target_path, mode="w") as tokens_file:
for counter, line in enumerate(data_file, 1):
if counter % 100000 == 0:
print(" tokenizing line %d" % counter)
token_ids = sentence_to_token_ids(line, vocab, tokenizer,
normalize_digits)
tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")


def prepare_custom_data(working_directory, train_enc, train_dec, test_enc, test_dec, enc_vocabulary_size,
dec_vocabulary_size, tokenizer=None):
# Create vocabularies of the appropriate sizes.
enc_vocab_path = os.path.join(working_directory, "vocab%d.enc" % enc_vocabulary_size)
dec_vocab_path = os.path.join(working_directory, "vocab%d.dec" % dec_vocabulary_size)
Expand All @@ -143,4 +147,4 @@ def prepare_custom_data(working_directory, train_enc, train_dec, test_enc, test_
data_to_token_ids(test_enc, enc_dev_ids_path, enc_vocab_path, tokenizer)
data_to_token_ids(test_dec, dec_dev_ids_path, dec_vocab_path, tokenizer)

return (enc_train_ids_path, dec_train_ids_path, enc_dev_ids_path, dec_dev_ids_path, enc_vocab_path, dec_vocab_path)
return enc_train_ids_path, dec_train_ids_path, enc_dev_ids_path, dec_dev_ids_path, enc_vocab_path, dec_vocab_path
Loading