llSourcell · pumpkinband · Oct 17, 2017 · Oct 17, 2017 · Oct 20, 2017 · Oct 20, 2017
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,5 @@
 data/
+working_dir/
 *.pyc
+.venv/
+.idea/
diff --git a/README.md b/README.md
@@ -20,22 +20,81 @@ Use [pip](https://pypi.python.org/pypi/pip) to install any missing dependencies
 Usage
 ===========
 
-To train the bot, edit the `seq2seq.ini` file so that mode is set to train like so
+Create venv & install dependencies:
+
+> Using viritualenv here to be compatable with python2, install with "pip install virtualenv"
+
+```
+# create venv
+python -m virtualenv .venv
+
+# enter venv (assuming macos/linux)
+source .venv/bin/activate
+
+# install requirements
+pip install -r requirements.txt
+```
+
+Training
+--------------------
+
+0. Create directories
+    ```
+    mkdir working_dir
+    mkdir data
+    ```
+
+1. Download the [Cornell Movie Dialogue dataset](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html) and place the unzipped content in the `data/` directory.
+    ```
+    cd data
+    wget http://www.mpi-sws.org/~cristian/data/cornell_movie_dialogs_corpus.zip
+    unzip cornell_movie_dialogs_corpus.zip
+
+    # move to `data/` root
+    mv "corenell movie-dialogs corpus"/* .
+    ```
+
+2. Prepare data for training, in the `data/` directory, run the `prepare_data.py` script
+    ```
+    # move to project root
+    cd ..
+    python prepare_data.py
+    ```
+
+3. To train the bot, edit the `seq2seq.ini` file so that mode is set to train like so
 
 `mode = train`
 
-then run the code like so
+4. Start training, by running the code like so:
+
+    ``python execute.py``
+
+> There is no mechanism to stop training, you will need to 'ctrl-c' to stop training after a period of time.
+
+
+Test
+-------------
+
+1. To test the bot during or after training, edit the `seq2seq.ini` file so that mode is set to test like so
+
+    `mode = test`
 
-``python execute.py``
+2. To test run the code like so:
 
-To test the bot during or after training, edit the `seq2seq.ini` file so that mode is set to test like so
+    ```
+    python execute.py
 
-`mode = test`
+    >> Mode : test
 
-then run the code like so
+    Reading model parameters from working_dir/seq2seq.ckpt-10200
+    >    
+    ```
 
-``python execute.py``
+    
 
+3. Confrim... 
+    - What does "Test do?"
+    - How to use it?
 
 Challenge
 ===========

diff --git a/data_utils.py b/data_utils.py
@@ -20,16 +20,16 @@
 
 import os
 import re
-
-from six.moves import urllib
+from io import open
+from collections import Counter
 
 from tensorflow.python.platform import gfile
 
 # Special vocabulary symbols - we always put them at the start.
-_PAD = b"_PAD"
-_GO = b"_GO"
-_EOS = b"_EOS"
-_UNK = b"_UNK"
+_PAD = "_PAD"
+_GO = "_GO"
+_EOS = "_EOS"
+_UNK = "_UNK"
 _START_VOCAB = [_PAD, _GO, _EOS, _UNK]
 
 PAD_ID = 0
@@ -38,93 +38,97 @@
 UNK_ID = 3
 
 # Regular expressions used to tokenize.
-_WORD_SPLIT = re.compile(b"([.,!?\"':;)(])")
-_DIGIT_RE = re.compile(br"\d")
+_WORD_SPLIT = re.compile("([.,!?\"':;)(])")
+_DIGIT_RE = re.compile(r"\d")
 
+CORNELL_MOVIE_CORPUS_ENCODING = 'ISO-8859-2'
 
 
 def basic_tokenizer(sentence):
-  """Very basic tokenizer: split the sentence into a list of tokens."""
-  words = []
-  for space_separated_fragment in sentence.strip().split():
-    words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
-  return [w for w in words if w]
+    """Very basic tokenizer: split the sentence into a list of tokens."""
+    all_words = []
+    for space_separated_fragment in sentence.strip().split():
+        words = re.split(_WORD_SPLIT, space_separated_fragment)
+        for word in words:
+            if word:
+                all_words.append(word)
+    return all_words
 
 
 def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
                       tokenizer=None, normalize_digits=True):
-
-  if not gfile.Exists(vocabulary_path):
-    print("Creating vocabulary %s from %s" % (vocabulary_path, data_path))
+    if not tokenizer:
+        tokenizer = basic_tokenizer
+
+    if not os.path.exists(vocabulary_path):
+        print("Creating vocabulary %s from %s" % (vocabulary_path, data_path))
+        vocab = Counter()
+        with open(data_path, 'rt', encoding='utf8') as f:
+            for counter, sentence in enumerate(f, 1):
+                if counter % 100000 == 0:
+                    print("  processing line %d" % counter)
+                tokens = tokenizer(sentence)
+                for w in tokens:
+                    if normalize_digits:
+                        word = re.sub(_DIGIT_RE, '0', w)
+                    else:
+                        word = w
+                    vocab[word] += 1
+
+            vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
+            print('>> Full Vocabulary Size :', len(vocab_list))
+            if len(vocab_list) > max_vocabulary_size:
+                vocab_list = vocab_list[:max_vocabulary_size]
+                print('>>>> Vocab Truncated to: {}'.format(max_vocabulary_size))
+            with open(vocabulary_path, 'wt', encoding='utf8') as vocab_file:
+                for w in vocab_list:
+                    vocab_file.write(w + '\n')
+
+
+def initialize_vocabulary(vocabulary_path, encoding=CORNELL_MOVIE_CORPUS_ENCODING):
     vocab = {}
-    with gfile.GFile(data_path, mode="rb") as f:
-      counter = 0
-      for line in f:
-        counter += 1
-        if counter % 100000 == 0:
-          print("  processing line %d" % counter)
-        tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
-        for w in tokens:
-          word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w
-          if word in vocab:
-            vocab[word] += 1
-          else:
-            vocab[word] = 1
-      vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
-      print('>> Full Vocabulary Size :',len(vocab_list))
-      if len(vocab_list) > max_vocabulary_size:
-        vocab_list = vocab_list[:max_vocabulary_size]
-      with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
-        for w in vocab_list:
-          vocab_file.write(w + b"\n")
-
-
-def initialize_vocabulary(vocabulary_path):
-
-  if gfile.Exists(vocabulary_path):
     rev_vocab = []
-    with gfile.GFile(vocabulary_path, mode="rb") as f:
-      rev_vocab.extend(f.readlines())
-    rev_vocab = [line.strip() for line in rev_vocab]
-    vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
+    if gfile.Exists(vocabulary_path):
+        with open(vocabulary_path, 'rt', encoding=encoding) as f:
+            for index, line in enumerate(f, 1):
+                element = line.strip()
+                rev_vocab.append(element)
+                vocab[element] = index
+        assert len(vocab) == len(rev_vocab)
+        if not (vocab and rev_vocab):
+            raise ValueError('File empty: {}'.format(vocabulary_path))
+    else:
+        raise ValueError("Vocabulary file %s not found.", vocabulary_path)
     return vocab, rev_vocab
-  else:
-    raise ValueError("Vocabulary file %s not found.", vocabulary_path)
 
 
 def sentence_to_token_ids(sentence, vocabulary, tokenizer=None, normalize_digits=True):
-
-  if tokenizer:
+    if not tokenizer:
+        tokenizer = basic_tokenizer
     words = tokenizer(sentence)
-  else:
-    words = basic_tokenizer(sentence)
-  if not normalize_digits:
-    return [vocabulary.get(w, UNK_ID) for w in words]
-  # Normalize digits by 0 before looking words up in the vocabulary.
-  return [vocabulary.get(re.sub(_DIGIT_RE, b"0", w), UNK_ID) for w in words]
+    if not normalize_digits:
+        return [vocabulary.get(w, UNK_ID) for w in words]
+    # Normalize digits by 0 before looking words up in the vocabulary.
+    return [vocabulary.get(re.sub(_DIGIT_RE, '0', w), UNK_ID) for w in words]
 
 
 def data_to_token_ids(data_path, target_path, vocabulary_path,
                       tokenizer=None, normalize_digits=True):
-
-  if not gfile.Exists(target_path):
-    print("Tokenizing data in %s" % data_path)
-    vocab, _ = initialize_vocabulary(vocabulary_path)
-    with gfile.GFile(data_path, mode="rb") as data_file:
-      with gfile.GFile(target_path, mode="w") as tokens_file:
-        counter = 0
-        for line in data_file:
-          counter += 1
-          if counter % 100000 == 0:
-            print("  tokenizing line %d" % counter)
-          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
-                                            normalize_digits)
-          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
-
-
-
-def prepare_custom_data(working_directory, train_enc, train_dec, test_enc, test_dec, enc_vocabulary_size, dec_vocabulary_size, tokenizer=None):
-
+    if not gfile.Exists(target_path):
+        print("Tokenizing data in %s" % data_path)
+        vocab, _ = initialize_vocabulary(vocabulary_path)
+        with gfile.GFile(data_path, mode="rb") as data_file:
+            with gfile.GFile(target_path, mode="w") as tokens_file:
+                for counter, line in enumerate(data_file, 1):
+                    if counter % 100000 == 0:
+                        print("  tokenizing line %d" % counter)
+                    token_ids = sentence_to_token_ids(line, vocab, tokenizer,
+                                                      normalize_digits)
+                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
+
+
+def prepare_custom_data(working_directory, train_enc, train_dec, test_enc, test_dec, enc_vocabulary_size,
+                        dec_vocabulary_size, tokenizer=None):
     # Create vocabularies of the appropriate sizes.
     enc_vocab_path = os.path.join(working_directory, "vocab%d.enc" % enc_vocabulary_size)
     dec_vocab_path = os.path.join(working_directory, "vocab%d.dec" % dec_vocabulary_size)
@@ -143,4 +147,4 @@ def prepare_custom_data(working_directory, train_enc, train_dec, test_enc, test_
     data_to_token_ids(test_enc, enc_dev_ids_path, enc_vocab_path, tokenizer)
     data_to_token_ids(test_dec, dec_dev_ids_path, dec_vocab_path, tokenizer)
 
-    return (enc_train_ids_path, dec_train_ids_path, enc_dev_ids_path, dec_dev_ids_path, enc_vocab_path, dec_vocab_path)
+    return enc_train_ids_path, dec_train_ids_path, enc_dev_ids_path, dec_dev_ids_path, enc_vocab_path, dec_vocab_path