Skip to content
This repository has been archived by the owner on Jul 4, 2023. It is now read-only.

Add BPE encoder #100

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
19 changes: 19 additions & 0 deletions bpe.vocab
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#version: 0.2
d e
c o
e n
co de
b u
w i
w h
u b
r o
o k
ok en
o f</w>
l a
i s</w>
en code
e x
c h</w>
T h
1 change: 1 addition & 0 deletions build_tools/travis/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ pip install -U -r requirements.txt --progress-bar off
pip install spacy --progress-bar off
pip install nltk --progress-bar off
pip install sacremoses --progress-bar off
pip install subword_nmt --progress-bar off
pip install pandas --progress-bar off
pip install requests --progress-bar off

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ mock
# nltk
# spacy
# sacremoses
# subword-nmt

# Optional CUDA Utilties
# pynvrtc
Expand Down
23 changes: 23 additions & 0 deletions test_bpe.vocab
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#version: 0.2
h a
s ;
s; t</w>
p o
po s;t</w>
p a
pa j
paj a
paja m
pajam a
pajama s</w>
o w</w>
o u
o t</w>
n g</w>
m y</w>
l e
le p
i n</w>
b e
a pos;t</w>
& apos;t</w>
63 changes: 63 additions & 0 deletions tests/encoders/text/test_bytepair_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import unittest
import torch
import sys
from torchnlp.encoders.text import BPEEncoder


class TestBPETextTokenizer(unittest.TestCase):

def setUp(self):
self.corpus = ['This is a corpus of text that provides a bunch of tokens from which ',
'to build a vocabulary. It will be used when strings are encoded ',
'with a SubwordTextTokenizer subclass. The encoder was coded by a coder.']

def test_vocab(self):
encoder = BPEEncoder(self.corpus, from_filenames=False)

# test if reserved_tokens were add to index_to_token.
self.assertEqual('<pad>', encoder.vocab[0])
self.assertEqual('<unk>', encoder.vocab[1])
self.assertEqual('</s>', encoder.vocab[2])
self.assertEqual('<s>', encoder.vocab[3])
self.assertEqual('<copy>', encoder.vocab[4])

# test if some high occurrence sub words are in the token.
self.assertIn('oken@@', encoder.index_to_token)
self.assertIn('encode@@', encoder.index_to_token)

expect_vocab_size = 57
self.assertEqual(expect_vocab_size, encoder.vocab_size)

def test_encode(self):
if sys.version_info.minor > 5:
original = 'This is a coded sentence encoded by the SubwordTextTokenizer.'
encoder = BPEEncoder(self.corpus, from_filenames=False)

# excepted encode.
expect = [5, 6, 6, 7, 56, 32, 43, 1, 14, 1, 34, 42, 47, 32, 41, 36, 14, 17,
42, 49, 50, 51, 33, 9, 52, 53, 15, 14, 53, 26, 21, 54, 44, 55, 37]

encode_lst = encoder.encode(original).numpy().tolist()

self.assertListEqual(expect, encode_lst)

def test_decoder(self):
if sys.version_info.minor > 5:
encoded = torch.tensor([5, 6, 6, 7, 56, 32, 43, 1, 14, 1, 34, 42, 47, 32,
41, 36, 14, 17, 42, 49, 50, 51, 33, 9, 52, 53, 15,
14, 53, 26, 21, 54, 44, 55, 37])

encoder = BPEEncoder(self.corpus, from_filenames=False)

expect = "This is a coded s<unk> t<unk> ce encoded by the SubwordTextTokenizer."

self.assertEqual(expect, encoder.decode(encoded))

def test_encode_decode(self):
original = "This is a coded sentence encoded by the SubwordTextTokenizer."
expect = "This is a coded s<unk> t<unk> ce encoded by the SubwordTextTokenizer."

encoder = BPEEncoder(self.corpus, from_filenames=False)

decode_encode_str = encoder.decode(encoder.encode(original))
self.assertEqual(expect, decode_encode_str)
101 changes: 101 additions & 0 deletions tests/encoders/text/test_bytepair_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import unittest
import pickle

from torchnlp.encoders.text.bpe_text_tokenizer import BPETextTokenizer


class TestBPETextTokenizer(unittest.TestCase):

def setUp(self):
self.corpus = [
"One morning I shot an elephant in my pajamas. How he got in my pajamas, I don't",
'know.', 'Groucho Marx',
"I haven't slept for 10 days... because that would be too long.", 'Mitch Hedberg'
]

def test_pre_tokenizer(self):
expected = ['One morning I shot an elephant in my pajamas . How he got in my pajamas ,'
' I don &apos;t',
'know .',
'Groucho Marx',
'I haven &apos;t slept for 10 days ... because that would be too long .',
'Mitch Hedberg']

self.assertListEqual(expected, [BPETextTokenizer.pre_tokenize(sen) for sen in self.corpus])

def test_get_vocabulary(self):
# tokenizer = BPETextTokenizer('test_bpe', use_moses=True)
def segment_words(line):
return BPETextTokenizer._segment_words(line, BPETextTokenizer.pre_tokenize)
token_counts = BPETextTokenizer.get_vocabulary(self.corpus,
segment_words, from_filenames=False)
expected = {
"&apos;t": 2,
".": 3,
"...": 1,
"Groucho": 1,
"Marx": 1,
"Mitch": 1,
"Hedberg": 1,
"I": 3,
"in": 2,
"my": 2,
"know": 1,
"because": 1,
"pajamas": 2,
}
self.assertDictContainsSubset(expected, token_counts)

def test_learn_bpe(self):
tokenizer = BPETextTokenizer('test_bpe')
tokenizer.build_from_corpus(self.corpus, from_filenames=False)
expected = {('&', 'apos;t</w>'): 21, ('a', 'pos;t</w>'): 20, ('b', 'e'): 19,
('i', 'n</w>'): 18, ('le', 'p'): 17, ('l', 'e'): 16, ('m', 'y</w>'): 15,
('n', 'g</w>'): 14, ('o', 't</w>'): 13, ('o', 'u'): 12, ('o', 'w</w>'): 11,
('pajama', 's</w>'): 10, ('pajam', 'a'): 9, ('paja', 'm'): 8, ('paj', 'a'): 7,
('pa', 'j'): 6, ('p', 'a'): 5, ('po', 's;t</w>'): 4, ('p', 'o'): 3,
('s;', 't</w>'): 2, ('s', ';'): 1, ('h', 'a'): 0}
self.assertDictEqual(expected, tokenizer.bpe.bpe_codes)

def test_encode_decode(self):
corpus = ['This is a corpus of text that provides a bunch of tokens from which ',
'to build a vocabulary. It will be used when strings are encoded ',
'with a SubwordTextTokenizer subclass. The encoder was coded by a coder.']

original = 'This is a coded sentence encoded by the SubwordTextTokenizer.'

tokenizer = BPETextTokenizer('test_bpe')
tokenizer.build_from_corpus(corpus, from_filenames=False)

# Encoding should be reversible.
encoded = tokenizer.encode(original)
decoded = tokenizer.decode(encoded)
self.assertEqual(original, decoded)

# The substrings coded@@ and en@@ are frequent enough in the corpus that
# they should appear in the vocabulary even though they are substrings
# of other included strings.
subtoken_strings = encoded
self.assertIn('en@@', subtoken_strings)
self.assertIn('code@@', subtoken_strings)

def test_build_vocab(self):
tokenizer = BPETextTokenizer('test_bpe')
tokenizer.build_from_corpus(self.corpus, from_filenames=False)

# test the all item in vocab.
expect = {'O@@': 1, 'n@@': 4, 'e': 4, 'm@@': 1, 'o@@': 5, 'r@@': 4, 'i@@': 2,
'ng': 2, 'I': 3, 's@@': 3, 'h@@': 3, 'ot': 2, 'a@@': 4, 'n': 3,
'e@@': 3, 'lep@@': 2, 'ha@@': 3, 't': 3, 'in': 2, 'my': 2,
'pajamas': 2, '.': 4, 'H@@': 2, 'ow': 2, 'g@@': 1, ',': 1, 'd@@': 3,
'&apos;t': 2, 'k@@': 1, 'G@@': 1, 'ou@@': 2, 'c@@': 3, 'o': 2,
'M@@': 2, 'x': 1, 'v@@': 1, 'f@@': 1, 'r': 1, '1@@': 1, '0': 1,
'y@@': 1, 's': 1, '.@@': 2, 'be@@': 2, 'u@@': 1, 't@@': 3,
'w@@': 1, 'l@@': 2, 'd': 1, 'b@@': 1, 'h': 1, 'g': 1}

self.assertDictEqual(expect, tokenizer.vocab)


def test_is_pickleable():
tokenizer = BPETextTokenizer('test_bpe')
pickle.dumps(tokenizer)
4 changes: 3 additions & 1 deletion torchnlp/encoders/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@
from torchnlp.encoders.text.text_encoder import TextEncoder
from torchnlp.encoders.text.treebank_encoder import TreebankEncoder
from torchnlp.encoders.text.whitespace_encoder import WhitespaceEncoder
from torchnlp.encoders.text.bytepair_encoder import BPEEncoder

__all__ = [
'CharacterEncoder', 'DEFAULT_COPY_INDEX', 'DEFAULT_COPY_TOKEN', 'DEFAULT_EOS_INDEX',
'DEFAULT_EOS_TOKEN', 'DEFAULT_PADDING_INDEX', 'DEFAULT_PADDING_TOKEN',
'DEFAULT_RESERVED_TOKENS', 'DEFAULT_SOS_INDEX', 'DEFAULT_SOS_TOKEN', 'DEFAULT_UNKNOWN_INDEX',
'DEFAULT_UNKNOWN_TOKEN', 'DelimiterEncoder', 'MosesEncoder', 'pad_tensor',
'stack_and_pad_tensors', 'TextEncoder', 'SpacyEncoder', 'StaticTokenizerEncoder',
'SubwordEncoder', 'TreebankEncoder', 'WhitespaceEncoder', 'BatchedSequences'
'SubwordEncoder', 'TreebankEncoder', 'WhitespaceEncoder', 'BatchedSequences',
'BPEEncoder'
]
86 changes: 86 additions & 0 deletions torchnlp/encoders/text/bpe_text_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import codecs
from subword_nmt import learn_bpe, apply_bpe
from collections import Counter
from sacremoses import MosesTokenizer, MosesDetokenizer


class BPETextTokenizer(object):
_moses_tok = MosesTokenizer(lang='en')
_moses_detok = MosesDetokenizer(lang='en')

def __init__(self, file_prefix=None, separator='@@'):
if file_prefix is not None:
self.codes_file = '{}.vocab'.format(file_prefix)

self.separator = separator
self.bpe = None
self.vocab = None

@staticmethod
def pre_tokenize(line):
return BPETextTokenizer._moses_tok.tokenize(line, return_str=True)

@staticmethod
def _segment_words(line, pre_apply=None):
if pre_apply is not None:
line = pre_apply(line)
line = str(line)
return line.strip('\r\n ').split()

@staticmethod
def get_vocabulary(item_list, segment=_segment_words, from_filenames=True):
vocab = Counter()
if from_filenames:
for fname in item_list:
with codecs.open(fname, encoding='UTF-8') as f:
for line in f:
for word in segment(line):
vocab[word] += 1
else:
for line in item_list:
for word in segment(line):
vocab[word] += 1
return vocab

def build_from_corpus(self, item_list, min_count=2, num_symbols=10000,
total_symbols=False, from_filenames=True):
def segment_words(line):
return self._segment_words(line, self.pre_tokenize)

vocab_words = self.get_vocabulary(item_list, segment_words, from_filenames=from_filenames)

vocab_list = ['{0} {1}'.format(key, freq)
for (key, freq) in vocab_words.items()]

with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output:
learn_bpe.learn_bpe(vocab_list, output, num_symbols=num_symbols,
min_frequency=min_count, verbose=False,
is_dict=True, total_symbols=total_symbols)

with codecs.open(self.codes_file, encoding='UTF-8') as codes:
self.bpe = apply_bpe.BPE(codes, separator=self.separator)

self.vocab = dict(self.get_vocabulary(item_list=item_list, segment=self.segment,
from_filenames=from_filenames))

def segment(self, line):
if not hasattr(self, 'bpe'):
raise NameError('Learn bpe first!')
line = self.pre_tokenize(line)
return self.bpe.segment(line.strip('\r\n ')).split(' ')

def encode(self, raw_text):
return self.segment(raw_text)

def decode(self, bpe_text, delimiter=' '):
decode_string = delimiter.join(bpe_text)
try:
decode_string = decode_string.decode('utf-8')
except Exception:
pass
decode_string = decode_string \
.replace(self.separator + ' ', '') \
.replace(self.separator, '')
decode_string = str(decode_string).strip('\r\n ').split()
decode_string = self._moses_detok.tokenize(decode_string)
return decode_string
Loading