# %load_ext autoreload
# %autoreload 2

class ByteTextTokenizer[source]

ByteTextTokenizer(is_lm=True, add_bos=False, add_eos=False) :: Transform

Encodes each byte to an id. For 8-bit strings only. Credit to the Tensor2Tensor library

Implementation take from the ByteTextEncoder implementation from the tensor2tensor library

wonder = "I wonder how the moon got it's shine?"
tok = ByteTextTokenizer()
tok_wonder = tok(wonder)

# test string vs list
assert (tok(wonder) == tok([wonder])).sum() == len(tok(wonder))
# assert (tok.decode(tok_wonder) == tok.decode([tok_wonder])).sum() == len(wonder)
assert type(tok_wonder) == LMTensorText
assert len(tok_wonder) == 37
assert tok.decode(tok_wonder) == wonder

class SubwordTextEncoder[source]

SubwordTextEncoder(filename=None, seq_len=256, ls_lm=True, add_bos=False, BOS_ID=None) :: Transform

Class for invertibly encoding text using a limited vocabulary.

A tokenizer class for invertibly encoding text using a limited vocabulary. Taken from the Tensor2Tensor library SubWordTextEncoder

Arguments

Initialize and read from a file, if provided.

filename : filename from which to read vocab. If None, do not load a vocab

is_lm : whether or not to return a LMTensorText or a TensorText, can be important for fastai type dispatching

add_bos : whether or not to add a BOS token at the beginning of the encoded tokens. if bos_token_id is not set then a PAD token will be used

seq_len : Maximum sequence length of encoded tokens, including BOS token

Description

Invertibly encodes a native string as a sequence of subtokens from a limited vocabulary.

A SubwordTextEncoder is built from a corpus (so it is tailored to the text in the corpus), and stored to a file. See text_encoder_build_subword.py.

It can then be loaded and used to encode/decode any text.

Encoding has four phases:

  1. Tokenize into a list of tokens. Each token is a unicode string of either all alphanumeric characters or all non-alphanumeric characters. We drop tokens consisting of a single space that are between two alphanumeric tokens.

  2. Escape each token. This escapes away special and out-of-vocabulary characters, and makes sure that each token ends with an underscore, and has no other underscores.

  3. Represent each escaped token as a the concatenation of a list of subtokens from the limited vocabulary. Subtoken selection is done greedily from beginning to end. That is, we construct the list in order, always picking the longest subtoken in our vocabulary that matches a prefix of the remaining portion of the encoded token.

  4. Concatenate these lists. This concatenation is invertible due to the fact that the trailing underscores indicate when one list is finished.

# Test for SubwordTokenizer

!pip install -Uqq sacrebleu
!wget https://raw.githubusercontent.com/tensorflow/tensor2tensor/master/tensor2tensor/test_data/vocab.translate_ende_wmt32k.32768.subwords

import sacrebleu
!sacrebleu -t wmt14/full -l en-de --echo src > wmt14-en-de.src
!sacrebleu -t wmt14/full -l en-de --echo ref > wmt14-en-de.ref

# Load the source text and reference translations into Python
refs = []
for lineno, line in enumerate(sacrebleu.smart_open('wmt14-en-de.ref'), 1):
  if line.endswith('\n'):
    line = line[:-1]
  refs.append(line)
srcs = []
for lineno, line in enumerate(sacrebleu.smart_open('wmt14-en-de.src'), 1):
  if line.endswith('\n'):
    line = line[:-1]
  srcs.append(line)
--2020-12-19 08:12:56--  https://raw.githubusercontent.com/tensorflow/tensor2tensor/master/tensor2tensor/test_data/vocab.translate_ende_wmt32k.32768.subwords
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.64.133, 151.101.0.133, 151.101.192.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.64.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 324025 (316K) [text/plain]
Saving to: ‘vocab.translate_ende_wmt32k.32768.subwords.3’

vocab.translate_end 100%[===================>] 316.43K  --.-KB/s    in 0.1s    

2020-12-19 08:12:56 (3.24 MB/s) - ‘vocab.translate_ende_wmt32k.32768.subwords.3’ saved [324025/324025]

# Encode source sentences using the tokenizer
tok = SubwordTextEncoder(filename='./vocab.translate_ende_wmt32k.32768.subwords')
srcs=srcs[:5]
input_ids = np.zeros((len(srcs), 128), dtype=np.int64)
for i, x in enumerate(srcs):
  print(x)
  x = tok.encodes(x)
  assert len(x) <= 127
  input_ids[i, :len(x)] = x
  input_ids[i, len(x)] = 1
Gutach: Increased safety for pedestrians
They are not even 100 metres apart: On Tuesday, the new B 33 pedestrian lights in Dorfparkplatz in Gutach became operational - within view of the existing Town Hall traffic lights.
Two sets of lights so close to one another: intentional or just a silly error?
Yesterday, Gutacht's Mayor gave a clear answer to this question.
"At the time, the Town Hall traffic lights were installed because this was a school route," explained Eckert yesterday.
# Test BOS and seq_len
tok = SubwordTextEncoder(filename='./vocab.translate_ende_wmt32k.32768.subwords', seq_len=10, add_bos=True, BOS_ID=999)
x = "I was walking down the street, when I saw a TIGER! A huuuuge tiger. What do I do?"
toks = tok(x)
assert toks[0] == 999
assert len(toks) == 10
# Decode sample output
tokenizer.decodes(input_ids[0])
'Gutach: Increased safety for pedestrians<EOS><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'