def read_lines(path):
"""
Tokenizes a text file.
"""
assert os.path.exists(path)
lines = []
with open(path, 'r') as f:
for line in f:
lines.append(line) # + ['<eos>'])
return lines
def convert_data_to_seq_length(df, seq_length=2**16):
"""
Take a dataframe text data and convert it to a dataframe with the same columns where
every data sample is of numericalized token length of seq_length, except for the last example which is the remainder.
(less than but closest to the value given)
:param df: a pandas dataframe with columns [tokenized, lens] consisting of the numericalized tokens of text and their respective lengths
:param seq_length: the numericalized token sequence length to split the data into
:return: the new dataframe with split data samples
"""
concat_data = to_concat(list(df['tokenized']))
result = pd.DataFrame(columns=['tokenized', 'lens'])
n_seqs = len(concat_data)//seq_length
for i in tqdm(range(n_seqs), desc="Splitting data", total=n_seqs):
sample = concat_data[i*seq_length:(i+1)*seq_length]
result = result.append(
{
'tokenized': sample,
'lens': len(sample),
},
ignore_index=True)
# Add last data sample which is the remainder
sample = concat_data[n_seqs*seq_length:]
if len(sample) > 0:
result = result.append(
{
'tokenized': sample,
'lens': len(sample),
},
ignore_index=True)
return result
def read_and_prepare_data(data_path, seq_length=0):
"""
Read the data from file, and prepare the dataframe.
This does not include splitting into train and validation sets.
:param data_path: relative path to the raw data
:param seq_length: sequence length to split data into, default is don't change data sample length
:return: the dataframe after preparations
"""
print("Reading data from path...")
# Read the data from file
enwik8 = read_lines(data_path)
df = pd.DataFrame({'text': enwik8})
print("Done!")
time.sleep(0.5) # this is so the printing of the progress bar is not weird
# Initialize the BTT
btt = ByteTextTokenizer(is_lm=True, add_bos=True, add_eos=True)
# Modify dataset for training
tqdm.pandas(desc="Tokenizing data")
df['tokenized'] = df['text'].progress_map(lambda x: btt(x))
# By default we won't change the data sample length
if seq_length != 0:
print("Sequence length has been added, splitting data to samples with sequence length " + str(seq_length))
# Convert data samples according to sequence length
df = convert_data_to_seq_length(df, seq_length)
print("Done!")
else:
df['lens'] = df['text'].map(lambda x: len(x))
df['lens_cum_sum'] = df.lens.cumsum()
return df
test_text = 'hello world!'
test_df = pd.DataFrame({'text': [test_text]})
btt = ByteTextTokenizer(is_lm=True, add_bos=True, add_eos=True)
tokenized_test_text = btt(test_text)
assert len(test_df) == 1
assert len(test_df['text'][0]) == len(test_text)
tqdm.pandas(desc="tokenizing data")
test_df['tokenized'] = test_df['text'].progress_map(lambda x: btt(x))
# Split the df into a divisable length (2)
converted_test_df = convert_data_to_seq_length(test_df, 2)
assert len(converted_test_df) == len(tokenized_test_text)//2
# Split the df into a non-divisable length (5)
converted_test_df = convert_data_to_seq_length(test_df, 5)
assert len(converted_test_df) != len(tokenized_test_text)//5
test_df['tokenized'][0]
Data utils for the synthetic task of the reformer paper. We want to create sequences of the form 0w0w, where w is a sequence of integeres between 1-127 of some lenght: eg. 08470847. We create items on the fly instead of all items up front.
dls=DataLoaders.from_dsets(TwinSequence(10, 50, seed=42), bs=6, shuffle=False, device='cpu')
xb, yb = dls.one_batch()
inp1, targ1 = xb[0].tolist(), yb[0].tolist()
inp1, targ1
dls2=DataLoaders.from_dsets(TwinSequence(10, 50, seed=42), bs=6, shuffle=False, device='cpu')
xb, yb = dls.one_batch()
inp2, targ2 = xb[0].tolist(), yb[0].tolist()
inp2, targ2
assert all_equal(inp1[1:], targ1[:-1])
assert all_equal(inp1, inp2)
assert all_equal(targ1, targ2)
For the synthetic task we also have to mask the first half of the targets. The first part is just random integers, so it's impossible to learn anything from it. We set the tokens in the first part to a special index, -100, and later tell our lossfunction to ignore items with this value. This means that the only task the model can learn is to copy the first part of the input sequence. If we didn't mask the first part, it would be penalized for poor performance in the first part, and would try to find a compromise.
Below is a TwinSequence dataset that is deterministic, and loads all items to memory:
dls=DataLoaders.from_dsets(TwinSequence(20, 100, seed=42), bs=16, shuffle=False, device='cpu')
xb, yb = dls.one_batch()
inp3, targ3 = xb[0].tolist(), yb[0].tolist()
assert all_equal(inp3[1:], targ3[:-1])