def read_lines(path):
    """
    Tokenizes a text file.
    """
    assert os.path.exists(path)
    lines = []
    with open(path, 'r') as f:
        for line in f:
            lines.append(line)  # + ['<eos>'])
    return lines


def convert_data_to_seq_length(df, seq_length=2**16):
    """
    Take a dataframe text data and convert it to a dataframe with the same columns where
    every data sample is of numericalized token length of seq_length, except for the last example which is the remainder.
    (less than but closest to the value given)
    :param df: a pandas dataframe with columns [tokenized, lens] consisting of the numericalized tokens of text and their respective lengths
    :param seq_length: the numericalized token sequence length to split the data into
    :return: the new dataframe with split data samples
    """
    concat_data = to_concat(list(df['tokenized']))
    result = pd.DataFrame(columns=['tokenized', 'lens'])
    n_seqs = len(concat_data)//seq_length
    for i in tqdm(range(n_seqs), desc="Splitting data", total=n_seqs):
        sample = concat_data[i*seq_length:(i+1)*seq_length]
        result = result.append(
            {
                'tokenized': sample,
                'lens': len(sample),
            },
            ignore_index=True)
    # Add last data sample which is the remainder
    sample = concat_data[n_seqs*seq_length:]
    if len(sample) > 0:
        result = result.append(
        {
            'tokenized': sample,
            'lens': len(sample),
        },
        ignore_index=True)
    return result


def read_and_prepare_data(data_path, seq_length=0):
    """
    Read the data from file, and prepare the dataframe.
    This does not include splitting into train and validation sets.
    :param data_path: relative path to the raw data
    :param seq_length: sequence length to split data into, default is don't change data sample length
    :return: the dataframe after preparations
    """
    print("Reading data from path...")
    # Read the data from file
    enwik8 = read_lines(data_path)
    df = pd.DataFrame({'text': enwik8})
    print("Done!")
    
    time.sleep(0.5)  # this is so the printing of the progress bar is not weird
    # Initialize the BTT
    btt = ByteTextTokenizer(is_lm=True, add_bos=True, add_eos=True)

    # Modify dataset for training
    tqdm.pandas(desc="Tokenizing data")
    df['tokenized'] = df['text'].progress_map(lambda x: btt(x))
    
    # By default we won't change the data sample length
    if seq_length != 0:
        print("Sequence length has been added, splitting data to samples with sequence length " + str(seq_length))
        # Convert data samples according to sequence length
        df = convert_data_to_seq_length(df, seq_length)
        print("Done!")
    else:
        df['lens'] = df['text'].map(lambda x: len(x))

    df['lens_cum_sum'] = df.lens.cumsum()

    return df

test_text = 'hello world!'
test_df = pd.DataFrame({'text': [test_text]})
btt = ByteTextTokenizer(is_lm=True, add_bos=True, add_eos=True)
tokenized_test_text = btt(test_text)
assert len(test_df) == 1
assert len(test_df['text'][0]) == len(test_text)

tqdm.pandas(desc="tokenizing data")
test_df['tokenized'] = test_df['text'].progress_map(lambda x: btt(x))

# Split the df into a divisable length (2)
converted_test_df = convert_data_to_seq_length(test_df, 2)
assert len(converted_test_df) == len(tokenized_test_text)//2

# Split the df into a non-divisable length (5)
converted_test_df = convert_data_to_seq_length(test_df, 5)
assert len(converted_test_df) != len(tokenized_test_text)//5

tokenizing data: 100%|██████████| 1/1 [00:00<00:00, 1174.88it/s]
Splitting data: 100%|██████████| 7/7 [00:00<00:00, 272.55it/s]
Splitting data: 100%|██████████| 2/2 [00:00<00:00, 243.28it/s]

test_df['tokenized'][0]

LMTensorText([  2, 107, 104, 111, 111, 114,  35, 122, 114, 117, 111, 103,  36,   1])

Synthetic task

Data utils for the synthetic task of the reformer paper. We want to create sequences of the form 0w0w, where w is a sequence of integeres between 1-127 of some lenght: eg. 08470847. We create items on the fly instead of all items up front.

dls=DataLoaders.from_dsets(TwinSequence(10, 50, seed=42), bs=6, shuffle=False, device='cpu')
xb, yb = dls.one_batch()
inp1, targ1 = xb[0].tolist(), yb[0].tolist()
inp1, targ1

([0, 10, 74, 33, 116, 0, 10, 74, 33, 116],
 [10, 74, 33, 116, 0, 10, 74, 33, 116, 0])

dls2=DataLoaders.from_dsets(TwinSequence(10, 50, seed=42), bs=6, shuffle=False, device='cpu')
xb, yb = dls.one_batch()
inp2, targ2 = xb[0].tolist(), yb[0].tolist()
inp2, targ2

([0, 10, 74, 33, 116, 0, 10, 74, 33, 116],
 [10, 74, 33, 116, 0, 10, 74, 33, 116, 0])

assert all_equal(inp1[1:], targ1[:-1])
assert all_equal(inp1, inp2)
assert all_equal(targ1, targ2)

For the synthetic task we also have to mask the first half of the targets. The first part is just random integers, so it's impossible to learn anything from it. We set the tokens in the first part to a special index, -100, and later tell our lossfunction to ignore items with this value. This means that the only task the model can learn is to copy the first part of the input sequence. If we didn't mask the first part, it would be penalized for poor performance in the first part, and would try to find a compromise.

Below is a TwinSequence dataset that is deterministic, and loads all items to memory:

dls=DataLoaders.from_dsets(TwinSequence(20, 100, seed=42), bs=16, shuffle=False, device='cpu')
xb, yb = dls.one_batch()
inp3, targ3 = xb[0].tolist(), yb[0].tolist()
assert all_equal(inp3[1:], targ3[:-1])

Data utils

`read_lines`[source]

`convert_data_to_seq_length`[source]

`read_and_prepare_data`[source]

Synthetic task

`class` `TwinSequence`[source]

`class` `MaskTargCallback`[source]

`class` `DeterministicTwinSequence`[source]

Data utils

read_lines[source]

convert_data_to_seq_length[source]

read_and_prepare_data[source]

Synthetic task

class TwinSequence[source]

class MaskTargCallback[source]

class DeterministicTwinSequence[source]

`read_lines`[source]

`convert_data_to_seq_length`[source]

`read_and_prepare_data`[source]

`class` `TwinSequence`[source]

`class` `MaskTargCallback`[source]

`class` `DeterministicTwinSequence`[source]