from fastai.text.all import *
from reformer_fastai.all import *

Experiment Tracking

Make sure you have wandb and are logged in:

Load Experiment Tracking with Weights & Biases:

import wandb
from reformer_fastai.tracking import WandbCallback

WANDB_NAME = 'lm_enwik8_shared_qk_af'
GROUP = 'TEST'
NOTES = 'Baseline TransformerLM with shared QK on enwik8, sl 4096'
CONFIG = {}
TAGS =['lm','test','enwik8']

Download and Unpack enwik8 Data

Download and unzip enwik8 data

path = untar_data('http://mattmahoney.net/dc/enwik8.zip', dest='/data')

Prepare Data

df = pd.DataFrame({'text':read_lines(path)})
df.head()
text
0 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en">\n
1 <siteinfo>\n
2 <sitename>Wikipedia</sitename>\n
3 <base>http://en.wikipedia.org/wiki/Main_Page</base>\n
4 <generator>MediaWiki 1.6alpha</generator>\n
btt = ByteTextTokenizer(is_lm=True, add_bos=False, add_eos=False)
%%time
df['toks'] = df['text'].apply(btt)
df['lens'] = df['toks'].apply(len)
df['lens_cum_sum'] = df.lens.cumsum()
CPU times: user 2min 48s, sys: 3 s, total: 2min 51s
Wall time: 2min 50s
train_cutoff = df.lens.sum() - 10_000_000  # keep all but 10M characters for val and test
train_idxs = df.loc[df['lens_cum_sum'] < train_cutoff].index.values
train_idxs = list(range(0, max(train_idxs)))

remaining_idxs = len(df) - max(train_idxs)
validation_idxs = list(range(max(train_idxs), max(train_idxs) + int(remaining_idxs/2)))
test_idxs = list(range(max(validation_idxs), len(df)))

splits = [train_idxs, validation_idxs]
%%time
tfms = [attrgetter("text"), btt]
dsets = Datasets(df, [tfms], splits=splits, dl_type=LMDataLoader)
CPU times: user 1.2 s, sys: 52.1 ms, total: 1.25 s
Wall time: 1.25 s
%%time
bs, sl = 2, 4096
# pad_seq2seq = partial(pad_input, pad_idx=bte.pad_token_id, pad_fields=[0,1])
dl_kwargs = [{'lens':df['lens'].values[train_idxs]},
             {'val_lens':df['lens'].values[validation_idxs]}]
dls = dsets.dataloaders(bs=bs, val_bs=2*bs, seq_len=sl, dl_kwargs=dl_kwargs, shuffle_train=True, n_workers=2)
CPU times: user 36.6 s, sys: 1.1 s, total: 37.7 s
Wall time: 37.4 s
dls.show_batch(max_n=4)
text text_
0 The famous [[tennis]] players [[Ivan Lendl]] and [[Martina Navrátilová]] were born in Czechoslovakia.\n'''[[Ugric languages|Ugric]]''' (Ugrian)\n*[[Roger Sandall]]\nIn 1996, Flockhart appeared as the daughter of [[Dianne Wiest]] and [[Gene Hackman]]'s characters, in ''[[The Birdcage]]'', her last film role before securing the starring role in the hit show ''[[Ally McBeal]]''. Throughout this year she continued to work on Broadway, playing the role of Natasha in [[Anton Chekhov]]'s ''[[Three Sisters (play)|Three Sisters]]''.\n* [[City College of San Francisco]]\n\n <text xml:space="preserve">#REDIRECT[[football (soccer)]]</text>\nCuarón's next project found him making a severe left turn; shot in [[Mexico]] with a Spanish-speaking cast, ''[[Y tu mamá también]]'' was a funny, provocative, and controversial road comedy about two sexually obsessed teenagers who take an extended road trip with an attractive woman in her thirties. The film's open portrayal of sexuality and frequent rude humor, as well as the politically and socially relevant asides, made the film an he famous [[tennis]] players [[Ivan Lendl]] and [[Martina Navrátilová]] were born in Czechoslovakia.\n'''[[Ugric languages|Ugric]]''' (Ugrian)\n*[[Roger Sandall]]\nIn 1996, Flockhart appeared as the daughter of [[Dianne Wiest]] and [[Gene Hackman]]'s characters, in ''[[The Birdcage]]'', her last film role before securing the starring role in the hit show ''[[Ally McBeal]]''. Throughout this year she continued to work on Broadway, playing the role of Natasha in [[Anton Chekhov]]'s ''[[Three Sisters (play)|Three Sisters]]''.\n* [[City College of San Francisco]]\n\n <text xml:space="preserve">#REDIRECT[[football (soccer)]]</text>\nCuarón's next project found him making a severe left turn; shot in [[Mexico]] with a Spanish-speaking cast, ''[[Y tu mamá también]]'' was a funny, provocative, and controversial road comedy about two sexually obsessed teenagers who take an extended road trip with an attractive woman in her thirties. The film's open portrayal of sexuality and frequent rude humor, as well as the politically and socially relevant asides, made the film an
1 (in contradiction to the characters of his contemporaries, who were more concerned with the [[Medieval]] virtues of [[Chivalry]], [[Piety]] and [[Humility]]).\n==External links==\n\n[[Category:Christian writers|Fox, George]]\n\n||? || z || z \nColumn chromatography utilizes a vertical glass column filled with some form of solid support with the sample to be separated placed on top of this support. The rest of the column is filled with a solvent which, under the influence of gravity, moves the sample through the column. Similarly to other forms of chromatography, differences in rates of movement through the solid medium are translated to different exit times from the bottom of the column for the various elements of the original sample.\n\n </contributor>\n[[no:Delaware]]\n &lt;td&gt;Wurm, Worm&lt;/td&gt;\n <page>\n <username>Romanc19s</username>\nThe brand name ''Aspirin'' was coined by the [[Bayer]] company of [[Germany]]. In some countries the name (in contradiction to the characters of his contemporaries, who were more concerned with the [[Medieval]] virtues of [[Chivalry]], [[Piety]] and [[Humility]]).\n==External links==\n\n[[Category:Christian writers|Fox, George]]\n\n||? || z || z \nColumn chromatography utilizes a vertical glass column filled with some form of solid support with the sample to be separated placed on top of this support. The rest of the column is filled with a solvent which, under the influence of gravity, moves the sample through the column. Similarly to other forms of chromatography, differences in rates of movement through the solid medium are translated to different exit times from the bottom of the column for the various elements of the original sample.\n\n </contributor>\n[[no:Delaware]]\n &lt;td&gt;Wurm, Worm&lt;/td&gt;\n <page>\n <username>Romanc19s</username>\nThe brand name ''Aspirin'' was coined by the [[Bayer]] company of [[Germany]]. In some countries the name is
vocab_sz = btt.vocab_size
xb, yb = dls.one_batch()
xb.shape, yb.shape
(torch.Size([2, 4096]), torch.Size([2, 4096]))

Training

Initialise wandb logging, pleaes do not change project or entity (that that everything gets logged to the same place)

wandb.init(reinit=True, project="reformer-fastai", entity="fastai_community", 
           name=WANDB_NAME, group=GROUP, notes=NOTES, tags=TAGS, config=CONFIG)
config = TransformerLMConfigEnwik8(max_seq_len=4096, axial_shape=(64,64), shared_qk=True)
learn = Learner(dls, TransformerLM.from_config(config),
                loss_func=CrossEntropyLossFlat(), opt_func=adafactor,
                cbs = [GradientAccumulation(n_acc=8), GradientClip()],
                metrics=[accuracy, perplexity, bpc]).to_fp16()
learn.fit(1, cbs=WandbCallback(log_model=False, log_preds=False))
0.00% [0/1 00:00<00:00]
epoch train_loss valid_loss accuracy perplexity bpc time

34.07% [3743/10987 1:28:17<2:50:53 2.3217]
learn.recorder.plot_loss()