Helper classes

bs = 4
sl = 64
d = 128
x = torch.randn(bs, sl, d)
ff  = ChunkedFeedForward(d, n_chunks=8, dim=1)
out = ff(x)
assert out.size() == (bs, sl, d)

bs = 4
sl = 64
d = 128
x = torch.randn(bs, sl, d)
# revblock is called on twin x
x2 = torch.cat([x, x], dim=-1)
attn = Attention(d)
ff = ChunkedFeedForward(d, n_chunks=8, dim=-2)
revblock = ReversibleBlock(attn, ff)
out = revblock(x2)
assert out.size() == (bs, sl, d*2)
# no grads are stored
out = torch.stack(out.chunk(2, dim=-1)).mean(dim=0)
try: out.mean().backward()
except RuntimeError as e: print(e)

element 0 of tensors does not require grad and does not have a grad_fn

attn = Attention(d)
ff = ChunkedFeedForward(d, n_chunks=8, dim=-2)
irrevblock = IrreversibleBlock(attn, ff)
out = irrevblock(x2)
assert out.size() == (bs, sl, d*2)

bs = 4
sl = 64
d = 128
x = torch.randn(bs, sl, d)
x2 = torch.cat([x, x], dim=-1)
blocks = []
for i in range(2):
    f = PreNorm(d, Attention(d))
    g = PreNorm(d, FeedForward(d))
    blocks.append(nn.ModuleList([f, g]))
layers = ReversibleSequence(nn.ModuleList(blocks))
out = layers(x2)
assert out.size() == (bs, sl, 2*d)

bs = 4
sl = 64
d = 128
x = torch.randn(bs, sl, d)
x2 = torch.cat([x, x], dim=-1)
blocks = []
for i in range(2):
    f = PreNorm(d, LSHSelfAttention(d, bucket_size=16))
    g = PreNorm(d, FeedForward(d))
    blocks.append(nn.ModuleList([f, g]))
layers = ReversibleSequence(nn.ModuleList(blocks))
out = layers(x2, arg_route=(True, False), _reverse=False, _depth=1)
assert out.size() == (bs, sl, 2*d)

try: out.mean().backward()
except RuntimeError as e: print(e)

element 0 of tensors does not require grad and does not have a grad_fn

ReversibleTransformer

x = torch.randn(bs, sl, d)
m = ReversibleEncoder(d)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 64, 128])

x = torch.randn(bs, sl, d)
m = ReversibleDecoder(d)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 64, 128])

* vocab_sz: int
* d_model: int - inner dimension of the model
* n_layers: int (default: 6)
* n_heads: int (default: 8)
* d_ff: int - inner dimension of the pointwise FeedForward net, if None defaults to 4*d_model
* ff_chunkes: int - number of chunks for FeedForward layer computation
* attn_dropout: float - attention dropout
* ff_dropout: float - feed-forward dropout
* emb_dropout: float - embedding dropout
* causal: bool (default: True) - if True does causal masking automatically
* max_seq_len: int (default: 512)
* tie_weights: bool - if True target embedding weights are used for computation output projection
* prenorm: bool - wether to use PreNorm or PostNorm
* attn_bias: bool - if True projection layers attention modules will have bias
* pad_idx: int - padding token id, required for autogeneration of padding mask
* pos_enc: str from {'absolute', 'fixed', 'axial'} - type of positional encoding to use
* axial_shape: tuple - required if 'axial' positional encoding are used, should be factors of
        max_seq_len
* axial_emb_dims: tuple - [optional] axial embedding components, should sum to d_model
* rev_thres: int - if (seq_len < rev_thres) applies irreversible blocks

* x - input ids, shape [bs, sl]
* mask - optional boolean mask, shape [bs, sl]

* logits - target token logits, shape [bs, sl, vocab_sz]

bs = 4
sl = 128
d = 64
vocab_sz = 256
x = torch.randint(vocab_sz, (bs, sl))
model = ReversibleLM(vocab_sz, d, n_layers=2, causal=False)
out = model(x)
assert (out.size() == (bs, sl, vocab_sz))
out.shape

torch.Size([4, 128, 256])

* enc_vocab_sz: int - source vocab size
* dec_vocab_sz: int - target vocab size
* d_model: int - inner dimension of the model
* n_enc_layers: int (default: 6)
* n_dec_layers: int (default: 6)
* n_heads: int (default: 8)
* d_ff: int - inner dimension of the pointwise FeedForward net, if None defaults to 4*d_model
* ff_chunkes: int - number of chunks for FeedForward layer computation
* attn_dropout: float - attention dropout
* ff_dropout: float - feed-forward dropout
* emb_dropout: float - embedding dropout
* max_seq_len: int (default: 512)
* prenorm: bool - whether to use PreNorm or PostNorm
* attn_bias: bool - whether to allow biases in attention projection layers
* pad_idx: int - padding token id, if pad_idx is provided, and no mask/context_mask are
        passed to forward method will be used to generate padding masks
* tie_weights: bool - if True target embedding weights are used for computation output projection
* shared_emb: bool - if True encoder and decoder will use shared embedding layer
* pos_enc: str from {'absolute', 'fixed', 'axial'} - type of positional encoding to use
* axial_shape: tuple - required if 'axial' positional encoding are used, should be factors of
        max_seq_len
* axial_emb_dims: tuple - [optional] axial embedding components, should sum to d_model

* src - source input ids, shape [bs, src_sl]
* tgt - target input ids, shape [bs, tgt_sl]
* src_mask - optional boolean source mask, shape [bs, src_sl]
* tgt_mask - optional boolean target mask, shape [bs, tgt_sl]

* logits - target token logits, shape [bs, tgt_sl, tgt_vocab_sz]

bs = 4
src_sl = 70
tgt_sl = 80
d = 64
src_vocab_sz = 256
tgt_vocab_sz = 256
src = torch.randint(src_vocab_sz, (bs, src_sl))
tgt = torch.randint(tgt_vocab_sz, (bs, tgt_sl))
model = ReversibleTransformer(src_vocab_sz, tgt_vocab_sz, d, n_enc_layers=2, n_dec_layers=2)
out = model(src, tgt)
assert (out.size() == (bs, tgt_sl, tgt_vocab_sz))
out.shape

torch.Size([4, 80, 256])

Transformer with LSH attention

bs = 4
sl = 128
d = 64
x = torch.randn(bs, sl, d)
m = LSHEncoderBlock(d)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 128, 64])

m = LSHEncoderBlock(d, use_lsh=False)
out = m(x)
assert (out.size() == (bs, sl, d))

bs = 4
sl = 128
d = 64
x = torch.randn(bs, sl, d)
m = LSHEncoder(d, n_layers=2)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 128, 64])

m = LSHEncoder(d, n_layers=2, n_heads=4, use_lsh=False)
out = m(x)
assert (out.size() == (bs, sl, d))

* vocab_sz: int
* d_model: int - inner dimension of the model
* n_layers: int (default: 6)
* n_heads: int (default: 8)
* d_ff: int - inner dimension of the pointwise FeedForward net, if None defaults to 4*d_model
* attn_dropout: float - attention dropout
* ff_dropout: float - feed-forward dropout
* emb_dropout: float - embedding dropout
* causal: bool (default: True) - if True does causal masking automatically
* max_seq_len: int (default: 512)
* tie_weights: bool - if True target embedding weights are used for computation output projection
* prenorm: bool - wether to use PreNorm or PostNorm
* attn_bias: bool - wether to allow biases in attention projection layers
* pad_idx: int - padding token id, required for autogeneration of padding mask
* pos_enc: str from {'absolute', 'fixed', 'axial'} - type of positional encoding to use
* axial_shape: tuple - required if 'axial' positional encoding are used, should be factors of
        max_seq_len
* axial_emb_dims: tuple - [optional] axial embedding components, should sum to d_model
* use_slh: bool - parameter to switch between LSH and full attention
* n_hashes: int - number of hashing rounds for LSH
* bucket_size: int - input sequence length should be divisible by 2*bucket_size
* seed: int - for LSHAttention module

* x - input ids, shape [bs, sl]
* mask - optional boolean mask, shape [bs, sl]

* logits - target token logits, shape [bs, sl, vocab_sz]

bs = 4
sl = 128
d = 64
vocab_sz = 256
x = torch.randint(vocab_sz, (bs, sl))
model = LSHLM(vocab_sz, d, n_layers=2, causal=False)
out = model(x)
assert (out.size() == (bs, sl, vocab_sz))
out.shape

torch.Size([4, 128, 256])

model.use_lsh = True
out = model(x)
assert (out.size() == (bs, sl, vocab_sz))
%timeit model(x)

304 ms ± 22.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

model.use_lsh = False
out = model(x)
assert (out.size() == (bs, sl, vocab_sz))
%timeit model(x)

8.6 ms ± 325 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

Reformer

bs = 4
sl = 128
d = 64
x = torch.randn(bs, sl, d)
m = ReformerEncoder(d, n_layers=2)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 128, 64])

* vocab_sz: int
* d_model: int - inner dimension of the model
* n_layers: int (default: 6)
* n_heads: int (default: 8)
* d_ff: int - inner dimension of the pointwise FeedForward net, if None defaults to 4*d_model
* ff_chunkes: int - number of chunks for FeedForward layer computation
* attn_dropout: float - attention dropout
* ff_dropout: float - feed-forward dropout
* emb_dropout: float - embedding dropout
* causal: bool (default: True) - if True does causal masking automatically
* max_seq_len: int (default: 512)
* tie_weights: bool - if True target embedding weights are used for computation output projection
* prenorm: bool - wether to use PreNorm or PostNorm
* attn_bias: bool - wether to allow biases in attention projection layers
* pad_idx: int - padding token id, required for autogeneration of padding mask
* pos_enc: str from {'absolute', 'fixed', 'axial'} - type of positional encoding to use
* axial_shape: tuple - required if 'axial' positional encoding are used, should be factors of
        max_seq_len
* axial_emb_dims: tuple - [optional] axial embedding components, should sum to d_model
* rev_thres: int - if (seq_len < rev_thres) applies irreversible blocks
* use_slh: bool - parameter to switch between LSH and full attention
* n_hashes: int - number of hashing rounds for LSH
* bucket_size: int - input sequence length should be divisible by 2*bucket_size
* seed: int - for LSHAttention module

* x - input ids, shape [bs, sl]
* mask - optional boolean mask, shape [bs, sl]

* logits - target token logits, shape [bs, sl, vocab_sz]

bs = 4
sl = 128
d = 64
vocab_sz = 256
x = torch.randint(vocab_sz, (bs, sl))
model = ReformerLM(vocab_sz, d, n_layers=2, causal=False)
out = model(x)
assert (out.size() == (bs, sl, vocab_sz))
out.shape

torch.Size([4, 128, 256])

Check cached buckets:

{'buckets:0': tensor([[ 0,  0,  1,  ..., 15, 15, 14],
        [ 0,  0,  0,  ..., 14, 15, 15],
        [ 0,  0,  0,  ..., 14, 15, 14],
        [ 0,  0,  1,  ..., 14, 15, 14]])}
torch.Size([4, 1024])
{'buckets:1': tensor([[ 0,  0,  0,  ..., 15, 15, 14],
        [ 0,  0,  0,  ..., 15, 14, 15],
        [ 1,  1,  1,  ..., 14, 14, 14],
        [ 0,  0,  0,  ..., 14, 15, 15]])}
torch.Size([4, 1024])

LSHAttention execution time depends on number of hashing rounds

print(f'Number of hashing rounds {model._n_hashes}')
%timeit model(x)

Number of hashing rounds 8
304 ms ± 19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

model.n_hashes = 1
print(f'Number of hashing rounds {model.n_hashes}')
%timeit model(x)

Number of hashing rounds 1
74.7 ms ± 2.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Reformer

Helper classes

`class` `Chunk`[source]

`class` `ChunkedFeedForward`[source]

`class` `Deterministic`[source]

`class` `ReversibleBlock`[source]

`class` `IrreversibleBlock`[source]

`class` `ReversibleSequence`[source]

ReversibleTransformer

`class` `ReversibleEncoder`[source]

`class` `ReversibleDecoder`[source]

`class` `ReversibleLM`[source]

`class` `ReversibleTransformer`[source]

Transformer with LSH attention

`class` `LSHEncoderBlock`[source]

`class` `LSHEncoder`[source]

`class` `LSHLM`[source]

Reformer

`class` `ReformerEncoder`[source]

`class` `ReformerLM`[source]

`reformer_lm_splits`[source]

`None`[source]

Reformer

Helper classes

class Chunk[source]

class ChunkedFeedForward[source]

class Deterministic[source]

class ReversibleBlock[source]

class IrreversibleBlock[source]

class ReversibleSequence[source]

ReversibleTransformer

class ReversibleEncoder[source]

class ReversibleDecoder[source]

class ReversibleLM[source]

class ReversibleTransformer[source]

Transformer with LSH attention

class LSHEncoderBlock[source]

class LSHEncoder[source]

class LSHLM[source]

Reformer

class ReformerEncoder[source]

class ReformerLM[source]

reformer_lm_splits[source]

None[source]

`class` `Chunk`[source]

`class` `ChunkedFeedForward`[source]

`class` `Deterministic`[source]

`class` `ReversibleBlock`[source]

`class` `IrreversibleBlock`[source]

`class` `ReversibleSequence`[source]

`class` `ReversibleEncoder`[source]

`class` `ReversibleDecoder`[source]

`class` `ReversibleLM`[source]

`class` `ReversibleTransformer`[source]

`class` `LSHEncoderBlock`[source]

`class` `LSHEncoder`[source]

`class` `LSHLM`[source]

`class` `ReformerEncoder`[source]

`class` `ReformerLM`[source]

`reformer_lm_splits`[source]

`None`[source]