from functools import reduceSequence Classification ✅
1 Review of python’s reduce
#
# Aggregating sequences of numbers by summation
#
xs = [1, 2, 3, 4, 5]
def update(total: int, x: int) -> int:
return total + x
reduce(update, xs, 0)15
#
# Count total characters of a sequence of words
#
xs = ["hello", "how", "are", "u", "doing?"]
def update(total: int, word: str) -> int:
return total + len(word)
reduce(update, xs, 0)18
#
# Aggregate a sequence of words to their average number of characters per word
#
xs = ["hello", "how", "are", "u", "doing?"]
def update(stat: (int, int), word: str) -> (int, int):
count, total = stat
return (count + 1, total + len(word))
(total_words, total_chars) = reduce(update, xs, (0, 0))
total_chars / total_words3.6
2 Processing unbounded sequences of vectors
Suppose we have a sequence of vectors:
\[ x = (x_1, x_2, \dots, x_L) \] where each \(x_i\in\mathbb{R}^m\). Thus, \(x\in(\mathbb{R}^m)^*\).
We want to describe an aggregation function \(\mathbf{agg}:(\mathbb{R}^m)^*\to\mathbb{R}^n\)
The challenge is that \(\mathbf{agg}\) must be able to handle sequences of any length \(L\).
Auxiliary functions
Consider a state space \(\mathbb{R}^k\) that will be used to aggregate the sequence to a hidden state.
This can be done by introducing:
- an initial state \(s_0\in\mathbb{R}^k\).
- an update function \(h:\mathbb{R}^k \times \mathbb{R}^m\to\mathbb{R}^k\).
Note the update function \(h\) takes two inputs: current state and current input. \[ s_{i+1} = h(s_i, x_i) \]
Using the update function \(h\), we can reduce \(x\) to a final state:
\(s^* = \mathrm{reduce}(h, x, s_0)\)
Introduce an output function \(f:\mathbb{R}^k\to\mathbb{R}^n\) which maps the final state to the aggregation output.
\(\mathbf{agg}(x) = f(s^*)\)
3 Revisiting Python reduce
In the case of:
xs = ["hello", "how", "are", "u", "doing?"]
def update(stat: (int, int), word: str) -> (int, int):
count, total = stat
return (count + 1, total + len(word))
(total_words, total_chars) = reduce(update, xs, (0, 0))
total_chars / total_wordsWe have the following:
- State space: \[\mathbb{N}^2\]
- Initial state: \[(0, 0)\]
- Update function: \[h(s, w) = (s[0]+1, s[1]+\mathrm{len}(w))\]
- Output function: \[f(s) = \frac{s[1]}{s[0]}\]
4 Learning sequence aggregations
In sequence learning, we need to model the aggregation as:
- \(h\): update function
- \(f\): output function
Both are just normal vector functions, and thus can be learned using MLPs.
The equations that describe the sequence function is given as:
\[ s_{i+1} = h(x_i, s_i) \mathrm{\ for\ } i\in 0\dots L-1 \]
\[ y = f(x, s_L) \]
where:
- \(s_0\) is the initial state.
- \(x = (x_0, x_1, \dots, x_L) \in{\mathbb{R}^m}^*\) is the sequence input.
- \(y\in\mathbb{R}^n\) is the output vector.
5 Preprocessing of IMDB Reviews
from importlib import reload
import mytext
import torch
from torchtext.data import get_tokenizer
from torch.utils.data import TensorDataset, DataLoader
reload(mytext);tokenizer = get_tokenizer('basic_english')reviews, targets = mytext.imdb_reviews(tokenizer)voc = mytext.build_vocab(reviews)reviews_tensor = mytext.build_tensor(reviews, voc)
reviews_tensor.shapetorch.Size([25000, 2633])
inputs_tensor = reviews_tensor[:, :100]
inputs_tensor.shapetorch.Size([25000, 100])
targets_tensor = torch.tensor(targets, dtype=torch.long)
targets_tensor.shapetorch.Size([25000])
dataset = TensorDataset(inputs_tensor, targets_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)6 Recurrent Neural Network From Scratch
We model the update function as a fully connected layer.
from torch import nn
from torch.optim import Adam
import numpy as npclass MyRNN(nn.Module):
def __init__(self, voc_size, input_dim, state_dim, output_dim):
super().__init__()
self.voc_size = voc_size
self.input_dim = input_dim
self.state_dim = state_dim
self.output_dim = output_dim
self.embedding = nn.Embedding(voc_size, input_dim)
self.h = nn.Linear(input_dim + state_dim, state_dim)
self.f = nn.Linear(input_dim + state_dim, output_dim)
def forward(self, tokens):
# tokens: (-1, L)
(batch_size, L) = tokens.shape
x_seq = self.embedding(tokens) # (-1, L, input_dim)
s = self.init_state(batch_size) # (-1, state_dim)
for i in range(L):
x = x_seq[:, i, :] # (-1, input_dim)
combined = torch.cat([x, s], axis=1) # (-1, input_dim + state_dim)
s = self.h(combined) # (-1, state_dim)
y = self.f(
torch.cat((x, s), 1)
) # (-1, output_dim)
return y, s
def init_state(self, batch_size):
return torch.zeros(batch_size, self.state_dim)Let’s try the module on a single batch.
(tokens, targets) = next(iter(dataloader))
print(tokens.shape)
print(targets.shape)torch.Size([64, 100])
torch.Size([64])
rnn = MyRNN(
voc_size=len(voc),
input_dim=32,
state_dim=64,
output_dim=2,
)(outputs, states) = rnn(tokens)
outputs.shapetorch.Size([64, 2])
7 Training
epochs = 5
input_dim = 32
state_dim = 64
model = MyRNN(len(voc), input_dim, state_dim, 2)
loss = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters())
for epoch in range(epochs):
losses = []
for (x, target) in dataloader:
y, _ = model(x)
l = loss(y, target)
l.backward()
optimizer.step()
optimizer.zero_grad()
losses.append(l.item())
l = np.mean(losses)
print("{}: loss={:.4f}".format(epoch, l))0: loss=0.6980
1: loss=0.6911
2: loss=0.6856
3: loss=0.6875
4: loss=0.6696
#
# Evaluate
#
from torchmetrics import Accuracy
with torch.no_grad():
success = 0
total = 0
for x, target in dataloader:
y, _ = model(x)
pred = y.argmax(axis=1)
success += (pred == target).sum()
total += target.shape[0]
print("Accuracy = {:.2f}".format(success/total))Accuracy = 0.60