Sequence Classification ✅

1 Review of python’s reduce

from functools import reduce

#
# Aggregating sequences of numbers by summation
#

xs = [1, 2, 3, 4, 5]

def update(total: int, x: int) -> int:
    return total + x

reduce(update, xs, 0)

#
# Count total characters of a sequence of words
#
xs = ["hello", "how", "are", "u", "doing?"]

def update(total: int, word: str) -> int:
    return total + len(word)

reduce(update, xs, 0)

#
# Aggregate a sequence of words to their average number of characters per word
#
xs = ["hello", "how", "are", "u", "doing?"]

def update(stat: (int, int), word: str) -> (int, int):
    count, total = stat
    return (count + 1, total + len(word))

(total_words, total_chars) = reduce(update, xs, (0, 0))
total_chars / total_words

3.6

2 Processing unbounded sequences of vectors

Suppose we have a sequence of vectors:

\[ x = (x_1, x_2, \dots, x_L) \] where each \(x_i\in\mathbb{R}^m\). Thus, \(x\in(\mathbb{R}^m)^*\).

We want to describe an aggregation function \(\mathbf{agg}:(\mathbb{R}^m)^*\to\mathbb{R}^n\)

The challenge is that \(\mathbf{agg}\) must be able to handle sequences of any length \(L\).

Auxiliary functions

Consider a state space \(\mathbb{R}^k\) that will be used to aggregate the sequence to a hidden state.
This can be done by introducing:
- an initial state \(s_0\in\mathbb{R}^k\).
- an update function \(h:\mathbb{R}^k \times \mathbb{R}^m\to\mathbb{R}^k\).
Note the update function \(h\) takes two inputs: current state and current input. \[ s_{i+1} = h(s_i, x_i) \]
Using the update function \(h\), we can reduce \(x\) to a final state:

\(s^* = \mathrm{reduce}(h, x, s_0)\)
Introduce an output function \(f:\mathbb{R}^k\to\mathbb{R}^n\) which maps the final state to the aggregation output.

\(\mathbf{agg}(x) = f(s^*)\)

3 Revisiting Python reduce

In the case of:

xs = ["hello", "how", "are", "u", "doing?"]

def update(stat: (int, int), word: str) -> (int, int):
    count, total = stat
    return (count + 1, total + len(word))

(total_words, total_chars) = reduce(update, xs, (0, 0))
total_chars / total_words

We have the following:

State space: \[\mathbb{N}^2\]
Initial state: \[(0, 0)\]
Update function: \[h(s, w) = (s[0]+1, s[1]+\mathrm{len}(w))\]
Output function: \[f(s) = \frac{s[1]}{s[0]}\]

4 Learning sequence aggregations

In sequence learning, we need to model the aggregation as:

\(h\): update function
\(f\): output function

Both are just normal vector functions, and thus can be learned using MLPs.

The equations that describe the sequence function is given as:

\[ s_{i+1} = h(x_i, s_i) \mathrm{\ for\ } i\in 0\dots L-1 \]

\[ y = f(x, s_L) \]

where:

\(s_0\) is the initial state.
\(x = (x_0, x_1, \dots, x_L) \in{\mathbb{R}^m}^*\) is the sequence input.
\(y\in\mathbb{R}^n\) is the output vector.

5 Preprocessing of IMDB Reviews

from importlib import reload
import mytext

import torch
from torchtext.data import get_tokenizer
from torch.utils.data import TensorDataset, DataLoader

reload(mytext);

tokenizer = get_tokenizer('basic_english')

reviews, targets = mytext.imdb_reviews(tokenizer)

voc = mytext.build_vocab(reviews)

reviews_tensor = mytext.build_tensor(reviews, voc)
reviews_tensor.shape

torch.Size([25000, 2633])

inputs_tensor = reviews_tensor[:, :100]
inputs_tensor.shape

torch.Size([25000, 100])

targets_tensor = torch.tensor(targets, dtype=torch.long)
targets_tensor.shape

torch.Size([25000])

dataset = TensorDataset(inputs_tensor, targets_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

6 Recurrent Neural Network From Scratch

We model the update function as a fully connected layer.

from torch import nn
from torch.optim import Adam
import numpy as np

class MyRNN(nn.Module):
    def __init__(self, voc_size, input_dim, state_dim, output_dim):
        super().__init__()
        self.voc_size = voc_size
        self.input_dim = input_dim
        self.state_dim = state_dim
        self.output_dim = output_dim
        
        self.embedding = nn.Embedding(voc_size, input_dim)
        self.h = nn.Linear(input_dim + state_dim, state_dim)
        self.f = nn.Linear(input_dim + state_dim, output_dim)
    
    def forward(self, tokens):
        # tokens: (-1, L)
        (batch_size, L) = tokens.shape
        x_seq = self.embedding(tokens) # (-1, L, input_dim)
        s = self.init_state(batch_size) # (-1, state_dim)
        for i in range(L):
            x = x_seq[:, i, :] # (-1, input_dim)
            combined = torch.cat([x, s], axis=1) # (-1, input_dim + state_dim)
            s = self.h(combined) # (-1, state_dim)

        y = self.f(
                torch.cat((x, s), 1)
            ) # (-1, output_dim)
        return y, s
    
    def init_state(self, batch_size):
        return torch.zeros(batch_size, self.state_dim)

Let’s try the module on a single batch.

(tokens, targets) = next(iter(dataloader))

print(tokens.shape)
print(targets.shape)

torch.Size([64, 100])
torch.Size([64])

rnn = MyRNN(
    voc_size=len(voc),
    input_dim=32,
    state_dim=64,
    output_dim=2,
)

(outputs, states) = rnn(tokens)
outputs.shape

torch.Size([64, 2])

7 Training

epochs = 5
input_dim = 32
state_dim = 64
model = MyRNN(len(voc), input_dim, state_dim, 2)
loss = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters())

for epoch in range(epochs):
    losses = []
    for (x, target) in dataloader:
        y, _ = model(x)
        l = loss(y, target)
        l.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(l.item())
    l = np.mean(losses)
    print("{}: loss={:.4f}".format(epoch, l))

0: loss=0.6980
1: loss=0.6911
2: loss=0.6856
3: loss=0.6875
4: loss=0.6696

#
# Evaluate
#
from torchmetrics import Accuracy

with torch.no_grad():
    success = 0
    total = 0
    for x, target in dataloader:
        y, _ = model(x)
        pred = y.argmax(axis=1)
        success += (pred == target).sum()
        total += target.shape[0]
    print("Accuracy = {:.2f}".format(success/total))

Accuracy = 0.60