import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from torchsummary import summary
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
import my
Text Encoding ✅
1 Loading training data
We will work with the IMDB Review dataset.
- Each input is a string consisting of words forming the review of a movie.
- Each output is a categorical value in the domain of {
positive
,negative
}.
The function we want to learn is:
\[ f_0 : \mathrm{Text} \to \{0, 1\} \]
with open("/workspace/datasets/IMDB_Review/labels.txt", 'r') as f:
= [x.strip() for x in f.readlines()]
lines = pd.Series(lines)
labels with open('/workspace/datasets/IMDB_Review/reviews.txt', 'r') as f:
= [x.strip() for x in f.readlines()]
lines = pd.Series(lines)
reviews
= pd.DataFrame({"review": reviews, "label": labels})
train_df train_df.head()
review | label | |
---|---|---|
0 | bromwell high is a cartoon comedy . it ran at ... | positive |
1 | story of a man who has unnatural feelings for ... | negative |
2 | homelessness or houselessness as george carli... | positive |
3 | airport starts as a brand new luxury pla... | negative |
4 | brilliant over acting by lesley ann warren . ... | positive |
2 Convert target to integer values
'target'] = train_df.label.apply({
train_df['negative': 0,
'positive': 1
}.get) train_df.head()
review | label | target | |
---|---|---|---|
0 | bromwell high is a cartoon comedy . it ran at ... | positive | 1 |
1 | story of a man who has unnatural feelings for ... | negative | 0 |
2 | homelessness or houselessness as george carli... | positive | 1 |
3 | airport starts as a brand new luxury pla... | negative | 0 |
4 | brilliant over acting by lesley ann warren . ... | positive | 1 |
3 Tokenizer and vocabulary
def tokenizer(review: str) -> List[str]:
import re
= re.sub(r'[^a-zA-Z0-9_]', ' ', review)
review return review.split()
0].review)[:10] tokenizer(train_df.iloc[
['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']
- A vocabulary is the collection of known tokens.
- Each token is mapped to an integer.
- We have some reserved tokens:
<PAD>
is used for padding - showing that there is no token at this position.<OOV>
stands for out-of-vocabulary token, which means that it’s an unknown token.<START>
indicates the start of a token sequence.
- We also build a lookup table that maps the integer value to the corresponding token.
= 1
OOV def build_vocabulary(data: List[str]) \
-> (Dict[str, int], Dict[int, str]):
= {
voc '<PAD>': 0,
'<OOV>': 1,
'<START>': 2,
}for text in data:
for token in tokenizer(text):
if token not in voc:
= len(voc)
voc[token] = dict()
lookup for (token, index) in voc.items():
= token
lookup[index] return voc, lookup
= build_vocabulary(train_df.review) (voc, lookup)
list(voc.items())[:5]
[('<PAD>', 0), ('<OOV>', 1), ('<START>', 2), ('bromwell', 3), ('high', 4)]
#
# Vocabulary size
#
len(voc)
74075
Summary:
- \(\mathrm{tokenizer} : \mathrm{Text}\to\mathbb{V}^*\) where \(\mathbb{V}\) is the tokens.
- \(\mathrm{voc}: \mathbb{V}\to\mathbb{N}\) where \(\mathbb{N}\) is the integers representing the tokens.
Using the tokenizer, we can transform the function \(f_0\) into the following form:
\[ f_1 : \mathbb{V}^* \to \{0, 1\} \]
Furthermore, using the vocabulary encoding, we can transform the learning task to the following form:
\[ f_2 : \mathbb{N}^* \to \{0, 1\} \]
def encode(text: str) -> List[int]:
return list(map(lambda x: voc.get(x, OOV), tokenizer(text)))
5].apply(encode) train_df.review[:
0 [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
1 [94, 54, 6, 95, 45, 96, 97, 98, 99, 6, 100, 10...
2 [167, 168, 169, 15, 170, 171, 172, 96, 173, 11...
3 [349, 101, 15, 6, 350, 351, 352, 353, 5, 354, ...
4 [653, 431, 619, 118, 277, 278, 279, 470, 654, ...
Name: review, dtype: object
4 Vector embedding
#
# Each categorical value (as an integer) is encoded into a 5-dim vector.
#
= nn.Embedding(len(voc), 5) embedding
input = torch.tensor([[1,2,3]])
= embedding(input)
output
input.shape, "==>", output.shape
(torch.Size([1, 3]), '==>', torch.Size([1, 3, 5]))
Note
The embedding is mapping from integer indexes to \(d\)-dimensional vectors. It is implemented as a matrix of shape \(N\times d\) where \(N\) is the range of all possible indexes.
Conceptually, the embedding layer is a function: \[ \mathrm{emb} : [0\dots N-1] \to \mathbb{R}^d \]
This means that we can now convert the original task to learning a function of the form:
\[ f_3 : (\mathbb{R}^d)^* \to \{0, 1\} \]
5 Getting ready for text learning
Let’s take a quick glance at how text learning can be done.
from torch.utils.data import Dataset, DataLoader
class MyDataset(Dataset):
def __init__(self, df: pd.DataFrame, sequence_length):
self.df = df
self.sequence_length = sequence_length
def __len__(self):
return len(df)
def __getitem__(self, index):
= self.df.iloc[index]
row = row.review, row.target
review, target = tokenizer(review)
tokens = list(map(lambda tok: voc.get(tok, OOV), tokens))
integers
if len(integers) >= self.sequence_length:
= integers[:self.sequence_length]
integers else:
= integers \
integers + [OOV] * (self.sequence_length - len(integers))
= torch.tensor(integers, dtype=torch.int32)
x return (x, target)
= MyDataset(train_df, 10)
dataset = DataLoader(dataset, shuffle=True, batch_size=100) dataloader
#
# Examining the dataset content
#
0] dataset[
(tensor([ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=torch.int32), 1)
#
# Examining the dataloader content
#
= iter(dataloader)
xs = next(xs)
batch 0].shape, batch[1].shape batch[
(torch.Size([100, 10]), torch.Size([100]))
6 Simple text classifier
class MyModel(nn.Module):
def __init__(self):
super().__init__()
self.emb = nn.Embedding(len(voc), 10)
self.flatten = nn.Flatten()
self.linear = nn.LazyLinear(32)
self.output = nn.Linear(32, 2)
def forward(self, x):
= self.emb(x) # (length, emb_dim)
x = self.flatten(x) # (length x emb_dim)
x = self.linear(x)
x = torch.nn.functional.relu(x)
x = self.output(x)
x return x
= MyModel()
model = torch.optim.Adam(model.parameters())
optimizer = torch.nn.CrossEntropyLoss() loss_fn
from tqdm.notebook import tqdm
= 5
epochs
for epoch in range(epochs):
= 0
total_loss = 0
total_acc = 0
total_count for (x, target) in tqdm(dataloader):
optimizer.zero_grad()= model(x)
y = loss_fn(y, target)
loss
loss.backward()
optimizer.step()with torch.no_grad():
= (y.argmax(axis=-1) == target).sum() / len(target)
acc += loss.item()
total_loss += acc.item()
total_acc += 1
total_count
print("Epoch {}: mean loss = {:.2f}, mean acc = {:.2f}".format(
epoch,/ total_count,
total_loss / total_count)
total_acc )
Epoch 0: mean loss = 0.69, mean acc = 0.52
Epoch 1: mean loss = 0.68, mean acc = 0.56
Epoch 2: mean loss = 0.67, mean acc = 0.59
Epoch 3: mean loss = 0.65, mean acc = 0.62
Epoch 4: mean loss = 0.62, mean acc = 0.65
7 Summary
Text needs to be broken down into tokens. Depending on the situation, tokens can be words, parts of words, or even smaller units such as a character.
The collection of all tokens is the vocabulary. Each token is encoded by an integer index from 0 to the size of the vocabulary.
An embedding neural network layer is used to map each token index to a \(d\)-dimensional vector for downstream processing.
Text is encoded as a sequence of \(d\)-dimensional vectors.
8 Limitations of MLP
- Each input can have variable length because they are sequences of vectors (after embedding).
- However, the length of MLP is fixed at some length \(L\), and the size of the neural network needs to grow as \(L\) increases.
We will discuss network architectures that can process arbitrary sequence lengths with a fixed network size.
- Recurrent neural network
- 1D convolutional networks