{:check ["true"]}
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.datasets as datasets
import tensorflow.keras.layers as layers
import numpy as np
#
# Load the IMDB movie review dataset
#
data = datasets.imdb.load_data()
#
# Similar to MNIST dataset, it's split between training and testing data
#
(x_train, y_train), (x_test, y_test) = data
#
# Load the word_index table
#
index = datasets.imdb.get_word_index()
#
# build the lookup table
#
id_to_word = dict((i+3, w) for (w,i) in index.items())
id_to_word.update({
0: '<PAD>',
1: '<START>',
2: '<UNKNOWN>',
3: '<UNUSED>',
})
np.array(x_train[0])[:10]
" ".join([id_to_word[i] for i in x_train[0][:10]])
#
# The vocabulary size is
#
len(index)
def decode(ordinals):
return " ".join(id_to_word.get(i) for i in ordinals)
decode(x_train[2]), y_train[2]
data = datasets.imdb.load_data(
num_words=1000,
skip_top=5
)
(x_train, y_train), (x_test, y_test) = data
decode(x_train[0])
decode(x_train[1])
#
# To encode text to ordinals (at the application level), we compute the reverse lookup table
#
word_to_id = {
w:i for (i,w) in id_to_word.items()
}
def encode(text):
return [word_to_id.get(w, 2) for w in text.split()]
encode("this film was just brilliant casting")
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.datasets as datasets
import tensorflow.keras.models as models
import tensorflow.keras.layers as layers
import tensorflow.keras.preprocessing.sequence as sequence
import numpy as np
Load the IMDB Movie Review dataset
data = datasets.imdb.load_data(num_words=10000, skip_top=10)
word_index = datasets.imdb.get_word_index()
i2w = dict((i+3, w) for (w,i) in word_index.items())
i2w.update({
0: '<PAD>',
1: '<START>',
2: '<OOV>',
3: '<?>',
})
(x_train, y_train), (x_test, y_test) = data
The ordinal2text
function decodes the ordinal numbers to text.
def ordinal2text(seq):
return " ".join([i2w[i] for i in seq])
ordinal2text(x_train[0])
The embedding layer maps ordinal indices to embedding vectors.
Input shape is:
(batch_size, sequence_length)
Output shape is:
(batch_size, sequence_length, dimension)
embedding = layers.Embedding(10000, 20)
input_seq = np.array([[1,2,3, 3, 2], [1,2, 0, 0, 0]])
embedding_vectors = embedding(input_seq)
print(input_seq.shape, "=>", embedding_vectors.shape)
The SimpleRNN layer uses state vectors to perform reduction on the sequence of input vectors.
By default it outputs the final output vector.
It's possible to obtain the final state vector using return_state=True
.
It's also possible to obtain the sequence of output vectors during the reduction using
return_sequences=True
.
rnn = layers.SimpleRNN(7)
output_vector = rnn(embedding_vectors)
print(embedding_vectors.shape, "=>", output_vector.shape)
We can use a dense layer to conver the RNN output to a probability of good movie review.
dense = layers.Dense(1, activation='sigmoid')
output = dense(output_vector)
print(output_vector.shape, "=>", output.shape)
Padding and truncation
We need all the input sequences to have the same length for embedding layer to work.
This requires us to padd all the sequences that are too short, and truncate sequences that are too long.
keras.preprocessing.sequence
module provides the pad_sequences
function to make this task easier.
batch = x_train[:5]
[len(x) for x in batch]
padded_batch = sequence.pad_sequences(batch, maxlen=200)
[len(x) for x in padded_batch]
padded_batch
maxlen = 200
inputs = layers.Input(shape=(maxlen))
x = embedding(inputs)
x = rnn(x)
sentiment_output = dense(x)
model = models.Model(inputs=inputs, outputs=sentiment_output)
keras.utils.plot_model(model)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
x_train_padded = sequence.pad_sequences(x_train, maxlen=maxlen)
model.fit(x_train_padded, y_train, epochs=5, validation_split=0.2)
We can make use of the model to do some text analysis.
w2i = {w:i for (i,w) in i2w.items()}
def text2ordinal(text):
return [w2i.get(w, 2) for w in text.split()]
text_ordinals = text2ordinal("this film was just brilliant")
model.predict(sequence.pad_sequences([text_ordinals], maxlen=maxlen))
text_ordinals = text2ordinal("rendered terrible flat flat flat performances")
model.predict(sequence.pad_sequences([text_ordinals], maxlen=maxlen))