Contents

Reuters news

Contents

This is a self-correcting activity generated by nbgrader. Fill in any place that says YOUR CODE HERE or YOUR ANSWER HERE. Run subsequent cells to check your code.

Reuters news¶

The Reuters dataset is a set of short newswires and their topics, published by Reuters in 1987 and widely used for text classification. There are 46 different topics, some more represented than others. These topics are mutually exclusive: a news can only belong to one topic.

The goal is to classify news articles by their topic.

Environment setup¶

# Import base packages
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Setup plots
%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 8
%config InlineBackend.figure_format = 'retina'
sns.set()

# Import ML packages
import tensorflow as tf
print(f'TensorFlow version: {tf.__version__}')
print(f'Keras version: {tf.keras.__version__}')

from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.datasets import reuters
from tensorflow.keras.utils import to_categorical

Utility functions¶

def plot_loss_acc(history):
    """Plot training and (optionally) validation loss and accuracy"""

    loss = history.history['loss']
    epochs = range(1, len(loss) + 1)

    plt.figure(figsize=(10, 10))

    plt.subplot(2, 1, 1)
    plt.plot(epochs, loss, '.--', label='Training loss')
    final_loss = loss[-1]
    title = 'Training loss: {:.4f}'.format(final_loss)
    plt.ylabel('Loss')
    if 'val_loss' in history.history:
        val_loss = history.history['val_loss']
        plt.plot(epochs, val_loss, 'o-', label='Validation loss')
        final_val_loss = val_loss[-1]
        title += ', Validation loss: {:.4f}'.format(final_val_loss)
    plt.title(title)
    plt.legend()

    acc = history.history['accuracy']

    plt.subplot(2, 1, 2)
    plt.plot(epochs, acc, '.--', label='Training acc')
    final_acc = acc[-1]
    title = 'Training accuracy: {:.2f}%'.format(final_acc * 100)
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    if 'val_accuracy' in history.history:
        val_acc = history.history['val_accuracy']
        plt.plot(epochs, val_acc, 'o-', label='Validation acc')
        final_val_acc = val_acc[-1]
        title += ', Validation accuracy: {:.2f}%'.format(final_val_acc * 100)
    plt.title(title)
    plt.legend()

Step 1: Loading the data¶

Question¶

Load the Reuters dataset included with Keras. Limit yourself to the 10,000 most frequent words.
Print shapes of training data and labels.
Print the first training sample.
Print the first 20 labels.

# The following code used to cause a loading error caused by an API change in NumPy
# https://stackoverflow.com/questions/55890813/how-to-fix-object-arrays-cannot-be-loaded-when-allow-pickle-false-for-imdb-loa
# It seems fixed with recent versions of TF/Keras

# YOUR CODE HERE

# Showing the first 10 samples as text

# word_index is a dictionary mapping words to an integer index
word_index = reuters.get_word_index()
# We reverse it, mapping integer indices to words
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# We decode the news; note that our indices were offset by 3
# because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown".
decoded_news = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
print(decoded_news)

Step 2: Preparing the data¶

Question¶

Prepare data for training. Set apart the first 1,000 examples for validation. Store the data subsets in variables named x_train/y_train, x_val/y_val and x_test/y_test.

def vectorize_sequences(sequences, dimension=10000):
    """One-hot encode a vector of sequences into a binary matrix (number of sequences, dimension)"""

    # Example : [[3, 5]] -> [[0. 0. 0. 1. 0. 1. 0...]]

    results = np.zeros((len(sequences), dimension))
    # set specific indices of results[i] to 1s
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

# Turn news into vectors of 0s and 1s (one-hot encoding)
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

# YOUR CODE HERE

# Show a sample of encoded input
df_x_train = pd.DataFrame(x_train)
df_x_train.sample(n=10)

print(f'x_train: {x_train.shape}. y_train: {y_train.shape}')
print(f'x_val: {x_val.shape}. y_val: {y_val.shape}')
print(f'x_test: {x_test.shape}. y_test: {y_test.shape}')

# Assert shapes of prepared data
assert x_train.shape == (7982, 10000)
assert y_train.shape == (7982, 46)
assert x_val.shape == (1000, 10000)
assert y_val.shape == (1000, 46)
assert x_test.shape == (2246, 10000)
assert y_test.shape ==(2246, 46)

Step 3: Training a model¶

Question¶

Train a model on the data to obtain a training accuracy > 95%. Store the training history in a variable named history.

# YOUR CODE HERE

# Show training history
plot_loss_acc(history)

# Retrieve final training accuracy
train_acc = history.history['accuracy'][-1]

# Assert final accuracy
assert train_acc > 0.95

# Evaluate model performance on test data
_, test_acc = model.evaluate(x_test, y_test)

print(f'Test accuracy: {test_acc * 100:.2f}%')

Step 4: Tuning the model¶

Question¶

If necessary, tune your model to obtain a validation accuracy > 82%.

# YOUR CODE HERE

# Show training history
plot_loss_acc(history)

# Retrieve final validation accuracy
val_acc = history.history['val_accuracy'][-1]

# Assert final accuracy
assert val_acc > 0.82

# Evaluate model performance on test data
_, test_acc = model.evaluate(x_test, y_test)

print(f'Test accuracy: {test_acc * 100:.2f}%')