Jena weather¶

The goal of this activity is to analyze a times series with a Recurrent Neural Network (RNN), in order to forecast the weather based on past observations.

It uses a weather dataset recorded from 2003 to 2016 by the Max Planck Institute for Biogeochemistry in Jena, Germany. This dataset contains 14 different features such as air temperature, atmospheric pressure, and humidity. These were collected every 10 minutes.

Environment setup¶

import platform

print(f"Python version: {platform.python_version()}")
assert platform.python_version_tuple() >= ("3", "6")

import os  # To access locally extracted file
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Setup plots
%matplotlib inline
plt.rcParams["figure.figsize"] = 10, 8
%config InlineBackend.figure_format = 'retina'
sns.set()

import tensorflow as tf

print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {tf.keras.__version__}")

from tensorflow.keras.utils import get_file

# You may add other imports here as needed
# YOUR CODE HERE

# Utility functions


def plot_series(series, y_true, y_pred=None, x_label="$t$", y_label="$temp(t)$"):
    """Plot a time series with actual and predicted future values
    series: vector of shape (time steps, )
    y_true: scalar (if only 1 ahead step) or vector of shape (ahead steps,)
    y_pred: scalar (if only 1 ahead step) or vector of shape (ahead steps,)"""

    plt.plot(series, ".-")
    n_steps = series.shape[0]

    # Calculate the number of steps ahead (= number of future values)
    n_steps_ahead = 1
    if not np.isscalar(y_true):
        n_steps_ahead = y_true.shape[0]

    # Plot actual future values
    plt.plot(np.arange(n_steps, n_steps + n_steps_ahead), y_true, "ro-", label="Actual")

    if y_pred is not None:
        # Plot predicted future values
        plt.plot(
            np.arange(n_steps, n_steps + n_steps_ahead),
            y_pred,
            "bx-",
            label="Predicted",
            markersize=10,
        )
    if x_label:
        plt.xlabel(x_label, fontsize=16)
    if y_label:
        plt.ylabel(y_label, fontsize=16, rotation=90)

    plt.legend(fontsize=14)


def plot_loss(history):
    """Plot training loss for a Keras model
    Takes a Keras History object as parameter"""

    loss = history.history["loss"]
    epochs = range(1, len(loss) + 1)

    plt.figure(figsize=(10, 5))
    plt.plot(epochs, loss, ".--", label="Training loss")
    final_loss = loss[-1]
    title = "Training loss: {:.4f}".format(final_loss)
    plt.ylabel("Loss")
    if "val_loss" in history.history:
        val_loss = history.history["val_loss"]
        plt.plot(epochs, val_loss, "o-", label="Validation loss")
        final_val_loss = val_loss[-1]
        title += ", Validation loss: {:.4f}".format(final_val_loss)
    plt.title(title)
    plt.legend()

Step 1: loading the data¶

# Download and extract the dataset
zip_path = get_file(
    origin="https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip",
    fname="jena_climate_2009_2016.csv.zip",
    extract=True,
)
csv_path, _ = os.path.splitext(zip_path)
print(f"Dataset extracted at {csv_path}")

# Load the dataset into a DataFrame
df_weather = pd.read_csv(csv_path)
print(f"Dataset: {df_weather.shape}")

Question¶

Show the first 10 data samples.

# YOUR CODE HERE

Step 2: preparing the data¶

You’ll try to predict the temperature by using only the past temperatures, and not the other features of the dataset.

# Select only the temperature feature in the dataset
df_univariate = df_weather["T (degC)"]

# Add time to ease visualization
df_univariate.index = df_weather["Date Time"]

df_univariate.head()

# Plot the temperatures over time
df_univariate.plot()
plt.show()

Question¶

It is important to scale features before training a neural network. Standardization is a common way of doing this scaling by subtracting the mean and dividing by the standard deviation of each feature.

Standardize the data using values computed on the training set.

# First 300,000 samples fo training
# Next 60,000 samples for validation
# Remaining samples for test
TRAIN_SPLIT = 300000
VAL_SPLIT = 360000

# Compute the mean and standard deviation on training set only
x_train_mean = df_univariate[:TRAIN_SPLIT].mean()
x_train_std = df_univariate[:TRAIN_SPLIT].std()

# Standardize the dataset
# YOUR CODE HERE

df_univariate.head()

Creating inputs and targets¶

It’s time to split the dataset as usual and create training, validation and test sets.

def prepare_univariate_data(dataset, start_index, end_index, history_size, target_size):
    """Create inputs and targets for a window of time defined by start_index and end_index
    history_size: number of time steps of the window
    target_size: number of steps ahead to be predicted"""

    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i - history_size, i)
        # Reshape data from (history_size,) to (history_size, 1)
        data.append(np.reshape(dataset[indices], (history_size, 1)))
        labels.append(dataset[i + target_size])
    return np.array(data), np.array(labels).reshape(len(labels), 1)

# We're using the 20 last temperature observations to predict the next one.
past_history = 20
future_target = 0

x_train, y_train = prepare_univariate_data(
    df_univariate.values, 0, TRAIN_SPLIT, past_history, future_target,
)
x_val, y_val = prepare_univariate_data(
    df_univariate.values, TRAIN_SPLIT, VAL_SPLIT, past_history, future_target,
)
x_test, y_test = prepare_univariate_data(
    df_univariate.values, VAL_SPLIT, None, past_history, future_target,
)

print(f"x_train: {x_train.shape}, y_train: {y_train.shape}")
print(f"x_val: {x_val.shape}, y_val: {y_val.shape}")
print(f"x_test: {x_test.shape}, y_test: {y_test.shape}")

# Plot the first validation series with ground truth
plot_series(series=x_val[0, :, 0], y_true=y_val[0, 0])

Step 3: training models¶

We start by defining a baseline model using a naïve approach, then try to beat it using a RNN.

# Baseline prediction = input for last time step
y_pred_baseline = x_val[:, -1]
print(f"y_pred_baseline: {y_pred_baseline.shape}")

baseline_mse = np.mean(mean_squared_error(y_val, y_pred_baseline))

# Print MSE
print(f"Baseline MSE: {baseline_mse:0.05f}")

# Plot the first validation series with ground truth and prediction
plot_series(series=x_val[0, :, 0], y_true=y_val[0, 0], y_pred=y_pred_baseline[0, 0])

Question¶

Define a Recurrent Neural Network model able to learn from the prepared data and predict the temperature one step ahead. Store it in the univariate_model variable.

# YOUR CODE HERE

univariate_model.summary()

# Compile and traing the model

univariate_model.compile(optimizer="adam", loss="mse")

history = univariate_model.fit(
    x_train,
    y_train,
    epochs=10,
    batch_size=256,
    verbose=0,
    validation_data=(x_val, y_val),
)

plot_loss(history)

Question¶

Compute the model’s prediction on validation data. Store the result in the y_pred_univariate variable.

The model’s MSE must be lower than the baseline model’s.

# YOUR CODE HERE

univariate_mse = np.mean(mean_squared_error(y_val, y_pred_univariate))

# Print MSE
print(f"Univariate MSE: {univariate_mse:0.05f}")

assert univariate_mse < baseline_mse

# Plot first validation series with ground truth and prediction
plot_series(series=x_val[0, :, 0], y_true=y_val[0, 0], y_pred=y_pred_univariate[0, 0])
plt.show()

Machine Learning Katas

Jena weather

Contents

Jena weather¶

Environment setup¶

Step 1: loading the data¶

Question¶

Step 2: preparing the data¶

Question¶

Creating inputs and targets¶

Step 3: training models¶

Question¶

Question¶