This is a self-correcting activity generated by nbgrader. Fill in any place that says YOUR CODE HERE or YOUR ANSWER HERE. Run subsequent cells to check your code.


Household power consumption

The goal of this activity is to analyze a time series in order to predict the electric consumption of a home.

It uses a dataset gathering measurements for a house located in Sceaux (France) between December 2006 and November 2010.

Attribute description is as follows:

  1. date: date in format dd/mm/yyyy

  2. time: time in format hh:mm:ss

  3. global_active_power: household global minute-averaged active power (in kilowatt)

  4. global_reactive_power: household global minute-averaged reactive power (in kilowatt)

  5. voltage: minute-averaged voltage (in volt)

  6. global_intensity: household global minute-averaged current intensity (in ampere)

  7. sub_metering_1: energy sub-metering No. 1 (in watt-hour of active energy). It corresponds to the kitchen, containing mainly a dishwasher, an oven and a microwave (hot plates are not electric but gas powered).

  8. sub_metering_2: energy sub-metering No. 2 (in watt-hour of active energy). It corresponds to the laundry room, containing a washing-machine, a tumble-drier, a refrigerator and a light.

  9. sub_metering_3: energy sub-metering No. 3 (in watt-hour of active energy). It corresponds to an electric water-heater and an air-conditioner.

The active energy consumed every minute (in watt hour) in the household by electrical equipment not measured in sub-meterings 1, 2 and 3 is geven par the following formula:

global_active_power*1000/60 - sub_metering_1 - sub_metering_2 - sub_metering_3

Environment setup

import platform

print(f"Python version: {platform.python_version()}")
assert platform.python_version_tuple() >= ("3", "6")

import os  # To access locally extracted file
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

print(f"NumPy version: {np.__version__}")
# Setup plots
%matplotlib inline
plt.rcParams["figure.figsize"] = 10, 8
%config InlineBackend.figure_format = 'retina'
sns.set()
import sklearn

print(f"scikit-learn version: {sklearn.__version__}")
assert sklearn.__version__ >= "0.20"

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {tf.keras.__version__}")

from tensorflow.keras.utils import get_file
from tensorflow.keras.losses import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Reshape, Lambda, LSTM
def plot_series(series, y_true, y_pred=None, x_label="$t$", y_label="$x(t)$"):
    """Plot a time series with actual and predicted future values
    series: vector of shape (time steps, )
    y_true: scalar (if only 1 ahead step) or vector of shape (ahead steps,)
    y_pred: scalar (if only 1 ahead step) or vector of shape (ahead steps,)"""

    plt.plot(series, ".-", label="Inputs")
    n_steps = series.shape[0]

    # Calculate the number of steps ahead (= number of future values)
    n_steps_ahead = 1
    if not np.isscalar(y_true):
        n_steps_ahead = y_true.shape[0]

    # Plot actual future values
    plt.plot(np.arange(n_steps, n_steps + n_steps_ahead), y_true, "ro-", label="Labels")

    if y_pred is not None:
        # Plot predicted future values
        plt.plot(
            np.arange(n_steps, n_steps + n_steps_ahead),
            y_pred,
            "bx-",
            label="Predicted",
            markersize=10,
        )
    if x_label:
        plt.xlabel(x_label, fontsize=16)
    if y_label:
        plt.ylabel(y_label, fontsize=16)

    plt.legend(fontsize=14)


def plot_loss(history):
    """Plot training loss for a Keras model
    Takes a Keras History object as parameter"""

    loss = history.history["loss"]
    epochs = range(1, len(loss) + 1)

    plt.figure(figsize=(10, 5))
    plt.plot(epochs, loss, ".--", label="Training loss")
    final_loss = loss[-1]
    title = "Training loss: {:.4f}".format(final_loss)
    plt.ylabel("Loss")
    if "val_loss" in history.history:
        val_loss = history.history["val_loss"]
        plt.plot(epochs, val_loss, "o-", label="Validation loss")
        final_val_loss = val_loss[-1]
        title += ", Validation loss: {:.4f}".format(final_val_loss)
    plt.title(title)
    plt.legend()

Step 1: loading the data

# Download and extract the dataset
zip_path = get_file(
    origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip",
    fname="household_power_consumption.zip",
    extract=True,
)
file_path, _ = os.path.splitext(zip_path)
file_path += ".txt"
print(f"Dataset extracted at {file_path}")
# Load the dataset into a DataFrame
# - "Date" and "Time" columns are merged into a "Date_time" attribute, which is used as index column
# - Missing values ("nan" and "?") are converted into NumPy NaNs
df_power = pd.read_csv(
    file_path,
    sep=";",
    parse_dates={"Date_time": ["Date", "Time"]},
    infer_datetime_format=True,
    low_memory=False,
    na_values=["?"],
    index_col="Date_time",
)
print(f"df_power: {df_power.shape}")

Step 2: exploring the data

Use pandas to gain insights about the dataset.

# YOUR CODE HERE
# YOUR CODE HERE
# YOUR CODE HERE

Step 3: preparing the data

# Compute number and percent of missing values among features
def find_missing_values(df):
    total_missing = df.isnull().sum()
    percent_missing = (total_missing * 100 / df.isnull().count()).sort_values(
        ascending=False
    )
    return pd.concat(
        [total_missing, percent_missing], axis=1, keys=["Total", "Percent"]
    )


find_missing_values(df_power).head(n=10)
# Show the first samples with missing values
df_power[df_power.isnull().any(axis=1)].head(n=10)
# Fill missing values with mean for all features
def fill_na(df):
    n_features = df.shape[1]
    for j in range(0, n_features):
        df.iloc[:, j] = df.iloc[:, j].fillna(df.iloc[:, j].mean())
    return df


df_power = fill_na(df_power)

# Check that there are no remaining missing values
df_power.isnull().sum()
# Resample dataset over hours rather than minutes, to speed up computations
df_power = df_power.resample("h").mean()
print(f"df_power: {df_power.shape}")
df_power.head(n=10)
plot_cols = ["Global_active_power", "Voltage", "Global_intensity"]

# Plot several features resampled over hour for the whole dataset
df_plotted_cols = df_power[plot_cols]
_ = df_plotted_cols.plot(subplots=True)
# Plot several features resampled over hour for the first 20 days of the dataset
_ = df_plotted_cols[:480].plot(subplots=True)

Question

Split the dataset using (70%, 20%, 10%) ratios.

# Split dataset between training, validation and test sets
# No shuffling to preserve time dependencies
n_samples = len(df_power)

# YOUR CODE HERE

print(f"df_train: {df_train.shape}")
print(f"df_val: {df_val.shape}")
print(f"df_test: {df_test.shape}")

Question

Standardize the splitted sets.

# Standardize the sets using metrics computed on training set
train_mean = df_train.mean()
train_std = df_train.std()

# YOUR CODE HERE
# Look at the distribution of the features
df_std = (df_power - train_mean) / train_std
df_std = df_std.melt(var_name="Features", value_name="Normalized_values")
plt.figure(figsize=(12, 6))
ax = sns.violinplot(x="Features", y="Normalized_values", data=df_std)
_ = ax.set_xticklabels(df_power.keys(), rotation=90)
# Split a dataset into time windows
# input_width is the number of input time steps
# label_width is the number of predicted time steps
def split_into_windows(dataset, input_width, label_width):
    inputs = []
    labels = []

    start_index = input_width
    end_index = len(dataset) - label_width
    for i in range(start_index, end_index):
        input_indices = range(i - input_width, i)
        inputs.append(dataset[input_indices])
        label_indices = range(i, i + label_width)
        labels.append(dataset[label_indices])

    return np.array(inputs), np.array(labels)


def plot_features(series, y_true, y_pred=None, title=None):
    plot_cols = [0, 2, 3]

    fig, axes = plt.subplots(
        nrows=len(plot_cols), ncols=1, sharey=True, figsize=(12, 8)
    )
    if title:
        fig.suptitle(title, fontsize=18)
    for i, col in enumerate(plot_cols):
        plt.sca(axes[i])
        plot_series(
            series=series[:, col],
            y_true=y_true[:, col],
            y_pred=y_pred[:, col] if y_pred is not None else None,
            x_label="$Time (h)$",
            y_label=df_train.columns[col],
        )

Question

Complete the definition of the train() function.

def train(model, x_train, y_train, x_val, y_val):
    # Train a model using Adam, mean_squared_error for loss and mae for metric
    
    # YOUR CODE HERE

Step 4: training models

n_features = df_power.shape[1]

# Hyperparameters
n_steps_before = 24
n_steps_ahead = 5
n_epochs = 20
x_train, y_train = split_into_windows(df_train.values, n_steps_before, n_steps_ahead)
x_val, y_val = split_into_windows(df_val.values, n_steps_before, n_steps_ahead)
x_test, y_test = split_into_windows(df_test.values, n_steps_before, n_steps_ahead)

print(f"x_train: {x_train.shape}, y_train: {y_train.shape}")
print(f"x_val: {x_val.shape}, y_val: {y_val.shape}")
print(f"x_test: {x_test.shape}, y_test: {y_test.shape}")
# Plot last validation series
plot_features(x_val[-1], y_val[-1])

Naïve forecasting

# Duplicate last values for all features
y_pred_naive = np.tile(x_val[:, -1:, :], (n_steps_ahead, 1))
print(f"y_pred_naive: {y_pred_naive.shape}")

print(f"Naïve predictor MSE: {np.mean(mean_squared_error(y_val, y_pred_naive)):0.05f}")
# Plot forecasting for last validation series

# YOUR CODE HERE

Dense network

dense_model = Sequential(
    [
        # Take the last time-step.
        # Shape [batch, time, features] => [batch, 1, features]
        Lambda(lambda x: x[:, -1:, :]),
        # Shape => [batch, 1, dense_units]
        Dense(units=512, activation="relu"),
        # Shape => [batch, n_steps_ahead*n_features]
        Dense(
            units=n_steps_ahead * n_features, kernel_initializer=tf.initializers.zeros()
        ),
        # Shape => [batch, n_steps_ahead, n_features]
        Reshape([n_steps_ahead, n_features]),
    ]
)
history = train(dense_model, x_train, y_train, x_val, y_val)
plot_loss(history)
y_pred_dense = dense_model.predict(x_val)

print(f"Dense network MSE: {np.mean(mean_squared_error(y_val, y_pred_dense)):0.05f}")
# Plot forecasting for last validation series

# YOUR CODE HERE

Recurrent network

Using the architecture of your choice, define a recurrent neural network able to beat the dense model.

# YOUR CODE HERE
# YOUR CODE HERE
# YOUR CODE HERE
# YOUR CODE HERE
# YOUR CODE HERE