PyTorch

Environment setup

import platform

print(f"Python version: {platform.python_version()}")
assert platform.python_version_tuple() >= ("3", "6")

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
Python version: 3.7.5
# Setup plots
%matplotlib inline
plt.rcParams["figure.figsize"] = 10, 8
%config InlineBackend.figure_format = 'retina'
sns.set()
%load_ext tensorboard
import sklearn

print(f"scikit-learn version: {sklearn.__version__}")

from sklearn.datasets import make_moons

import torch

print(f"PyTorch version: {torch.__version__}")

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
scikit-learn version: 0.22.1
PyTorch version: 1.3.1
def plot_planar_data(X, y):
    """Plot some 2D data"""

    plt.figure()
    plt.plot(X[y == 0, 0], X[y == 0, 1], 'or', alpha=0.5, label=0)
    plt.plot(X[y == 1, 0], X[y == 1, 1], 'ob', alpha=0.5, label=1)
    plt.legend()

Tensor API

Tensor creation

# Create 1D tensor with predefined values
t = torch.tensor([5.5, 3])

print(t)
print(t.shape)
tensor([5.5000, 3.0000])
torch.Size([2])
# Create 2D tensor filled with random numbers from a uniform distribution
x = torch.rand(5, 3)

print(x)
print(x.shape)
tensor([[0.3746, 0.1669, 0.0174],
        [0.9889, 0.9538, 0.0463],
        [0.1561, 0.4398, 0.5971],
        [0.9370, 0.8256, 0.6580],
        [0.2451, 0.8639, 0.5963]])
torch.Size([5, 3])

Operations

# Addition operator
y = x + 2

print(y)
tensor([[2.3746, 2.1669, 2.0174],
        [2.9889, 2.9538, 2.0463],
        [2.1561, 2.4398, 2.5971],
        [2.9370, 2.8256, 2.6580],
        [2.2451, 2.8639, 2.5963]])
# Addition method
y = torch.add(x, 2)

print(y)
tensor([[2.3746, 2.1669, 2.0174],
        [2.9889, 2.9538, 2.0463],
        [2.1561, 2.4398, 2.5971],
        [2.9370, 2.8256, 2.6580],
        [2.2451, 2.8639, 2.5963]])
y = torch.zeros(5, 3)

# In-place addition: tensor is mutated
y.add_(x)
y.add_(2)

print(y)
tensor([[2.3746, 2.1669, 2.0174],
        [2.9889, 2.9538, 2.0463],
        [2.1561, 2.4398, 2.5971],
        [2.9370, 2.8256, 2.6580],
        [2.2451, 2.8639, 2.5963]])

Indexing

print(x)

# Print second column of tensor
print(x[:, 1])
tensor([[0.3746, 0.1669, 0.0174],
        [0.9889, 0.9538, 0.0463],
        [0.1561, 0.4398, 0.5971],
        [0.9370, 0.8256, 0.6580],
        [0.2451, 0.8639, 0.5963]])
tensor([0.1669, 0.9538, 0.4398, 0.8256, 0.8639])

Reshaping with view()

PyTorch allows a tensor to be a view of an existing tensor. For memory efficiency reasons, view tensors share the same underlying data with their base tensor.

# Reshape into a (15,) vector
x.view(15)
tensor([0.3746, 0.1669, 0.0174, 0.9889, 0.9538, 0.0463, 0.1561, 0.4398, 0.5971,
        0.9370, 0.8256, 0.6580, 0.2451, 0.8639, 0.5963])
# The dimension identified by -1 is inferred from other dimensions
print(x.view(-1, 5))  # Shape: (3,5)
print(x.view(5, -1))  # Shape: (5, 3)
print(x.view(-1,))  # Shape: (15,)

# Error: a tensor of size 15 can't be reshaped into a (?, 4) tensor
# print(x.view(-1, 4))
tensor([[0.3746, 0.1669, 0.0174, 0.9889, 0.9538],
        [0.0463, 0.1561, 0.4398, 0.5971, 0.9370],
        [0.8256, 0.6580, 0.2451, 0.8639, 0.5963]])
tensor([[0.3746, 0.1669, 0.0174],
        [0.9889, 0.9538, 0.0463],
        [0.1561, 0.4398, 0.5971],
        [0.9370, 0.8256, 0.6580],
        [0.2451, 0.8639, 0.5963]])
tensor([0.3746, 0.1669, 0.0174, 0.9889, 0.9538, 0.0463, 0.1561, 0.4398, 0.5971,
        0.9370, 0.8256, 0.6580, 0.2451, 0.8639, 0.5963])

Reshaping à la NumPy

# Reshape into a (3,5) tensor, creating a view if possible
x.reshape(3, 5)
tensor([[0.3746, 0.1669, 0.0174, 0.9889, 0.9538],
        [0.0463, 0.1561, 0.4398, 0.5971, 0.9370],
        [0.8256, 0.6580, 0.2451, 0.8639, 0.5963]])

From NumPy to PyTorch

# Create a NumPy tensor
a = np.random.rand(2, 2)
# Convert it into a PyTorch tensor
b = torch.from_numpy(a)

print(b)

# a and b share memory
a *= 2
print(b)
b += 1
print(a)
tensor([[0.6285, 0.2705],
        [0.8091, 0.1353]], dtype=torch.float64)
tensor([[1.2571, 0.5411],
        [1.6182, 0.2706]], dtype=torch.float64)
[[2.25707965 1.54108591]
 [2.61824455 1.27061508]]

From PyTorch to NumPy

# Create a PyTorch tensor
a = torch.rand(2,2)
# Convert it into a NumPy tensor
b = a.numpy()

print(b)

# a and b share memory
a *= 2
print(b)
b += 1
print(a)
[[0.05700839 0.8589342 ]
 [0.8565902  0.6768685 ]]
[[0.11401677 1.7178684 ]
 [1.7131804  1.353737  ]]
tensor([[1.1140, 2.7179],
        [2.7132, 2.3537]])

GPU-based tensors

# Look for an available CUDA device
if torch.cuda.is_available():
    device = torch.device("cuda")
    # Move an existing tensor to GPU
    x_gpu = x.to(device)
    print(x_gpu)
    # Directly create a tensor on GPU
    t_gpu = torch.ones(3, 3, device=device)
    print(t_gpu)
else:
    print("No CUDA device available :(")
No CUDA device available :(
# Try to copy tensor to GPU, fall back on CPU instead
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x_device = x.to(device)

print(x_device)
tensor([[0.3746, 0.1669, 0.0174],
        [0.9889, 0.9538, 0.0463],
        [0.1561, 0.4398, 0.5971],
        [0.9370, 0.8256, 0.6580],
        [0.2451, 0.8639, 0.5963]])

Neural networks API

Building models with PyTorch

The torch.nn package provides the basic building blocks for assembling models. Other packages like torch.optim and torchvision define training utilities and specialized tools.

PyTorch offers a great deal of flexibility for creating custom architectures and training loops, hence its popularity among researchers.

Example 1: training a dense network on planar data

# Generate moon-shaped, non-linearly separable data
x, y = make_moons(n_samples=1000, noise=0.10, random_state=0)

print(f'x: {x.shape}. y: {y.shape}')
plot_planar_data(x, y)
x: (1000, 2). y: (1000,)
../_images/pytorch_31_1.png
# Create PyTorch tensors from Numpy data, with appropriate types
x_train = torch.from_numpy(x).float()
y_train = torch.from_numpy(y).long()

Model definition

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
dense_model = nn.Sequential(
    nn.Linear(in_features=2, out_features=3),
    nn.Tanh(),
    nn.Linear(in_features=3, out_features=2)
)

print(dense_model)
Sequential(
  (0): Linear(in_features=2, out_features=3, bias=True)
  (1): Tanh()
  (2): Linear(in_features=3, out_features=2, bias=True)
)
# The nn package also contains definitions of popular loss functions; in this
# case we will use Cross Entropy as our loss function.
loss_fn = nn.CrossEntropyLoss()

# Used to enable training analysis through TensorBoard
# Writer will output to ./runs/ directory by default
writer = SummaryWriter()

Model training

learning_rate = 1.0
num_epochs = 2000

for epoch in range(num_epochs):
    # Forward pass: compute model prediction
    y_pred = dense_model(x_train)

    # Compute and print loss
    loss = loss_fn(y_pred, y_train)
    if epoch % 100 == 0:
        print(f"Epoch [{epoch+1:4}/{num_epochs}], loss: {loss:.6f}")
        # Write epoch loss for TensorBoard
        writer.add_scalar("Loss/train", loss.item(), epoch)

    # Zero the gradients before running the backward pass
    # Avoids accumulating gradients erroneously
    dense_model.zero_grad()

    # Backward pass: compute gradient of the loss w.r.t all the learnable parameters of the model
    loss.backward()

    # Update the weights using gradient descent
    # no_grad() avoids tracking operations history here
    with torch.no_grad():
        for param in dense_model.parameters():
            param -= learning_rate * param.grad


print(f"Training finished. Final loss: {loss:.6f}")
Epoch [   1/2000], loss: 0.615728
Epoch [ 101/2000], loss: 0.255993
Epoch [ 201/2000], loss: 0.254656
Epoch [ 301/2000], loss: 0.253930
Epoch [ 401/2000], loss: 0.253383
Epoch [ 501/2000], loss: 0.252850
Epoch [ 601/2000], loss: 0.252219
Epoch [ 701/2000], loss: 0.251364
Epoch [ 801/2000], loss: 0.250020
Epoch [ 901/2000], loss: 0.165510
Epoch [1001/2000], loss: 0.034935
Epoch [1101/2000], loss: 0.018792
Epoch [1201/2000], loss: 0.013152
Epoch [1301/2000], loss: 0.010341
Epoch [1401/2000], loss: 0.008661
Epoch [1501/2000], loss: 0.007542
Epoch [1601/2000], loss: 0.006741
Epoch [1701/2000], loss: 0.006137
Epoch [1801/2000], loss: 0.005665
Epoch [1901/2000], loss: 0.005284
Training finished. Final loss: 0.004974

Example 2: training a convnet on CIFAR10

Data loading and preparation

# Transform images of range [0, 1] into tensors of normalized range [-1, 1]
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# Load training set
trainset = torchvision.datasets.CIFAR10(
    root="./data", train=True, download=True, transform=transform
)
# Get an iterable from training set
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=4, shuffle=True, num_workers=2
)

testset = torchvision.datasets.CIFAR10(
    root="./data", train=False, download=True, transform=transform
)
testloader = torch.utils.data.DataLoader(
    testset, batch_size=4, shuffle=False, num_workers=2
)
Files already downloaded and verified
Files already downloaded and verified

Expected network architecture

Example CNN architecture

# Define a CNN that takes (3, 32, 32) tensors as input (channel-first)
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5)
        # Convolution output is 16 5x5 feature maps, flattened as a 400 elements vectors
        self.fc1 = nn.Linear(in_features=16 * 5 * 5, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
cnn_model = Net()
print(cnn_model)
Net(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=10, bias=True)
)

Model training

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(cnn_model.parameters(), lr=0.001, momentum=0.9)

num_epochs = 2

# Loop over the dataset multiple times
for epoch in range(num_epochs):
    running_loss = 0.0

    for i, data in enumerate(trainloader, 0):
        # Get the inputs; data is a list of [inputs, labels]
        # inputs is a 4D tensor of shape (batch size, channels, rows, cols)
        # labels is a 1D tensor of shape (batch size,)
        inputs, labels = data

        # Reset the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = cnn_model(inputs)

        # Loos computation
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()

        # GD step
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:  # print every 2000 mini-batches
            print(
                f"Epoch [{epoch+1}/{num_epochs}], batch {i+1:5}, loss: {running_loss / 2000:.6f}"
            )
            running_loss = 0.0


print(f"Training finished")
Epoch [1/2], batch  2000, loss: 2.108662
Epoch [1/2], batch  4000, loss: 1.737864
Epoch [1/2], batch  6000, loss: 1.592003
Epoch [1/2], batch  8000, loss: 1.507958
Epoch [1/2], batch 10000, loss: 1.445331
Epoch [1/2], batch 12000, loss: 1.393309
Epoch [2/2], batch  2000, loss: 1.327055
Epoch [2/2], batch  4000, loss: 1.302520
Epoch [2/2], batch  6000, loss: 1.286105
Epoch [2/2], batch  8000, loss: 1.265079
Epoch [2/2], batch 10000, loss: 1.240521
Epoch [2/2], batch 12000, loss: 1.270833
Training finished

Model evaluation

correct = 0
total = 0

with torch.no_grad():
    for data in testloader:
        # Load inputs and labels
        images, labels = data
        # Compute model predictions for batch. Shape is (batch size, number of classes) so(4, 10) here
        outputs = cnn_model(images)
        # Get the indexes of maximum values along the second axis
        # This gives us the predicted classes (those with the highest prediction value)
        _, predicted = torch.max(outputs.data, dim=1)
        total += labels.size(0)
        # Add the number of correct predictions for the batch to the total count
        correct += (predicted == labels).sum().item()

print(f"Test acccuracy: {(100 * correct / total)}%")
Test acccuracy: 56.16%

Training analysis with TensorBoard

More info on PyTorch/TensorBoard integration here.

PyTorch+Tensorboard