Tensors can be located on GPU for faster computations.
import torch a = torch.tensor([5.5, 3]) b = torch.rand(5, 3) b.reshape(3, 5) # Shape: (3, 5) b.view(5, -1) # Shape: (5, 3) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') b = b.to(device)
$$f(x1,x2) = ln(x1) + x1x2 - sin(x2)$$
PyTorch (like TensorFlow) implements reverse mode AD.
# By default, user-created tensors do not track operations on them x1 = torch.tensor([2.0], requires_grad=True) x2 = torch.tensor([5.0], requires_grad=True) # Compute some operations on x1 and x2 y = torch.log(x1) + x1*x2 - torch.sin(x2) # Let the magic happen y.backward() print(x1.grad) # dy/dx1 print(x2.grad) # dy/dx2
“People are now building a new kind of software by assembling networks of parameterized functional blocks and by training them from examples using some form of gradient-based optimization…. It’s really very much like a regular program, except it’s parameterized, automatically differentiated, and trainable/optimizable” (Y. LeCun).
import torch.nn as nn # Define a (2, 3, 2) neural network with one hidden layer model = nn.Sequential( nn.Linear(2, 3), nn.Tanh(), nn.Linear(3, 2) )
learning_rate = 1.0 for t in range(2000): # Compute model prediction y_pred = model(x_train) # Compute loss loss = loss_fn(y_pred, y_train) # Zero the gradients before running the backward pass model.zero_grad() # Compute gradient of the loss wrt all the parameters loss.backward() # Update the weights using gradient descent with torch.no_grad(): for param in model.parameters(): param -= learning_rate * param.grad
import torch.nn as nn import torch.nn.functional as F # Define a CNN that takes (3, 32, 32) tensors as input class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(3, 6, 5) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(6, 16, 5) # Convolution output is 16 5x5 feature maps self.fc1 = nn.Linear(16 * 5 * 5, 120) self.fc2 = nn.Linear(120, 10) def forward(self, x): x = self.pool(F.relu(self.conv1(x))) x = self.pool(F.relu(self.conv2(x))) x = x.view(-1, 16 * 5 * 5) x = F.relu(self.fc1(x)) x = self.fc2(x) return x net = Net()
import torch.optim as optim criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) for t in range(2000): # Compute model prediction outputs = net(inputs) # Compute loss loss = criterion(outputs, labels) # Compute gradients of the loss wrt all parameters loss.backward() # Perform one step of gradient descent optimizer.step()