1, Classification
1.1 data
Create some fake data to simulate the real situation For example, the data of two quadratic distributions, but their mean values are different
import torch import matplotlib.pyplot as plt # False data n_data = torch.ones(100, 2) # Basic form of data x0 = torch.normal(2*n_data, 1) # Type 0 x data (tensor), shape=(100, 2) y0 = torch.zeros(100) # Type 0 y data (tensor), shape=(100,) x1 = torch.normal(-2*n_data, 1) # Type 1 x data (tensor), shape=(100, 1) y1 = torch.ones(100) # Type 1 y data (tensor), shape=(100,) # Note that the data form of X and Y data must be the same as below (torch.cat is merging data) x = torch.cat((x0, x1), 0).type(torch.FloatTensor) # FloatTensor = 32-bit floating y = torch.cat((y0, y1), ).type(torch.LongTensor) # LongTensor = 64-bit integer # plt.scatter(x.data.numpy()[:, 0], x.data.numpy()[:, 1], c=y.data.numpy(), s=100, lw=0, cmap='RdYlGn') # plt.show() # Draw a picture plt.scatter(x.data.numpy(), y.data.numpy()) plt.show()
1.2 establishment of network
Building a neural network, we can directly use the system in torch First define all layer attributes (init()), and then build (forward(x)) layer by layer relationship links
class Net(torch.nn.Module): # Module inheriting torch def __init__(self, n_feature, n_hidden, n_output): super(Net, self).__init__() # Inherit__ init__ function self.hidden = torch.nn.Linear(n_feature, n_hidden) # Hidden layer linear output self.out = torch.nn.Linear(n_hidden, n_output) # Output layer linear output def forward(self, x): # The input value is propagated forward, and the output value is analyzed by neural network x = F.relu(self.hidden(x)) # Excitation function (linear value of hidden layer) x = self.out(x) # Output value, but this is not a predicted value. The predicted value needs to be calculated separately return x net = Net(n_feature=2, n_hidden=10, n_output=2) # Just a few output for a few categories print(net) # net structure
Output result:
"""
Net (
(hidden): Linear (2 -> 10)
(out): Linear (10 -> 2)
)
"""
1.3 training
# optimizer is a training tool optimizer = torch.optim.SGD(net.parameters(), lr=0.02) # All parameters passed into net, learning rate # When calculating the error, pay attention to the real value! no One hot form, but 1D Tensor, (batch,) # But the predicted value is 2D tensor (batch, n_classes) loss_func = torch.nn.CrossEntropyLoss() for t in range(100): out = net(x) # Feed the net training data x and output the analysis value loss = loss_func(out, y) # Calculate the error between the two optimizer.zero_grad() # Clear the residual update parameter value of the previous step loss.backward() # Error back propagation, calculate parameter update value optimizer.step() # Apply the parameter update value to the parameters of net
1.4 visual training
for t in range(100): ... loss.backward() optimizer.step() # Then come up if t % 2 == 0: plt.cla() # The maximum probability after a softmax excitation function is the predicted value prediction = torch.max(F.softmax(out), 1)[1] pred_y = prediction.data.numpy().squeeze() target_y = y.data.numpy() plt.scatter(x.data.numpy()[:, 0], x.data.numpy()[:, 1], c=pred_y, s=100, lw=0, cmap='RdYlGn') accuracy = sum(pred_y == target_y)/200. # How much of the forecast is the same as the real value plt.text(1.5, -4, 'Accuracy=%.2f' % accuracy, fontdict={'size': 20, 'color': 'red'}) plt.pause(0.1) plt.ioff() # Stop drawing plt.show()
2, pytorch fast building neural network
Torch provides many convenient ways. It is also a neural network, which can be fast
import torch import torch.nn.functional as F import numpy as np from torch.autograd import Variable import matplotlib.pyplot as plt # The data dimension is 1,y is the power of 2 + noise n_data = torch.ones(100, 2) x0 =torch.normal(2*n_data,1) y0 = torch.zeros(100) # Label is 0 x1 = torch.normal(-2*n_data,1) y1 = torch.ones(100) # Label is 1 x = torch.cat((x0, x1),0).type(torch.FloatTensor) y = torch.cat((y0, y1),).type(torch.LongTensor) # Scatter plot x,y = Variable(x),Variable(y) plt.scatter(x.data.numpy()[:,0], x.data.numpy()[:,1],c=np.squeeze(y.data.numpy()), s=200, lw=0, cmap='RdYlGn') plt.show() # Build neural network class Net(torch.nn.Module): def __init__(self,n_feature,n_hidden,n_output): super(Net, self).__init__() self.hidden = torch.nn.Linear(n_feature,n_hidden) #A hidden layer self.predict = torch.nn.Linear(n_hidden,n_output) #Predict neuron layer, predict a Y def forward(self,x): x = torch.relu(self.hidden(x)) x = self.predict(x) return x net1 = Net(2,10,2) # Structure of neural network 2-10-2 net2 = torch.nn.Sequential( torch.nn.Linear(2,10), # Input - hide torch.nn.ReLU(), # Excitation function torch.nn.Linear(10,2) # Hidden layer - output layer ) print(net1) # Print information of neural network print(net2) # Print information of neural network
We will find that net2 displays more content. Why? Originally, he included the excitation function, but in net1, the excitation function is actually called in the forward() function This shows that the advantage of net1 over net2 is that you can personalize your own forward communication process according to your personal needs, such as (RNN) However, if you don't need the 7788 process, I believe net2 is more suitable for you
3, Save network
3.1 build data and network
import torch from torch.autograd import Variable import matplotlib.pyplot as plt torch.manual_seed(1) x = torch.unsqueeze(torch.linspace(-1, 1, 100), dim=1) # One dimensional to two dimensional y = x.pow(2) + 0.2*torch.rand(x.size()) x, y = Variable(x, requires_grad=False), Variable(y, requires_grad=False) # When requires_ When grade is False, the gradient is not required def save(): net1 = torch.nn.Sequential( torch.nn.Linear(1, 10), torch.nn.ReLU(), torch.nn.Linear(10, 1), ) optimizer = torch.optim.SGD(net1.parameters(), lr=0.05) loss_function = torch.nn.MSELoss() for t in range(1000): # Training steps prediction = net1(x) loss = loss_function(prediction, y) # The predicted value is first, and the real value is last optimizer.zero_grad() loss.backward() optimizer.step()
3.2 save network
Next, we have two ways to save
torch.save(net1, 'net.pkl') # Save entire network torch.save(net1.state_dict(), 'net_params.pkl') # Save only the parameters in the network (fast speed and less memory)
3.3 extraction network
This method will extract the whole neural network, which may be slow when the network is large
def restore_net(): # restore entire net1 to net2 net2 = torch.load('net.pkl') prediction = net2(x)
Extract network parameters:
def restore_params(): # New net3 net3 = torch.nn.Sequential( torch.nn.Linear(1, 10), torch.nn.ReLU(), torch.nn.Linear(10, 1) ) # Copy saved parameters to net3 net3.load_state_dict(torch.load('net_params.pkl')) prediction = net3(x)
3.4 results
# Save net1 (1. The whole network, 2. Only parameters) save() # Extract the entire network restore_net() # Extract network parameters and copy them to the new network restore_params()
4, Optimizer optimizer
4.1 data
import torch import torch.utils.data as Data import torch.nn.functional as F import matplotlib.pyplot as plt torch.manual_seed(1) # reproducible LR = 0.01 BATCH_SIZE = 32 EPOCH = 12 # fake dataset x = torch.unsqueeze(torch.linspace(-1, 1, 1000), dim=1) y = x.pow(2) + 0.1*torch.normal(torch.zeros(*x.size())) # plot dataset plt.scatter(x.numpy(), y.numpy()) plt.show() # Use the data loader mentioned in the previous section torch_dataset = Data.TensorDataset(x, y) loader = Data.DataLoader(dataset=torch_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2,)
4.2 each optimizer optimizes a neural network
In order to compare each optimizer, we create a neural network for each optimizer, but this neural network comes from the same Net form
# Default network form class Net(torch.nn.Module): def __init__(self): super(Net, self).__init__() self.hidden = torch.nn.Linear(1, 20) # hidden layer self.predict = torch.nn.Linear(20, 1) # output layer def forward(self, x): x = F.relu(self.hidden(x)) # activation function for hidden layer x = self.predict(x) # linear output return x # Create a net for each optimizer net_SGD = Net() net_Momentum = Net() net_RMSprop = Net() net_Adam = Net() nets = [net_SGD, net_Momentum, net_RMSprop, net_Adam]
4.3 Optimizer
Next, create different optimizers to train different networks And create a loss_func is used to calculate the error We use several common optimizers, SGD, Momentum, RMSprop, Adam
# different optimizers opt_SGD = torch.optim.SGD(net_SGD.parameters(), lr=LR) opt_Momentum = torch.optim.SGD(net_Momentum.parameters(), lr=LR, momentum=0.8) opt_RMSprop = torch.optim.RMSprop(net_RMSprop.parameters(), lr=LR, alpha=0.9) opt_Adam = torch.optim.Adam(net_Adam.parameters(), lr=LR, betas=(0.9, 0.99)) optimizers = [opt_SGD, opt_Momentum, opt_RMSprop, opt_Adam] loss_func = torch.nn.MSELoss() losses_his = [[], [], [], []] # Record the loss of different neural networks during training
4.4 mapping
for epoch in range(EPOCH): print('Epoch: ', epoch) for step, (b_x, b_y) in enumerate(loader): # For each optimizer, optimization belongs to his neural network for net, opt, l_his in zip(nets, optimizers, losses_his): output = net(b_x) # get output for every net loss = loss_func(output, b_y) # compute loss for every net opt.zero_grad() # clear gradients for next train loss.backward() # backpropagation, compute gradients opt.step() # apply gradients l_his.append(loss.data.numpy()) # loss recoder
SGD is the most common optimizer, which can also be said to have no acceleration effect. Momentum is an improved version of SGD, which adds the momentum principle RMSprop is an upgraded version of momentum Adam is an upgraded version of RMSprop However, from this result, we can see that Adam's effect seems to be a little worse than RMSprop Therefore, it is not that the more advanced the optimizer, the better the result We can try different optimizers in our own experiments and find the one that is most suitable for your data / network