Implementation of neural network classification by PyTorch

1, Classification

1.1 data

Create some fake data to simulate the real situation For example, the data of two quadratic distributions, but their mean values are different

import torch
import matplotlib.pyplot as plt

# False data
n_data = torch.ones(100, 2)         # Basic form of data
x0 = torch.normal(2*n_data, 1)      # Type 0 x data (tensor), shape=(100, 2)
y0 = torch.zeros(100)               # Type 0 y data (tensor), shape=(100,)
x1 = torch.normal(-2*n_data, 1)     # Type 1 x data (tensor), shape=(100, 1)
y1 = torch.ones(100)                # Type 1 y data (tensor), shape=(100,)

# Note that the data form of X and Y data must be the same as below (torch.cat is merging data)
x = torch.cat((x0, x1), 0).type(torch.FloatTensor)  # FloatTensor = 32-bit floating
y = torch.cat((y0, y1), ).type(torch.LongTensor)    # LongTensor = 64-bit integer

# plt.scatter(x.data.numpy()[:, 0], x.data.numpy()[:, 1], c=y.data.numpy(), s=100, lw=0, cmap='RdYlGn')
# plt.show()

# Draw a picture
plt.scatter(x.data.numpy(), y.data.numpy())
plt.show()

1.2 establishment of network

Building a neural network, we can directly use the system in torch First define all layer attributes (init()), and then build (forward(x)) layer by layer relationship links

class Net(torch.nn.Module):     # Module inheriting torch
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()     # Inherit__ init__  function
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # Hidden layer linear output
        self.out = torch.nn.Linear(n_hidden, n_output)       # Output layer linear output

    def forward(self, x):
        # The input value is propagated forward, and the output value is analyzed by neural network
        x = F.relu(self.hidden(x))      # Excitation function (linear value of hidden layer)
        x = self.out(x)                 # Output value, but this is not a predicted value. The predicted value needs to be calculated separately
        return x

net = Net(n_feature=2, n_hidden=10, n_output=2) # Just a few output for a few categories

print(net)  # net structure

Output result:
"""
Net (
(hidden): Linear (2 -> 10)
(out): Linear (10 -> 2)
)
"""

1.3 training

# optimizer is a training tool
optimizer = torch.optim.SGD(net.parameters(), lr=0.02)  # All parameters passed into net, learning rate
# When calculating the error, pay attention to the real value! no One hot form, but 1D Tensor, (batch,)
# But the predicted value is 2D tensor (batch, n_classes)
loss_func = torch.nn.CrossEntropyLoss()

for t in range(100):
    out = net(x)     # Feed the net training data x and output the analysis value

    loss = loss_func(out, y)     # Calculate the error between the two

    optimizer.zero_grad()   # Clear the residual update parameter value of the previous step
    loss.backward()         # Error back propagation, calculate parameter update value
    optimizer.step()        # Apply the parameter update value to the parameters of net

1.4 visual training

for t in range(100):

    ...
    loss.backward()
    optimizer.step()

    # Then come up
    if t % 2 == 0:
        plt.cla()
        # The maximum probability after a softmax excitation function is the predicted value
        prediction = torch.max(F.softmax(out), 1)[1]
        pred_y = prediction.data.numpy().squeeze()
        target_y = y.data.numpy()
        plt.scatter(x.data.numpy()[:, 0], x.data.numpy()[:, 1], c=pred_y, s=100, lw=0, cmap='RdYlGn')
        accuracy = sum(pred_y == target_y)/200.  # How much of the forecast is the same as the real value
        plt.text(1.5, -4, 'Accuracy=%.2f' % accuracy, fontdict={'size': 20, 'color':  'red'})
        plt.pause(0.1)

plt.ioff()  # Stop drawing
plt.show()

2, pytorch fast building neural network

Torch provides many convenient ways. It is also a neural network, which can be fast

import torch
import torch.nn.functional as F
import numpy as np
from torch.autograd import Variable
import matplotlib.pyplot as plt

# The data dimension is 1,y is the power of 2 + noise
n_data = torch.ones(100, 2)
x0 =torch.normal(2*n_data,1)
y0 = torch.zeros(100)  # Label is 0
x1 = torch.normal(-2*n_data,1)
y1 = torch.ones(100)  # Label is 1
x = torch.cat((x0, x1),0).type(torch.FloatTensor)
y = torch.cat((y0, y1),).type(torch.LongTensor)


# Scatter plot
x,y = Variable(x),Variable(y)
plt.scatter(x.data.numpy()[:,0], x.data.numpy()[:,1],c=np.squeeze(y.data.numpy()), s=200, lw=0, cmap='RdYlGn')
plt.show()

# Build neural network
class Net(torch.nn.Module):
    def __init__(self,n_feature,n_hidden,n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature,n_hidden)  #A hidden layer
        self.predict = torch.nn.Linear(n_hidden,n_output)  #Predict neuron layer, predict a Y
    def forward(self,x):
        x = torch.relu(self.hidden(x))
        x = self.predict(x)
        return x

net1 = Net(2,10,2)  # Structure of neural network 2-10-2

net2 = torch.nn.Sequential(
    torch.nn.Linear(2,10), # Input - hide
    torch.nn.ReLU(), # Excitation function
    torch.nn.Linear(10,2) # Hidden layer - output layer
)

print(net1)  # Print information of neural network
print(net2)  # Print information of neural network

We will find that net2 displays more content. Why? Originally, he included the excitation function, but in net1, the excitation function is actually called in the forward() function This shows that the advantage of net1 over net2 is that you can personalize your own forward communication process according to your personal needs, such as (RNN) However, if you don't need the 7788 process, I believe net2 is more suitable for you

3, Save network

3.1 build data and network

import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt

torch.manual_seed(1)

x = torch.unsqueeze(torch.linspace(-1, 1, 100), dim=1)  # One dimensional to two dimensional
y = x.pow(2) + 0.2*torch.rand(x.size())

x, y = Variable(x, requires_grad=False), Variable(y, requires_grad=False)  # When requires_ When grade is False, the gradient is not required

def save():
   net1 = torch.nn.Sequential(
       torch.nn.Linear(1, 10),
       torch.nn.ReLU(),
       torch.nn.Linear(10, 1),
  )
   optimizer = torch.optim.SGD(net1.parameters(), lr=0.05)
   loss_function = torch.nn.MSELoss()

   for t in range(1000):  # Training steps
       prediction = net1(x)
       loss = loss_function(prediction, y)  # The predicted value is first, and the real value is last
       optimizer.zero_grad()
       loss.backward()
       optimizer.step()

3.2 save network

Next, we have two ways to save

torch.save(net1, 'net.pkl')  # Save entire network
torch.save(net1.state_dict(), 'net_params.pkl')   # Save only the parameters in the network (fast speed and less memory)

3.3 extraction network

This method will extract the whole neural network, which may be slow when the network is large

def restore_net():
    # restore entire net1 to net2
    net2 = torch.load('net.pkl')
    prediction = net2(x)

Extract network parameters:

def restore_params():
    # New net3
    net3 = torch.nn.Sequential(
        torch.nn.Linear(1, 10),
        torch.nn.ReLU(),
        torch.nn.Linear(10, 1)
    )

    # Copy saved parameters to net3
    net3.load_state_dict(torch.load('net_params.pkl'))
    prediction = net3(x)

3.4 results

# Save net1 (1. The whole network, 2. Only parameters)
save()

# Extract the entire network
restore_net()

# Extract network parameters and copy them to the new network
restore_params()

4, Optimizer optimizer

4.1 data

import torch
import torch.utils.data as Data
import torch.nn.functional as F
import matplotlib.pyplot as plt

torch.manual_seed(1)    # reproducible

LR = 0.01
BATCH_SIZE = 32
EPOCH = 12

# fake dataset
x = torch.unsqueeze(torch.linspace(-1, 1, 1000), dim=1)
y = x.pow(2) + 0.1*torch.normal(torch.zeros(*x.size()))

# plot dataset
plt.scatter(x.numpy(), y.numpy())
plt.show()

# Use the data loader mentioned in the previous section
torch_dataset = Data.TensorDataset(x, y)
loader = Data.DataLoader(dataset=torch_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2,)

4.2 each optimizer optimizes a neural network

In order to compare each optimizer, we create a neural network for each optimizer, but this neural network comes from the same Net form

# Default network form
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(1, 20)   # hidden layer
        self.predict = torch.nn.Linear(20, 1)   # output layer

    def forward(self, x):
        x = F.relu(self.hidden(x))      # activation function for hidden layer
        x = self.predict(x)             # linear output
        return x

# Create a net for each optimizer
net_SGD         = Net()
net_Momentum    = Net()
net_RMSprop     = Net()
net_Adam        = Net()
nets = [net_SGD, net_Momentum, net_RMSprop, net_Adam]

4.3 Optimizer

Next, create different optimizers to train different networks And create a loss_func is used to calculate the error We use several common optimizers, SGD, Momentum, RMSprop, Adam

# different optimizers
opt_SGD         = torch.optim.SGD(net_SGD.parameters(), lr=LR)
opt_Momentum    = torch.optim.SGD(net_Momentum.parameters(), lr=LR, momentum=0.8)
opt_RMSprop     = torch.optim.RMSprop(net_RMSprop.parameters(), lr=LR, alpha=0.9)
opt_Adam        = torch.optim.Adam(net_Adam.parameters(), lr=LR, betas=(0.9, 0.99))
optimizers = [opt_SGD, opt_Momentum, opt_RMSprop, opt_Adam]

loss_func = torch.nn.MSELoss()
losses_his = [[], [], [], []]   # Record the loss of different neural networks during training

4.4 mapping

for epoch in range(EPOCH):
    print('Epoch: ', epoch)
    for step, (b_x, b_y) in enumerate(loader):

        # For each optimizer, optimization belongs to his neural network
        for net, opt, l_his in zip(nets, optimizers, losses_his):
            output = net(b_x)              # get output for every net
            loss = loss_func(output, b_y)  # compute loss for every net
            opt.zero_grad()                # clear gradients for next train
            loss.backward()                # backpropagation, compute gradients
            opt.step()                     # apply gradients
            l_his.append(loss.data.numpy())     # loss recoder


SGD is the most common optimizer, which can also be said to have no acceleration effect. Momentum is an improved version of SGD, which adds the momentum principle RMSprop is an upgraded version of momentum Adam is an upgraded version of RMSprop However, from this result, we can see that Adam's effect seems to be a little worse than RMSprop Therefore, it is not that the more advanced the optimizer, the better the result We can try different optimizers in our own experiments and find the one that is most suitable for your data / network

Tags: Deep Learning

Posted by M4F on Fri, 06 May 2022 11:39:22 +0300