pytorch for Seq2Seq machine translation

In this paper, a simple Seq2Seq machine translation task using pytorch is introduced. Teacher Li Hongyi's Deep Learning Course Assignment 8, where the dataset comes from, allows video lessons to be learned at Station B. https://www.bilibili.com/video/BV1JE411g7XF?p=53 ). Algorithmic theory can be read in the paper "Sequence to Sequence Learning with Neural Networks", or it can be used as a reference. My Paper Reading Notes .

Modules needed for import

 1 import torch
 2 import torch.nn as nn
 3 import torch.utils.data as data
 4 import torchsummary
 5 from torchvision import datasets
 6 import numpy as np
 7 import sys
 8 import os
 9 import random
10 import re
11 import json
12 from nltk.translate.bleu_score import sentence_bleu
13 
14 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Set required configuration parameters

 1 data_path = "./cmn-eng"          # Location of the dataset
 2 store_model_path = "./ckpt"      # Location where the model is stored
 3 max_output_len = 45              # Maximum length of output sentence
 4 batch_size = 64                  # batch_size
 5 emb_dim = 256                    # word embedding Dimension of vector
 6 hid_dim = 512                    # RNN Dimension of Hidden State
 7 n_layers = 4                     # RNN Number of layers
 8 dropout = 0.5                    # dropout Probability p
 9 learning_rate = 0.0001           # Initialize learning rate
10 teacher_forcing_ratio = 0.5      # Probability of training with positive solutions
11 summary_steps = 6000             # Total Training batch number

Import datasets and Chinese-English dictionaries from downloaded data files

 1 # Load Dictionary
 2 def get_dictionary(root, language):
 3     with open(os.path.join(root, 'word2int_{}.json'.format(language)), "r") as f:
 4         word2int = json.load(f)
 5     with open(os.path.join(root, 'int2word_{}.json'.format(language)), "r") as f:
 6         int2word = json.load(f)
 7     print('{} vocab size: {}'.format(language, len(word2int)))
 8     return word2int, int2word, len(word2int)
 9 
10 word2int_cn, int2word_cn, cn_vocab_size = get_dictionary(data_path, 'cn') # Chinese Dictionary
11 word2int_en, int2word_en, en_vocab_size = get_dictionary(data_path, 'en') # English dictionary
12 vocab=[word2int_cn, int2word_cn, word2int_en, int2word_en]
13 
14 # Load data (training/validation/testing)
15 def load_data(root, set_name):
16     data = []
17     with open(os.path.join(root, '{}.txt'.format(set_name)), "r") as f:
18         for line in f:
19             data.append(line)
20     print('{} dataset size: {}'.format(set_name,len(data)))
21     
22     return data
23 
24 training_data=load_data(data_path, 'training')
25 val_data=load_data(data_path, 'validation')
26 testing_data=load_data(data_path, 'testing')
27 
28 '''
29 Print results
30 cn vocab size: 3805
31 en vocab size: 3922
32 training dataset size: 18000
33 validation dataset size: 500
34 testing dataset size: 2636
35 '''

From the output results, we can see that the vocabulary in Chinese and English dictionaries is 3805, 3922, and the number of sentences in training set, validation set and test set is 18000, 500 and 2636, respectively.

Do some processing on the dataset, inheriting torch.utils.data.Dataset class, then implement u inside len_u And u getitem_u Method

 1 class EN2CNDataset(data.Dataset):
 2     def __init__(self, data, max_output_len, vocab):
 3         self.max_output_len = max_output_len
 4         self.word2int_cn, self.int2word_cn = vocab[0], vocab[1] # Chinese Dictionary
 5         self.word2int_en, self.int2word_en = vocab[2], vocab[3] # English dictionary
 6         self.data = data
 7 
 8         self.cn_vocab_size = len(self.word2int_cn)
 9         self.en_vocab_size = len(self.word2int_en)
10 
11     def seq_pad(self, label, pad_token):
12         # Sentences of different lengths pad To the same length for training
13         label = np.pad(label, (0, (self.max_output_len - label.shape[0])), mode='constant', constant_values=pad_token)
14         return label
15 
16     def __len__(self):
17         return len(self.data)
18 
19     def __getitem__(self, Index):
20         # Separate English and Chinese sentences
21         sentences = self.data[Index]
22         sentences = re.split('[\t\n]', sentences)
23         sentences = list(filter(None, sentences))
24         #print (sentences)
25         assert len(sentences) == 2
26 
27         # special token
28         BOS = self.word2int_en['<BOS>']
29         EOS = self.word2int_en['<EOS>']
30         UNK = self.word2int_en['<UNK>']
31 
32         # Add at the beginning of a sentence'<BOS>',End Add'<EOS>',Words not in the dictionary are marked as'<UNK>'
33         en, cn = [BOS], [BOS]
34         # Dictionary Index Vector after English Sentence Separation
35         sentence = re.split(' ', sentences[0])
36         sentence = list(filter(None, sentence))
37         for word in sentence:
38             en.append(self.word2int_en.get(word, UNK))
39         en.append(EOS)
40 
41         # Dictionary Index Vector after Chinese Sentence Separation
42         # e.g. < BOS >, we, are, friends, < EOS > --> 1, 28, 29, 205, 2
43         sentence = re.split(' ', sentences[1])
44         sentence = list(filter(None, sentence))
45         for word in sentence:
46             cn.append(self.word2int_cn.get(word, UNK))
47         cn.append(EOS)
48 
49         en, cn = np.asarray(en), np.asarray(cn)
50         #if len(en)>30 or len(cn)>30:
51         #    print(len(en),len(cn)) 
52 
53         # use '<PAD>' Put a sentence pad To the same length
54         en = self.seq_pad(en, self.word2int_en['<PAD>'])
55         cn = self.seq_pad(cn, self.word2int_cn['<PAD>'])
56         en, cn = torch.LongTensor(en), torch.LongTensor(cn)
57 
58         # return Vectors for English and Chinese sentences
59         return en, cn
60 
61 train_dataset = EN2CNDataset(training_data, max_output_len, vocab)
62 val_dataset = EN2CNDataset(val_data, max_output_len, vocab)
63 test_dataset = EN2CNDataset(testing_data, max_output_len, vocab)
64 train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
65 val_loader = data.DataLoader(val_dataset, batch_size=1)
66 test_loader = data.DataLoader(test_dataset, batch_size=1)

The main processing is to add start and end symbols ('<BOS>'&'<EOS>') at the beginning and end of each sentence. Replace words that do not appear in the dictionary with the'<UNK>'symbol to indicate new words. Complete each sentence with the'<PAD>'symbol to the same length max_output_len; Convert words in sentences to indexes in dictionaries and store them as index sequences. English and Chinese sequences are stored separately.

Define Encoder and Decoder classes

 1 class Encoder(nn.Module):
 2     def __init__(self, en_vocab_size, emb_dim, hid_dim, n_layers, dropout):
 3         super().__init__()
 4         self.hid_dim = hid_dim
 5         self.n_layers = n_layers
 6         self.embedding = nn.Embedding(en_vocab_size, emb_dim)
 7         self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout, batch_first=True, bidirectional=True)
 8         self.dropout = nn.Dropout(dropout)
 9 
10     def forward(self, input):
11         # input = [batch size, sequence len, en_vocab_size]
12         embedding = self.embedding(input)
13         outputs, hidden = self.rnn(self.dropout(embedding))
14         # outputs = [batch size, sequence len, hid_dim * directions]
15         # hidden =  [n_layers * directions, batch size  , hid_dim]
16 
17         return outputs, hidden
18     
19 class Decoder(nn.Module):
20     def __init__(self, cn_vocab_size, emb_dim, hid_dim, n_layers, dropout):
21         super().__init__()
22         self.cn_vocab_size = cn_vocab_size
23         self.hid_dim = hid_dim * 2 #because Encoder Is bidirectional
24         self.n_layers = n_layers
25         self.embedding = nn.Embedding(cn_vocab_size, emb_dim)
26         self.input_dim = emb_dim
27         self.rnn = nn.GRU(self.input_dim, self.hid_dim, n_layers, dropout = dropout, batch_first=True)
28         self.embedding2vocab1 = nn.Linear(self.hid_dim, self.hid_dim * 2)
29         self.embedding2vocab2 = nn.Linear(self.hid_dim * 2, self.hid_dim * 4)
30         self.embedding2vocab3 = nn.Linear(self.hid_dim * 4, self.cn_vocab_size)
31         self.dropout = nn.Dropout(dropout)
32 
33     def forward(self, input, hidden):
34         # input = [batch size, cn_vocab_size]
35         # hidden = [batch size, n_layers * directions, hid_dim]
36         # Decoder Is one way, so directions=1
37         input = input.unsqueeze(1)
38         embedded = self.dropout(self.embedding(input))
39         # embedded = [batch size, 1, emb_dim]
40         output, hidden = self.rnn(embedded, hidden)
41         # output = [batch size, 1, hid_dim]
42         # hidden = [n_layers, batch size, hid_dim]
43 
44         # take RNN Converts the dimension of the output vector from target Dictionary size for languages
45         output = self.embedding2vocab1(output.squeeze(1))
46         output = self.embedding2vocab2(output)
47         prediction = self.embedding2vocab3(output)
48         # prediction = [batch size, vocab size]
49         return prediction, hidden

You can see that Encoder is a two-way, four-layer GRU and Decoder is a one-way, four-layer GRU. The hidden state output of RNN in Encoder is used to initialize RNN in Decoder, and the output of Decoder is used to calculate loss.

Define Seq2Seq class, define model, optimizer, and loss function

 1 class Seq2Seq(nn.Module):
 2     def __init__(self, encoder, decoder, device):
 3         super().__init__()
 4         self.encoder = encoder
 5         self.decoder = decoder
 6         self.device = device
 7     def forward(self, input, target, teacher_forcing_ratio):
 8         # input  = [batch size, input len, vocab size]
 9         # target = [batch size, target len, vocab size]
10         # teacher_forcing_ratio Is the probability of training with positive solutions
11         batch_size = target.shape[0]
12         target_len=target.shape[1]
13         vocab_size = self.decoder.cn_vocab_size
14         
15         # Prepare one tensor Storage Output
16         outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device)
17         
18         encoder_outputs, hidden = self.encoder(input)
19         # Encoder Last Hidden State(hidden state)For initialization Decoder
20         # because Encoder Is bidirectional RNN,So you need to have two directions for the same layer hidden state Stitch
21         # hidden = [num_layers * directions, batch size, hid dim] --> [num_layers, directions, batch size, hid dim]
22         hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1)
23         # hidden = [num_layers, batch size, hid dim * 2]
24         hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2)
25         
26         dec_input = target[:, 0] # 'BOS'
27         preds = []
28         for t in range(1, target_len):
29             output, hidden = self.decoder(dec_input, hidden)
30             outputs[:, t] = output
31             # Decide whether to train with positive solutions
32             teacher_force = random.random() <= teacher_forcing_ratio
33             # Remove the word with the highest probability of output
34             top1 = output.argmax(1)
35             # teacher force by True Train with positive solution, otherwise train with predicted maximum probability words
36             dec_input = target[:, t] if teacher_force and t < target_len else top1
37             preds.append(top1.unsqueeze(1))
38         preds = torch.cat(preds, 1)
39         return outputs, preds
40     def inference(self, input, target):
41         # test model
42         # input  = [batch size, input len, vocab size]
43         # target = [batch size, target len, vocab size]
44         batch_size = input.shape[0]
45         input_len = input.shape[1]
46         vocab_size = self.decoder.cn_vocab_size
47         
48         outputs = torch.zeros(batch_size, input_len, vocab_size).to(self.device)
49         encoder_outputs, hidden = self.encoder(input)
50         # Encoder Last Hidden State(hidden state)For initialization Decoder
51         # because Encoder Is bidirectional RNN,So you need to have two directions for the same layer hidden state Stitch
52         # hidden = [num_layers * directions, batch size, hid dim] --> [num_layers, directions, batch size, hid dim]
53         hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1)
54         hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2)
55         
56         dec_input = target[:, 0] # 'BOS'
57         preds = []
58         for t in range(1, input_len):
59             output, hidden = self.decoder(dec_input, hidden)
60             outputs[:, t] = output
61             # Remove the word with the highest probability of output
62             top1 = output.argmax(1)
63             # Word training with predicted maximum probability
64             dec_input = top1
65             preds.append(top1.unsqueeze(1))
66         preds = torch.cat(preds, 1)
67         return outputs, preds
68     
69 encoder = Encoder(en_vocab_size, emb_dim, hid_dim, n_layers, dropout)
70 decoder = Decoder(cn_vocab_size, emb_dim, hid_dim, n_layers, dropout)
71 model = Seq2Seq(encoder, decoder, device).to(device)
72 print(model)
73 loss_function = nn.CrossEntropyLoss(ignore_index=0).to(device)
74 optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
75 print(optimizer)
76 print('num of parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad))

You can see that Decoder predicts word by word, using teacher_during training Forcing mechanism, not used for validation and testing. There is a parameter in the loss function ignore_index=0, which means that the loss when the prediction is classified as 0 is ignored because the prediction word'<PAD>'is classified as 0 here.

Define useful functions to store and read network parameters, calculate Bleu scores, convert predictions to text, and create iterators for training batch es

 1 def save_model(model, optimizer, store_model_path, step):
 2     torch.save(model.state_dict(), '{}/model_{}.ckpt'.format(store_model_path,step))
 3     return
 4 
 5 def load_model(model, load_model_path):
 6     print('Load model from {}'.format(load_model_path))
 7     model.load_state_dict(torch.load('{}.ckpt'.format(load_model_path)))
 8     return model
 9 
10 def computebleu(sentences, targets):
11     score = 0 
12     assert (len(sentences) == len(targets))
13 
14     def cut_token(sentence):
15         tmp = []
16         for token in sentence:
17             if token == '<UNK>' or token.isdigit() or len(bytes(token[0], encoding='utf-8')) == 1:
18                 tmp.append(token)
19             else:
20                 tmp += [word for word in token]
21         return tmp 
22 
23     for sentence, target in zip(sentences, targets):
24         sentence = cut_token(sentence)
25         target = cut_token(target)
26         score += sentence_bleu([target], sentence, weights=(1, 0, 0, 0))
27     return score
28 
29 def tokens2sentence(outputs, int2word):
30     sentences = []
31     for tokens in outputs:
32         sentence = []
33         for token in tokens:
34             word = int2word[str(int(token))]
35             if word == '<EOS>':
36                 break
37             sentence.append(word)
38         sentences.append(sentence)
39     return sentences
40 
41 def infinite_iter(data_loader):
42     it = iter(data_loader)
43     while True:
44         try:
45             ret = next(it)
46             yield ret
47         except StopIteration:
48             it = iter(data_loader)

Training and validation, batch training is validated every 300 rounds, and the model is stored

 1 model.train()
 2 model.zero_grad()
 3 train_losses, val_losses, val_bleu_scores = [], [], []
 4 loss_sum = 0.0
 5 train_iter = infinite_iter(train_loader)
 6 
 7 for step in range(summary_steps):
 8     model.train()
 9     sources, targets = next(train_iter)
10     sources, targets = sources.to(device), targets.to(device)
11     outputs, preds = model(sources, targets, teacher_forcing_ratio)
12     # targets First token yes '<BOS>' So ignore
13     outputs = outputs[:, 1:].reshape(-1, outputs.size(2))
14     targets = targets[:, 1:].reshape(-1)
15     loss = loss_function(outputs, targets)
16     
17     optimizer.zero_grad()
18     loss.backward()
19     grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
20     optimizer.step()
21 
22     loss_sum += loss.item()
23     if (step + 1) % 10 == 0:
24         loss_sum = loss_sum / 10
25         print ("\r","train [{}] loss: {:.3f}, Perplexity: {:.3f}".format(step+1, loss_sum, np.exp(loss_sum)), end=" ")
26         train_losses.append(loss_sum)
27         loss_sum = 0.0
28         
29         if (step + 1) % 300 == 0:
30             # Every 300 rounds batch Training is validated and models are stored
31             model.eval()
32             loss_val, bleu_val= 0.0, 0.0
33             n = 0
34             for sources_val, targets_val in val_loader:
35                 sources_val, targets_val = sources_val.to(device), targets_val.to(device)
36                 batch_size = sources_val.size(0)
37                 #print(batch_size)
38                 outputs_val, preds_val = model.inference(sources_val, targets_val)
39                 # targets First token yes '<BOS>' So ignore
40                 outputs_val = outputs_val[:, 1:].reshape(-1, outputs_val.size(2))
41                 targets_val = targets_val[:, 1:].reshape(-1)
42                 loss = loss_function(outputs_val, targets_val)
43                 loss_val += loss.item()
44                 
45                 # Convert predictions to text
46                 targets_val = targets_val.view(sources_val.size(0), -1)
47                 preds_val = tokens2sentence(preds_val, int2word_cn)
48                 sources_val = tokens2sentence(sources_val, int2word_en)
49                 targets_val = tokens2sentence(targets_val, int2word_cn)
50                 # Calculation Bleu Score
51                 bleu_val += computebleu(preds_val, targets_val)
52                 n += batch_size
53             loss_val = loss_val/len(val_loader)
54             bleu_val = bleu_val/n
55             val_losses.append(loss_val)
56             val_bleu_scores.append(bleu_val)
57             print ("\n", "val [{}] loss: {:.3f}, Perplexity: {:.3f}, bleu score: {:.3f} ".format(step+1, loss_val, np.exp(loss_val), bleu_val))
58 
59             # Storage Model
60             save_model(model, optimizer, store_model_path, step+1)
61         

Test and save results

 1 load_model_path = "./ckpt/model_6000"      # Read Model Location
 2 
 3 model = load_model(model, load_model_path) # Read Model
 4 model.to(device)
 5 model.eval()
 6 # test model
 7 loss_test, bleu_test= 0.0, 0.0
 8 n = 0
 9 result = []
10 for sources_test, targets_test in test_loader:
11     sources_test, targets_test = sources_test.to(device), targets_test.to(device)
12     batch_size = sources_test.size(0)
13     # print(batch_size)
14     outputs_test, preds_test = model.inference(sources_test, targets_test)
15     # targets First token yes '<BOS>' So ignore
16     outputs_test = outputs_test[:, 1:].reshape(-1, outputs_test.size(2))
17     targets_test = targets_test[:, 1:].reshape(-1)
18     loss = loss_function(outputs_test, targets_test)
19     loss_test += loss.item()
20                 
21     # Convert predictions to text
22     targets_test = targets_test.view(sources_test.size(0), -1)
23     preds_test = tokens2sentence(preds_test, int2word_cn)
24     sources_test = tokens2sentence(sources_test, int2word_en)
25     targets_test = tokens2sentence(targets_test, int2word_cn)
26     for source, pred, target in zip(sources_test, preds_test, targets_test):
27         result.append((source, pred, target))
28     # Calculation Bleu Score
29     bleu_test += computebleu(preds_test, targets_test)
30     n += batch_size
31 loss_test = loss_test/len(test_loader)
32 bleu_test = bleu_test/n
33 print ('test loss: {}, bleu_score: {}'.format(loss_test,bleu_test))
34 # Store results
35 with open('./test_output.txt', 'w') as f:
36     for line in result:
37         print (line, file=f)

 

Posted by czambran on Mon, 23 May 2022 20:50:43 +0300