In this paper, a simple Seq2Seq machine translation task using pytorch is introduced. Teacher Li Hongyi's Deep Learning Course Assignment 8, where the dataset comes from, allows video lessons to be learned at Station B. https://www.bilibili.com/video/BV1JE411g7XF?p=53 ). Algorithmic theory can be read in the paper "Sequence to Sequence Learning with Neural Networks", or it can be used as a reference. My Paper Reading Notes .
Modules needed for import
1 import torch 2 import torch.nn as nn 3 import torch.utils.data as data 4 import torchsummary 5 from torchvision import datasets 6 import numpy as np 7 import sys 8 import os 9 import random 10 import re 11 import json 12 from nltk.translate.bleu_score import sentence_bleu 13 14 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Set required configuration parameters
1 data_path = "./cmn-eng" # Location of the dataset 2 store_model_path = "./ckpt" # Location where the model is stored 3 max_output_len = 45 # Maximum length of output sentence 4 batch_size = 64 # batch_size 5 emb_dim = 256 # word embedding Dimension of vector 6 hid_dim = 512 # RNN Dimension of Hidden State 7 n_layers = 4 # RNN Number of layers 8 dropout = 0.5 # dropout Probability p 9 learning_rate = 0.0001 # Initialize learning rate 10 teacher_forcing_ratio = 0.5 # Probability of training with positive solutions 11 summary_steps = 6000 # Total Training batch number
Import datasets and Chinese-English dictionaries from downloaded data files
1 # Load Dictionary 2 def get_dictionary(root, language): 3 with open(os.path.join(root, 'word2int_{}.json'.format(language)), "r") as f: 4 word2int = json.load(f) 5 with open(os.path.join(root, 'int2word_{}.json'.format(language)), "r") as f: 6 int2word = json.load(f) 7 print('{} vocab size: {}'.format(language, len(word2int))) 8 return word2int, int2word, len(word2int) 9 10 word2int_cn, int2word_cn, cn_vocab_size = get_dictionary(data_path, 'cn') # Chinese Dictionary 11 word2int_en, int2word_en, en_vocab_size = get_dictionary(data_path, 'en') # English dictionary 12 vocab=[word2int_cn, int2word_cn, word2int_en, int2word_en] 13 14 # Load data (training/validation/testing) 15 def load_data(root, set_name): 16 data = [] 17 with open(os.path.join(root, '{}.txt'.format(set_name)), "r") as f: 18 for line in f: 19 data.append(line) 20 print('{} dataset size: {}'.format(set_name,len(data))) 21 22 return data 23 24 training_data=load_data(data_path, 'training') 25 val_data=load_data(data_path, 'validation') 26 testing_data=load_data(data_path, 'testing') 27 28 ''' 29 Print results 30 cn vocab size: 3805 31 en vocab size: 3922 32 training dataset size: 18000 33 validation dataset size: 500 34 testing dataset size: 2636 35 '''
From the output results, we can see that the vocabulary in Chinese and English dictionaries is 3805, 3922, and the number of sentences in training set, validation set and test set is 18000, 500 and 2636, respectively.
Do some processing on the dataset, inheriting torch.utils.data.Dataset class, then implement u inside len_u And u getitem_u Method
1 class EN2CNDataset(data.Dataset): 2 def __init__(self, data, max_output_len, vocab): 3 self.max_output_len = max_output_len 4 self.word2int_cn, self.int2word_cn = vocab[0], vocab[1] # Chinese Dictionary 5 self.word2int_en, self.int2word_en = vocab[2], vocab[3] # English dictionary 6 self.data = data 7 8 self.cn_vocab_size = len(self.word2int_cn) 9 self.en_vocab_size = len(self.word2int_en) 10 11 def seq_pad(self, label, pad_token): 12 # Sentences of different lengths pad To the same length for training 13 label = np.pad(label, (0, (self.max_output_len - label.shape[0])), mode='constant', constant_values=pad_token) 14 return label 15 16 def __len__(self): 17 return len(self.data) 18 19 def __getitem__(self, Index): 20 # Separate English and Chinese sentences 21 sentences = self.data[Index] 22 sentences = re.split('[\t\n]', sentences) 23 sentences = list(filter(None, sentences)) 24 #print (sentences) 25 assert len(sentences) == 2 26 27 # special token 28 BOS = self.word2int_en['<BOS>'] 29 EOS = self.word2int_en['<EOS>'] 30 UNK = self.word2int_en['<UNK>'] 31 32 # Add at the beginning of a sentence'<BOS>',End Add'<EOS>',Words not in the dictionary are marked as'<UNK>' 33 en, cn = [BOS], [BOS] 34 # Dictionary Index Vector after English Sentence Separation 35 sentence = re.split(' ', sentences[0]) 36 sentence = list(filter(None, sentence)) 37 for word in sentence: 38 en.append(self.word2int_en.get(word, UNK)) 39 en.append(EOS) 40 41 # Dictionary Index Vector after Chinese Sentence Separation 42 # e.g. < BOS >, we, are, friends, < EOS > --> 1, 28, 29, 205, 2 43 sentence = re.split(' ', sentences[1]) 44 sentence = list(filter(None, sentence)) 45 for word in sentence: 46 cn.append(self.word2int_cn.get(word, UNK)) 47 cn.append(EOS) 48 49 en, cn = np.asarray(en), np.asarray(cn) 50 #if len(en)>30 or len(cn)>30: 51 # print(len(en),len(cn)) 52 53 # use '<PAD>' Put a sentence pad To the same length 54 en = self.seq_pad(en, self.word2int_en['<PAD>']) 55 cn = self.seq_pad(cn, self.word2int_cn['<PAD>']) 56 en, cn = torch.LongTensor(en), torch.LongTensor(cn) 57 58 # return Vectors for English and Chinese sentences 59 return en, cn 60 61 train_dataset = EN2CNDataset(training_data, max_output_len, vocab) 62 val_dataset = EN2CNDataset(val_data, max_output_len, vocab) 63 test_dataset = EN2CNDataset(testing_data, max_output_len, vocab) 64 train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 65 val_loader = data.DataLoader(val_dataset, batch_size=1) 66 test_loader = data.DataLoader(test_dataset, batch_size=1)
The main processing is to add start and end symbols ('<BOS>'&'<EOS>') at the beginning and end of each sentence. Replace words that do not appear in the dictionary with the'<UNK>'symbol to indicate new words. Complete each sentence with the'<PAD>'symbol to the same length max_output_len; Convert words in sentences to indexes in dictionaries and store them as index sequences. English and Chinese sequences are stored separately.
Define Encoder and Decoder classes
1 class Encoder(nn.Module): 2 def __init__(self, en_vocab_size, emb_dim, hid_dim, n_layers, dropout): 3 super().__init__() 4 self.hid_dim = hid_dim 5 self.n_layers = n_layers 6 self.embedding = nn.Embedding(en_vocab_size, emb_dim) 7 self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout, batch_first=True, bidirectional=True) 8 self.dropout = nn.Dropout(dropout) 9 10 def forward(self, input): 11 # input = [batch size, sequence len, en_vocab_size] 12 embedding = self.embedding(input) 13 outputs, hidden = self.rnn(self.dropout(embedding)) 14 # outputs = [batch size, sequence len, hid_dim * directions] 15 # hidden = [n_layers * directions, batch size , hid_dim] 16 17 return outputs, hidden 18 19 class Decoder(nn.Module): 20 def __init__(self, cn_vocab_size, emb_dim, hid_dim, n_layers, dropout): 21 super().__init__() 22 self.cn_vocab_size = cn_vocab_size 23 self.hid_dim = hid_dim * 2 #because Encoder Is bidirectional 24 self.n_layers = n_layers 25 self.embedding = nn.Embedding(cn_vocab_size, emb_dim) 26 self.input_dim = emb_dim 27 self.rnn = nn.GRU(self.input_dim, self.hid_dim, n_layers, dropout = dropout, batch_first=True) 28 self.embedding2vocab1 = nn.Linear(self.hid_dim, self.hid_dim * 2) 29 self.embedding2vocab2 = nn.Linear(self.hid_dim * 2, self.hid_dim * 4) 30 self.embedding2vocab3 = nn.Linear(self.hid_dim * 4, self.cn_vocab_size) 31 self.dropout = nn.Dropout(dropout) 32 33 def forward(self, input, hidden): 34 # input = [batch size, cn_vocab_size] 35 # hidden = [batch size, n_layers * directions, hid_dim] 36 # Decoder Is one way, so directions=1 37 input = input.unsqueeze(1) 38 embedded = self.dropout(self.embedding(input)) 39 # embedded = [batch size, 1, emb_dim] 40 output, hidden = self.rnn(embedded, hidden) 41 # output = [batch size, 1, hid_dim] 42 # hidden = [n_layers, batch size, hid_dim] 43 44 # take RNN Converts the dimension of the output vector from target Dictionary size for languages 45 output = self.embedding2vocab1(output.squeeze(1)) 46 output = self.embedding2vocab2(output) 47 prediction = self.embedding2vocab3(output) 48 # prediction = [batch size, vocab size] 49 return prediction, hidden
You can see that Encoder is a two-way, four-layer GRU and Decoder is a one-way, four-layer GRU. The hidden state output of RNN in Encoder is used to initialize RNN in Decoder, and the output of Decoder is used to calculate loss.
Define Seq2Seq class, define model, optimizer, and loss function
1 class Seq2Seq(nn.Module): 2 def __init__(self, encoder, decoder, device): 3 super().__init__() 4 self.encoder = encoder 5 self.decoder = decoder 6 self.device = device 7 def forward(self, input, target, teacher_forcing_ratio): 8 # input = [batch size, input len, vocab size] 9 # target = [batch size, target len, vocab size] 10 # teacher_forcing_ratio Is the probability of training with positive solutions 11 batch_size = target.shape[0] 12 target_len=target.shape[1] 13 vocab_size = self.decoder.cn_vocab_size 14 15 # Prepare one tensor Storage Output 16 outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device) 17 18 encoder_outputs, hidden = self.encoder(input) 19 # Encoder Last Hidden State(hidden state)For initialization Decoder 20 # because Encoder Is bidirectional RNN,So you need to have two directions for the same layer hidden state Stitch 21 # hidden = [num_layers * directions, batch size, hid dim] --> [num_layers, directions, batch size, hid dim] 22 hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1) 23 # hidden = [num_layers, batch size, hid dim * 2] 24 hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2) 25 26 dec_input = target[:, 0] # 'BOS' 27 preds = [] 28 for t in range(1, target_len): 29 output, hidden = self.decoder(dec_input, hidden) 30 outputs[:, t] = output 31 # Decide whether to train with positive solutions 32 teacher_force = random.random() <= teacher_forcing_ratio 33 # Remove the word with the highest probability of output 34 top1 = output.argmax(1) 35 # teacher force by True Train with positive solution, otherwise train with predicted maximum probability words 36 dec_input = target[:, t] if teacher_force and t < target_len else top1 37 preds.append(top1.unsqueeze(1)) 38 preds = torch.cat(preds, 1) 39 return outputs, preds 40 def inference(self, input, target): 41 # test model 42 # input = [batch size, input len, vocab size] 43 # target = [batch size, target len, vocab size] 44 batch_size = input.shape[0] 45 input_len = input.shape[1] 46 vocab_size = self.decoder.cn_vocab_size 47 48 outputs = torch.zeros(batch_size, input_len, vocab_size).to(self.device) 49 encoder_outputs, hidden = self.encoder(input) 50 # Encoder Last Hidden State(hidden state)For initialization Decoder 51 # because Encoder Is bidirectional RNN,So you need to have two directions for the same layer hidden state Stitch 52 # hidden = [num_layers * directions, batch size, hid dim] --> [num_layers, directions, batch size, hid dim] 53 hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1) 54 hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2) 55 56 dec_input = target[:, 0] # 'BOS' 57 preds = [] 58 for t in range(1, input_len): 59 output, hidden = self.decoder(dec_input, hidden) 60 outputs[:, t] = output 61 # Remove the word with the highest probability of output 62 top1 = output.argmax(1) 63 # Word training with predicted maximum probability 64 dec_input = top1 65 preds.append(top1.unsqueeze(1)) 66 preds = torch.cat(preds, 1) 67 return outputs, preds 68 69 encoder = Encoder(en_vocab_size, emb_dim, hid_dim, n_layers, dropout) 70 decoder = Decoder(cn_vocab_size, emb_dim, hid_dim, n_layers, dropout) 71 model = Seq2Seq(encoder, decoder, device).to(device) 72 print(model) 73 loss_function = nn.CrossEntropyLoss(ignore_index=0).to(device) 74 optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate) 75 print(optimizer) 76 print('num of parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad))
You can see that Decoder predicts word by word, using teacher_during training Forcing mechanism, not used for validation and testing. There is a parameter in the loss function ignore_index=0, which means that the loss when the prediction is classified as 0 is ignored because the prediction word'<PAD>'is classified as 0 here.
Define useful functions to store and read network parameters, calculate Bleu scores, convert predictions to text, and create iterators for training batch es
1 def save_model(model, optimizer, store_model_path, step): 2 torch.save(model.state_dict(), '{}/model_{}.ckpt'.format(store_model_path,step)) 3 return 4 5 def load_model(model, load_model_path): 6 print('Load model from {}'.format(load_model_path)) 7 model.load_state_dict(torch.load('{}.ckpt'.format(load_model_path))) 8 return model 9 10 def computebleu(sentences, targets): 11 score = 0 12 assert (len(sentences) == len(targets)) 13 14 def cut_token(sentence): 15 tmp = [] 16 for token in sentence: 17 if token == '<UNK>' or token.isdigit() or len(bytes(token[0], encoding='utf-8')) == 1: 18 tmp.append(token) 19 else: 20 tmp += [word for word in token] 21 return tmp 22 23 for sentence, target in zip(sentences, targets): 24 sentence = cut_token(sentence) 25 target = cut_token(target) 26 score += sentence_bleu([target], sentence, weights=(1, 0, 0, 0)) 27 return score 28 29 def tokens2sentence(outputs, int2word): 30 sentences = [] 31 for tokens in outputs: 32 sentence = [] 33 for token in tokens: 34 word = int2word[str(int(token))] 35 if word == '<EOS>': 36 break 37 sentence.append(word) 38 sentences.append(sentence) 39 return sentences 40 41 def infinite_iter(data_loader): 42 it = iter(data_loader) 43 while True: 44 try: 45 ret = next(it) 46 yield ret 47 except StopIteration: 48 it = iter(data_loader)
Training and validation, batch training is validated every 300 rounds, and the model is stored
1 model.train() 2 model.zero_grad() 3 train_losses, val_losses, val_bleu_scores = [], [], [] 4 loss_sum = 0.0 5 train_iter = infinite_iter(train_loader) 6 7 for step in range(summary_steps): 8 model.train() 9 sources, targets = next(train_iter) 10 sources, targets = sources.to(device), targets.to(device) 11 outputs, preds = model(sources, targets, teacher_forcing_ratio) 12 # targets First token yes '<BOS>' So ignore 13 outputs = outputs[:, 1:].reshape(-1, outputs.size(2)) 14 targets = targets[:, 1:].reshape(-1) 15 loss = loss_function(outputs, targets) 16 17 optimizer.zero_grad() 18 loss.backward() 19 grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1) 20 optimizer.step() 21 22 loss_sum += loss.item() 23 if (step + 1) % 10 == 0: 24 loss_sum = loss_sum / 10 25 print ("\r","train [{}] loss: {:.3f}, Perplexity: {:.3f}".format(step+1, loss_sum, np.exp(loss_sum)), end=" ") 26 train_losses.append(loss_sum) 27 loss_sum = 0.0 28 29 if (step + 1) % 300 == 0: 30 # Every 300 rounds batch Training is validated and models are stored 31 model.eval() 32 loss_val, bleu_val= 0.0, 0.0 33 n = 0 34 for sources_val, targets_val in val_loader: 35 sources_val, targets_val = sources_val.to(device), targets_val.to(device) 36 batch_size = sources_val.size(0) 37 #print(batch_size) 38 outputs_val, preds_val = model.inference(sources_val, targets_val) 39 # targets First token yes '<BOS>' So ignore 40 outputs_val = outputs_val[:, 1:].reshape(-1, outputs_val.size(2)) 41 targets_val = targets_val[:, 1:].reshape(-1) 42 loss = loss_function(outputs_val, targets_val) 43 loss_val += loss.item() 44 45 # Convert predictions to text 46 targets_val = targets_val.view(sources_val.size(0), -1) 47 preds_val = tokens2sentence(preds_val, int2word_cn) 48 sources_val = tokens2sentence(sources_val, int2word_en) 49 targets_val = tokens2sentence(targets_val, int2word_cn) 50 # Calculation Bleu Score 51 bleu_val += computebleu(preds_val, targets_val) 52 n += batch_size 53 loss_val = loss_val/len(val_loader) 54 bleu_val = bleu_val/n 55 val_losses.append(loss_val) 56 val_bleu_scores.append(bleu_val) 57 print ("\n", "val [{}] loss: {:.3f}, Perplexity: {:.3f}, bleu score: {:.3f} ".format(step+1, loss_val, np.exp(loss_val), bleu_val)) 58 59 # Storage Model 60 save_model(model, optimizer, store_model_path, step+1) 61
Test and save results
1 load_model_path = "./ckpt/model_6000" # Read Model Location 2 3 model = load_model(model, load_model_path) # Read Model 4 model.to(device) 5 model.eval() 6 # test model 7 loss_test, bleu_test= 0.0, 0.0 8 n = 0 9 result = [] 10 for sources_test, targets_test in test_loader: 11 sources_test, targets_test = sources_test.to(device), targets_test.to(device) 12 batch_size = sources_test.size(0) 13 # print(batch_size) 14 outputs_test, preds_test = model.inference(sources_test, targets_test) 15 # targets First token yes '<BOS>' So ignore 16 outputs_test = outputs_test[:, 1:].reshape(-1, outputs_test.size(2)) 17 targets_test = targets_test[:, 1:].reshape(-1) 18 loss = loss_function(outputs_test, targets_test) 19 loss_test += loss.item() 20 21 # Convert predictions to text 22 targets_test = targets_test.view(sources_test.size(0), -1) 23 preds_test = tokens2sentence(preds_test, int2word_cn) 24 sources_test = tokens2sentence(sources_test, int2word_en) 25 targets_test = tokens2sentence(targets_test, int2word_cn) 26 for source, pred, target in zip(sources_test, preds_test, targets_test): 27 result.append((source, pred, target)) 28 # Calculation Bleu Score 29 bleu_test += computebleu(preds_test, targets_test) 30 n += batch_size 31 loss_test = loss_test/len(test_loader) 32 bleu_test = bleu_test/n 33 print ('test loss: {}, bleu_score: {}'.format(loss_test,bleu_test)) 34 # Store results 35 with open('./test_output.txt', 'w') as f: 36 for line in result: 37 print (line, file=f)