import sys import os import argparse import time import random import math import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable import cuda_functional as MF def read_corpus(path, eos="</s>"): data = [ ] with open(path) as fin: for line in fin: data += line.split() + [ eos ] return data def create_batches(data_text, map_to_ids, batch_size, cuda=True): data_ids = map_to_ids(data_text) N = len(data_ids) L = ((N-1) // batch_size) * batch_size x = np.copy(data_ids[:L].reshape(batch_size,-1).T) y = np.copy(data_ids[1:L+1].reshape(batch_size,-1).T)#x和y的结果基本相同 x, y = torch.from_numpy(x), torch.from_numpy(y) x, y = x.contiguous(), y.contiguous() if cuda: x, y = x.cuda(), y.cuda() return x, y class EmbeddingLayer(nn.Module):#为语料中每一个单词对应的其相应的词向量 def __init__(self, n_d, words, fix_emb=False): super(EmbeddingLayer, self).__init__() word2id = {} for w in words: if w not in word2id: word2id[w] = len(word2id)#把文本映射到数字上。 self.word2id = word2id self.n_V, self.n_d = len(word2id), n_d#n_V应该是指词库大小,n_d指hidden state size self.embedding = nn.Embedding(self.n_V, n_d)#赋予每个单词相应的词向量 def forward(self, x): return self.embedding(x) def map_to_ids(self, text):#映射 return np.asarray([self.word2id[x] for x in text], dtype=‘int64‘ ) class Model(nn.Module): def __init__(self, words, args): super(Model, self).__init__() self.args = args self.n_d = args.d self.depth = args.depth self.drop = nn.Dropout(args.dropout)#防止过拟合的层,变分dropout self.embedding_layer = EmbeddingLayer(self.n_d, words) self.n_V = self.embedding_layer.n_V if args.lstm: self.rnn = nn.LSTM(self.n_d, self.n_d,#self.rnn = nn.LSTM( # if use nn.RNN(), it hardly learns input_size=INPUT_SIZE, hidden_size=64, # rnn hidden unit num_layers=1, # number of rnn layer batch_first=True, # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size) ) self.depth, dropout = args.rnn_dropout ) else: self.rnn = MF.SRU(self.n_d, self.n_d, self.depth, dropout = args.rnn_dropout, rnn_dropout = args.rnn_dropout, use_tanh = 0 ) self.output_layer = nn.Linear(self.n_d, self.n_V) # tie weights self.output_layer.weight = self.embedding_layer.embedding.weight self.init_weights() if not args.lstm: self.rnn.set_bias(args.bias) def init_weights(self):#initial c val_range = (3.0/self.n_d)**0.5 for p in self.parameters(): if p.dim() > 1: # matrix p.data.uniform_(-val_range, val_range) else: p.data.zero_() def forward(self, x, hidden): emb = self.drop(self.embedding_layer(x)) output, hidden = self.rnn(emb, hidden)#rnn的输入和输出都有两个,即输入和上一层的隐层的值 output = self.drop(output) output = output.view(-1, output.size(2))#改变tensor的size,size(2)表示计算第三维的大小,如size 4x6x7,则.size(3)就等于7 output = self.output_layer(output) return output, hidden def init_hidden(self, batch_size): weight = next(self.parameters()).data zeros = Variable(weight.new(self.depth, batch_size, self.n_d).zero_()) if self.args.lstm: return (zeros, zeros) else: return zeros def print_pnorm(self):#输出范数,范数常常被用来度量某个向量空间(或矩阵)中的每个向量的长度或大小。正则化中就是用范数 norms = [ "{:.0f}".format(x.norm().data[0]) for x in self.parameters() ] sys.stdout.write("\tp_norm: {}\n".format( norms )) def train_model(epoch, model, train): model.train() args = model.args unroll_size = args.unroll_size batch_size = args.batch_size N = (len(train[0])-1)//unroll_size + 1 lr = args.lr total_loss = 0.0 criterion = nn.CrossEntropyLoss(size_average=False)#每个小批次的损失将被相加。 hidden = model.init_hidden(batch_size) for i in range(N): x = train[0][i*unroll_size:(i+1)*unroll_size] y = train[1][i*unroll_size:(i+1)*unroll_size].view(-1)#view(-1)是指按列展开 x, y = Variable(x), Variable(y) hidden = (Variable(hidden[0].data), Variable(hidden[1].data)) if args.lstm else Variable(hidden.data) model.zero_grad() output, hidden = model(x, hidden) assert x.size(1) == batch_size loss = criterion(output, y) / x.size(1)#.size(1)计算列数.size(0)计算行数,must be (1. nn output, 2. target), the target label is NOT one-hotted loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.clip_grad)#nn.utils.clip_grad_norm()对网络进行梯度裁剪,因为RNN中容易出现梯度爆炸的问题。 for p in model.parameters(): if p.requires_grad: if args.weight_decay > 0: p.data.mul_(1.0-args.weight_decay) p.data.add_(-lr, p.grad.data) if math.isnan(loss.data[0]) or math.isinf(loss.data[0]):#如果发生梯度消失或梯度爆炸则退出程序 sys.exit(0) #math.isinf(x):如果x = ±inf(inf:infinity ,译为无穷)也就是±∞返回True return #math.isnan(x):如果x = Non (not a number) 返回True; total_loss += loss.data[0] / x.size(0) if i%10 == 0: sys.stdout.write("\r{}".format(i)) sys.stdout.flush() return np.exp(total_loss/N) def eval_model(model, valid): model.eval() args = model.args total_loss = 0.0 unroll_size = model.args.unroll_size criterion = nn.CrossEntropyLoss(size_average=False) hidden = model.init_hidden(1) N = (len(valid[0])-1)//unroll_size + 1 for i in range(N): x = valid[0][i*unroll_size:(i+1)*unroll_size] y = valid[1][i*unroll_size:(i+1)*unroll_size].view(-1) x, y = Variable(x, volatile=True), Variable(y) hidden = (Variable(hidden[0].data), Variable(hidden[1].data)) if args.lstm else Variable(hidden.data) output, hidden = model(x, hidden) loss = criterion(output, y) total_loss += loss.data[0] avg_loss = total_loss / valid[1].numel()#numel()返回张量所含元素的个数 ppl = np.exp(avg_loss) return ppl def main(args): train = read_corpus(args.train) dev = read_corpus(args.dev) test = read_corpus(args.test) model = Model(train, args) model.cuda() sys.stdout.write("vocab size: {}\n".format( model.embedding_layer.n_V )) sys.stdout.write("num of parameters: {}\n".format( sum(x.numel() for x in model.parameters() if x.requires_grad) )) model.print_pnorm() sys.stdout.write("\n") map_to_ids = model.embedding_layer.map_to_ids train = create_batches(train, map_to_ids, args.batch_size) dev = create_batches(dev, map_to_ids, 1) test = create_batches(test, map_to_ids, 1) unchanged = 0 best_dev = 1e+8 for epoch in range(args.max_epoch): start_time = time.time()#返回当前时间的时间戳(1970纪元后经过的浮点秒数)。 if args.lr_decay_epoch>0 and epoch>=args.lr_decay_epoch: args.lr *= args.lr_decay train_ppl = train_model(epoch, model, train) dev_ppl = eval_model(model, dev) sys.stdout.write("\rEpoch={} lr={:.4f} train_ppl={:.2f} dev_ppl={:.2f}" "\t[{:.2f}m]\n".format( epoch, args.lr, train_ppl, dev_ppl, (time.time()-start_time)/60.0 )) model.print_pnorm() sys.stdout.flush() if dev_ppl < best_dev: unchanged = 0 best_dev = dev_ppl start_time = time.time() test_ppl = eval_model(model, test) sys.stdout.write("\t[eval] test_ppl={:.2f}\t[{:.2f}m]\n".format( test_ppl, (time.time()-start_time)/60.0 )) sys.stdout.flush() else: unchanged += 1 if unchanged >= 30: break sys.stdout.write("\n") if __name__ == "__main__": argparser = argparse.ArgumentParser(sys.argv[0], conflict_handler=‘resolve‘) argparser.add_argument("--lstm", action="store_true") argparser.add_argument("--train", type=str, required=True, help="training file") argparser.add_argument("--dev", type=str, required=True, help="dev file") argparser.add_argument("--test", type=str, required=True, help="test file") argparser.add_argument("--batch_size", "--batch", type=int, default=32) argparser.add_argument("--unroll_size", type=int, default=35) argparser.add_argument(" ", type=int, default=300) argparser.add_argument("--d", type=int, default=910) argparser.add_argument("--dropout", type=float, default=0.7, help="dropout of word embeddings and softmax output" ) argparser.add_argument("--rnn_dropout", type=float, default=0.2, help="dropout of RNN layers" ) argparser.add_argument("--bias", type=float, default=-3, help="intial bias of highway gates", ) argparser.add_argument("--depth", type=int, default=6) argparser.add_argument("--lr", type=float, default=1.0) argparser.add_argument("--lr_decay", type=float, default=0.98) argparser.add_argument("--lr_decay_epoch", type=int, default=175) argparser.add_argument("--weight_decay", type=float, default=1e-5) argparser.add_argument("--clip_grad", type=float, default=5) args = argparser.parse_args() print (args)