一、PGN架构图
- PGN架构本质上仍是一个seq2seq架构,与传统seq2seq不同的是
- 包含注意力机制
- 通过context-vec、decoder-output、decoder-input生成概率p-gen,p-gen代表了从vocab-dist生成词汇的概率,而(1-p-gen)代表从attention-dist生成词汇的概率,最后对两个分布加权求和,得到最终的词汇分布
- 改进点:
- 力求将描述原文细节的单词直接用到摘要中,极大的解决OOV问题;
- 引入机制来控制和跟踪原始文本的重复范围, 减少生成摘要的重复问题
二、PGN论文
三、运行截图&预测截图
四、代码实现
1. utils
1.1 config
import torch
import os
import sys
root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)
# 神经网络通用参数
hidden_size = 512
dec_hidden_size = 512
embed_size = 512
pointer = True
# 模型相关配置参数
max_vocab_size = 20000
model_name = 'pgn_model'
embed_file = root_path + '/data/wv/word2vec.model'
source = 'train'
train_data_path = root_path + '/data/train.txt'
val_data_path = root_path + '/data/dev.txt'
test_data_path = root_path + '/data/test.txt'
stop_word_file = root_path + '/data/stopwords.txt'
losses_path = root_path + '/data/loss.txt'
log_path = root_path + '/data/log_train.txt'
word_vector_model_path = root_path + '/data/wv/word2vec.model'
encoder_save_name = root_path + '/saved_model/model_encoder.pt'
decoder_save_name = root_path + '/saved_model/model_decoder.pt'
attention_save_name = root_path + '/saved_model/model_attention.pt'
reduce_state_save_name = root_path + '/saved_model/model_reduce_state.pt'
model_save_path = root_path + '/saved_model/pgn_model.pt'
max_enc_len = 300
max_dec_len = 100
truncate_enc = True
truncate_dec = True
# 下面两个参数关系到predict阶段的展示效果, 需要按业务场景调参
min_dec_steps = 30
# 在Greedy Decode的时候设置为50
# max_dec_steps = 50
# 在Beam-search Decode的时候设置为30
max_dec_steps = 30
enc_rnn_dropout = 0.5
enc_attn = True
dec_attn = True
dec_in_dropout = 0
dec_rnn_dropout = 0
dec_out_dropout = 0
# 训练参数
trunc_norm_init_std = 1e-4
eps = 1e-31
learning_rate = 0.001
lr_decay = 0.0
initial_accumulator_value = 0.1
epochs = 5
batch_size = 32
is_cuda = True
# 下面4个参数都是第六章的优化策略
coverage = False
fine_tune = False
scheduled_sampling = False
weight_tying = False
max_grad_norm = 2.0
DEVICE = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
LAMBDA = 1
1.2 config1
import os
# 设置项目的root目录, 方便后续代码文件的导入
root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# 原始数据文本存储路径
train_raw_data_path = os.path.join(root_path, 'data', 'train.csv')
test_raw_data_path = os.path.join(root_path, 'data', 'test.csv')
# 停用词表和用户自定义字典的存储路径
stop_words_path = os.path.join(root_path, 'data', 'stopwords.txt')
user_dict_path = os.path.join(root_path, 'data', 'user_dict.txt')
# 预处理+切分后的训练测试数据路径
train_seg_path = os.path.join(root_path, 'data', 'train_seg_data.csv')
test_seg_path = os.path.join(root_path, 'data', 'test_seg_data.csv')
# 经过第一轮处理后的最终训练集数据和测试集数据
train_data_path = os.path.join(root_path, 'data', 'train.txt')
test_data_path = os.path.join(root_path, 'data', 'test.txt')
# 词向量模型路径
word_vector_path = os.path.join(root_path, 'data', 'wv', 'word2vec.model')
1.3 dataset
# 导入相关工具包
import sys
import os
from collections import Counter
import torch
from torch.utils.data import Dataset
# 设置项目的root路径, 方便后续相关代码文件的导入
root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)
# 导入项目中的自定义代码文件
from utils.func_utils import simple_tokenizer, count_words, sort_batch_by_len, source2ids, abstract2ids
from utils.vocab import Vocab
from utils import config
# 创建数据对的类
class PairDataset(object):
def __init__(self, filename, tokenize=simple_tokenizer, max_enc_len=None, max_dec_len=None,
truncate_enc=False, truncate_dec=False):
print("Reading dataset %s..." % filename, end=' ', flush=True)
self.filename = filename
self.pairs = []
# 直接读取训练集数据文件, 切分成编码器数据, 解码器数据, 并做长度的统一
with open(filename, 'r', encoding='utf-8') as f:
next(f)
for i, line in enumerate(f):
# 在数据预处理阶段已经约定好了x, y之间以<SEP>分隔
pair = line.strip().split('<SEP>')
if len(pair) != 2:
print("Line %d of %s is error formed." % (i, filename))
print(line)
continue
# 前半部分是编码器数据, 即article原始文本.
enc = tokenize(pair[0])
if max_enc_len and len(enc) > max_enc_len:
if truncate_enc:
enc = enc[:max_enc_len]
else:
continue
# 后半部分是解码器数据, 即abstract摘要文本
dec = tokenize(pair[1])
if max_dec_len and len(dec) > max_dec_len:
if truncate_dec:
dec = dec[:max_dec_len]
else:
continue
# 以元组数据对的格式存储进结果列表中
self.pairs.append((enc, dec))
print("%d pairs." % len(self.pairs))
# 构建模型所需的字典
def build_vocab(self, embed_file=None):
# 对读取的文件进行单词计数统计
word_counts = Counter()
count_words(word_counts, [enc + dec for enc, dec in self.pairs])
# 初始化字典类
vocab = Vocab()
# 如果有预训练词向量就直接加载, 如果没有则随着模型一起训练获取
vocab.load_embeddings(embed_file)
# 将计数得到的结果写入字典类中
for word, count in word_counts.most_common(config.max_vocab_size):
vocab.add_words([word])
# 返回在vocab.py代码文件中定义的字典类结果
return vocab
# 直接为后续创建DataLoader提供服务的数据集预处理类
class SampleDataset(Dataset):
def __init__(self, data_pair, vocab):
self.src_sents = [x[0] for x in data_pair]
self.trg_sents = [x[1] for x in data_pair]
self.vocab = vocab
self._len = len(data_pair)
# 需要自定义__getitem__()取元素的函数
def __getitem__(self, index):
# 调用工具函数获取输入x和oov
x, oov = source2ids(self.src_sents[index], self.vocab)
# 完全按照模型需求, 自定义返回格式, 共有6个字段, 每个字段"个性化定制"即可
return {'x': [self.vocab.SOS] + x + [self.vocab.EOS],
'OOV': oov,
'len_OOV': len(oov),
'y': [self.vocab.SOS] + abstract2ids(self.trg_sents[index], self.vocab, oov) + [self.vocab.EOS],
'x_len': len(self.src_sents[index]),
'y_len': len(self.trg_sents[index])
}
def __len__(self):
return self._len
# 创建DataLoader时自定义的数据处理函数
def collate_fn(batch):
# 按照最大长度的限制, 对张量进行填充0
def padding(indice, max_length, pad_idx=0):
pad_indice = [item + [pad_idx] * max(0, max_length - len(item)) for item in indice]
return torch.tensor(pad_indice)
# 对一个批次中的数据, 按照x_len字段进行排序
data_batch = sort_batch_by_len(batch)
# 依次取得所需的字段, 作为构建DataLoader的返回数据, 本模型需要6个字段
x = data_batch['x']
x_max_length = max([len(t) for t in x])
y = data_batch['y']
y_max_length = max([len(t) for t in y])
OOV = data_batch['OOV']
len_OOV = torch.tensor(data_batch['len_OOV'])
x_padded = padding(x, x_max_length)
y_padded = padding(y, y_max_length)
x_len = torch.tensor(data_batch['x_len'])
y_len = torch.tensor(data_batch['y_len'])
return x_padded, y_padded, x_len, y_len, OOV, len_OOV
1.4 func_utils
# 导入系统工具包
import os
import sys
# 设置项目的root路径, 方便后续相关代码文件的导入
root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)
# 导入项目中的工具包
import numpy as np
import time
import heapq
import random
import pathlib
from utils import config
import torch
# 函数耗时计量函数
def timer(module):
def wrapper(func):
# func: 一个函数名, 下面的计时函数就是计算这个func函数的耗时.
def cal_time( *args, **kwargs):
t1 = time.time()
res = func( *args, **kwargs)
t2 = time.time()
cost_time = t2 - t1
print(f'{cost_time} secs used for ', module)
return res
return cal_time
return wrapper
# 将一段文本按空格切分, 返回结果列表
def simple_tokenizer(text):
return text.split()
# 以字典计数的方式统计一段文本中不同单词的数量
def count_words(counter, text):
for sentence in text:
for word in sentence:
counter[word] += 1
# 对一个批次batch_size个样本, 按照x_len字段长短进行排序, 并返回排序后的结果
def sort_batch_by_len(data_batch):
# 初始化一个结果字典, 其中包含的6个字段都是未来数据迭代器中的6个字段
res = {'x': [],
'y': [],
'x_len': [],
'y_len': [],
'OOV': [],
'len_OOV': []}
# 遍历批次数据, 分别将6个字段数据按照字典key值, 添加进各自的列表中
for i in range(len(data_batch)):
res['x'].append(data_batch[i]['x'])
res['y'].append(data_batch[i]['y'])
res['x_len'].append(len(data_batch[i]['x']))
res['y_len'].append(len(data_batch[i]['y']))
res['OOV'].append(data_batch[i]['OOV'])
res['len_OOV'].append(data_batch[i]['len_OOV'])
# 以x_len字段大小进行排序, 并返回下标结果的列表
sorted_indices = np.array(res['x_len']).argsort()[::-1].tolist()
# 返回的data_batch依然保持字典类型
data_batch = {name: [_tensor[i] for i in sorted_indices] for name, _tensor in res.items()}
return data_batch
# 原始文本映射成ids张量
def source2ids(source_words, vocab):
ids = []
oovs = []
unk_id = vocab.UNK
for w in source_words:
i = vocab[w]
if i == unk_id: # 如果w是OOV单词
if w not in oovs: # 将w添加进OOV列表中
oovs.append(w)
# 索引0对应第一个source document OOV, 索引1对应第二个source document OOV, 以此类推......
oov_num = oovs.index(w)
# 在本项目中索引vocab_size对应第一个source document OOV, vocab_size+1对应第二个source document OOV
ids.append(vocab.size() + oov_num)
else:
ids.append(i)
return ids, oovs
# 摘要文本映射成数字化ids张量
def abstract2ids(abstract_words, vocab, source_oovs):
ids = []
unk_id = vocab.UNK
for w in abstract_words:
i = vocab[w]
if i == unk_id: # 如果w是OOV单词
if w in source_oovs: # 如果w是source document OOV
# 对这样的w计算出一个新的映射id值
vocab_idx = vocab.size() + source_oovs.index(w)
ids.append(vocab_idx)
else: # 如果w不是一个source document OOV
ids.append(unk_id) # 对这样的w只能用UNK的id值来代替
else:
ids.append(i) # 如果w是词表中的单词, 直接取id值映射
return ids
# 将输出张量ids结果映射成自然语言文本
def outputids2words(id_list, source_oovs, vocab):
words = []
for i in id_list:
try:
# w可能是<UNK>
w = vocab.index2word[i]
# w是OOV单词
except IndexError:
assert_msg = "Error: 无法在词典中找到该ID值."
assert source_oovs is not None, assert_msg
# 寄希望索引i是一个source document OOV单词
source_oov_idx = i - vocab.size()
try:
# 如果成功取到, 则w是source document OOV单词
w = source_oovs[source_oov_idx]
# i不仅是OOV单词, 也不对应source document中的原文单词
except ValueError:
raise ValueError('Error: 模型生成的ID: %i, 原始文本中的OOV ID: %i \
但是当前样本中只有%i个OOVs'
% (i, source_oov_idx, len(source_oovs)))
# 向结果列表中添加原始字符
words.append(w)
# 空格连接成字符串返回
return ' '.join(words)
# 创建小顶堆, 包含k个点的特殊二叉树, 始终保持二叉树中最小的值在root根节点
def add2heap(heap, item, k):
if len(heap) < k:
heapq.heappush(heap, item)
else:
heapq.heappushpop(heap, item)
# 将文本张量中所有OOV单词的id, 全部替换成<UNK>对应的id
def replace_oovs(in_tensor, vocab):
# oov_token = torch.full(in_tensor.shape, vocab.UNK).long().to(config.DEVICE)
# 在Pytorch1.5.0以及更早的版本中, torch.full()默认返回float类型
# 在Pytorch1.7.0最新版本中, torch.full()会将bool返回成torch.bool, 会将integer返回成torch.long.
# 上面一行代码在Pytorch1.6.0版本中会报错, 因为必须指定成long类型, 如下面代码所示
oov_token = torch.full(in_tensor.shape, vocab.UNK, dtype=torch.long).to(config.DEVICE)
out_tensor = torch.where(in_tensor > len(vocab) - 1, oov_token, in_tensor)
return out_tensor
# 获取模型训练中若干超参数信息
def config_info(config):
info = 'model_name = {}, pointer = {}, coverage = {}, fine_tune = {}, scheduled_sampling = {}, weight_tying = {},' + 'source = {} '
return (info.format(config.model_name, config.pointer, config.coverage, config.fine_tune, config.scheduled_sampling, config.weight_tying, config.source))
@timer(module='test a demo program')
def test():
s = 0
for i in range(100000000):
s += i
print('s = ', s)
if __name__ == '__main__':
test()
1.5 multi_proc_utils
import pandas as pd
import numpy as np
from multiprocessing import cpu_count, Pool
# cpu 数量
cores = cpu_count()
# 分块个数
partitions = cores
def parallelize(df, func):
# 数据切分
data_split = np.array_split(df, partitions)
# 线程池
pool = Pool(cores)
# 数据分发 合并
data = pd.concat(pool.map(func, data_split))
# 关闭线程池
pool.close()
# 执行完close后不会有新的进程加入到pool,join函数等待所有子进程结束
pool.join()
return data
1.6 preprocess
# 导入若干工具包
import re
import jieba
import pandas as pd
import numpy as np
import os
import sys
# 设置项目的root目录, 方便后续相关代码包的导入
root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)
# 导入文本预处理的配置信息config1
from utils.config1 import *
# 导入多核CPU并行处理数据的函数
from utils.multi_proc_utils import *
# jieba载入自定义切词表
jieba.load_userdict(user_dict_path)
# 根据max_len和vocab填充<START> <STOP> <PAD> <UNK>
def pad_proc(sentence, max_len, word_to_id):
# 1: 按空格统计切分出词
words = sentence.strip().split(' ')
# 2: 截取规定长度的词数
words = words[:max_len]
# 3: 填充<UNK>
sentence = [w if w in word_to_id else '<UNK>' for w in words]
# 4: 填充<START>, <END>
sentence = ['<START>'] + sentence + ['<STOP>']
# 5: 判断长度, 填充<PAD>
sentence = sentence + ['<PAD>'] * (max_len - len(words))
return ' '.join(sentence)
# 加载停用词
def load_stop_words(stop_word_path):
# stop_word_path: 停用词路径
# 打开文件
f = open(stop_word_path, 'r', encoding='utf-8')
# 读取所有行
stop_words = f.readlines()
# 去除每一个停用词前后的空格, 换行符
stop_words = [stop_word.strip() for stop_word in stop_words]
return stop_words
# 加载停用词
stop_words = load_stop_words(stop_words_path)
# 清洗文本, 删除特殊符号(被sentence_proc调用)
def clean_sentence(sentence):
# sentence: 待处理的字符串
if isinstance(sentence, str):
# 删除1. 2. 3. 这些标题
r = re.compile("\D(\d\.)\D")
sentence = r.sub("", sentence)
# 删除带括号的 进口 海外
r = re.compile(r"[((]进口[))]|\(海外\)")
sentence = r.sub("", sentence)
# 删除除了汉字数字字母和,!?。.- 以外的字符
r = re.compile("[^,!?。\.\-\u4e00-\u9fa5_a-zA-Z0-9]")
# 用中文输入法下的,!?来替换英文输入法下的,!?
sentence = sentence.replace(",", ",")
sentence = sentence.replace("!", "!")
sentence = sentence.replace("?", "?")
sentence = r.sub("", sentence)
# 删除--- 车主说, 技师说, 语音, 图片, 你好, 您好
r = re.compile(r"车主说|技师说|语音|图片|你好|您好")
sentence = r.sub("", sentence)
return sentence
else:
return ''
# 过滤一句切好词的话中的停用词(被sentence_proc调用)
def filter_stopwords(seg_list):
# seg_list: 切好词的列表 [word1 ,word2 .......]
# 首先去掉多余空字符
words = [word for word in seg_list if word]
# 去掉停用词
return [word for word in words if word not in stop_words]
# 预处理模块(处理一条句子, 被sentences_proc调用)
def sentence_proc(sentence):
# sentence:待处理字符串
# 清除无用词
sentence = clean_sentence(sentence)
# 切词, 默认精确模式, 全模式cut参数cut_all=True
words = jieba.cut(sentence)
# 过滤停用词
words = filter_stopwords(words)
# 拼接成一个字符串, 按空格分隔
return ' '.join(words)
# 预处理模块(处理一个句子列表, 对每个句子调用sentence_proc操作)
def sentences_proc(df):
# df: 数据集
# 批量预处理训练集和测试集
for col_name in ['Brand', 'Model', 'Question', 'Dialogue']:
df[col_name] = df[col_name].apply(sentence_proc)
if 'Report' in df.columns:
# 训练集 Report 预处理
df['Report'] = df['Report'].apply(sentence_proc)
return df
# 用于数据加载+预处理(只需执行一次)
def build_dataset(train_raw_data_path, test_raw_data_path):
# 1. 加载原始数据
print('1. 加载原始数据.')
print(train_raw_data_path)
# 必须指定解码格式为utf-8
train_df = pd.read_csv(train_raw_data_path, engine='python', encoding='utf-8')
test_df = pd.read_csv(test_raw_data_path, engine='python', encoding='utf-8')
print('原始训练集行数 {}, 测试集行数 {}'.format(len(train_df), len(test_df)))
print('\n')
# 2. 空值去除(对于一行数据, 任意列只要有空值就去掉该行)
print('2. 空值去除(对于一行数据, 任意列只要有空值就去掉该行).')
train_df.dropna(subset=['Question', 'Dialogue', 'Report'], how='any', inplace=True)
test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True)
print('空值去除后训练集行数 {}, 测试集行数 {}'.format(len(train_df), len(test_df)))
print('\n')
# 3. 多线程, 批量数据预处理(对每个句子执行sentence_proc, 清除无用词, 切词, 过滤停用词, 再用空格拼接为一个>字符串)
print('3. 多线程, 批量数据预处理(对每个句子执行sentence_proc, 清除无用词, 切词, 过滤停用词, 再用空格拼接为一个字符串).')
train_df = parallelize(train_df, sentences_proc)
test_df = parallelize(test_df, sentences_proc)
print('\n')
print('sentences_proc has done!')
# 4. 合并训练测试集,用于训练词向量
print('4. 合并训练测试集, 用于训练词向量.')
# 新建一列,按行堆积
train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
train_df['Y'] = train_df[['Report']]
# 新建一列,按行堆积
test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
# 5. 保存分割处理好的train_seg_data.csv、test_set_data.csv
print('5. 保存处理好的train_seg_data.csv, test_set_data.csv.')
# 把建立的列merged去掉,该列对于神经网络无用,只用于训练词向量
train_df = train_df.drop(['Question'], axis=1)
train_df = train_df.drop(['Dialogue'], axis=1)
train_df = train_df.drop(['Brand'], axis=1)
train_df = train_df.drop(['Model'], axis=1)
train_df = train_df.drop(['Report'], axis=1)
train_df = train_df.drop(['QID'], axis=1)
test_df = test_df.drop(['Question'], axis=1)
test_df = test_df.drop(['Dialogue'], axis=1)
test_df = test_df.drop(['Brand'], axis=1)
test_df = test_df.drop(['Model'], axis=1)
test_df = test_df.drop(['QID'], axis=1)
# 将处理后的数据存入持久化文件
# train_df.to_csv(train_seg_path, index=None, header=True)
test_df.to_csv(test_seg_path, index=None, header=True)
train_df['data'] = train_df[['X', 'Y']].apply(lambda x: '<sep>'.join(x), axis=1)
train_df = train_df.drop(['X'], axis=1)
train_df = train_df.drop(['Y'], axis=1)
train_df.to_csv(train_seg_path, index=None, header=True)
print('The csv_file has saved!')
print('\n')
print('6. 后续工作是将第5步的结果文件进行适当处理, 并保存为.txt文件.')
print('本程序代码所有工作执行完毕!')
if __name__ == '__main__':
build_dataset(train_raw_data_path, test_raw_data_path)
1.7 vocab
# 导入工具包
import os
import sys
# 设置项目的root路径, 方便后续相关代码文件的导入
root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)
# 导入相关工具包
from collections import Counter
import numpy as np
import torch
import torch.nn as nn
# 如果采用预训练词向量的策略, 则导入相关配置和工具包
from utils.config1 import word_vector_path
from gensim.models import word2vec
# 词典类的创建
class Vocab(object):
PAD = 0
SOS = 1
EOS = 2
UNK = 3
def __init__(self):
self.word2index = {}
self.word2count = Counter()
self.reserved = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
self.index2word = self.reserved[:]
# 如果预训练词向量, 则后续直接载入; 否则置为None即可
self.embedding_matrix = None
# 向类词典中增加单词
def add_words(self, words):
for word in words:
if word not in self.word2index:
self.word2index[word] = len(self.index2word)
self.index2word.append(word)
# 因为引入Counter()工具包, 直接执行update()更新即可.
self.word2count.update(words)
# 如果已经提前预训练的词向量, 则执行类内函数对embedding_matrix赋值
def load_embeddings(self, word_vector_model_path):
# 直接下载预训练词向量模型
wv_model = word2vec.Word2Vec.load(word_vector_model_path)
# 从模型中直接提取词嵌入矩阵
self.embedding_matrix = wv_model.wv.vectors
# 根据id值item读取字典中的单词
def __getitem__(self, item):
if type(item) is int:
return self.index2word[item]
return self.word2index.get(item, self.UNK)
# 获取字典的当前长度(等效于单词总数)
def __len__(self):
return len(self.index2word)
# 获取字典的当前单词总数
def size(self):
return len(self.index2word)
if __name__ == '__main__':
vocab = Vocab()
print(vocab)
print('***')
print(vocab.size())
print('***')
print(vocab.embedding_matrix)
2. src
2.1 evaluate
# 导入工具包
import os
import sys
import torch
from tqdm import tqdm
import numpy as np
from torch.utils.data import DataLoader
# 设定项目的root路径, 方便后续代码文件的导入
root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)
# 导入项目的相关代码文件
from utils.dataset import collate_fn
from utils import config
# 编写评估函数
def evaluate(model, val_data, epoch):
print('validating')
val_loss = []
# 评估模型需要设定参数不变
with torch.no_grad():
DEVICE = config.DEVICE
# 创建数据迭代器, pin_memory=True是对于GPU机器的优化设置
# 为了PGN模型数据的特殊性, 传入自定义的collate_fn提供个性化服务
val_dataloader = DataLoader(dataset=val_data,
batch_size=config.batch_size,
shuffle=True,
pin_memory=True,
drop_last=True,
collate_fn=collate_fn)
# 遍历测试集数据进行评估
for batch, data in enumerate(tqdm(val_dataloader)):
x, y, x_len, y_len, oov, len_oovs = data
if config.is_cuda:
x = x.to(DEVICE)
y = y.to(DEVICE)
x_len = x_len.to(DEVICE)
len_oovs = len_oovs.to(DEVICE)
total_num = len(val_dataloader)
loss = model(x, x_len, y, len_oovs, batch=batch, num_batches=total_num, teacher_forcing=True)
val_loss.append(loss.item())
# 返回整个测试集的平均损失值
return np.mean(val_loss)
2.2 model
import torch.nn as nn
# 构建编码器类
class Encoder(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size, rnn_drop=0):
super(Encoder, self).__init__()
# 词嵌入层采用跟随模型一起训练的模式
self.embedding = nn.Embedding(vocab_size, embed_size)
self.hidden_size = hidden_size
# 编码器的主体采用单层, 双向LSTM结构
self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, dropout=rnn_drop, batch_first=True)
def forward(self, x):
embedded = self.embedding(x)
output, hidden = self.lstm(embedded)
return output, hidden
# 构建注意力类
class Attention(nn.Module):
def __init__(self, hidden_units):
super(Attention, self).__init__()
# 定义前向传播层, 对应论文中的公式1中的Wh, Ws
self.Wh = nn.Linear(2 * hidden_units, 2 * hidden_units, bias=False)
self.Ws = nn.Linear(2 * hidden_units, 2 * hidden_units)
# 定义全连接层, 对应论文中的公式1中最外层的v
self.v = nn.Linear(2 * hidden_units, 1, bias=False)
def forward(self, decoder_states, encoder_output, x_padding_masks):
h_dec, c_dec = decoder_states
# 将两个张量在最后一个维度拼接, 得到deocder state St: (1, batch_size, 2*hidden_units)
s_t = torch.cat([h_dec, c_dec], dim=2)
# 将batch_size置于第一个维度上: (batch_size, 1, 2*hidden_units)
s_t = s_t.transpose(0, 1)
# 按照hi的维度扩展St的维度: (batch_size, seq_length, 2*hidden_units)
s_t = s_t.expand_as(encoder_output).contiguous()
# 根据论文中的公式1来计算et, 总共有三步
# 第一步: 分别经历各自的全连接层矩阵乘法
# Wh * h_i: (batch_size, seq_length, 2*hidden_units)
encoder_features = self.Wh(encoder_output.contiguous())
# Ws * s_t: (batch_size, seq_length, 2*hidden_units)
decoder_features = self.Ws(s_t)
# 第二步: 两部分执行加和运算
# (batch_size, seq_length, 2*hidden_units)
attn_inputs = encoder_features + decoder_features
# 第三步: 执行tanh运算和一个全连接层的运算
# (batch_size, seq_length, 1)
score = self.v(torch.tanh(attn_inputs))
# 得到score后, 执行论文中的公式2
# (batch_size, seq_length)
attention_weights = F.softmax(score, dim=1).squeeze(2)
# 添加一步执行padding mask的运算, 将编码器端无效的PAD字符全部遮掩掉
attention_weights = attention_weights * x_padding_masks
# 最整个注意力层执行一次正则化操作
normalization_factor = attention_weights.sum(1, keepdim=True)
attention_weights = attention_weights / normalization_factor
# 执行论文中的公式3,将上一步得到的attention distributon应用在encoder hidden states上,得到context_vector
# (batch_size, 1, 2*hidden_units)
context_vector = torch.bmm(attention_weights.unsqueeze(1), encoder_output)
# (batch_size, 2*hidden_units)
context_vector = context_vector.squeeze(1)
return context_vector, attention_weights
class Decoder(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size, enc_hidden_size=None):
super(Decoder, self).__init__()
# 解码器端也采用跟随模型一起训练的方式, 得到词嵌入层
self.embedding = nn.Embedding(vocab_size, embed_size)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
# 解码器的主体结构采用单向LSTM, 区别于编码器端的双向LSTM
self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
# 因为要将decoder hidden state和context vector进行拼接, 因此需要3倍的hidden_size维度设置
self.W1 = nn.Linear(self.hidden_size * 3, self.hidden_size)
self.W2 = nn.Linear(self.hidden_size, vocab_size)
if config.pointer:
# 因为要根据论文中的公式8进行运算, 所谓输入维度上匹配的是4 * hidden_size + embed_size
self.w_gen = nn.Linear(self.hidden_size * 4 + embed_size, 1)
def forward(self, x_t, decoder_states, context_vector):
# 首先计算Decoder的前向传播输出张量
decoder_emb = self.embedding(x_t)
decoder_output, decoder_states = self.lstm(decoder_emb, decoder_states)
# 接下来就是论文中的公式4的计算.
# 将context vector和decoder state进行拼接, (batch_size, 3*hidden_units)
decoder_output = decoder_output.view(-1, config.hidden_size)
concat_vector = torch.cat([decoder_output, context_vector], dim=-1)
# 经历两个全连接层V和V1后,再进行softmax运算, 得到vocabulary distribution
# (batch_size, hidden_units)
FF1_out = self.W1(concat_vector)
# (batch_size, vocab_size)
FF2_out = self.W2(FF1_out)
# (batch_size, vocab_size)
p_vocab = F.softmax(FF2_out, dim=1)
# 构造decoder state s_t.
h_dec, c_dec = decoder_states
# (1, batch_size, 2*hidden_units)
s_t = torch.cat([h_dec, c_dec], dim=2)
# p_gen是通过context vector h_t, decoder state s_t, decoder input x_t, 三个部分共同计算出来的.
# 下面的部分是计算论文中的公式8.
p_gen = None
if config.pointer:
# 这里面采用了直接拼接3部分输入张量, 然后经历一个共同的全连接层w_gen, 和原始论文的计算不同.
# 这也给了大家提示, 可以提高模型的复杂度, 完全模拟原始论文中的3个全连接层来实现代码.
x_gen = torch.cat([context_vector, s_t.squeeze(0), decoder_emb.squeeze(1)], dim=-1)
p_gen = torch.sigmoid(self.w_gen(x_gen))
return p_vocab, decoder_states, p_gen
# 构造加和state的类, 方便模型运算
class ReduceState(nn.Module):
def __init__(self):
super(ReduceState, self).__init__()
def forward(self, hidden):
h, c = hidden
h_reduced = torch.sum(h, dim=0, keepdim=True)
c_reduced = torch.sum(c, dim=0, keepdim=True)
return (h_reduced, c_reduced)
# 导入系统工具包
import os
import sys
# 设置项目的root路径, 方便后续相关代码文件的导入
root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)
# 导入若干工具包
import torch
import torch.nn as nn
import torch.nn.functional as F
# 导入项目中的相关代码文件
from utils import config
from utils.func_utils import timer, replace_oovs
from utils.vocab import Vocab
# 构建PGN类
class PGN(nn.Module):
def __init__(self, v):
super(PGN, self).__init__()
# 初始化字典对象
self.v = v
self.DEVICE = config.DEVICE
# 依次初始化4个类对象
self.attention = Attention(config.hidden_size)
self.encoder = Encoder(len(v), config.embed_size, config.hidden_size)
self.decoder = Decoder(len(v), config.embed_size, config.hidden_size)
self.reduce_state = ReduceState()
# 计算最终分布的函数
def get_final_distribution(self, x, p_gen, p_vocab, attention_weights, max_oov):
if not config.pointer:
return p_vocab
batch_size = x.size()[0]
# 进行p_gen概率值的裁剪, 具体取值范围可以调参
p_gen = torch.clamp(p_gen, 0.001, 0.999)
# 接下来两行代码是论文中公式9的计算.
p_vocab_weighted = p_gen * p_vocab
# (batch_size, seq_len)
attention_weighted = (1 - p_gen) * attention_weights
# 得到扩展后的单词概率分布(extended-vocab probability distribution)
# extended_size = len(self.v) + max_oovs
extension = torch.zeros((batch_size, max_oov)).float().to(self.DEVICE)
# (batch_size, extended_vocab_size)
p_vocab_extended = torch.cat([p_vocab_weighted, extension], dim=1)
# 根据论文中的公式9, 累加注意力值attention_weighted到对应的单词位置x
final_distribution = p_vocab_extended.scatter_add_(dim=1, index=x, src=attention_weighted)
return final_distribution
def forward(self, x, x_len, y, len_oovs, batch, num_batches, teacher_forcing):
x_copy = replace_oovs(x, self.v)
x_padding_masks = torch.ne(x, 0).byte().float()
# 第一步: 进行Encoder的编码计算
encoder_output, encoder_states = self.encoder(x_copy)
decoder_states = self.reduce_state(encoder_states)
# 初始化每一步的损失值
step_losses = []
# 第二步: 循环解码, 每一个时间步都经历注意力的计算, 解码器层的计算.
# 初始化解码器的输入, 是ground truth中的第一列, 即真实摘要的第一个字符
x_t = y[:, 0]
for t in range(y.shape[1] - 1):
# 如果使用Teacher_forcing, 则每一个时间步用真实标签来指导训练
if teacher_forcing:
x_t = y[:, t]
x_t = replace_oovs(x_t, self.v)
y_t = y[:, t + 1]
# 通过注意力层计算context vector
context_vector, attention_weights = self.attention(decoder_states,encoder_output,x_padding_masks)
# 通过解码器层计算得到vocab distribution和hidden states
p_vocab, decoder_states, p_gen = self.decoder(x_t.unsqueeze(1), decoder_states, context_vector)
# 得到最终的概率分布
final_dist = self.get_final_distribution(x,p_gen,p_vocab,attention_weights,torch.max(len_oovs))
# 第t个时间步的预测结果, 将作为第t + 1个时间步的输入(如果采用Teacher-forcing则不同).
x_t = torch.argmax(final_dist, dim=1).to(self.DEVICE)
# 根据模型对target tokens的预测, 来获取到预测的概率
if not config.pointer:
y_t = replace_oovs(y_t, self.v)
target_probs = torch.gather(final_dist, 1, y_t.unsqueeze(1))
target_probs = target_probs.squeeze(1)
# 将解码器端的PAD用padding mask遮掩掉, 防止计算loss时的干扰
mask = torch.ne(y_t, 0).byte()
# 为防止计算log(0)而做的数学上的平滑处理
loss = -torch.log(target_probs + config.eps)
# 先遮掩, 再添加损失值
mask = mask.float()
loss = loss * mask
step_losses.append(loss)
# 第三步: 计算一个批次样本的损失值, 为反向传播做准备.
sample_losses = torch.sum(torch.stack(step_losses, 1), 1)
# 统计非PAD的字符个数, 作为当前批次序列的有效长度
seq_len_mask = torch.ne(y, 0).byte().float()
batch_seq_len = torch.sum(seq_len_mask, dim=1)
# 计算批次样本的平均损失值
batch_loss = torch.mean(sample_losses / batch_seq_len)
return batch_loss
if __name__ == '__main__':
v = Vocab()
model = PGN(v)
print(model)
2.3 predict
# 导入工具包
import random
import os
import sys
import torch
import jieba
# 设定项目的root路径, 方便后续相关代码文件的导入
root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)
# 导入项目的相关代码文件
from utils import config
from src.model import PGN
from utils.dataset import PairDataset
from utils.func_utils import source2ids, outputids2words, timer, add2heap, replace_oovs
# 构建预测类
class Predict():
@timer(module='initalize predicter')
def __init__(self):
self.DEVICE = config.DEVICE
dataset = PairDataset(config.train_data_path,
max_enc_len=config.max_enc_len,
max_dec_len=config.max_dec_len,
truncate_enc=config.truncate_enc,
truncate_dec=config.truncate_dec)
self.vocab = dataset.build_vocab(embed_file=config.embed_file)
self.model = PGN(self.vocab)
self.stop_word = list(set([self.vocab[x.strip()] for x in open(config.stop_word_file).readlines()]))
# 导入已经训练好的模型, 并转移到GPU上.
self.model.load_state_dict(torch.load(config.model_save_path))
self.model.to(self.DEVICE)
def greedy_search(self, x, max_sum_len, len_oovs, x_padding_masks):
encoder_output, encoder_states = self.model.encoder(replace_oovs(x, self.vocab))
# 用encoder的hidden state初始化decoder的hidden state
decoder_states = self.model.reduce_state(encoder_states)
# 利用SOS作为解码器的初始化输入字符
x_t = torch.ones(1) * self.vocab.SOS
x_t = x_t.to(self.DEVICE, dtype=torch.int64)
summary = [self.vocab.SOS]
# 循环解码, 最多解码max_sum_len步
while int(x_t.item()) != (self.vocab.EOS) and len(summary) < max_sum_len:
context_vector, attention_weights = self.model.attention(decoder_states,
encoder_output,
x_padding_masks)
p_vocab, decoder_states, p_gen = self.model.decoder(x_t.unsqueeze(1),
decoder_states,
context_vector)
final_dist = self.model.get_final_distribution(x, p_gen, p_vocab,
attention_weights,
torch.max(len_oovs))
# 以贪心解码策略预测字符
x_t = torch.argmax(final_dist, dim=1).to(self.DEVICE)
decoder_word_idx = x_t.item()
# 将预测的字符添加进结果摘要中
summary.append(decoder_word_idx)
x_t = replace_oovs(x_t, self.vocab)
return summary
@timer(module='doing prediction')
def predict(self, text, tokenize=True):
if isinstance(text, str) and tokenize:
text = list(jieba.cut(text))
# 将原始文本映射成数字化张量
x, oov = source2ids(text, self.vocab)
x = torch.tensor(x).to(self.DEVICE)
# 获取OOV的长度和padding mask张量
len_oovs = torch.tensor([len(oov)]).to(self.DEVICE)
x_padding_masks = torch.ne(x, 0).byte().float()
# 利用贪心解码函数得到摘要结果.
summary = self.greedy_search(x.unsqueeze(0),
max_sum_len=config.max_dec_steps,
len_oovs=len_oovs,
x_padding_masks=x_padding_masks)
# 将得到的摘要数字化张量转换成自然语言文本
summary = outputids2words(summary, oov, self.vocab)
# 删除掉特殊字符<SOS>和<EOS>
return summary.replace('<SOS>', '').replace('<EOS>', '').strip()
if __name__ == "__main__":
print('实例化Predict对象, 构建dataset和vocab......')
pred = Predict()
print('vocab_size: ', len(pred.vocab))
# Randomly pick a sample in test set to predict.
with open(config.val_data_path, 'r') as test:
picked = random.choice(list(test))
source, ref = picked.strip().split('<SEP>')
print('source: ', source, '\n')
print('---------------------------------------------------------------')
print('ref: ', ref, '\n')
print('---------------------------------------------------------------')
greedy_prediction = pred.predict(source.split())
print('greedy: ', greedy_prediction, '\n')
2.4 train
# 导入系统工具包
import pickle
import os
import sys
# 设置项目的root路径, 方便后续相关代码文件的导入
root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)
# 导入项目中用到的工具包
import numpy as np
from torch import optim
from torch.utils.data import DataLoader
import torch
from torch.nn.utils import clip_grad_norm_
from tqdm import tqdm
from tensorboardX import SummaryWriter
# 导入项目中自定义的代码文件, 类, 函数等
from src.model import PGN
from utils import config
from src.evaluate import evaluate
from utils.dataset import PairDataset, collate_fn, SampleDataset
from utils.func_utils import config_info
# 编写训练模型的主逻辑函数.
def train(dataset, val_dataset, v, start_epoch=0):
DEVICE = config.DEVICE
# 实例化PGN类对象并移动到GPU上(CPU).
model = PGN(v)
model.to(DEVICE)
print("loading data......")
train_data = SampleDataset(dataset.pairs, v)
val_data = SampleDataset(val_dataset.pairs, v)
print("initializing optimizer......")
# 定义模型训练的优化器.
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
# 定义训练集的数据迭代器(这里用到了自定义的collate_fn以服务于PGN特殊的数据结构).
train_dataloader = DataLoader(dataset=train_data,
batch_size=config.batch_size,
shuffle=True,
collate_fn=collate_fn)
# 验证集上的损失值初始化为一个大整数.
val_losses = 10000000.0
# SummaryWriter: 为服务于TensorboardX写日志的可视化工具.
writer = SummaryWriter(config.log_path)
num_epochs = len(range(start_epoch, config.epochs))
# 训练阶段采用Teacher-forcing的策略
teacher_forcing = True
print('teacher_forcing = {}'.format(teacher_forcing))
# 根据配置文件config.py中的设置, 对整个数据集进行一定轮次的迭代训练.
with tqdm(total=config.epochs) as epoch_progress:
for epoch in range(start_epoch, config.epochs):
# 每一个epoch之前打印模型训练的相关配置信息.
print(config_info(config))
# 初始化每一个batch损失值的存放列表
batch_losses = []
num_batches = len(train_dataloader)
# 针对每一个epoch, 按batch读取数据迭代训练模型
with tqdm(total=num_batches//100) as batch_progress:
for batch, data in enumerate(tqdm(train_dataloader)):
x, y, x_len, y_len, oov, len_oovs = data
assert not np.any(np.isnan(x.numpy()))
# 如果配置有GPU, 则加速训练
if config.is_cuda:
x = x.to(DEVICE)
y = y.to(DEVICE)
x_len = x_len.to(DEVICE)
len_oovs = len_oovs.to(DEVICE)
# 设置模型进入训练模式(参数参与反向传播和更新)
model.train()
# "老三样"中的第一步: 梯度清零
optimizer.zero_grad()
# 调用模型进行训练并返回损失值
loss = model(x, x_len, y,
len_oovs, batch=batch,
num_batches=num_batches,
teacher_forcing=teacher_forcing)
batch_losses.append(loss.item())
# "老三样"中的第二步: 反向传播
loss.backward()
# 为防止梯度爆炸(gradient explosion)而进行梯度裁剪.
clip_grad_norm_(model.encoder.parameters(), config.max_grad_norm)
clip_grad_norm_(model.decoder.parameters(), config.max_grad_norm)
clip_grad_norm_(model.attention.parameters(), config.max_grad_norm)
# "老三样"中的第三步: 参数更新
optimizer.step()
# 每隔100个batch记录一下损失值信息.
if (batch % 100) == 0:
batch_progress.set_description(f'Epoch {epoch}')
batch_progress.set_postfix(Batch=batch, Loss=loss.item())
batch_progress.update()
# 向tensorboard中写入损失值信息.
writer.add_scalar(f'Average loss for epoch {epoch}',
np.mean(batch_losses),
global_step=batch)
# 将一个轮次中所有batch的平均损失值作为这个epoch的损失值.
epoch_loss = np.mean(batch_losses)
epoch_progress.set_description(f'Epoch {epoch}')
epoch_progress.set_postfix(Loss=epoch_loss)
epoch_progress.update()
# 结束每一个epoch训练后, 直接在验证集上跑一下模型效果
avg_val_loss = evaluate(model, val_data, epoch)
print('training loss:{}'.format(epoch_loss), 'validation loss:{}'.format(avg_val_loss))
# 更新更小的验证集损失值evaluating loss.
if (avg_val_loss < val_losses):
torch.save(model.encoder, config.encoder_save_name)
torch.save(model.decoder, config.decoder_save_name)
torch.save(model.attention, config.attention_save_name)
torch.save(model.reduce_state, config.reduce_state_save_name)
torch.save(model.state_dict(), config.model_save_path)
val_losses = avg_val_loss
# 将更小的损失值写入文件中
with open(config.losses_path, 'wb') as f:
pickle.dump(val_losses, f)
writer.close()
if __name__ == '__main__':
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('DEVICE: ', DEVICE)
# 构建训练用的数据集对
dataset = PairDataset(config.train_data_path,
max_enc_len=config.max_enc_len,
max_dec_len=config.max_dec_len,
truncate_enc=config.truncate_enc,
truncate_dec=config.truncate_dec)
# 构建测试用的数据集对
val_dataset = PairDataset(config.val_data_path,
max_enc_len=config.max_enc_len,
max_dec_len=config.max_dec_len,
truncate_enc=config.truncate_enc,
truncate_dec=config.truncate_dec)
# 创建模型的单词字典
vocab = dataset.build_vocab(embed_file=config.embed_file)
# 调用训练函数进行训练并测试
train(dataset, val_dataset, vocab, start_epoch=0)
3. data
3.1 make_data
import os
import sys
# 打开最终的结果文件
train_writer = open('train.txt', 'w', encoding='utf-8')
test_writer = open('test.txt', 'w', encoding='utf-8')
# 对训练数据做处理, 将article和abstract中间用'<SEP>'分隔
n = 0
with open('train_seg_data.csv', 'r', encoding='utf-8') as f1:
f1.readline()
for line in f1.readlines():
line = line.strip().strip('\n')
article, abstract = line.split('<sep>')
text = article + '<SEP>' + abstract + '\n'
train_writer.write(text)
n += 1
print('train n=', n)
n = 0
# 对测试数据做处理, 仅将文件存储格式从.csv换成.txt
with open('test_seg_data.csv', 'r', encoding='utf-8') as f2:
f2.readline()
for line in f2.readlines():
line = line.strip().strip('\n')
text = line + '\n'
test_writer.write(text)
n += 1
print('test n=', n)