NLP

文本摘要(二)

PGN架构

Posted by 新宇 on April 24, 2021

一、PGN架构图

  • PGN架构本质上仍是一个seq2seq架构,与传统seq2seq不同的是
    • 包含注意力机制
    • 通过context-vec、decoder-output、decoder-input生成概率p-gen,p-gen代表了从vocab-dist生成词汇的概率,而(1-p-gen)代表从attention-dist生成词汇的概率,最后对两个分布加权求和,得到最终的词汇分布
    • 改进点:
      • 力求将描述原文细节的单词直接用到摘要中,极大的解决OOV问题;
      • 引入机制来控制和跟踪原始文本的重复范围, 减少生成摘要的重复问题

二、PGN论文

论文连接

三、运行截图&预测截图

四、代码实现

1. utils

1.1 config

import torch
import os
import sys
root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)

# 神经网络通用参数

hidden_size = 512
dec_hidden_size = 512
embed_size = 512
pointer = True

# 模型相关配置参数

max_vocab_size = 20000
model_name = 'pgn_model'
embed_file = root_path + '/data/wv/word2vec.model'
source = 'train'
train_data_path = root_path + '/data/train.txt'
val_data_path = root_path + '/data/dev.txt'
test_data_path = root_path + '/data/test.txt'
stop_word_file = root_path + '/data/stopwords.txt'
losses_path = root_path + '/data/loss.txt'
log_path = root_path + '/data/log_train.txt'
word_vector_model_path = root_path + '/data/wv/word2vec.model'
encoder_save_name = root_path + '/saved_model/model_encoder.pt'
decoder_save_name = root_path + '/saved_model/model_decoder.pt'
attention_save_name = root_path + '/saved_model/model_attention.pt'
reduce_state_save_name = root_path + '/saved_model/model_reduce_state.pt'
model_save_path = root_path + '/saved_model/pgn_model.pt'
max_enc_len = 300
max_dec_len = 100
truncate_enc = True
truncate_dec = True
# 下面两个参数关系到predict阶段的展示效果, 需要按业务场景调参

min_dec_steps = 30
# 在Greedy Decode的时候设置为50
# max_dec_steps = 50
# 在Beam-search Decode的时候设置为30

max_dec_steps = 30
enc_rnn_dropout = 0.5
enc_attn = True
dec_attn = True
dec_in_dropout = 0
dec_rnn_dropout = 0
dec_out_dropout = 0

# 训练参数

trunc_norm_init_std = 1e-4
eps = 1e-31
learning_rate = 0.001
lr_decay = 0.0
initial_accumulator_value = 0.1
epochs = 5
batch_size = 32
is_cuda = True

# 下面4个参数都是第六章的优化策略

coverage = False
fine_tune = False
scheduled_sampling = False
weight_tying = False

max_grad_norm = 2.0
DEVICE = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
LAMBDA = 1

1.2 config1

import os

# 设置项目的root目录, 方便后续代码文件的导入

root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

# 原始数据文本存储路径

train_raw_data_path = os.path.join(root_path, 'data', 'train.csv')
test_raw_data_path = os.path.join(root_path, 'data', 'test.csv')

# 停用词表和用户自定义字典的存储路径

stop_words_path = os.path.join(root_path, 'data', 'stopwords.txt')
user_dict_path = os.path.join(root_path, 'data', 'user_dict.txt')

# 预处理+切分后的训练测试数据路径

train_seg_path = os.path.join(root_path, 'data', 'train_seg_data.csv')
test_seg_path = os.path.join(root_path, 'data', 'test_seg_data.csv')

# 经过第一轮处理后的最终训练集数据和测试集数据

train_data_path = os.path.join(root_path, 'data', 'train.txt')
test_data_path = os.path.join(root_path, 'data', 'test.txt')

# 词向量模型路径

word_vector_path = os.path.join(root_path, 'data', 'wv', 'word2vec.model')

1.3 dataset


# 导入相关工具包

import sys
import os
from collections import Counter
import torch
from torch.utils.data import Dataset

# 设置项目的root路径, 方便后续相关代码文件的导入

root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)

# 导入项目中的自定义代码文件

from utils.func_utils import simple_tokenizer, count_words, sort_batch_by_len, source2ids, abstract2ids
from utils.vocab import Vocab
from utils import config


# 创建数据对的类

class PairDataset(object):
    def __init__(self, filename, tokenize=simple_tokenizer, max_enc_len=None, max_dec_len=None,
                 truncate_enc=False, truncate_dec=False):
        print("Reading dataset %s..." % filename, end=' ', flush=True)
        self.filename = filename
        self.pairs = []

        # 直接读取训练集数据文件, 切分成编码器数据, 解码器数据, 并做长度的统一

        with open(filename, 'r', encoding='utf-8') as f:
            next(f)
            for i, line in enumerate(f):
                # 在数据预处理阶段已经约定好了x, y之间以<SEP>分隔

                pair = line.strip().split('<SEP>')
                if len(pair) != 2:
                    print("Line %d of %s is error formed." % (i, filename))
                    print(line)
                    continue
                # 前半部分是编码器数据, 即article原始文本.

                enc = tokenize(pair[0])
                if max_enc_len and len(enc) > max_enc_len:
                    if truncate_enc:
                        enc = enc[:max_enc_len]
                    else:
                        continue

                # 后半部分是解码器数据, 即abstract摘要文本

                dec = tokenize(pair[1])
                if max_dec_len and len(dec) > max_dec_len:
                    if truncate_dec:
                        dec = dec[:max_dec_len]
                    else:
                        continue
                # 以元组数据对的格式存储进结果列表中

                self.pairs.append((enc, dec))
        print("%d pairs." % len(self.pairs))

    # 构建模型所需的字典

    def build_vocab(self, embed_file=None):
        # 对读取的文件进行单词计数统计

        word_counts = Counter()
        count_words(word_counts, [enc + dec for enc, dec in self.pairs])
        # 初始化字典类

        vocab = Vocab()
        # 如果有预训练词向量就直接加载, 如果没有则随着模型一起训练获取

        vocab.load_embeddings(embed_file)

        # 将计数得到的结果写入字典类中

        for word, count in word_counts.most_common(config.max_vocab_size):
            vocab.add_words([word])

        # 返回在vocab.py代码文件中定义的字典类结果

        return vocab

# 直接为后续创建DataLoader提供服务的数据集预处理类

class SampleDataset(Dataset):
    def __init__(self, data_pair, vocab):
        self.src_sents = [x[0] for x in data_pair]
        self.trg_sents = [x[1] for x in data_pair]
        self.vocab = vocab
        self._len = len(data_pair)

    # 需要自定义__getitem__()取元素的函数

    def __getitem__(self, index):
        # 调用工具函数获取输入x和oov

        x, oov = source2ids(self.src_sents[index], self.vocab)
        # 完全按照模型需求, 自定义返回格式, 共有6个字段, 每个字段"个性化定制"即可

        return {'x': [self.vocab.SOS] + x + [self.vocab.EOS],
                'OOV': oov,
                'len_OOV': len(oov),
                'y': [self.vocab.SOS] + abstract2ids(self.trg_sents[index], self.vocab, oov) + [self.vocab.EOS],
                'x_len': len(self.src_sents[index]),
                'y_len': len(self.trg_sents[index])
                }

    def __len__(self):
        return self._len

# 创建DataLoader时自定义的数据处理函数

def collate_fn(batch):
    # 按照最大长度的限制, 对张量进行填充0

    def padding(indice, max_length, pad_idx=0):
        pad_indice = [item + [pad_idx] * max(0, max_length - len(item)) for item in indice]
        return torch.tensor(pad_indice)

    # 对一个批次中的数据, 按照x_len字段进行排序

    data_batch = sort_batch_by_len(batch)

    # 依次取得所需的字段, 作为构建DataLoader的返回数据, 本模型需要6个字段

    x = data_batch['x']
    x_max_length = max([len(t) for t in x])
    y = data_batch['y']
    y_max_length = max([len(t) for t in y])

    OOV = data_batch['OOV']
    len_OOV = torch.tensor(data_batch['len_OOV'])

    x_padded = padding(x, x_max_length)
    y_padded = padding(y, y_max_length)

    x_len = torch.tensor(data_batch['x_len'])
    y_len = torch.tensor(data_batch['y_len'])

    return x_padded, y_padded, x_len, y_len, OOV, len_OOV

1.4 func_utils

# 导入系统工具包

import os
import sys
# 设置项目的root路径, 方便后续相关代码文件的导入

root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)

# 导入项目中的工具包

import numpy as np
import time
import heapq
import random
import pathlib
from utils import config
import torch


# 函数耗时计量函数

def timer(module):
    def wrapper(func):
        # func: 一个函数名, 下面的计时函数就是计算这个func函数的耗时.

        def cal_time( *args, **kwargs):
            t1 = time.time()
            res = func( *args, **kwargs)
            t2 = time.time()
            cost_time = t2 - t1
            print(f'{cost_time} secs used for ', module)
            return res
        return cal_time
    return wrapper

# 将一段文本按空格切分, 返回结果列表

def simple_tokenizer(text):
    return text.split()

# 以字典计数的方式统计一段文本中不同单词的数量

def count_words(counter, text):
    for sentence in text:
        for word in sentence:
            counter[word] += 1

# 对一个批次batch_size个样本, 按照x_len字段长短进行排序, 并返回排序后的结果

def sort_batch_by_len(data_batch):
    # 初始化一个结果字典, 其中包含的6个字段都是未来数据迭代器中的6个字段

    res = {'x': [],
           'y': [],
           'x_len': [],
           'y_len': [],
           'OOV': [],
           'len_OOV': []}

    # 遍历批次数据, 分别将6个字段数据按照字典key值, 添加进各自的列表中

    for i in range(len(data_batch)):
        res['x'].append(data_batch[i]['x'])
        res['y'].append(data_batch[i]['y'])
        res['x_len'].append(len(data_batch[i]['x']))
        res['y_len'].append(len(data_batch[i]['y']))
        res['OOV'].append(data_batch[i]['OOV'])
        res['len_OOV'].append(data_batch[i]['len_OOV'])

    # 以x_len字段大小进行排序, 并返回下标结果的列表

    sorted_indices = np.array(res['x_len']).argsort()[::-1].tolist()

    # 返回的data_batch依然保持字典类型

    data_batch = {name: [_tensor[i] for i in sorted_indices] for name, _tensor in res.items()}

    return data_batch

# 原始文本映射成ids张量

def source2ids(source_words, vocab):
    ids = []
    oovs = []
    unk_id = vocab.UNK
    for w in source_words:
        i = vocab[w]
        if i == unk_id:  # 如果w是OOV单词

            if w not in oovs:  # 将w添加进OOV列表中

                oovs.append(w)
            # 索引0对应第一个source document OOV, 索引1对应第二个source document OOV, 以此类推......

            oov_num = oovs.index(w)
            # 在本项目中索引vocab_size对应第一个source document OOV, vocab_size+1对应第二个source document OOV

            ids.append(vocab.size() + oov_num)
        else:
            ids.append(i)
    return ids, oovs

# 摘要文本映射成数字化ids张量

def abstract2ids(abstract_words, vocab, source_oovs):
    ids = []
    unk_id = vocab.UNK
    for w in abstract_words:
        i = vocab[w]
        if i == unk_id:  # 如果w是OOV单词

            if w in source_oovs:  # 如果w是source document OOV

                # 对这样的w计算出一个新的映射id值

                vocab_idx = vocab.size() + source_oovs.index(w)
                ids.append(vocab_idx)
            else:  # 如果w不是一个source document OOV

                ids.append(unk_id)  # 对这样的w只能用UNK的id值来代替

        else:
            ids.append(i) # 如果w是词表中的单词, 直接取id值映射

    return ids

# 将输出张量ids结果映射成自然语言文本

def outputids2words(id_list, source_oovs, vocab):
    words = []
    for i in id_list:
        try:
            # w可能是<UNK>

            w = vocab.index2word[i]
        # w是OOV单词

        except IndexError:
            assert_msg = "Error: 无法在词典中找到该ID值."
            assert source_oovs is not None, assert_msg
            # 寄希望索引i是一个source document OOV单词

            source_oov_idx = i - vocab.size()
            try:
                # 如果成功取到, 则w是source document OOV单词

                w = source_oovs[source_oov_idx]
            # i不仅是OOV单词, 也不对应source document中的原文单词

            except ValueError:
                raise ValueError('Error: 模型生成的ID: %i, 原始文本中的OOV ID: %i \
                                  但是当前样本中只有%i个OOVs'
                                  % (i, source_oov_idx, len(source_oovs)))
        # 向结果列表中添加原始字符

        words.append(w)
    # 空格连接成字符串返回

    return ' '.join(words)

# 创建小顶堆, 包含k个点的特殊二叉树, 始终保持二叉树中最小的值在root根节点

def add2heap(heap, item, k):
    if len(heap) < k:
        heapq.heappush(heap, item)
    else:
        heapq.heappushpop(heap, item)

# 将文本张量中所有OOV单词的id, 全部替换成<UNK>对应的id

def replace_oovs(in_tensor, vocab):
    # oov_token = torch.full(in_tensor.shape, vocab.UNK).long().to(config.DEVICE)

    # 在Pytorch1.5.0以及更早的版本中, torch.full()默认返回float类型

    # 在Pytorch1.7.0最新版本中, torch.full()会将bool返回成torch.bool, 会将integer返回成torch.long.

    # 上面一行代码在Pytorch1.6.0版本中会报错, 因为必须指定成long类型, 如下面代码所示

    oov_token = torch.full(in_tensor.shape, vocab.UNK, dtype=torch.long).to(config.DEVICE)

    out_tensor = torch.where(in_tensor > len(vocab) - 1, oov_token, in_tensor)
    return out_tensor

# 获取模型训练中若干超参数信息

def config_info(config):
    info = 'model_name = {}, pointer = {}, coverage = {}, fine_tune = {}, scheduled_sampling = {}, weight_tying = {},' + 'source = {}  '
    return (info.format(config.model_name, config.pointer, config.coverage, config.fine_tune, config.scheduled_sampling, config.weight_tying, config.source))


@timer(module='test a demo program')
def test():
    s = 0
    for i in range(100000000):
        s += i
    print('s = ', s)


if __name__ == '__main__':
    test()

1.5 multi_proc_utils

import pandas as pd
import numpy as np
from multiprocessing import cpu_count, Pool

# cpu 数量

cores = cpu_count()
# 分块个数

partitions = cores


def parallelize(df, func):
    # 数据切分

    data_split = np.array_split(df, partitions)
    # 线程池

    pool = Pool(cores)
    # 数据分发 合并

    data = pd.concat(pool.map(func, data_split))
    # 关闭线程池

    pool.close()
    # 执行完close后不会有新的进程加入到pool,join函数等待所有子进程结束

    pool.join()
    return data

1.6 preprocess

# 导入若干工具包

import re
import jieba
import pandas as pd
import numpy as np
import os
import sys

# 设置项目的root目录, 方便后续相关代码包的导入

root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)

# 导入文本预处理的配置信息config1

from utils.config1 import *
# 导入多核CPU并行处理数据的函数

from utils.multi_proc_utils import *

# jieba载入自定义切词表

jieba.load_userdict(user_dict_path)


# 根据max_len和vocab填充<START> <STOP> <PAD> <UNK>

def pad_proc(sentence, max_len, word_to_id):
    # 1: 按空格统计切分出词

    words = sentence.strip().split(' ')
    # 2: 截取规定长度的词数

    words = words[:max_len]
    # 3: 填充<UNK>

    sentence = [w if w in word_to_id else '<UNK>' for w in words]
    # 4: 填充<START>, <END>

    sentence = ['<START>'] + sentence + ['<STOP>']
    # 5: 判断长度, 填充<PAD>

    sentence = sentence + ['<PAD>'] * (max_len - len(words))
    return ' '.join(sentence)


# 加载停用词

def load_stop_words(stop_word_path):
    # stop_word_path: 停用词路径

    # 打开文件

    f = open(stop_word_path, 'r', encoding='utf-8')
    # 读取所有行

    stop_words = f.readlines()
    # 去除每一个停用词前后的空格, 换行符

    stop_words = [stop_word.strip() for stop_word in stop_words]
    return stop_words

# 加载停用词

stop_words = load_stop_words(stop_words_path)


# 清洗文本, 删除特殊符号(被sentence_proc调用)

def clean_sentence(sentence):
    # sentence: 待处理的字符串

    if isinstance(sentence, str):
        # 删除1. 2. 3. 这些标题

        r = re.compile("\D(\d\.)\D")
        sentence = r.sub("", sentence)

        # 删除带括号的 进口 海外

        r = re.compile(r"[((]进口[))]|\(海外\)")
        sentence = r.sub("", sentence)
        # 删除除了汉字数字字母和,!?。.- 以外的字符

        r = re.compile("[^,!?。\.\-\u4e00-\u9fa5_a-zA-Z0-9]")
        # 用中文输入法下的,!?来替换英文输入法下的,!?

        sentence = sentence.replace(",", ",")
        sentence = sentence.replace("!", "!")
        sentence = sentence.replace("?", "?")
        sentence = r.sub("", sentence)

        # 删除--- 车主说, 技师说, 语音, 图片, 你好, 您好

        r = re.compile(r"车主说|技师说|语音|图片|你好|您好")
        sentence = r.sub("", sentence)

        return sentence
    else:
        return ''


# 过滤一句切好词的话中的停用词(被sentence_proc调用)

def filter_stopwords(seg_list):
    # seg_list: 切好词的列表 [word1 ,word2 .......]

    # 首先去掉多余空字符

    words = [word for word in seg_list if word]
    # 去掉停用词

    return [word for word in words if word not in stop_words]


# 预处理模块(处理一条句子, 被sentences_proc调用)

def sentence_proc(sentence):
    # sentence:待处理字符串

    # 清除无用词

    sentence = clean_sentence(sentence)
    # 切词, 默认精确模式, 全模式cut参数cut_all=True

    words = jieba.cut(sentence)
    # 过滤停用词

    words = filter_stopwords(words)
    # 拼接成一个字符串, 按空格分隔

    return ' '.join(words)


# 预处理模块(处理一个句子列表, 对每个句子调用sentence_proc操作)

def sentences_proc(df):
    # df: 数据集

    # 批量预处理训练集和测试集

    for col_name in ['Brand', 'Model', 'Question', 'Dialogue']:
        df[col_name] = df[col_name].apply(sentence_proc)

    if 'Report' in df.columns:
        # 训练集 Report 预处理

        df['Report'] = df['Report'].apply(sentence_proc)

    return df


# 用于数据加载+预处理(只需执行一次)

def build_dataset(train_raw_data_path, test_raw_data_path):
    # 1. 加载原始数据

    print('1. 加载原始数据.')
    print(train_raw_data_path)
    # 必须指定解码格式为utf-8

    train_df = pd.read_csv(train_raw_data_path, engine='python', encoding='utf-8')
    test_df = pd.read_csv(test_raw_data_path, engine='python', encoding='utf-8')
    print('原始训练集行数 {}, 测试集行数 {}'.format(len(train_df), len(test_df)))
    print('\n')

    # 2. 空值去除(对于一行数据, 任意列只要有空值就去掉该行)

    print('2. 空值去除(对于一行数据, 任意列只要有空值就去掉该行).')
    train_df.dropna(subset=['Question', 'Dialogue', 'Report'], how='any', inplace=True)
    test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True)
    print('空值去除后训练集行数 {}, 测试集行数 {}'.format(len(train_df), len(test_df)))
    print('\n')

    # 3. 多线程, 批量数据预处理(对每个句子执行sentence_proc, 清除无用词, 切词, 过滤停用词, 再用空格拼接为一个>字符串)

    print('3. 多线程, 批量数据预处理(对每个句子执行sentence_proc, 清除无用词, 切词, 过滤停用词, 再用空格拼接为一个字符串).')
    train_df = parallelize(train_df, sentences_proc)
    test_df = parallelize(test_df, sentences_proc)
    print('\n')
    print('sentences_proc has done!')

    # 4. 合并训练测试集,用于训练词向量

    print('4. 合并训练测试集, 用于训练词向量.')
    # 新建一列,按行堆积

    train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    train_df['Y'] = train_df[['Report']]
    # 新建一列,按行堆积

    test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    # 5. 保存分割处理好的train_seg_data.csv、test_set_data.csv

    print('5. 保存处理好的train_seg_data.csv, test_set_data.csv.')
    # 把建立的列merged去掉,该列对于神经网络无用,只用于训练词向量

    train_df = train_df.drop(['Question'], axis=1)
    train_df = train_df.drop(['Dialogue'], axis=1)
    train_df = train_df.drop(['Brand'], axis=1)
    train_df = train_df.drop(['Model'], axis=1)
    train_df = train_df.drop(['Report'], axis=1)
    train_df = train_df.drop(['QID'], axis=1)
    test_df = test_df.drop(['Question'], axis=1)
    test_df = test_df.drop(['Dialogue'], axis=1)
    test_df = test_df.drop(['Brand'], axis=1)
    test_df = test_df.drop(['Model'], axis=1)
    test_df = test_df.drop(['QID'], axis=1)
    # 将处理后的数据存入持久化文件

    # train_df.to_csv(train_seg_path, index=None, header=True)

    test_df.to_csv(test_seg_path, index=None, header=True)
    train_df['data'] = train_df[['X', 'Y']].apply(lambda x: '<sep>'.join(x), axis=1)
    train_df = train_df.drop(['X'], axis=1)
    train_df = train_df.drop(['Y'], axis=1)
    train_df.to_csv(train_seg_path, index=None, header=True)
    print('The csv_file has saved!')
    print('\n')
    print('6. 后续工作是将第5步的结果文件进行适当处理, 并保存为.txt文件.')
    print('本程序代码所有工作执行完毕!')

if __name__ == '__main__':
    build_dataset(train_raw_data_path, test_raw_data_path)

1.7 vocab

# 导入工具包

import os
import sys
# 设置项目的root路径, 方便后续相关代码文件的导入

root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)

# 导入相关工具包

from collections import Counter
import numpy as np
import torch
import torch.nn as nn

# 如果采用预训练词向量的策略, 则导入相关配置和工具包

from utils.config1 import word_vector_path
from gensim.models import word2vec


# 词典类的创建

class Vocab(object):
    PAD = 0
    SOS = 1
    EOS = 2
    UNK = 3

    def __init__(self):
        self.word2index = {}
        self.word2count = Counter()
        self.reserved = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
        self.index2word = self.reserved[:]
        # 如果预训练词向量, 则后续直接载入; 否则置为None即可

        self.embedding_matrix = None

    # 向类词典中增加单词

    def add_words(self, words):
        for word in words:
            if word not in self.word2index:
                self.word2index[word] = len(self.index2word)
                self.index2word.append(word)

        # 因为引入Counter()工具包, 直接执行update()更新即可.

        self.word2count.update(words)

    # 如果已经提前预训练的词向量, 则执行类内函数对embedding_matrix赋值

    def load_embeddings(self, word_vector_model_path):
        # 直接下载预训练词向量模型

        wv_model = word2vec.Word2Vec.load(word_vector_model_path)
        # 从模型中直接提取词嵌入矩阵

        self.embedding_matrix = wv_model.wv.vectors

    # 根据id值item读取字典中的单词

    def __getitem__(self, item):
        if type(item) is int:
            return self.index2word[item]
        return self.word2index.get(item, self.UNK)

    # 获取字典的当前长度(等效于单词总数)

    def __len__(self):
        return len(self.index2word)

    # 获取字典的当前单词总数

    def size(self):
        return len(self.index2word)

if __name__ == '__main__':
    vocab = Vocab()
    print(vocab)
    print('***')
    print(vocab.size())
    print('***')
    print(vocab.embedding_matrix)

2. src

2.1 evaluate

# 导入工具包

import os
import sys
import torch
from tqdm import tqdm
import numpy as np
from torch.utils.data import DataLoader

# 设定项目的root路径, 方便后续代码文件的导入

root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)

# 导入项目的相关代码文件

from utils.dataset import collate_fn
from utils import config


# 编写评估函数

def evaluate(model, val_data, epoch):
    print('validating')
    val_loss = []
    # 评估模型需要设定参数不变

    with torch.no_grad():
        DEVICE = config.DEVICE
        # 创建数据迭代器, pin_memory=True是对于GPU机器的优化设置
        # 为了PGN模型数据的特殊性, 传入自定义的collate_fn提供个性化服务

        val_dataloader = DataLoader(dataset=val_data,
                                    batch_size=config.batch_size,
                                    shuffle=True,
                                    pin_memory=True,
                                    drop_last=True,
                                    collate_fn=collate_fn)

        # 遍历测试集数据进行评估

        for batch, data in enumerate(tqdm(val_dataloader)):
            x, y, x_len, y_len, oov, len_oovs = data
            if config.is_cuda:
                x = x.to(DEVICE)
                y = y.to(DEVICE)
                x_len = x_len.to(DEVICE)
                len_oovs = len_oovs.to(DEVICE)
            total_num = len(val_dataloader)

            loss = model(x, x_len, y, len_oovs, batch=batch, num_batches=total_num, teacher_forcing=True)
            val_loss.append(loss.item())
    # 返回整个测试集的平均损失值

    return np.mean(val_loss)

2.2 model

import torch.nn as nn

# 构建编码器类

class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, rnn_drop=0):
        super(Encoder, self).__init__()
        # 词嵌入层采用跟随模型一起训练的模式

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.hidden_size = hidden_size
        # 编码器的主体采用单层, 双向LSTM结构

        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, dropout=rnn_drop, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.lstm(embedded)
        return output, hidden

# 构建注意力类

class Attention(nn.Module):
    def __init__(self, hidden_units):
        super(Attention, self).__init__()
        # 定义前向传播层, 对应论文中的公式1中的Wh, Ws

        self.Wh = nn.Linear(2 * hidden_units, 2 * hidden_units, bias=False)
        self.Ws = nn.Linear(2 * hidden_units, 2 * hidden_units)
        # 定义全连接层, 对应论文中的公式1中最外层的v

        self.v = nn.Linear(2 * hidden_units, 1, bias=False)

    def forward(self, decoder_states, encoder_output, x_padding_masks):
        h_dec, c_dec = decoder_states
        # 将两个张量在最后一个维度拼接, 得到deocder state St: (1, batch_size, 2*hidden_units)

        s_t = torch.cat([h_dec, c_dec], dim=2)
        # 将batch_size置于第一个维度上: (batch_size, 1, 2*hidden_units)

        s_t = s_t.transpose(0, 1)
        # 按照hi的维度扩展St的维度: (batch_size, seq_length, 2*hidden_units)

        s_t = s_t.expand_as(encoder_output).contiguous()

         # 根据论文中的公式1来计算et, 总共有三步

        # 第一步: 分别经历各自的全连接层矩阵乘法

        # Wh * h_i: (batch_size, seq_length, 2*hidden_units)

        encoder_features = self.Wh(encoder_output.contiguous())
        # Ws * s_t: (batch_size, seq_length, 2*hidden_units)

        decoder_features = self.Ws(s_t)

        # 第二步: 两部分执行加和运算

        # (batch_size, seq_length, 2*hidden_units)

        attn_inputs = encoder_features + decoder_features

        # 第三步: 执行tanh运算和一个全连接层的运算

        # (batch_size, seq_length, 1)

        score = self.v(torch.tanh(attn_inputs))

        # 得到score后, 执行论文中的公式2

        # (batch_size, seq_length)

        attention_weights = F.softmax(score, dim=1).squeeze(2)

        # 添加一步执行padding mask的运算, 将编码器端无效的PAD字符全部遮掩掉

        attention_weights = attention_weights * x_padding_masks

        # 最整个注意力层执行一次正则化操作

        normalization_factor = attention_weights.sum(1, keepdim=True)
        attention_weights = attention_weights / normalization_factor

        # 执行论文中的公式3,将上一步得到的attention distributon应用在encoder hidden states上,得到context_vector

        # (batch_size, 1, 2*hidden_units)

        context_vector = torch.bmm(attention_weights.unsqueeze(1), encoder_output)
        # (batch_size, 2*hidden_units)

        context_vector = context_vector.squeeze(1)

        return context_vector, attention_weights

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, enc_hidden_size=None):
        super(Decoder, self).__init__()
        # 解码器端也采用跟随模型一起训练的方式, 得到词嵌入层

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size

        # 解码器的主体结构采用单向LSTM, 区别于编码器端的双向LSTM

        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)

        # 因为要将decoder hidden state和context vector进行拼接, 因此需要3倍的hidden_size维度设置

        self.W1 = nn.Linear(self.hidden_size * 3, self.hidden_size)
        self.W2 = nn.Linear(self.hidden_size, vocab_size)

        if config.pointer:
            # 因为要根据论文中的公式8进行运算, 所谓输入维度上匹配的是4 * hidden_size + embed_size

            self.w_gen = nn.Linear(self.hidden_size * 4 + embed_size, 1)

    def forward(self, x_t, decoder_states, context_vector):
        # 首先计算Decoder的前向传播输出张量

        decoder_emb = self.embedding(x_t)
        decoder_output, decoder_states = self.lstm(decoder_emb, decoder_states)

        # 接下来就是论文中的公式4的计算.

        # 将context vector和decoder state进行拼接, (batch_size, 3*hidden_units)

        decoder_output = decoder_output.view(-1, config.hidden_size)
        concat_vector = torch.cat([decoder_output, context_vector], dim=-1)

        # 经历两个全连接层V和V1后,再进行softmax运算, 得到vocabulary distribution

        # (batch_size, hidden_units)

        FF1_out = self.W1(concat_vector)
        # (batch_size, vocab_size)

        FF2_out = self.W2(FF1_out)
        # (batch_size, vocab_size)

        p_vocab = F.softmax(FF2_out, dim=1)

        # 构造decoder state s_t.

        h_dec, c_dec = decoder_states
        # (1, batch_size, 2*hidden_units)

        s_t = torch.cat([h_dec, c_dec], dim=2)

        # p_gen是通过context vector h_t, decoder state s_t, decoder input x_t, 三个部分共同计算出来的.

        # 下面的部分是计算论文中的公式8.

        p_gen = None
        if config.pointer:
            # 这里面采用了直接拼接3部分输入张量, 然后经历一个共同的全连接层w_gen, 和原始论文的计算不同.

            # 这也给了大家提示, 可以提高模型的复杂度, 完全模拟原始论文中的3个全连接层来实现代码.

            x_gen = torch.cat([context_vector, s_t.squeeze(0), decoder_emb.squeeze(1)], dim=-1)
            p_gen = torch.sigmoid(self.w_gen(x_gen))

        return p_vocab, decoder_states, p_gen

# 构造加和state的类, 方便模型运算

class ReduceState(nn.Module):
    def __init__(self):
        super(ReduceState, self).__init__()

    def forward(self, hidden):
        h, c = hidden
        h_reduced = torch.sum(h, dim=0, keepdim=True)
        c_reduced = torch.sum(c, dim=0, keepdim=True)
        return (h_reduced, c_reduced)

# 导入系统工具包

import os
import sys
# 设置项目的root路径, 方便后续相关代码文件的导入

root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)

# 导入若干工具包

import torch
import torch.nn as nn
import torch.nn.functional as F
# 导入项目中的相关代码文件

from utils import config
from utils.func_utils import timer, replace_oovs
from utils.vocab import Vocab


# 构建PGN类

class PGN(nn.Module):
    def __init__(self, v):
        super(PGN, self).__init__()
        # 初始化字典对象

        self.v = v
        self.DEVICE = config.DEVICE

        # 依次初始化4个类对象

        self.attention = Attention(config.hidden_size)
        self.encoder = Encoder(len(v), config.embed_size, config.hidden_size)
        self.decoder = Decoder(len(v), config.embed_size, config.hidden_size)
        self.reduce_state = ReduceState()

    # 计算最终分布的函数

    def get_final_distribution(self, x, p_gen, p_vocab, attention_weights, max_oov):
        if not config.pointer:
            return p_vocab

        batch_size = x.size()[0]
        # 进行p_gen概率值的裁剪, 具体取值范围可以调参

        p_gen = torch.clamp(p_gen, 0.001, 0.999)
        # 接下来两行代码是论文中公式9的计算.

        p_vocab_weighted = p_gen * p_vocab
        # (batch_size, seq_len)

        attention_weighted = (1 - p_gen) * attention_weights

        # 得到扩展后的单词概率分布(extended-vocab probability distribution)

        # extended_size = len(self.v) + max_oovs

        extension = torch.zeros((batch_size, max_oov)).float().to(self.DEVICE)
        # (batch_size, extended_vocab_size)

        p_vocab_extended = torch.cat([p_vocab_weighted, extension], dim=1)

        # 根据论文中的公式9, 累加注意力值attention_weighted到对应的单词位置x

        final_distribution = p_vocab_extended.scatter_add_(dim=1, index=x, src=attention_weighted)

        return final_distribution

    def forward(self, x, x_len, y, len_oovs, batch, num_batches, teacher_forcing):
        x_copy = replace_oovs(x, self.v)
        x_padding_masks = torch.ne(x, 0).byte().float()
        # 第一步: 进行Encoder的编码计算

        encoder_output, encoder_states = self.encoder(x_copy)
        decoder_states = self.reduce_state(encoder_states)
        # 初始化每一步的损失值

        step_losses = []

        # 第二步: 循环解码, 每一个时间步都经历注意力的计算, 解码器层的计算.

        # 初始化解码器的输入, 是ground truth中的第一列, 即真实摘要的第一个字符

        x_t = y[:, 0]
        for t in range(y.shape[1] - 1):
            # 如果使用Teacher_forcing, 则每一个时间步用真实标签来指导训练

            if teacher_forcing:
                x_t = y[:, t]

            x_t = replace_oovs(x_t, self.v)
            y_t = y[:, t + 1]
            # 通过注意力层计算context vector

            context_vector, attention_weights = self.attention(decoder_states,encoder_output,x_padding_masks)

            # 通过解码器层计算得到vocab distribution和hidden states

            p_vocab, decoder_states, p_gen = self.decoder(x_t.unsqueeze(1), decoder_states, context_vector)

            # 得到最终的概率分布

            final_dist = self.get_final_distribution(x,p_gen,p_vocab,attention_weights,torch.max(len_oovs))

            # 第t个时间步的预测结果, 将作为第t + 1个时间步的输入(如果采用Teacher-forcing则不同).

            x_t = torch.argmax(final_dist, dim=1).to(self.DEVICE)

            # 根据模型对target tokens的预测, 来获取到预测的概率

            if not config.pointer:
                y_t = replace_oovs(y_t, self.v)
            target_probs = torch.gather(final_dist, 1, y_t.unsqueeze(1))
            target_probs = target_probs.squeeze(1)

            # 将解码器端的PAD用padding mask遮掩掉, 防止计算loss时的干扰

            mask = torch.ne(y_t, 0).byte()
            # 为防止计算log(0)而做的数学上的平滑处理

            loss = -torch.log(target_probs + config.eps)

            # 先遮掩, 再添加损失值

            mask = mask.float()
            loss = loss * mask
            step_losses.append(loss)

        # 第三步: 计算一个批次样本的损失值, 为反向传播做准备.

        sample_losses = torch.sum(torch.stack(step_losses, 1), 1)
        # 统计非PAD的字符个数, 作为当前批次序列的有效长度

        seq_len_mask = torch.ne(y, 0).byte().float()
        batch_seq_len = torch.sum(seq_len_mask, dim=1)

        # 计算批次样本的平均损失值

        batch_loss = torch.mean(sample_losses / batch_seq_len)
        return batch_loss

if __name__ == '__main__':
    v = Vocab()
    model = PGN(v)
    print(model)

2.3 predict

# 导入工具包

import random
import os
import sys
import torch
import jieba

# 设定项目的root路径, 方便后续相关代码文件的导入

root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)

# 导入项目的相关代码文件

from utils import config
from src.model import PGN
from utils.dataset import PairDataset
from utils.func_utils import source2ids, outputids2words, timer, add2heap, replace_oovs


# 构建预测类

class Predict():
    @timer(module='initalize predicter')
    def __init__(self):
        self.DEVICE = config.DEVICE

        dataset = PairDataset(config.train_data_path,
                              max_enc_len=config.max_enc_len,
                              max_dec_len=config.max_dec_len,
                              truncate_enc=config.truncate_enc,
                              truncate_dec=config.truncate_dec)

        self.vocab = dataset.build_vocab(embed_file=config.embed_file)

        self.model = PGN(self.vocab)
        self.stop_word = list(set([self.vocab[x.strip()] for x in open(config.stop_word_file).readlines()]))

        # 导入已经训练好的模型, 并转移到GPU上.

        self.model.load_state_dict(torch.load(config.model_save_path))
        self.model.to(self.DEVICE)

    def greedy_search(self, x, max_sum_len, len_oovs, x_padding_masks):
        encoder_output, encoder_states = self.model.encoder(replace_oovs(x, self.vocab))

        # 用encoder的hidden state初始化decoder的hidden state

        decoder_states = self.model.reduce_state(encoder_states)

        # 利用SOS作为解码器的初始化输入字符

        x_t = torch.ones(1) * self.vocab.SOS
        x_t = x_t.to(self.DEVICE, dtype=torch.int64)
        summary = [self.vocab.SOS]

        # 循环解码, 最多解码max_sum_len步

        while int(x_t.item()) != (self.vocab.EOS) and len(summary) < max_sum_len:
            context_vector, attention_weights = self.model.attention(decoder_states,
                                                                     encoder_output,
                                                                     x_padding_masks)

            p_vocab, decoder_states, p_gen = self.model.decoder(x_t.unsqueeze(1),
                                                                decoder_states,
                                                                context_vector)

            final_dist = self.model.get_final_distribution(x, p_gen, p_vocab,
                                                           attention_weights,
                                                           torch.max(len_oovs))

            # 以贪心解码策略预测字符

            x_t = torch.argmax(final_dist, dim=1).to(self.DEVICE)
            decoder_word_idx = x_t.item()

            # 将预测的字符添加进结果摘要中

            summary.append(decoder_word_idx)
            x_t = replace_oovs(x_t, self.vocab)

        return summary

    @timer(module='doing prediction')
    def predict(self, text, tokenize=True):
        if isinstance(text, str) and tokenize:
            text = list(jieba.cut(text))

        # 将原始文本映射成数字化张量

        x, oov = source2ids(text, self.vocab)
        x = torch.tensor(x).to(self.DEVICE)

        # 获取OOV的长度和padding mask张量

        len_oovs = torch.tensor([len(oov)]).to(self.DEVICE)
        x_padding_masks = torch.ne(x, 0).byte().float()

        # 利用贪心解码函数得到摘要结果.

        summary = self.greedy_search(x.unsqueeze(0),
                                     max_sum_len=config.max_dec_steps,
                                     len_oovs=len_oovs,
                                     x_padding_masks=x_padding_masks)

        # 将得到的摘要数字化张量转换成自然语言文本

        summary = outputids2words(summary, oov, self.vocab)

        # 删除掉特殊字符<SOS>和<EOS>

        return summary.replace('<SOS>', '').replace('<EOS>', '').strip()

if __name__ == "__main__":
    print('实例化Predict对象, 构建dataset和vocab......')
    pred = Predict()
    print('vocab_size: ', len(pred.vocab))
    # Randomly pick a sample in test set to predict.

    with open(config.val_data_path, 'r') as test:
        picked = random.choice(list(test))
        source, ref = picked.strip().split('<SEP>')

    print('source: ', source, '\n')
    print('---------------------------------------------------------------')
    print('ref: ', ref, '\n')
    print('---------------------------------------------------------------')
    greedy_prediction = pred.predict(source.split())
    print('greedy: ', greedy_prediction, '\n')

2.4 train

# 导入系统工具包

import pickle
import os
import sys
# 设置项目的root路径, 方便后续相关代码文件的导入

root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_path)

# 导入项目中用到的工具包

import numpy as np
from torch import optim
from torch.utils.data import DataLoader
import torch
from torch.nn.utils import clip_grad_norm_
from tqdm import tqdm
from tensorboardX import SummaryWriter

# 导入项目中自定义的代码文件, 类, 函数等

from src.model import PGN
from utils import config
from src.evaluate import evaluate
from utils.dataset import PairDataset, collate_fn, SampleDataset
from utils.func_utils import config_info


# 编写训练模型的主逻辑函数.

def train(dataset, val_dataset, v, start_epoch=0):
    DEVICE = config.DEVICE

    # 实例化PGN类对象并移动到GPU上(CPU).

    model = PGN(v)
    model.to(DEVICE)

    print("loading data......")
    train_data = SampleDataset(dataset.pairs, v)
    val_data = SampleDataset(val_dataset.pairs, v)

    print("initializing optimizer......")

    # 定义模型训练的优化器.

    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

    # 定义训练集的数据迭代器(这里用到了自定义的collate_fn以服务于PGN特殊的数据结构).

    train_dataloader = DataLoader(dataset=train_data,
                                  batch_size=config.batch_size,
                                  shuffle=True,
                                  collate_fn=collate_fn)

    # 验证集上的损失值初始化为一个大整数.

    val_losses = 10000000.0

    # SummaryWriter: 为服务于TensorboardX写日志的可视化工具.

    writer = SummaryWriter(config.log_path)

    num_epochs =  len(range(start_epoch, config.epochs))

    # 训练阶段采用Teacher-forcing的策略

    teacher_forcing = True
    print('teacher_forcing = {}'.format(teacher_forcing))

    # 根据配置文件config.py中的设置, 对整个数据集进行一定轮次的迭代训练.

    with tqdm(total=config.epochs) as epoch_progress:
        for epoch in range(start_epoch, config.epochs):
            # 每一个epoch之前打印模型训练的相关配置信息.

            print(config_info(config))

            # 初始化每一个batch损失值的存放列表

            batch_losses = []
            num_batches = len(train_dataloader)

            # 针对每一个epoch, 按batch读取数据迭代训练模型

            with tqdm(total=num_batches//100) as batch_progress:
                for batch, data in enumerate(tqdm(train_dataloader)):
                    x, y, x_len, y_len, oov, len_oovs = data
                    assert not np.any(np.isnan(x.numpy()))

                    # 如果配置有GPU, 则加速训练

                    if config.is_cuda:
                        x = x.to(DEVICE)
                        y = y.to(DEVICE)
                        x_len = x_len.to(DEVICE)
                        len_oovs = len_oovs.to(DEVICE)

                    # 设置模型进入训练模式(参数参与反向传播和更新)

                    model.train()

                    # "老三样"中的第一步: 梯度清零

                    optimizer.zero_grad()
                    # 调用模型进行训练并返回损失值

                    loss = model(x, x_len, y,
                                 len_oovs, batch=batch,
                                 num_batches=num_batches,
                                 teacher_forcing=teacher_forcing)

                    batch_losses.append(loss.item())

                    # "老三样"中的第二步: 反向传播

                    loss.backward()

                    # 为防止梯度爆炸(gradient explosion)而进行梯度裁剪.

                    clip_grad_norm_(model.encoder.parameters(), config.max_grad_norm)
                    clip_grad_norm_(model.decoder.parameters(), config.max_grad_norm)
                    clip_grad_norm_(model.attention.parameters(), config.max_grad_norm)

                    # "老三样"中的第三步: 参数更新

                    optimizer.step()

                    # 每隔100个batch记录一下损失值信息.

                    if (batch % 100) == 0:
                        batch_progress.set_description(f'Epoch {epoch}')
                        batch_progress.set_postfix(Batch=batch, Loss=loss.item())
                        batch_progress.update()
                        # 向tensorboard中写入损失值信息.

                        writer.add_scalar(f'Average loss for epoch {epoch}',
                                           np.mean(batch_losses),
                                           global_step=batch)

            # 将一个轮次中所有batch的平均损失值作为这个epoch的损失值.

            epoch_loss = np.mean(batch_losses)

            epoch_progress.set_description(f'Epoch {epoch}')
            epoch_progress.set_postfix(Loss=epoch_loss)
            epoch_progress.update()

            # 结束每一个epoch训练后, 直接在验证集上跑一下模型效果

            avg_val_loss = evaluate(model, val_data, epoch)

            print('training loss:{}'.format(epoch_loss), 'validation loss:{}'.format(avg_val_loss))

            # 更新更小的验证集损失值evaluating loss.

            if (avg_val_loss < val_losses):
                torch.save(model.encoder, config.encoder_save_name)
                torch.save(model.decoder, config.decoder_save_name)
                torch.save(model.attention, config.attention_save_name)
                torch.save(model.reduce_state, config.reduce_state_save_name)
                torch.save(model.state_dict(), config.model_save_path)
                val_losses = avg_val_loss

                # 将更小的损失值写入文件中

                with open(config.losses_path, 'wb') as f:
                    pickle.dump(val_losses, f)

    writer.close()

if __name__ == '__main__':
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('DEVICE: ', DEVICE)

    # 构建训练用的数据集对

    dataset = PairDataset(config.train_data_path,
                          max_enc_len=config.max_enc_len,
                          max_dec_len=config.max_dec_len,
                          truncate_enc=config.truncate_enc,
                          truncate_dec=config.truncate_dec)

    # 构建测试用的数据集对

    val_dataset = PairDataset(config.val_data_path,
                              max_enc_len=config.max_enc_len,
                              max_dec_len=config.max_dec_len,
                              truncate_enc=config.truncate_enc,
                              truncate_dec=config.truncate_dec)

    # 创建模型的单词字典

    vocab = dataset.build_vocab(embed_file=config.embed_file)

    # 调用训练函数进行训练并测试

    train(dataset, val_dataset, vocab, start_epoch=0)

3. data

3.1 make_data

import os
import sys

# 打开最终的结果文件

train_writer = open('train.txt', 'w', encoding='utf-8')
test_writer = open('test.txt', 'w', encoding='utf-8')

# 对训练数据做处理, 将article和abstract中间用'<SEP>'分隔

n = 0
with open('train_seg_data.csv', 'r', encoding='utf-8') as f1:
    f1.readline()
    for line in f1.readlines():
        line = line.strip().strip('\n')
        article, abstract = line.split('<sep>')
        text = article + '<SEP>' + abstract + '\n'
        train_writer.write(text)
        n += 1

print('train n=', n)
n = 0

# 对测试数据做处理, 仅将文件存储格式从.csv换成.txt

with open('test_seg_data.csv', 'r', encoding='utf-8') as f2:
    f2.readline()
    for line in f2.readlines():
        line = line.strip().strip('\n')
        text = line + '\n'
        test_writer.write(text)
        n += 1

print('test n=', n)