欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

DialoGPT inferene自己的语料库

程序员文章站 2022-05-18 17:14:50
...

引言

Large-scale pretraining for dialogue
DialoGPT是基于GPT-2的对话生成预训练模型,在reddit数据集上训练
与2019年放出,现在已经在github上已经有了400+star,值得关注
原论文

  • 假定已经设置好环境,若不会,可以参照博客设置完毕

在eval_util.py中增加 inference函数

def inference_model_results(model, tokenizer, inference_dataloader, args):
    # use the same signature with eval_model_generation
    logger.info('compute eval model loss, using eval mode, '
                'please change it back to train after calling this function')
    model.eval()
    tot_sample = []
    with torch.no_grad():
        for step, batch in enumerate(inference_dataloader):
            batch = tuple(t.to(args.device) for t in batch)
            input_ids, position_ids, token_ids, label_ids, src_len, _ = batch
            if args.no_token_id:
                token_ids = None
            n_sample = input_ids.shape[0]
            logits = model.inference(input_ids, position_ids, token_ids)

            def decode(batch_data, tokenizer, input_flag):
                results = []
                batch_data = batch_data.cpu().data.numpy()
                for one_logits in batch_data:  # [sentence_len, vocabulary_size]
                    if not input_flag:
                        word_ids = np.argmax(one_logits, axis=1)
                    else:
                        word_ids = one_logits
                    words = []
                    for id in word_ids:
                        if tokenizer.decoder[id] != "<|endoftext|>":
                            words.append(tokenizer.decoder[id])
                        else:
                            break
                    output_words = []
                    for word in words:
                        output_words.append(word[1:]) if word.startswith("Ġ") else output_words.append(word)
                    results.append(" ".join(output_words))
                return results

            posts = decode(input_ids, tokenizer, True)
            inferences = decode(logits, tokenizer, False)

            tot_sample.append(n_sample)
            logger.info("model inference results")
            for index in range(len(posts)):
                print("post: ", posts[index])
                print("inference: ", inferences[index])

            # print(inferences)
            break
    # todo
    return None

在modeling_gpt2.py中class GPT2LMHeadModel(GPT2PreTrainedModel)中增加inference函数

    def inference(self, input_ids, position_ids=None, token_type_ids=None, past=None):
        hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
        lm_logits = self.lm_head(hidden_states)
        return lm_logits

自定义inference_LSP.py 文件

文件内容

#  Copyright (c) Microsoft Corporation.
#  Licensed under the MIT license.
'''
 * @Desc: train GPT2 from scratch/ fine tuning.
          Modified based on Huggingface GPT-2 implementation
'''

import json
import os
import sys
import argparse
import logging
import time
import tqdm
import datetime
import torch

import numpy as np

from os.path import join
from torch.distributed import get_rank, get_world_size

from lsp_model import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, Adam
from gpt2_training.train_utils import load_model, boolean_string, set_lr, get_eval_list_same_length
from gpt2_training.eval_utils import eval_model_loss, inference_model_results

from data_loader import BucketingDataLoader, DynamicBatchingLoader, DistributedBucketingDataLoader

from gpt2_training.distributed import all_reduce_and_rescale_tensors, all_gather_list

os.environ['CUDA_VISIBLE_DEVICES'] = "0"

logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO)
logger = logging.getLogger(__name__)

INF = 100000000
CACHE_EMPTY_STEP = 10000
EVAL_STEP = 10000

#########################################################################
# Prepare Parser
##########################################################################

parser = argparse.ArgumentParser()
parser.add_argument('--model_name_or_path', type=str, required=True,
                    help='pretrained model name or path to local checkpoint')

parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--max_seq_length", type=int, default=128)
parser.add_argument("--init_checkpoint", type=str, required=True)
parser.add_argument("--inference_input_file", type=str, required=True)

parser.add_argument("--inference_batch_size", type=int, default=8)
parser.add_argument("--num_optim_steps", type=int, default=1000000,
                    help="new API specifies num update steps")

parser.add_argument("--fp16", type=boolean_string, default=True)
parser.add_argument("--normalize_data", type=boolean_string, default=True)
parser.add_argument("--loss_scale", type=float, default=0)
parser.add_argument("--no_token_id", type=boolean_string, default=True)

parser.add_argument("--log_dir", type=str, required=True)
# distributed
parser.add_argument('--local_rank', type=int, default=-1,
                    help='for torch.distributed')
parser.add_argument('--config', help='JSON config file')

# do normal parsing
args = parser.parse_args()

if args.config is not None:
    # override argparse defaults by config JSON
    opts = json.load(open(args.config))
    for k, v in opts.items():
        if isinstance(v, str):
            # PHILLY ENV special cases
            if 'PHILLY_JOB_DIRECTORY' in v:
                v = v.replace('PHILLY_JOB_DIRECTORY',
                              os.environ['PHILLY_JOB_DIRECTORY'])
            elif 'PHILLY_LOG_DIRECTORY' in v:
                v = v.replace('PHILLY_LOG_DIRECTORY',
                              os.environ['PHILLY_LOG_DIRECTORY'])
        setattr(args, k, v)

    # command line should override config JSON
    argv = sys.argv[1:]
    overrides, _ = parser.parse_known_args(argv)
    for k, v in vars(overrides).items():
        if f'--{k}' in argv:
            setattr(args, k, v)
    setattr(args, 'local_rank', overrides.local_rank)

if args.local_rank == -1:
    logger.info('CUDA available? {}'.format(str(torch.cuda.is_available())))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    args.device, args.n_gpu = device, n_gpu
else:
    # distributed training
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    # Initializes the distributed backend which will take care of
    # sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')
    n_gpu = torch.distributed.get_world_size()
    args.device, args.n_gpu = device, 1
    logger.info("device: {} n_gpu: {}, distributed training: {}, "
                "16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

timestamp = datetime.datetime.now().strftime('%Y-%m-%d%H%M%S')
log_dir = args.log_dir

logger.info('Input Argument Information')
args_dict = vars(args)
for a in args_dict:
    logger.info('%-28s  %s' % (a, args_dict[a]))

#########################################################################
# Prepare Data Set
##########################################################################
print("Prepare Data")
enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)

config = GPT2Config.from_json_file(
    join(args.model_name_or_path, 'config.json'))

inference_dataloader_loss = DynamicBatchingLoader(
    args.inference_input_file, enc, args.normalize_data,
    args.inference_batch_size, args.max_seq_length)

inference_dataloader_gen = get_eval_list_same_length(
    args.inference_input_file, enc, args.inference_batch_size, True)

# eval_dataloader_loss = DynamicBatchingLoader(
#     args.eval_input_file, enc, args.normalize_data,
#     args.eval_batch_size, args.max_seq_length)
#
# eval_dataloader_gen = get_eval_list_same_length(
#     args.eval_input_file, enc, args.eval_batch_size, True)

#########################################################################
# Prepare Model
##########################################################################
print("Prepare Model")
logger.info("Prepare Model")
model = load_model(GPT2LMHeadModel(config), args.init_checkpoint,
                   args, verbose=True)

if args.local_rank != -1:
    # when from scratch make sure initial models are the same
    params = [p.data for p in model.parameters()]
    all_reduce_and_rescale_tensors(params, float(torch.distributed.get_world_size()))

no_decay = ['bias', 'ln']  # no decay for bias and LayerNorm (ln)

#########################################################################
# Inference !
##########################################################################
print("Model inference")
logger.info("Model inference")

inference_logger = open(join(log_dir, 'inference_log.txt'), 'a+', buffering=1)

epoch = 0

if args.local_rank != -1:
    n_gpu = 1
# todo modify loss out.
results = inference_model_results(model, enc, inference_dataloader_loss, args)
# todo output format
# print('{},{},{},{},{}'.format(epoch + 1, global_step + 1, step + 1, eval_loss, eval_ppl), file=inference_logger)
logger.info("inference_final_results:")
if results is None:
    logger.info("current results are None")
else:
    logger.info(results)

inference_logger.close()

python inference_LSP.py --model_name_or_path ./models/medium/ --init_checkpoint ./12_5_self_output/GPT2.1e-05.8.3gpu.2019-12-04225327/GP2-pretrain-step-50000.pkl --inference_input_file ./selfdata/attack_chatbot.tsv --log_dir inference_logs_dir/

Inference

python inference_LSP.py --model_name_or_path ./models/medium/ --init_checkpoint ./12_5_self_output/GPT2.1e-05.8.3gpu.2019-12-04225327/GP2-pretrain-step-50000.pkl --inference_input_file ./selfdata/attack_chatbot.tsv --log_dir inference_logs_dir/

validset.tsv:
–model_name_or_path ./models/medium/ --init_checkpoint ./12_5_self_output/GPT2.1e-05.8.3gpu.2019-12-04225327/GP2-pretrain-step-50000.pkl --inference_input_file ./selfdata/validset.tsv --log_dir inference_logs_dir/

./models/medium/medium_ft.pkl