BERT-中文-Classifier-实战
程序员文章站
2022-05-14 18:23:25
...
BERT-中文-Classifier-实战
目录
单标签[TOP]
tsv
分类文本 标签
分类文本1 标签1
分类文本2 标签2
分类文本3 标签3
bert(google-research)
[github][TOP]
文件夹结构
bert
|—— data_dir
| |—— test.tsv
| |—— train.tsv
| |—— val.tsv
|—— chinese_L-12_H-768_A-12
| |—— bert_config.json
| |—— bert_model.ckpt.data-00000-of-00001
| |—— bert_model.ckpt.index
| |—— bert_model.ckpt.meta
| |—— vocab.txt
|—— modeling.py
|—— optimization.py
|—— run_classifier.py
|—— tokenization.py
run_classifier.py
# 添加自定义Processor
class SelfProcessor(DataProcessor):
"""创建自己的Processor:模仿其他Processor构造自己的processor"""
def get_train_examples(self, data_dir):
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
return self._create_examples(self._read_tsv(os.path.join(data_dir, "val.tsv")), "val")
def get_test_examples(self, data_dir):
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
return ["标签1" ,"标签2", "标签3"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0: # tsv文件的第一行会被忽略掉,因此第一行可以填入注释等信息
continue
guid = "%s-%d" % (set_type, i)
text = tokenization.convert_to_unicode(line[0]) # text为欲分类的文本
# line对应的下标与tsv的分类文本的列数相对应
if set_type == "test":
label = "标签1" # 此处随意写一个标签
else:
label = tokenization.convert_to_unicode(line[1])
# line的下标与tsv中标签的列数相对应
examples.append(
InputExample(guid=guid, text_a=text, text_b=None, label=label))
return examples
def main(_):
# 修改main中的特定位置,请读者自行寻找
tf.logging.set_verbosity(tf.logging.INFO)
processors = {
# "cola": ColaProcessor,
# "mnli": MnliProcessor,
# "mrpc": MrpcProcessor,
# "xnli": XnliProcessor,
"self": SelfProcessor, # 修改main中的processor,添加改行
# 其他的processor可以注释掉,则前面的相对应的类也可以删除掉
# 此处的self的命名需与运行时的--task_name取值相同
}
tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
FLAGS.init_checkpoint)
运行
python run_classifier.py
--task_name=self
--do_train=true
--do_eval=true
--dopredict=true
--data_dir=data_dir
--vocab_file=chinese_L-12_H-768_A-12/vocab.txt
--bert_config_file=chinese_L-12_H-768_A-12/bert_config.json
--init_checkpoint=chinese_L-12_H-768_A-12/bert_model.ckpt
--max_seq_length=64
--train_batch_size=32
--learning_rate=5e-5
--num_train_epochs=2.0
--output_dir=tmp/self_output/
参数 | 说明 |
---|---|
--task_name |
与processors 中自定义的Processor名对应的名字相同 |
--data_dir |
数据文件夹,内部包含train.tsv 、test.tsv 、val.tsv (与自定义的Processor中调用的_read_tsv 中的参数名等相对应 |
--vocab_file |
谷歌提供的模型中的vocab.txt 路径 |
--bert_config_file |
谷歌提供模型中的bert_config.json 路径 |
--init_checkpoint |
谷歌提供模型中的bert_model.ckpt 路径 |
--output_dir |
输出文件夹 |
版本
python 3.6
tensorflow 1.11.0 / tensorflow-gpu 1.11.0 # 不能超过1.14.0,超过1.14.0的tf库有的内容没有,会出错
pytorch-pretrained-bert(pypi)
[pypi][github][TOP]
文件夹结构
torch_pretrained_bert
|—— data_dir
|—— run_classifier.py
run_classifier.py
# 添加自定义Processor
class SelfProcessor(DataProcessor):
def get_train_examples(self, data_dir):
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
return self._create_examples(self._read_tsv(os.path.join(data_dir, "val.tsv")), "val")
def get_labels(self):
return ["标签1" ,"标签2", "标签3"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%d" % (set_type, i)
text = line[0]
if set_type == "test":
label = "标签1"
else:
label = line[1]
examples.append(
InputExample(guid=guid, text_a=text, text_b=None, label=label))
return examples
def compute_metrics(task_name, preds, labels):
# 修改compute_metrics中内容,读者自行寻找修改
elif task_name == "wnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "self": # 添加该行
return {"acc": simple_accuracy(preds, labels)} # 添加改行
else:
raise KeyError(task_name)
def main():
# 修改main中对应部分,请读者自行寻找
processors = {
# "cola": ColaProcessor,
# "mnli": MnliProcessor,
# "mnli-mm": MnliMismatchedProcessor,
# "mrpc": MrpcProcessor,
# "sst-2": Sst2Processor,
# "sts-b": StsbProcessor,
# "qqp": QqpProcessor,
# "qnli": QnliProcessor,
# "rte": RteProcessor,
# "wnli": WnliProcessor,
"self": SelfProcessor, # 添加该行
}
output_modes = {
# "cola": "classification",
# "mnli": "classification",
# "mrpc": "classification",
# "sst-2": "classification",
# "sts-b": "regression",
# "qqp": "classification",
# "qnli": "classification",
# "rte": "classification",
# "wnli": "classification",
"self": "classification", # 添加该行
}
运行
python run_classifier.py
--task_name self
--do_train
--do_eval
--data_dir data_dir
--bert_model bert-base-chinese
--max_seq_length 128
--train_batch_size 16
--learning_rate 2e-5
--num_train_epochs 3.0
--output_dir tmp/self/
版本
python 3.7
pytorch-pretrained-bert 0.6.2
transformers(huggingface)
[pypi][github][TOP]
文件夹结构[问题]
transformers_bert
|—— data_dir
|—— transformers
| |—— data
| |—— tests
| |—— ....py
|—— run_glue.py
|—— requirements.txt
transformers_bert/transformers/data/metrics/__init__.py
def glue_compute_metrics(task_name, preds, labels):
elif task_name == "wnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "self": # 添加
return {"acc": simple_accuracy(preds, labels)} # 添加
else:
raise KeyError(task_name)
run_glue.py[github]
from transformers import DataProcessor, InputExample # 添加该行,导入DataProcessor InputExample
# 添加自定义Processor
class SelfProcessor(DataProcessor):
def get_example_from_tensor_dict(self, tensor_dict):
return InputExample(tensor_dict['idx'].numpy(),
tensor_dict['sentence'].numpy().decode('utf-8'),
None,
str(tensor_dict['label'].numpy()))
def get_train_examples(self, data_dir):
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
return self._create_examples(self._read_tsv(os.path.join(data_dir, "val.tsv")), 'val')
def get_labels(self):
return ["标签1" ,"标签2", "标签3"]
def _create_examples(self, lines, set_type):
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text = line[0]
label = line[1]
examples.append(
InputExample(guid=guid, text_a=text, text_b=None, label=label))
return examples
output_modes["self"] = "classification" # 添加该行,在output_modes中加入self任务
processors["self"] = SelfProcessor # 添加该行,在processors中加入self
运行
python run_glue.py
--model_type bert
--model_name_or_path bert-base-chinese
--task_name self
--do_train
--do_eval
--do_lower_case
--data_dir data_dir
--max_seq_length 256
--per_gpu_train_batch_size 8
--learning_rate 5e-5
--num_train_epochs 3.0
--output_dir tmp/self/
--save_steps 1000
参数 | 说明 |
---|---|
--save_steps |
每间隔多少步数保存一次(建议设置大一些,根据数据集设定保存步数,默认50,数据量较大时,会导致需存储大量保存点,导致缺少空间用于保存,发生运行时错误) |
版本
python 3.7
transformers 2.2.0
多标签[TOP]
tsv
分类文本 标签
分类文本1 ['标签1', '标签2']
分类文本2 ['标签2']
分类文本3 ['标签1', '标签3']
pytorch-pretrained-bert(pypi)
[TOP]
文件夹结构
torch_pretrained_bert_multi_label
|—— multi_data_dir
|—— run_classifier.py
run_classifier_multi_label.py
import re # 导入该包
from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss # 导入BCEWithLogitsLoss
# 参照fast-bert/fast_bert/modeling.py BertForMultiLabelSequenceClassification
# 参照BertForSequenceClassificaion
class BertForMultiLabelSequenceClassification(BertForSequenceClassification):
def __init__(self, config, num_labels):
super().__init__(config, num_labels)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
if labels is not None:
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(
logits.view(-1, self.num_labels), labels.view(-1, self.num_labels)
)
return loss
else:
return logits
# 添加自定义Processor
class SelfProcessor(DataProcessor):
def get_train_examples(self, data_dir):
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
return self._create_examples(self._read_tsv(os.path.join(data_dir, "val.tsv")), "val")
def get_labels(self):
return ['标签1', '标签2', '标签3']
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%d" % (set_type, i)
text = line[0]
labels = self.get_labels()
label_ids = [0.0] * len(labels)
if set_type == "test":
label_ids[0] = 1.0
else:
label = re.findall(u"'(.*?)'", line[2], re.S)
label = [labels.index(one_label) for one_label in label]
for label_id in label:
label_ids[label_id] = 1.0
examples.append(
InputExample(guid=guid, text_a=text, text_b=None, label=label_ids))
return examples
# 修改该函数指定部分
def convert_examples_to_features(examples, label_list, max_seq_length,
tokenizer, output_mode):
# FROM
if output_mode == "classification":
# label_id = label_map[example.label] # 修改
label_id = example.label # 添加
elif output_mode == "regression":
label_id = float(example.label)
else:
raise KeyError(output_mode)
# END
# FROM
logger.info(
"segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
logger.info("label: %s (id = %s)" % (example.label, str(label_id))) # 修改
features.append(
InputFeatures(input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id))
return features
# END
# 重写整个simple_accuracy函数
def simple_accuracy(preds, labels):
threshold = 0.9 # TODO 调整阈值
preds_max = preds.max(axis=1)
preds_threshold = preds_max * threshold
del preds_max
preds_labels = []
for i, pred in enumerate(preds):
preds_labels.append([1.0 if p > preds_threshold[i] else 0.0 for p in pred])
preds_labels = torch.tensor(preds_labels, dtype=torch.float)
# preds_labels = [[1,0,0,0,1...],...]
# labels = [[1,0,0,1,...],...]
# 使用交并比计算准确率
preds_labels = preds_labels.numpy()
preds_labels_or_labels = np.logical_or(preds_labels, labels)
preds_labels_and_labels = np.logical_and(preds_labels, labels)
accuracy = preds_labels_and_labels.sum(axis=1) / preds_labels_or_labels.sum(axis=1)
return accuracy.mean()
# 修改compute_metrics函数
def compute_metrics(task_name, preds, labels):
# FROM
elif task_name == "wnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "self":
return {"acc": simple_accuracy(preds, labels)}
else:
raise KeyError(task_name)
# END
# 修改main函数(每个FROM到END均为一个需修改处,可根据缩进及前后顺序区分)
def main():
# FROM
processors = {
# "cola": ColaProcessor,
# "mnli": MnliProcessor,
# "mnli-mm": MnliMismatchedProcessor,
# "mrpc": MrpcProcessor,
# "sst-2": Sst2Processor,
# "sts-b": StsbProcessor,
# "qqp": QqpProcessor,
# "qnli": QnliProcessor,
# "rte": RteProcessor,
# "wnli": WnliProcessor,
"self": SelfProcessor,
}
output_modes = {
# "cola": "classification",
# "mnli": "classification",
# "mrpc": "classification",
# "sst-2": "classification",
# "sts-b": "regression",
# "qqp": "classification",
# "qnli": "classification",
# "rte": "classification",
# "wnli": "classification",
"self": "classification",
}
# TO
# FROM
# Prepare model
cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
'distributed_{}'.format(args.local_rank))
model = BertForMultiLabelSequenceClassification.from_pretrained(args.bert_model,
cache_dir=cache_dir,
num_labels=num_labels) # 修改分类器类名
if args.fp16:
model.half()
model.to(device)
# END
# FROM
if output_mode == "classification":
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) # 修改
elif output_mode == "regression":
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
# TO
# FROM
if output_mode == "classification":
# loss_fct = CrossEntropyLoss() # 注释
loss_fct = BCEWithLogitsLoss() # 修改
loss = loss_fct(logits.view(-1), label_ids.view(-1)) # 修改
elif output_mode == "regression":
loss_fct = MSELoss()
loss = loss_fct(logits.view(-1), label_ids.view(-1))
# END
# FROM
model = BertForMultiLabelSequenceClassification.from_pretrained(args.output_dir,
num_labels=num_labels) # 修改分类器类名
tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
else:
model = BertForMultiLabelSequenceClassification.from_pretrained(args.bert_model,
num_labels=num_labels) # 修改分类器类名
model.to(device)
# END
# FROM
if output_mode == "classification":
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) # 修改
elif output_mode == "regression":
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
# END
# FROM
if output_mode == "classification":
# loss_fct = CrossEntropyLoss() # 注释
loss_fct = BCEWithLogitsLoss() # 添加
tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) # 修改
elif output_mode == "regression":
loss_fct = MSELoss()
tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
# END
# FROM
preds = preds[0]
if output_mode == "classification":
# preds = np.argmax(preds, axis=1) # 注释
pass # 添加
elif output_mode == "regression":
preds = np.squeeze(preds)
result = compute_metrics(task_name, preds, all_label_ids.numpy())
# END
运行
python run_classifier_multi_label.py
--task_name self
--do_train
--do_eval
--data_dir multi_data_dir/
--bert_model bert-base-chinese
--max_seq_length 256
--train_batch_size 8
--learning_rate 5e-5
--num_train_epochs 3.0
--output_dir tmp/self/
版本
python 3.7
pytorch-pretrained-bert 0.6.2
ENV[TOP]
Windows 10
GeForce RTX 2060 SUPER (8G)
cuda_10.1.243_426.00_win10
cudnn-10.1-windows10-x64-v7.6.5.32
P.S. [TOP]
运行问题
问题 | 解决 |
---|---|
使用GPU报错 | 1. 报错内容包括OOM 等,修改max_seq_length 与train_batch_size 的大小2. 正确安装CUDA和cudnn等 |
CPU训练时间过长 | 推荐使用GPU等,会有明显提速 |
安装CUDA中出现的VS部分 | 自定义安装中,取消勾选VS的,不安装VS,不影响使用 |
训练结果中的eval_accuracy 等结果不理想 |
调参:调整max_seq_length 、train_batch_size 、num_train_epochs 大小 |
内存 | 尽量关闭大多数占用内存较大的应用,为运行过程留出足够内存(尤其是使用CPU) |
运行pytorch-pretrained-bert 过程中关于do_lower_case 的WARNING
|
可以无视,没有影响 |
pypi 安装的transformers 运行run_glue.py 时有的函数没有 |
transformers_bert 中的transformers 文件夹,来自其github项目中的transformer文件夹 |
若直接复制上述示例中的tsv文件训练可能会导致问题 | 示例tsv为描述易于理解,并不是使用\t 进行间隔,且数据量过小,仅供展示,具体tsv文件格式读者可自行百度 |
训练中报错与数据量相关 | 增大数据量,数据量过少会导致报错 |
pytorch-pretrained-bert 的多标签分类器中的阈值 |
通过微调阈值来提高正确率 |
框架问题
框架 | 问题 |
---|---|
bert-chainer |
由于笔者使用的CUDA版本为10.1.* ,而chainer运行需要cupy,笔者测试时cupy仅支持到10.0.*,因此无法亲测该框架,抱歉 |
pypi 中pytorch-pretrained-bert 无run_classifier.py 等样例 |
笔者根据包的发布日期寻找到相应时间的github链接,已附在相应的标题旁,方便大家自行访问 |
fast-bert |
框架需要先安装apex 包,apex 包对Windows支持不友好 |
google-researh 与transformers 的多标签 |
由于笔者尝试框架比较辛苦,每次运行又需要等待很久,只更改了pytorch-pretrained-bert 的框架,望读者见谅,笔者可根据内容的相似程度,自行尝试修改,若出现问题,欢迎交流 |
以上问题均为笔者在运行中亲自经历的问题并尝试解决的方法,若运行中仍出现以上未涉及的问题,欢迎留言交流。
总结
框架 | 过程 | 结果 |
---|---|---|
bert(google-research) |
***** Running training ***** *** Features *** **** Trainable Variables **** Saving checkpoints for xxxx into tmp/self_output/model.ckpt.
|
***** Eval results ***** eval_accuracy = xxxx eval_loss = xxxx global_step = xxxx loss = xxxx
|
pytorch-pretrained-bert(pypi) |
Epoch: 0%| | 0/x [00:00<?, ?it/s] Iteration: 0%| | 0/xxxxx [00:00<?, ?it/s]
|
***** Eval results ***** acc = xxxx eval_loss = xxxx global_step = xxxx loss = xxxx
|
transformers(huggingface) |
Epoch: 0%| | 0/x [00:00<?, ?it/s] Iteration: 0%| | 0/xxxxx [00:00<?, ?it/s]
|
***** Eval results ***** acc = xxxxx
|
参考
以上为笔者根据下列博客等内容学习后,使用自己数据集进行编码实现,亲测有效。
单标签
多标签
调参
安装
附:笔者原创,需整理、修改及不断尝试,每个模型均为笔者亲自改写尝试,每次都是训练/测试完成结果较为满意才敢记录,每次训练长达7+小时,着实不易。若大家喜欢,还望给予点赞等鼓励,欢迎参与交流学习!
推荐阅读
-
Jasperreports6.4.1+Jaspersoft studio6.4.1进行报表开发实战教程-PDF中文显示
-
IKanalyzer、ansj_seg、jcseg三种中文分词器的实战较量
-
中文自然语言处理入门实战
-
SEO优化实战经验总结 中文分词
-
构建跨平台APP:jQuery Mobile移动应用实战 中文pdf扫描版
-
CSS+DIV网页样式布局实战从入门到精通 中文pdf扫描版
-
响应式Web设计:HTML5和CSS3实战 第2版 (本·弗莱恩) 中文pdf完整版
-
Spring实战第五版(中文版)学习笔记-第一章 Spring起步
-
SEO优化实战经验总结 中文分词
-
SwiftUI 实战之带文本框TextField的Alert组件解决中文问题(教程含源码)