-
Notifications
You must be signed in to change notification settings - Fork 64
/
demo.py
136 lines (113 loc) · 5.56 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import tensorflow as tf
from bert import modeling
import os
import create_input
import tokenization
import numpy as np
# 这里是下载下来的bert配置文件
bert_config = modeling.BertConfig.from_json_file("chinese_L-12_H-768_A-12/bert_config.json")
vocab_file="chinese_L-12_H-768_A-12/vocab.txt"
batch_size=20
num_labels=2# 类别数量,我的例子是个二分类
is_training=True
max_seq_length=128
iter_num=1000
lr=0.00005
if max_seq_length > bert_config.max_position_embeddings: # 模型有个最大的输入长度 512
raise ValueError("超出模型最大长度")
# 加载数据集合
with open("data/text.txt","r",encoding="utf-8") as reader:
data=reader.read().splitlines()
texts=[]
labels=[]
for line in data:
line=line.split("\t")
if len(line) ==2 and int(line[1])<2:# 这里演示一个二分类问题,但训练样本并没有认真处理过,所以去掉label大于1的。
texts.append(line[0])
labels.append(line[1])
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file) # token 处理器,主要作用就是 分字,将字转换成ID。vocab_file 字典文件路径
input_idsList=[]
input_masksList=[]
segment_idsList=[]
for t in texts:
single_input_id, single_input_mask, single_segment_id=create_input.convert_single_example(max_seq_length,tokenizer,t)
input_idsList.append(single_input_id)
input_masksList.append(single_input_mask)
segment_idsList.append(single_segment_id)
input_idsList=np.asarray(input_idsList,dtype=np.int32)
input_masksList=np.asarray(input_masksList,dtype=np.int32)
segment_idsList=np.asarray(segment_idsList,dtype=np.int32)
labels=np.asarray(labels,dtype=np.int32)
# 创建bert的输入
input_ids=tf.placeholder (shape=[batch_size,max_seq_length],dtype=tf.int32,name="input_ids")
input_mask=tf.placeholder (shape=[batch_size,max_seq_length],dtype=tf.int32,name="input_mask")
segment_ids=tf.placeholder (shape=[batch_size,max_seq_length],dtype=tf.int32,name="segment_ids")
###
input_labels=tf.placeholder (shape=batch_size,dtype=tf.int32,name="input_ids")
# 创建bert模型
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=segment_ids,
use_one_hot_embeddings=False # 这里如果使用TPU 设置为True,速度会快些。使用CPU 或GPU 设置为False ,速度会快些。
)
output_layer = model.get_sequence_output()# 这个获取每个token的output 输入数据[batch_size, seq_length, embedding_size] 如果做seq2seq 或者ner 用这个
output_layer = model.get_pooled_output() # 这个获取句子的output
hidden_size = output_layer.shape[-1].value #获取输出的维度
# 后面就简单了,就是一个全连接,效果和下面的是一样的
# logits = tf.layers.dense(output_layer, 2)
# loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=input_labels, name="soft_loss")
# loss = tf.reduce_mean(loss, name="loss")
# predict = tf.argmax(tf.nn.softmax(logits), axis=1, name="predictions")
# acc = tf.reduce_mean(tf.cast(tf.equal(input_labels, tf.cast(predict, dtype=tf.int32)), "float"), name="accuracy")
# 后面就简单了,就是一个全连接
output_weights = tf.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02))
output_bias = tf.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
with tf.variable_scope("loss"):
if is_training:
# I.e., 0.1 dropout
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
one_hot_labels = tf.one_hot(input_labels, depth=num_labels, dtype=tf.float32)
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
predict = tf.argmax(tf.nn.softmax(logits), axis=1, name="predictions")
acc = tf.reduce_mean(tf.cast(tf.equal(input_labels, tf.cast(predict, dtype=tf.int32)), "float"), name="accuracy")
train_op = tf.train.AdamOptimizer(lr).minimize(loss)
#bert模型参数初始化的地方
init_checkpoint = "chinese_L-12_H-768_A-12/bert_model.ckpt"
use_tpu = False
# 获取模型中所有的训练参数。
tvars = tf.trainable_variables()
# 加载BERT模型
(assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,
init_checkpoint)
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
tf.logging.info("**** Trainable Variables ****")
# 打印加载模型的参数
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
init_string)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(iter_num):
shuffIndex = np.random.permutation(np.arange(len(texts)))[:batch_size]
batch_labels = labels[shuffIndex]
batch_input_idsList=input_idsList[shuffIndex]
batch_input_masksList=input_masksList[shuffIndex]
batch_segment_idsList=segment_idsList[shuffIndex]
l,a,_=sess.run([loss,acc,train_op],feed_dict={
input_ids:batch_input_idsList,input_mask:batch_input_masksList,
segment_ids:batch_segment_idsList,input_labels:batch_labels
})
print("准确率:{},损失函数:{}".format(a,l))