-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbert-example.py
89 lines (73 loc) · 3.65 KB
/
bert-example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#https://github.com/huggingface/transformers
#https://huggingface.co/transformers/quickstart.html
#BERT example
#pip install transformers
#老的pytorch_transformers
import torch
import torch.nn as nn
from transformers import BertConfig, BertModel
from transformers.tokenization_bert import BertTokenizer as tokenization
import os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#get_bert_model
# bert预训练模型:
# pytorch_model.bin
# config.json
# vocab.txt
bert_path = './bert'
do_lower_case=True
bert_config_file = os.path.join(bert_path, f'bert_config.json')
vocab_file = os.path.join(bert_path, f'vocab.txt')
init_checkpoint = os.path.join(bert_path, f'pytorch_model.bin')
#加载配置
bert_config = BertConfig.from_json_file(bert_config_file)
# 加载词典
tokenizer = tokenization(vocab_file=vocab_file, do_lower_case=do_lower_case)
# 加载模型
model_bert = BertModel.from_pretrained(bert_path,config=bert_config)
model_bert.to(device)
# Tokenize input
text = "乌兹别克斯坦议会立法院主席获连任"
tokenized_text = tokenizer.tokenize(text)
tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
# Convert token to vocabulary indices
# input_ids:一个形状为[batch_size, sequence_length]的torch.LongTensor,在词汇表中包含单词的token索引
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
# segment_ids :形状[batch_size, sequence_length]的可选torch.LongTensor,在[0, 1]中选择token类型索引。类型0对应于句子A,类型1对应于句子B。
segment_ids = [0]*len(input_ids)
# input_mask:一个可选的torch.LongTensor,形状为[batch_size, sequence_length],索引在[0, 1]中选择。
input_mask = [1]*len(input_ids)
# Convert inputs to PyTorch tensors
input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
print("input_ids",input_ids.size())
input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)# attention_mask,可以不输入
segments_tensors = torch.tensor([segment_ids], dtype=torch.long).to(device)
#输出
all_encoder_layer, pooled_output = model_bert(input_ids,input_mask,token_type_ids=segments_tensors)
# all_encoder_layers:一个大小为[batch_size, sequence_length,hidden_size]的torch.FloatTensor列表,
# 它是每个注意块末端隐藏状态的完整序列列表(即BERT-base的12个完整序列,BERT-large的24个完整序列)
# pooled_output:一个大小为[batch_size, hidden_size] 的torch.FloatTensor,
# 它是在与输入(CLF)的第一个字符相关联的隐藏状态之上预训练的分类器的输出,用于训练Next - Sentence任务(参见BERT的论文)
#如果我们要输出embeding 表示,只使用all_encoder_layer
print('all_encoder_layer',all_encoder_layer.shape)
print('pooled_output',pooled_output.size())
#如果要分类,使用pooled_output
#padding
max_seq_length=300
text = "乌兹别克斯坦议会立法院主席获连任"
tokenized_text = tokenizer.tokenize(text)
tokenized_text=['[CLS]'] + tokenized_text + ['[SEP]']
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
input_mask = [1]*len(input_ids)
padding = [0] * (max_seq_length - len(input_ids))
input_ids += padding
input_mask += padding
input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
input_mask = torch.tensor([input_mask], dtype=torch.long).to(device)
print("padding input_ids",input_ids.size())
model_bert.eval()
with torch.no_grad():
all_encoder_layer, pooled_output = model_bert(input_ids,attention_mask= input_mask)
print('padding all_encoder_layer', all_encoder_layer.shape)
print('padding pooled_output', pooled_output.size())