-
Notifications
You must be signed in to change notification settings - Fork 0
/
ner-classification-bertic-experiments.py
236 lines (184 loc) · 7.68 KB
/
ner-classification-bertic-experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import evaluate
from datetime import datetime
import pandas as pd
import numpy as np
import json
from simpletransformers.ner import NERModel, NERArgs
from tqdm.autonotebook import tqdm as notebook_tqdm
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import logging
import sklearn
from numba import cuda
import argparse
import gc
import torch
import time
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
# Import the dataset
#if __name__ == '__main__':
# parser = argparse.ArgumentParser()
# parser.add_argument("dataset", help="path to the dataset in JSON format")
# args = parser.parse_args()
# Define the path to the datasets
datasets = ["datasets/hr500k.conllup_extracted.json", "datasets/reldi-normtagner-hr.conllup_extracted.json", "datasets/reldi-normtagner-sr.conllup_extracted.json", "datasets/set.sr.plus.conllup_extracted.json"]
#dataset_path = args.dataset
# Do a loop through the datasets
for dataset_path in datasets:
# Load the json file
with open(dataset_path, "r") as file:
json_dict = json.load(file)
# Open the train, eval and test dictionaries as DataFrames
train_df = pd.DataFrame(json_dict["train"])
test_df = pd.DataFrame(json_dict["test"])
dev_df = pd.DataFrame(json_dict["dev"])
# Change the sentence_ids to numbers
test_df['sentence_id'] = pd.factorize(test_df['sentence_id'])[0]
train_df['sentence_id'] = pd.factorize(train_df['sentence_id'])[0]
dev_df['sentence_id'] = pd.factorize(dev_df['sentence_id'])[0]
# Define the labels
LABELS = json_dict["labels"]
print(LABELS)
print(train_df.shape, test_df.shape, dev_df.shape)
print(train_df.head())
def train_and_test(model, train_df, test_df, dataset_path, LABELS):
# Define the model arguments - these arguments hold for all models,
# we just change the learning rate based on hyperparameter search
model_args = {"overwrite_output_dir": True,
"num_train_epochs": 5,
"labels_list": LABELS,
"learning_rate": 4e-05,
"train_batch_size": 32,
# Comment out no_cache and no_save if you want to save the model
"no_cache": True,
"no_save": True,
"max_seq_length": 256,
"save_steps": -1,
"silent": True,
}
# Define the type of dataset we are using
# - when we extend the code for SL, change this
dataset_type = "standard_hr"
if "reldi" in dataset_path:
dataset_type = "non_standard"
elif "set.sr" in dataset_path:
dataset_type = "standard_sr"
# Change no. of epochs based on the model and the dataset
if dataset_type == "standard_hr":
if model == "xlm-r-base":
model_args["num_train_epochs"] = 9
elif model == "csebert":
model_args["num_train_epochs"] = 8
elif model == "xlm-r-large":
model_args["num_train_epochs"] = 5
elif model == "bertic":
model_args["num_train_epochs"] = 7
elif dataset_type == "non_standard":
if model == "xlm-r-base":
model_args["num_train_epochs"] = 11
elif model == "csebert":
model_args["num_train_epochs"] = 9
elif model == "xlm-r-large":
model_args["num_train_epochs"] = 7
elif model == "bertic":
model_args["num_train_epochs"] = 14
elif dataset_type == "standard_sr":
if model == "xlm-r-base":
model_args["num_train_epochs"] = 11
elif model == "csebert":
model_args["num_train_epochs"] = 9
elif model == "xlm-r-large":
model_args["num_train_epochs"] = 11
elif model == "bertic":
model_args["num_train_epochs"] = 12
# Model type - a dictionary of type and model name.
model_type_dict = {
#"sloberta": ["camembert", "EMBEDDIA/sloberta"],
"csebert": ["bert", "EMBEDDIA/crosloengual-bert"],
"xlm-r-base": ["xlmroberta", "xlm-roberta-base"],
"xlm-r-large": ["xlmroberta", "xlm-roberta-large"],
"bertic": ["electra", "classla/bcms-bertic"],
}
# Define the model
current_model = NERModel(
model_type_dict[model][0],
model_type_dict[model][1],
labels = LABELS,
use_cuda=True,
args = model_args)
print("Training started. Current model: {}, no. of epochs: {}".format(model, model_args["num_train_epochs"]))
start_time = time.time()
# Fine-tune the model
current_model.train_model(train_df)
print("Training completed.")
training_time = round((time.time() - start_time)/60,2)
print("It took {} minutes for {} instances.".format(training_time, train_df.shape[0]))
# Clean cache
gc.collect()
torch.cuda.empty_cache()
start_evaluation_time = time.time()
# Evaluate the model
results = current_model.eval_model(test_df)
print("Evaluation completed.")
evaluation_time = round((time.time() - start_evaluation_time)/60,2)
print("It took {} minutes for {} instances.".format(evaluation_time, test_df.shape[0]))
# Get predictions
preds = results[1]
# Create a list with predictions
preds_list = []
for sentence in preds:
for word in sentence:
current_word = []
for element in word:
# Find prediction with the highest value
highest_index = element.index(max(element))
# Transform the index to label
current_pred = current_model.config.id2label[highest_index]
# Append to the list
current_word.append(current_pred)
# Segmentation can result in multiple predictions for one word - use the first prediction only
preds_list.append(current_word[0])
# Get y_true
y_true = list(test_df.labels)
run_name = "{}-{}".format(dataset_path, model)
# Evaluate predictions
metrics = evaluate.testing(y_true, preds_list, list(test_df.labels.unique()), run_name, show_matrix=True)
# Add y_pred and y_true to the metrics dict
metrics["y_true"] = y_true
metrics["y_pred"] = preds_list
# Let's also add entire results
metrics["results_output"] = results
# The function returns a dict with accuracy, micro f1, macro f1, y_true and y_pred
return metrics
# Here, we will only use the bertic model and experiment with different learning rates
model = "bertic"
# Let's do 4 runs for starters
for run in [0,1,2,3]:
current_results_dict = train_and_test(model, train_df, test_df, dataset_path, LABELS)
# Add to the dict model name, dataset name and run
current_results_dict["model"] = model
current_results_dict["run"] = "{}-{}".format(model, run)
current_results_dict["dataset"] = dataset_path
# Add to the file with results all important information
with open("ner-results-bertic-experiments.txt", "a") as file:
file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(datetime.now().strftime("%d/%m/%Y %H:%M:%S"), current_results_dict["model"], current_results_dict["run"], current_results_dict["dataset"], current_results_dict["micro F1"], current_results_dict["macro F1"], current_results_dict["label-report"]))
# Add to the original test_df y_preds
test_df["y_pred_{}_{}".format(model, run)] = current_results_dict["y_pred"]
# Save also y_pred and y_true
with open("logs/{}-{}-{}-true-and-pred-backlog.txt".format(dataset_path,model,run), "w") as backlog:
backlog.write("y-true\ty-pred\toutputs\n")
backlog.write("{}\t{}\t{}\n".format(current_results_dict["y_true"], current_results_dict["y_pred"], current_results_dict["results_output"]))
print("Run {} finished.".format(run))
# At the end, save the test_df with all predictions
test_df.to_csv("{}-test_df-with-predictions.csv".format(dataset_path))
# At the end, create a csv table with a summary of results
results = pd.read_csv("ner-results-bertic-experiments.txt", sep="\t")
results["Macro F1"] = results["Macro F1"].round(2)
# Pivot the DataFrame to rearrange columns into rows
pivot_df = results.pivot(index='Run', columns='Dataset', values='Macro F1')
# Reset the index to have 'Model' as a column
pivot_df.reset_index(inplace=True)
# Save the summary results
pivot_df.to_csv("ner-results-summary-bertic.csv")