forked from huggingface/hmtl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fine_tune.py
149 lines (124 loc) · 6.04 KB
/
fine_tune.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# coding: utf-8
"""
The ``fine_tune.py`` file is used to continue training (or `fine-tune`) a model on a `different
dataset` than the one it was originally trained on. It requires a saved model archive file, a path
to the data you will continue training with, and a directory in which to write the results.
. code-block:: bash
$ python fine_tune.py --help
usage: fine_tune.py [-h] -s SERIALIZATION_DIR -c CONFIG_FILE_PATH -p
PRETRAINED_DIR -m PRETRAINED_MODEL_NAME
optional arguments:
-h, --help show this help message and exit
-s SERIALIZATION_DIR, --serialization_dir SERIALIZATION_DIR
Directory in which to save the model and its logs.
-c CONFIG_FILE_PATH, --config_file_path CONFIG_FILE_PATH
Path to parameter file describing the new multi-tasked
model to be fine-tuned.
-p PRETRAINED_DIR, --pretrained_dir PRETRAINED_DIR
Directory in which was saved the pre-trained model.
-m PRETRAINED_MODEL_NAME, --pretrained_model_name PRETRAINED_MODEL_NAME
Name of the weight file for the pretrained model to
fine-tune in the ``pretrained_dir``.
"""
import argparse
import itertools
import os
import json
import re
from copy import deepcopy
import torch
from typing import List, Dict, Any, Tuple
import logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
)
from hmtl.tasks import Task
from hmtl.training.multi_task_trainer import MultiTaskTrainer
from hmtl.common import create_and_set_iterators
from evaluate import evaluate
from train import train_model
from allennlp.models.model import Model
from allennlp.data import Vocabulary
from allennlp.data.iterators import DataIterator
from allennlp.commands.train import create_serialization_dir
from allennlp.common.params import Params
from allennlp.common.checks import ConfigurationError
from allennlp.nn import RegularizerApplicator
logger = logging.getLogger(__name__)
if __name__ == "__main__":
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument(
"-s", "--serialization_dir", required=True, help="Directory in which to save the model and its logs.", type=str
)
parser.add_argument(
"-c",
"--config_file_path",
required=True,
help="Path to parameter file describing the new multi-tasked model to be fine-tuned.",
type=str,
)
parser.add_argument(
"-p", "--pretrained_dir", required=True, help="Directory in which was saved the pre-trained model.", type=str
)
parser.add_argument(
"-m",
"--pretrained_model_name",
required=True,
help="Name of the weight file for the pretrained model to fine-tune in the ``pretrained_dir``.",
type=str,
)
args = parser.parse_args()
params = Params.from_file(params_file=args.config_file_path)
serialization_dir = args.serialization_dir
create_serialization_dir(params, serialization_dir, False)
serialization_params = deepcopy(params).as_dict(quiet=True)
with open(os.path.join(serialization_dir, "config.json"), "w") as param_file:
json.dump(serialization_params, param_file, indent=4)
### Instantiate tasks ###
task_list = []
task_keys = [key for key in params.keys() if re.search("^task_", key)]
for key in task_keys:
logger.info("Creating %s", key)
task_params = params.pop(key)
task_description = task_params.pop("task_description")
task_data_params = task_params.pop("data_params")
task = Task.from_params(params=task_description)
task_list.append(task)
_, _ = task.load_data_from_params(params=task_data_params)
### Load Vocabulary from files and save it to the new serialization_dir ###
# PLEASE NOTE that here, we suppose that the vocabulary is the same for the pre-trained model
# and the model to fine-tune. The most noticeable implication of this hypothesis is that the label specs
# between the two datasets (for pre-training and for fine-tuning) are exactly the same.
vocab = Vocabulary.from_files(os.path.join(args.pretrained_dir, "vocabulary"))
logger.info("Vocabulary loaded from %s", os.path.join(args.pretrained_dir, "vocabulary"))
vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))
logger.info("Save vocabulary to file %s", os.path.join(serialization_dir, "vocabulary"))
### Load the data iterators for each task ###
task_list = create_and_set_iterators(params=params, task_list=task_list, vocab=vocab)
### Load Regularizations ###
regularizer = RegularizerApplicator.from_params(params.pop("regularizer", []))
### Create model ###
model_params = params.pop("model")
model = Model.from_params(vocab=vocab, params=model_params, regularizer=regularizer)
logger.info("Loading the pretrained model from %s", os.path.join(args.pretrained_dir, args.pretrained_model_name))
try:
pretrained_model_state_path = os.path.join(args.pretrained_dir, args.pretrained_model_name)
pretrained_model_state = torch.load(pretrained_model_state_path)
model.load_state_dict(state_dict=pretrained_model_state)
except:
raise ConfigurationError(
"It appears that the configuration of the pretrained model and "
"the model to fine-tune are not compatible. "
"Please check the compatibility of the encoders and taggers in the "
"config files."
)
### Create multi-task trainer ###
multi_task_trainer_params = params.pop("multi_task_trainer")
trainer = MultiTaskTrainer.from_params(
model=model, task_list=task_list, serialization_dir=serialization_dir, params=multi_task_trainer_params
)
### Launch training ###
metrics = train_model(multi_task_trainer=trainer, recover=False)
if metrics is not None:
logging.info("Fine-tuning is finished ! Let's have a drink. It's on the house !")