From 96b514afeeb918608815b22f5b4d25d7c2b64d69 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 2 Apr 2021 02:48:14 +0000 Subject: [PATCH 01/23] add adapter --- .../classification/train_classification.py | 75 ++++++++++++++++--- 1 file changed, 63 insertions(+), 12 deletions(-) diff --git a/scripts/classification/train_classification.py b/scripts/classification/train_classification.py index 0b823cef4f..f91a745d8d 100644 --- a/scripts/classification/train_classification.py +++ b/scripts/classification/train_classification.py @@ -5,7 +5,9 @@ import json import random import pandas as pd +import mxnet.numpy_extension as _mx_npx import os +import json import logging import time import argparse @@ -92,13 +94,27 @@ def parse_args(): help='the path to training dataset') parser.add_argument('--warmup_ratio', type=float, default=0.1, help='Ratio of warmup steps in the learning rate scheduler.') + parser.add_argument('--method', type=str, default='full', choices=['full', 'bias', 'subbias', 'adapter'], + help='different finetune method') args = parser.parse_args() return args + +def change_adapter_cfg(cfg, task): + adapter_config = {'adapter_fusion':False, + 'task_names':[task.task_name], + task.task_name:{'type':'Basic','unit':64}} + cfg.defrost() + cfg.MODEL.use_adapter = True + cfg.MODEL.adapter_config = json.dumps(adapter_config) + cfg.freeze() + return cfg + def get_network(model_name, ctx_l, + method='full', checkpoint_path=None, backbone_path=None, task=None): @@ -109,13 +125,16 @@ def get_network(model_name, use_segmentation = 'roberta' not in model_name and 'xlmr' not in model_name Model, cfg, tokenizer, download_params_path, _ = \ get_backbone(model_name, load_backbone=not backbone_path) + + if method == 'adapter': + cfg = change_adapter_cfg(cfg, task) backbone = Model.from_cfg(cfg) # Load local backbone parameters if backbone_path provided. # Otherwise, download backbone parameters from gluon zoo. backbone_params_path = backbone_path if backbone_path else download_params_path if checkpoint_path is None: - backbone.load_parameters(backbone_params_path, ignore_extra=True, + backbone.load_parameters(backbone_params_path, ignore_extra=True, allow_missing=True, ctx=ctx_l, cast_dtype=True) num_params, num_fixed_params \ = count_parameters(deduplicate_param_dict(backbone.collect_params())) @@ -219,6 +238,8 @@ def train(args): #random seed set_seed(args.seed) level = logging.INFO + if not os.path.exists(args.output_dir): + os.mkdir(args.output_dir) detail_dir = os.path.join(args.output_dir, args.task_name) if not os.path.exists(detail_dir): os.mkdir(detail_dir) @@ -228,11 +249,12 @@ def train(args): console=(local_rank == 0)) logging.info(args) cfg, tokenizer, classify_net, use_segmentation = \ - get_network(args.model_name, ctx_l, + get_network(args.model_name, ctx_l, args.method, args.param_checkpoint, args.backbone_path, task) + logging.info('Prepare training data') train_data, _ = get_task_data(args, task, tokenizer, segment='train') train_batchify = bf.Group(bf.Group(bf.Pad(), bf.Pad(), bf.Stack()), @@ -253,6 +275,22 @@ def train(args): sampler=sampler) + if args.method == 'full': + target_params_name = classify_net.collect_params().keys() + elif args.method == 'bias': + target_params_name = [key + for key in classify_net.collect_params() if + key.endswith('bias') or key.endswith('beta') or 'out_proj' in key] + elif args.method == 'adapter': + target_params_name = [key + for key in classify_net.collect_params() if + 'adapter' in key or 'out_proj' in key] + for name in classify_net.collect_params(): + if name not in target_params_name: + classify_net.collect_params()[name].grad_req = 'null' + + target_params = {name:classify_net.collect_params()[name] for name in target_params_name} + param_dict = classify_net.collect_params() # Do not apply weight decay to all the LayerNorm and bias @@ -269,7 +307,7 @@ def train(args): if local_rank == 0: writer = SummaryWriter(logdir=os.path.join(args.output_dir, args.task_name + '_tensorboard_' + - str(args.lr) + '_' + str(args.epochs))) + str(args.lr) + '_' + str(args.epochs) + '_' + str(args.method))) if args.comm_backend == 'horovod': # Horovod: fetch and broadcast parameters hvd.broadcast_parameters(param_dict, root_rank=0) @@ -290,10 +328,12 @@ def train(args): optimizer_params = {'learning_rate': args.lr, 'wd': args.wd, 'lr_scheduler': lr_scheduler} + + if args.comm_backend == 'horovod': - trainer = hvd.DistributedTrainer(param_dict, args.optimizer, optimizer_params) + trainer = hvd.DistributedTrainer(target_params, args.optimizer, optimizer_params) else: - trainer = mx.gluon.Trainer(classify_net.collect_params(), + trainer = mx.gluon.Trainer(target_params, 'adamw', optimizer_params) @@ -376,16 +416,22 @@ def train(args): log_gnorm = 0 log_step = 0 if local_rank == 0 and (i == max_update - 1 or i%(max_update//args.epochs) == 0 and i>0): - ckpt_name = '{}_{}_{}.params'.format(args.model_name, - args.task_name, - (i + 1)) + ckpt_name = '{}_{}_{}_{}.params'.format(args.model_name, + args.task_name, + (i + 1), + args.method) + tmp_params = classify_net._collect_params_with_prefix() params_saved = os.path.join(detail_dir, ckpt_name) - classify_net.save_parameters(params_saved) + arg_dict = {key: tmp_params[key]._reduce() for key in target_params} + _mx_npx.savez(params_saved, **arg_dict) logging.info('Params saved in: {}'.format(params_saved)) for metric in metrics: metric.reset() + end_time = time.time() + logging.info('Total costs:{}'.format(end_time - start_time)) + def evaluate(args): @@ -410,19 +456,24 @@ def evaluate(args): str(ctx_l))) cfg, tokenizer, classify_net, use_segmentation = \ - get_network(args.model_name, ctx_l, + get_network(args.model_name, ctx_l, args.method, args.param_checkpoint, args.backbone_path, task) + candidate_ckpt = [] detail_dir = os.path.join(args.output_dir, args.task_name) for name in os.listdir(detail_dir): - if name.endswith('.params') and args.task_name in name and args.model_name in name: + if name.endswith(args.method + '.params') and args.task_name in name and args.model_name in name: candidate_ckpt.append(os.path.join(detail_dir, name)) + candidate_ckpt.sort(reverse=False) best_ckpt = {} metrics = task.metric def evaluate_by_ckpt(ckpt_name, best_ckpt): - classify_net.load_parameters(ckpt_name, ctx=ctx_l, cast_dtype=True) + loaded = _mx_npx.load(ckpt_name) + full_dict = {'params': loaded, 'filename': ckpt_name} + classify_net.load_dict(full_dict, ctx_l, allow_missing=True, + ignore_extra=True, cast_dtype=True) logging.info('Prepare dev data') dev_data, label = get_task_data(args, task, tokenizer, segment='eval') From b1a2bed3e02aa0a5e51633e091fede575bc4e8b5 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 2 Apr 2021 02:48:37 +0000 Subject: [PATCH 02/23] add adapter --- src/gluonnlp/layers.py | 87 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) diff --git a/src/gluonnlp/layers.py b/src/gluonnlp/layers.py index 559260c67c..1036e58f11 100644 --- a/src/gluonnlp/layers.py +++ b/src/gluonnlp/layers.py @@ -17,7 +17,7 @@ """Layers.""" __all__ = ['PositionalEmbedding', 'SinusoidalPositionalEmbedding', 'LearnedPositionalEmbedding', 'BucketPositionalEmbedding', 'AdaptiveEmbedding', - 'PositionwiseFFN', 'ProjectedAdaptiveLogSoftmaxWithLoss'] + 'PositionwiseFFN', 'ProjectedAdaptiveLogSoftmaxWithLoss', 'AdapterModule'] import math from collections import OrderedDict @@ -28,6 +28,8 @@ import numpy as _np from typing import Union, Optional, List, Dict from .op import relative_position_bucket +#from .attention_cell import MultiHeadAttentionCell + InitializerType = Optional[Union[mx.init.Initializer, str]] @@ -478,6 +480,8 @@ def forward(self, positions): return np.take(self.weight.data(), positions, axis=0, mode=self._mode) + + @use_np class BucketPositionalEmbedding(HybridBlock): """Divide the positional space into buckets and assign the relative positions within each @@ -543,6 +547,8 @@ def __init__(self, layer_norm_eps: float = 1E-5, pre_norm: bool = False, dtype='float32', + use_adapter='False', + adapter_config={}, **kwargs): """ @@ -570,6 +576,7 @@ def __init__(self, self._dtype = dtype self._pre_norm = pre_norm self._use_gated_activation = use_gated_activation + self._use_adapter = use_adapter self._kwargs = OrderedDict([ ('units', units), ('hidden_size', hidden_size), @@ -611,6 +618,8 @@ def __init__(self, normalization=normalization, epsilon=layer_norm_eps, **kwargs) + if self._use_adapter: + self.adapter_layer_ffn = AdapterModule(in_units=units, adapter_config=adapter_config) def forward(self, data): """ @@ -637,6 +646,8 @@ def forward(self, data): out = self.activation_dropout_layer(out) out = self.ffn_2(out) out = self.dropout_layer(out) + if self._use_adapter: + out = self.adapter_layer_ffn(out) out = out + residual if not self._pre_norm: out = self.layer_norm(out) @@ -1007,3 +1018,77 @@ def forward(self, hidden, target): def __repr__(self): return _gen_repr_with_kwargs(self._kwargs, self.__class__.__name__) + +@use_np +class AdapterModule(nn.HybridBlock): + def __init__(self, in_units:int, adapter_config:dict): + super().__init__() + self._adapter_config = adapter_config + self.base_adapter_stacks = nn.HybridSequential() + for name in adapter_config['task_names']: + self.base_adapter_stacks.add(get_adapter(adapter_config[name], in_units)) + if adapter_config['adapter_fusion']: + self.adapter_fusion = AdapterFusion(adapter_config['adapter_fusion_config'], in_units) + + def forward(self, data): + output = [] + for base_adapter in self.base_adapter_stacks: + output.append(base_adapter(data)) + if self._adapter_config['adapter_fusion']: + output = np.stack(output, axis=0) + output = self.adapter_fusion(output) + return output + else: + return output[0] + + + + + +@use_np +def get_adapter(base_adapter_config, in_units): + if base_adapter_config['type'] == 'Basic': + return BasicAdapter(units=base_adapter_config['unit'], in_units=in_units) + else: + pass + ##lxy: not finished + + +@use_np +class AdapterFusion(nn.HybridBlock): + def __init__(self, config, in_units): + self._config = config + self.query_proj = nn.Dense(in_units=in_units, units=in_units) + self.key_proj = nn.Dense(in_units=in_units, units=in_units) + self.value_proj = nn.Dense(in_units=in_units, units=in_units) + self.attention_cell = MultiHeadAttentionCell(query_units=in_units, + num_heads=1, + attention_dropout=0, + scaled=True) + + def forward(self, query, key, value): + query = self.query_proj(query) + key = self.key_proj(key) + value = self.value_proj(value) + output = self.attention_cell(query, key, value) + return output + +@use_np +class BasicAdapter(nn.HybridBlock): + def __init__(self, units: int, in_units: int): + super().__init__() + self._units = units + self.down_proj = nn.Dense(in_units=in_units, + units=units, + flatten=False) + self.activate = get_activation('gelu') + self.up_proj = nn.Dense(in_units=units, + units=in_units, + flatten=False) + + def forward(self, data): + out = self.down_proj(data) + out = self.activate(out) + out = self.up_proj(out) + return out + data + From 1e51262413fdadfcf98e6b0a929701c88767d105 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 2 Apr 2021 02:48:57 +0000 Subject: [PATCH 03/23] add adpter --- src/gluonnlp/models/bert.py | 74 ++++++++++++++----------------------- 1 file changed, 27 insertions(+), 47 deletions(-) diff --git a/src/gluonnlp/models/bert.py b/src/gluonnlp/models/bert.py index 515ddef162..b2b4188f52 100644 --- a/src/gluonnlp/models/bert.py +++ b/src/gluonnlp/models/bert.py @@ -16,7 +16,6 @@ # under the License. """ Bert Model - @article{devlin2018bert, title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}, author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, @@ -30,6 +29,7 @@ import os from typing import Tuple +import json import mxnet as mx from mxnet import use_np, np, npx @@ -69,6 +69,8 @@ def google_en_uncased_bert_base(): cfg.MODEL.dtype = 'float32' cfg.MODEL.layout = 'NT' cfg.MODEL.compute_layout = 'auto' + cfg.MODEL.use_adapter = False + cfg.MODEL.adapter_config = None # Hyper-parameters of the Initializers cfg.INITIALIZER = CN() cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02] @@ -81,6 +83,7 @@ def google_en_uncased_bert_base(): return cfg + @bert_cfg_reg.register() def google_en_uncased_bert_large(): cfg = google_en_uncased_bert_base() @@ -161,6 +164,7 @@ def google_multi_cased_bert_large(): 'mlm_params': 'google_en_cased_bert_large/model_mlm-59ff3f6a.params', 'lowercase': False, }, + 'google_en_uncased_bert_large': { 'cfg': google_en_uncased_bert_large(), 'vocab': 'google_en_uncased_bert_large/vocab-e6d2b21d.json', @@ -224,7 +228,9 @@ def __init__(self, units: int = 512, weight_initializer: InitializerType = TruncNorm(stdev=0.02), bias_initializer: InitializerType = 'zeros', activation='gelu', - layout='NT'): + layout='NT', + use_adapter='False', + adapter_config={}): super().__init__() assert units % num_heads == 0,\ 'In BertTransformer, The units should be divided exactly ' \ @@ -236,8 +242,11 @@ def __init__(self, units: int = 512, self._output_attention = output_attention self._output_all_encodings = output_all_encodings self._layout = layout + self._use_adapter = use_adapter + self._adapter_config = adapter_config self.all_layers = nn.HybridSequential() + for layer_idx in range(num_layers): self.all_layers.add( TransformerEncoderLayer(units=units, @@ -250,7 +259,9 @@ def __init__(self, units: int = 512, bias_initializer=bias_initializer, activation=activation, layout=layout, - dtype=dtype)) + dtype=dtype, + use_adapter=use_adapter, + adapter_config=adapter_config)) @property def layout(self): @@ -259,9 +270,7 @@ def layout(self): def forward(self, data, valid_length): """ Generate the representation given the inputs. - This is used in training or fine-tuning a bert model. - Parameters ---------- data @@ -269,10 +278,8 @@ def forward(self, data, valid_length): Shape (batch_size, seq_length, C) - layout = 'TN' Shape (seq_length, batch_size, C) - valid_length Shape (batch_size,) - Returns ------- out @@ -280,7 +287,6 @@ def forward(self, data, valid_length): Shape (batch_size, seq_length, C_out) - layout = 'TN' Shape (seq_length, batch_size, C_out) - """ if self.layout == 'NT': time_axis, batch_axis = 1, 0 @@ -336,7 +342,9 @@ def __init__(self, dtype='float32', use_pooler=True, layout='NT', - compute_layout='auto'): + compute_layout='auto', + use_adapter=False, + adapter_config={}): super().__init__() self._dtype = dtype self.use_pooler = use_pooler @@ -351,6 +359,9 @@ def __init__(self, self.bias_initializer = bias_initializer self.layer_norm_eps = layer_norm_eps self._layout = layout + self._use_adapter = use_adapter + if self._use_adapter: + self._adapter_config = json.loads(adapter_config) if compute_layout is None or compute_layout == 'auto': self._compute_layout = layout else: @@ -370,7 +381,9 @@ def __init__(self, weight_initializer=weight_initializer, bias_initializer=bias_initializer, dtype=dtype, - layout=self._compute_layout + layout=self._compute_layout, + use_adapter=self._use_adapter, + adapter_config=self._adapter_config ) # Construct word embedding self.word_embed = nn.Embedding(input_dim=vocab_size, @@ -404,9 +417,7 @@ def layout(self): def forward(self, inputs, token_types, valid_length): # pylint: disable=arguments-differ """Generate the representation given the inputs. - This is used in training or fine-tuning a bert model. - Parameters ---------- inputs @@ -414,20 +425,16 @@ def forward(self, inputs, token_types, valid_length): Shape (batch_size, seq_length) - layout = 'TN' Shape (seq_length, batch_size) - token_types If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. - - layout = 'NT' Shape (batch_size, seq_length) - layout = 'TN' Shape (batch_size, seq_length) - valid_length : The valid length of each sequence Shape (batch_size,) - Returns ------- contextual_embedding @@ -435,7 +442,6 @@ def forward(self, inputs, token_types, valid_length): Shape (batch_size, seq_length, units). - layout = 'TN' Shape (seq_length, batch_size, units). - pooled_output This is optional. Shape (batch_size, units) """ @@ -457,7 +463,6 @@ def forward(self, inputs, token_types, valid_length): def get_initial_embedding(self, inputs, token_types=None): """Get the initial token embeddings that considers the token type and positional embeddings - Parameters ---------- inputs @@ -465,25 +470,20 @@ def get_initial_embedding(self, inputs, token_types=None): Shape (batch_size, seq_length) - layout = 'TN' Shape (seq_length, batch_size) - token_types The type of tokens. If None, it will be initialized as all zero. - - layout = 'NT' Shape (batch_size, seq_length) - layout = 'TN' Shape (seq_length, batch_size) - Returns ------- embedding The initial embedding that will be fed into the encoder - - layout = 'NT' Shape (batch_size, seq_length, C_emb) - layout = 'TN' Shape (seq_length, batch_size, C_emb) - """ if self.layout == 'NT': time_axis, batch_axis = 1, 0 @@ -505,10 +505,8 @@ def get_initial_embedding(self, inputs, token_types=None): def apply_pooling(self, sequence): """Generate the representation given the inputs. - This is used for pre-training or fine-tuning a bert model. Get the first token of the whole sequence which is [CLS]. - Parameters ---------- sequence @@ -516,7 +514,6 @@ def apply_pooling(self, sequence): Shape (batch_size, sequence_length, units) - layout = 'TN' Shape (sequence_length, batch_size, units) - Returns ------- outputs @@ -538,7 +535,6 @@ def get_cfg(key=None): @classmethod def from_cfg(cls, cfg, use_pooler=True, dtype=None) -> 'BertModel': """ - Parameters ---------- cfg @@ -547,7 +543,6 @@ def from_cfg(cls, cfg, use_pooler=True, dtype=None) -> 'BertModel': Whether to output the pooled feature dtype data type of the model - Returns ------- ret @@ -578,7 +573,9 @@ def from_cfg(cls, cfg, use_pooler=True, dtype=None) -> 'BertModel': bias_initializer=bias_initializer, use_pooler=use_pooler, layout=cfg.MODEL.layout, - compute_layout=cfg.MODEL.compute_layout) + compute_layout=cfg.MODEL.compute_layout, + use_adapter=cfg.MODEL.use_adapter, + adapter_config=cfg.MODEL.adapter_config) @use_np @@ -587,7 +584,6 @@ def __init__(self, backbone_cfg, weight_initializer=None, bias_initializer=None): """ - Parameters ---------- backbone_cfg @@ -626,7 +622,6 @@ def layout(self): def forward(self, inputs, token_types, valid_length, masked_positions): """Getting the scores of the masked positions. - Parameters ---------- inputs @@ -634,23 +629,19 @@ def forward(self, inputs, token_types, valid_length, Shape (batch_size, seq_length) - layout = 'TN' Shape (seq_length, batch_size) - token_types If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. - - layout = 'NT' Shape (batch_size, seq_length) - layout = 'TN' Shape (seq_length, batch_size) - valid_length : The valid length of each sequence Shape (batch_size,) masked_positions : The masked position of the sequence Shape (batch_size, num_masked_positions). - Returns ------- contextual_embedding @@ -658,7 +649,6 @@ def forward(self, inputs, token_types, valid_length, Shape (batch_size, seq_length, units). - layout = 'TN' Shape (seq_length, batch_size, units) - pooled_out Shape (batch_size, units) mlm_scores : @@ -680,7 +670,6 @@ def __init__(self, backbone_cfg, weight_initializer=None, bias_initializer=None): """ - Parameters ---------- backbone_cfg @@ -724,9 +713,7 @@ def layout(self): def forward(self, inputs, token_types, valid_length, masked_positions): """Generate the representation given the inputs. - This is used in training or fine-tuning a bert model. - Parameters ---------- inputs @@ -734,23 +721,19 @@ def forward(self, inputs, token_types, valid_length, Shape (batch_size, seq_length) - layout = 'TN' Shape (seq_length, batch_size) - token_types If the inputs contain two sequences, we will set different token types for the first sentence and the second sentence. - - layout = 'NT' Shape (batch_size, seq_length) - layout = 'TN' Shape (seq_length, batch_size) - valid_length The valid length of each sequence Shape (batch_size,) masked_positions The masked position of the sequence Shape (batch_size, num_masked_positions). - Returns ------- contextual_embedding @@ -758,7 +741,6 @@ def forward(self, inputs, token_types, valid_length, Shape (batch_size, seq_length, units). - layout = 'TN' Shape (seq_length, batch_size, units). - pooled_out Shape (batch_size, units) nsp_score : @@ -787,7 +769,6 @@ def get_pretrained_bert(model_name: str = 'google_en_cased_bert_base', load_mlm: str = False)\ -> Tuple[CN, HuggingFaceWordPieceTokenizer, str, str]: """Get the pretrained bert weights - Parameters ---------- model_name @@ -798,7 +779,6 @@ def get_pretrained_bert(model_name: str = 'google_en_cased_bert_base', Whether to load the weights of the backbone network load_mlm Whether to load the weights of MLM - Returns ------- cfg @@ -857,4 +837,4 @@ def get_pretrained_bert(model_name: str = 'google_en_cased_bert_base', BACKBONE_REGISTRY.register('bert', [BertModel, get_pretrained_bert, - list_pretrained_bert]) + list_pretrained_bert]) \ No newline at end of file From 65c3047e479ca3927452b26536ca77943afe578d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 2 Apr 2021 02:49:15 +0000 Subject: [PATCH 04/23] add adapter --- src/gluonnlp/models/transformer.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/gluonnlp/models/transformer.py b/src/gluonnlp/models/transformer.py index 646bcea808..51638cb783 100644 --- a/src/gluonnlp/models/transformer.py +++ b/src/gluonnlp/models/transformer.py @@ -16,7 +16,7 @@ from typing import Optional, Tuple, List from ..utils.registry import Registry from ..attention_cell import MultiHeadAttentionCell, gen_self_attn_mask, gen_mem_attn_mask -from ..layers import PositionalEmbedding, PositionwiseFFN, InitializerType +from ..layers import PositionalEmbedding, PositionwiseFFN, InitializerType, AdapterModule from ..utils.config import CfgNode as CN from ..sequence_sampler import BaseStepDecoder @@ -149,7 +149,9 @@ def __init__(self, bias_initializer: Optional[InitializerType] = 'zeros', activation: str = 'relu', dtype='float32', - layout='NT'): + layout='NT', + use_adapter=False, + adapter_config={}): """ Parameters @@ -186,6 +188,8 @@ def __init__(self, self._pre_norm = pre_norm self._dtype = dtype self._layout = layout + self._use_adapter = use_adapter + self._adapter_config = adapter_config assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \ 'Only "TN" and "NT" are accepted!'.format(layout) assert self._units % self._num_heads == 0, 'units must be divisive by the number of heads' @@ -204,6 +208,9 @@ def __init__(self, weight_initializer=weight_initializer, bias_initializer=bias_initializer, dtype=self._dtype) + + if self._use_adapter: + self.adapter_layer_attn = AdapterModule(in_units=units, adapter_config=adapter_config) attention_layout = 'NTK' if self._layout == 'NT' else 'TNK' self.attention_cell = \ MultiHeadAttentionCell( @@ -225,7 +232,9 @@ def __init__(self, layer_norm_eps=layer_norm_eps, activation=activation, pre_norm=pre_norm, - dtype=self._dtype) + dtype=self._dtype, + use_adapter=self._use_adapter, + adapter_config=self._adapter_config) @property def layout(self) -> str: @@ -265,6 +274,8 @@ def forward(self, data, attn_mask): out, [_, attn_weight] = self.attention_cell(query, key, value, attn_mask) out = self.attention_proj(out) out = self.dropout_layer(out) + if self._use_adapter: + out = self.adapter_layer_attn(out) out = out + data if not self._pre_norm: out = self.layer_norm(out) From 867a41cbc0b0a2e38ffcd8aa3f0cb4fbbabef61f Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 9 Apr 2021 10:59:36 +0000 Subject: [PATCH 05/23] change adapter config --- .../classification/train_classification.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/scripts/classification/train_classification.py b/scripts/classification/train_classification.py index f91a745d8d..35f3a80fdf 100644 --- a/scripts/classification/train_classification.py +++ b/scripts/classification/train_classification.py @@ -94,7 +94,7 @@ def parse_args(): help='the path to training dataset') parser.add_argument('--warmup_ratio', type=float, default=0.1, help='Ratio of warmup steps in the learning rate scheduler.') - parser.add_argument('--method', type=str, default='full', choices=['full', 'bias', 'subbias', 'adapter'], + parser.add_argument('--method', type=str, default='full', choices=['full', 'bias', 'adapter', 'last_layer'], help='different finetune method') @@ -103,9 +103,18 @@ def parse_args(): def change_adapter_cfg(cfg, task): - adapter_config = {'adapter_fusion':False, + adapter_config = { + 'location_0':{ + 'adapter_fusion':False, + 'pre_operator':False, 'task_names':[task.task_name], - task.task_name:{'type':'Basic','unit':64}} + task.task_name:{'type':'Basic','units':64, 'activation':'gelu'}}, + 'location_1':{ + 'adapter_fusion':False, + 'pre_operator':False, + 'task_names':[task.task_name], + task.task_name:{'type':'Basic','units':64, 'activation':'gelu'}} + } cfg.defrost() cfg.MODEL.use_adapter = True cfg.MODEL.adapter_config = json.dumps(adapter_config) @@ -131,7 +140,6 @@ def get_network(model_name, backbone = Model.from_cfg(cfg) # Load local backbone parameters if backbone_path provided. # Otherwise, download backbone parameters from gluon zoo. - backbone_params_path = backbone_path if backbone_path else download_params_path if checkpoint_path is None: backbone.load_parameters(backbone_params_path, ignore_extra=True, allow_missing=True, @@ -244,7 +252,7 @@ def train(args): if not os.path.exists(detail_dir): os.mkdir(detail_dir) logging_config(detail_dir, - name='train_{}_{}_'.format(args.task_name, args.model_name) + str(rank), # avoid race + name='train_{}_{}_{}_'.format(args.task_name, args.model_name, args.method) + str(rank), # avoid race level=level, console=(local_rank == 0)) logging.info(args) @@ -285,6 +293,10 @@ def train(args): target_params_name = [key for key in classify_net.collect_params() if 'adapter' in key or 'out_proj' in key] + elif args.method == 'last_layer': + target_params_name = [key + for key in classify_net.collect_params() if + 'out_proj' in key] for name in classify_net.collect_params(): if name not in target_params_name: classify_net.collect_params()[name].grad_req = 'null' From 17ab2d7d1ec9bcb03a83d796e6d89450805a793b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 9 Apr 2021 11:00:32 +0000 Subject: [PATCH 06/23] change adadpter input --- src/gluonnlp/layers.py | 76 ++---------------------------------------- 1 file changed, 2 insertions(+), 74 deletions(-) diff --git a/src/gluonnlp/layers.py b/src/gluonnlp/layers.py index 1036e58f11..1f1e185c1c 100644 --- a/src/gluonnlp/layers.py +++ b/src/gluonnlp/layers.py @@ -17,7 +17,7 @@ """Layers.""" __all__ = ['PositionalEmbedding', 'SinusoidalPositionalEmbedding', 'LearnedPositionalEmbedding', 'BucketPositionalEmbedding', 'AdaptiveEmbedding', - 'PositionwiseFFN', 'ProjectedAdaptiveLogSoftmaxWithLoss', 'AdapterModule'] + 'PositionwiseFFN', 'ProjectedAdaptiveLogSoftmaxWithLoss'] import math from collections import OrderedDict @@ -28,7 +28,6 @@ import numpy as _np from typing import Union, Optional, List, Dict from .op import relative_position_bucket -#from .attention_cell import MultiHeadAttentionCell @@ -647,7 +646,7 @@ def forward(self, data): out = self.ffn_2(out) out = self.dropout_layer(out) if self._use_adapter: - out = self.adapter_layer_ffn(out) + out = self.adapter_layer_ffn(out, residual) out = out + residual if not self._pre_norm: out = self.layer_norm(out) @@ -1019,76 +1018,5 @@ def forward(self, hidden, target): def __repr__(self): return _gen_repr_with_kwargs(self._kwargs, self.__class__.__name__) -@use_np -class AdapterModule(nn.HybridBlock): - def __init__(self, in_units:int, adapter_config:dict): - super().__init__() - self._adapter_config = adapter_config - self.base_adapter_stacks = nn.HybridSequential() - for name in adapter_config['task_names']: - self.base_adapter_stacks.add(get_adapter(adapter_config[name], in_units)) - if adapter_config['adapter_fusion']: - self.adapter_fusion = AdapterFusion(adapter_config['adapter_fusion_config'], in_units) - - def forward(self, data): - output = [] - for base_adapter in self.base_adapter_stacks: - output.append(base_adapter(data)) - if self._adapter_config['adapter_fusion']: - output = np.stack(output, axis=0) - output = self.adapter_fusion(output) - return output - else: - return output[0] - - - - - -@use_np -def get_adapter(base_adapter_config, in_units): - if base_adapter_config['type'] == 'Basic': - return BasicAdapter(units=base_adapter_config['unit'], in_units=in_units) - else: - pass - ##lxy: not finished - - -@use_np -class AdapterFusion(nn.HybridBlock): - def __init__(self, config, in_units): - self._config = config - self.query_proj = nn.Dense(in_units=in_units, units=in_units) - self.key_proj = nn.Dense(in_units=in_units, units=in_units) - self.value_proj = nn.Dense(in_units=in_units, units=in_units) - self.attention_cell = MultiHeadAttentionCell(query_units=in_units, - num_heads=1, - attention_dropout=0, - scaled=True) - - def forward(self, query, key, value): - query = self.query_proj(query) - key = self.key_proj(key) - value = self.value_proj(value) - output = self.attention_cell(query, key, value) - return output -@use_np -class BasicAdapter(nn.HybridBlock): - def __init__(self, units: int, in_units: int): - super().__init__() - self._units = units - self.down_proj = nn.Dense(in_units=in_units, - units=units, - flatten=False) - self.activate = get_activation('gelu') - self.up_proj = nn.Dense(in_units=units, - units=in_units, - flatten=False) - - def forward(self, data): - out = self.down_proj(data) - out = self.activate(out) - out = self.up_proj(out) - return out + data From 75095bd510b453df9df88f6f82a96ade5107e472 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 9 Apr 2021 11:02:14 +0000 Subject: [PATCH 07/23] change adapter --- src/gluonnlp/models/transformer.py | 49 +++++++++++++++++++----------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/src/gluonnlp/models/transformer.py b/src/gluonnlp/models/transformer.py index 51638cb783..6bb8c0a1e8 100644 --- a/src/gluonnlp/models/transformer.py +++ b/src/gluonnlp/models/transformer.py @@ -16,9 +16,10 @@ from typing import Optional, Tuple, List from ..utils.registry import Registry from ..attention_cell import MultiHeadAttentionCell, gen_self_attn_mask, gen_mem_attn_mask -from ..layers import PositionalEmbedding, PositionwiseFFN, InitializerType, AdapterModule +from ..layers import PositionalEmbedding, PositionwiseFFN, InitializerType from ..utils.config import CfgNode as CN from ..sequence_sampler import BaseStepDecoder +from ..adapters import AdapterModule, PositionwiseFFN_adapter transformer_cfg_reg = Registry('transformer_cfg') @@ -209,8 +210,8 @@ def __init__(self, bias_initializer=bias_initializer, dtype=self._dtype) - if self._use_adapter: - self.adapter_layer_attn = AdapterModule(in_units=units, adapter_config=adapter_config) + if self._use_adapter and 'location_0' in self._adapter_config: + self.adapter_layer_attn = AdapterModule(in_units=units, adapter_config=self._adapter_config['location_0']) attention_layout = 'NTK' if self._layout == 'NT' else 'TNK' self.attention_cell = \ MultiHeadAttentionCell( @@ -223,18 +224,32 @@ def __init__(self, ) self.layer_norm = nn.LayerNorm(epsilon=layer_norm_eps, in_channels=units) - self.ffn = PositionwiseFFN(units=units, - hidden_size=hidden_size, - dropout=hidden_dropout_prob, - activation_dropout=activation_dropout_prob, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - layer_norm_eps=layer_norm_eps, - activation=activation, - pre_norm=pre_norm, - dtype=self._dtype, - use_adapter=self._use_adapter, - adapter_config=self._adapter_config) + if self._use_adapter: + self.ffn = PositionwiseFFN_adapter(units=units, + hidden_size=hidden_size, + dropout=hidden_dropout_prob, + activation_dropout=activation_dropout_prob, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + layer_norm_eps=layer_norm_eps, + activation=activation, + pre_norm=pre_norm, + dtype=self._dtype, + use_adapter=self._use_adapter, + adapter_config=self._adapter_config) + else: + self.ffn = PositionwiseFFN(units=units, + hidden_size=hidden_size, + dropout=hidden_dropout_prob, + activation_dropout=activation_dropout_prob, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + layer_norm_eps=layer_norm_eps, + activation=activation, + pre_norm=pre_norm, + dtype=self._dtype, + use_adapter=self._use_adapter, + adapter_config=self._adapter_config) @property def layout(self) -> str: @@ -274,8 +289,8 @@ def forward(self, data, attn_mask): out, [_, attn_weight] = self.attention_cell(query, key, value, attn_mask) out = self.attention_proj(out) out = self.dropout_layer(out) - if self._use_adapter: - out = self.adapter_layer_attn(out) + if self._use_adapter and 'location_0' in self._adapter_config: + out = self.adapter_layer_attn(out, data) out = out + data if not self._pre_norm: out = self.layer_norm(out) From 1189a51b4fd284b19d87590c6f14cd161b8bc9bf Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 9 Apr 2021 11:02:51 +0000 Subject: [PATCH 08/23] add adapter --- src/gluonnlp/adapters.py | 243 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 src/gluonnlp/adapters.py diff --git a/src/gluonnlp/adapters.py b/src/gluonnlp/adapters.py new file mode 100644 index 0000000000..29480f57ed --- /dev/null +++ b/src/gluonnlp/adapters.py @@ -0,0 +1,243 @@ +import math +from collections import OrderedDict +import mxnet as mx +from mxnet import np, npx +from mxnet import use_np +from mxnet.gluon import nn, HybridBlock, Parameter, Constant +import numpy as _np +from .attention_cell import MultiHeadAttentionCell +import mxnet.numpy_extension as _mx_npx +from .layers import get_activation, get_norm_layer, _gen_repr_with_kwargs + + +__all__ = ['BasicAdapter', 'AdapterFusion', 'AdapterModule', 'PositionwiseFFN_adapter'] + +@use_np +class BasicAdapter(nn.HybridBlock): + def __init__(self, config: dict, in_units: int): + super().__init__() + self._units = config['units'] + self._activation = config['activation'] + self.down_proj = nn.Dense(in_units=in_units, + units=self._units, + flatten=False, + weight_initializer=None, bias_initializer='zero') + self.activate = get_activation(self._activation) + self.up_proj = nn.Dense(in_units=self._units, + units=in_units, + flatten=False, + weight_initializer=None, bias_initializer='zero') + + + + def forward(self, data, residual): + out = self.down_proj(data) + out = self.activate(out) + out = self.up_proj(out) + return out + residual + + +@use_np +class AdapterFusion(nn.HybridBlock): + def __init__(self, in_units): + super().__init__() + self.query_proj = nn.Dense(in_units=in_units, units=in_units, flatten=False, weight_initializer=None, bias_initializer='zero') + self.key_proj = nn.Dense(in_units=in_units, units=in_units, flatten=False, weight_initializer=None, bias_initializer='zero') + self.value_proj = nn.Dense(in_units=in_units, units=in_units, flatten=False, weight_initializer=None, bias_initializer='zero') + ''' + self.attention_cell = MultiHeadAttentionCell(query_units=in_units, + num_heads=1, + attention_dropout=0, + scaled=True) + ''' + + def forward(self, query, key, value): + #query bs, length, unit + #key bs, length, num_adapters, unit + query = npx.reshape(self.query_proj(query), (-2, -2, 1, -1)) + key = self.key_proj(key).transpose((0, 1, 3, 2)) + value = self.value_proj(value) + scores = np.squeeze(npx.batch_dot(query, key), axis=2) + attn_weights = npx.softmax(scores, axis=-1) + output = np.squeeze(npx.batch_dot(npx.reshape(attn_weights, (-2, -2, 1, -1)), value), axis=2) + + return output + +@use_np +def get_base_adapter(base_adapter_config, in_units): + if base_adapter_config['type'] == 'Basic': + base_adapter = BasicAdapter(config=base_adapter_config, in_units=in_units) + else: + pass + return base_adapter + +@use_np +class AdapterModule(nn.HybridBlock): + def __init__(self, in_units:int, adapter_config:dict): + super().__init__() + self._in_units = in_units + self._adapter_config = adapter_config + self.base_adapter_stacks = nn.HybridSequential() + for name in adapter_config['task_names']: + self.base_adapter_stacks.add(get_base_adapter(adapter_config[name], in_units)) + if adapter_config['adapter_fusion']: + self.adapter_fusion = AdapterFusion(in_units) + if adapter_config['pre_operator']: + self.pre_norm = nn.LayerNorm(epsilon=adapter_config['layer_norm_eps'], + in_channels=in_units) + + + def forward(self, data, residual): + new_residual = data + if self._adapter_config['pre_operator']: + data = data + residual + data = self.pre_norm(data) + + output = [] + for base_adapter in self.base_adapter_stacks: + output.append(base_adapter(data, new_residual)) + + if self._adapter_config['adapter_fusion']: + output = np.stack(output, axis=2) + #output = np.concatenate(output, axis = 1) + output = self.adapter_fusion(new_residual, output, output) + + return output + else: + return output[0] + + + +@use_np +class PositionwiseFFN_adapter(HybridBlock): + """The Position-wise FFN layer used in Transformer-like architectures, + # this architecture copy from layers.py to aviod import probles + + If pre_norm is True: + norm(data) -> fc1 -> act -> act_dropout -> fc2 -> dropout -> res(+data) + Else: + data -> fc1 -> act -> act_dropout -> fc2 -> dropout -> norm(res(+data)) + """ + def __init__(self, + units: int = 512, + hidden_size: int = 2048, + use_bias=True, + activation_dropout: float = 0.0, + dropout: float = 0.1, + weight_initializer=None, + bias_initializer='zeros', + activation='relu', + use_gated_activation=False, + normalization: str = 'layer_norm', + layer_norm_eps: float = 1E-5, + pre_norm: bool = False, + dtype='float32', + use_adapter='False', + adapter_config={}, + **kwargs): + """ + + Parameters + ---------- + units + hidden_size + activation_dropout + dropout + weight_initializer + bias_initializer + activation + normalization + layer_norm or no_norm + layer_norm_eps + pre_norm + Pre-layer normalization as proposed in the paper: + "[ACL2018] The Best of Both Worlds: Combining Recent Advances in + Neural Machine Translation" + This will stabilize the training of Transformers. + You may also refer to + "[Arxiv2020] Understanding the Difficulty of Training Transformers" + """ + super().__init__() + self._dtype = dtype + self._pre_norm = pre_norm + self._use_gated_activation = use_gated_activation + self._use_adapter = use_adapter + self._adapter_config = adapter_config + self._kwargs = OrderedDict([ + ('units', units), + ('hidden_size', hidden_size), + ('activation_dropout', activation_dropout), + ('activation', activation), + ('dropout', dropout), + ('normalization', normalization), + ('layer_norm_eps', layer_norm_eps), + ('pre_norm', pre_norm), + ('dtype', self._dtype) + ]) + self.dropout_layer = nn.Dropout(dropout) + self.activation_dropout_layer = nn.Dropout(activation_dropout) + self.ffn_1 = nn.Dense(units=hidden_size, + in_units=units, + flatten=False, + use_bias=use_bias, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype) + if use_gated_activation: + self.gated_ffn_1 = nn.Dense(units=hidden_size, + in_units=units, + flatten=False, + use_bias=use_bias, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype) + self.activation = get_activation(activation) + self.ffn_2 = nn.Dense(units=units, + in_units=hidden_size, + flatten=False, + use_bias=use_bias, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype) + # TODO(sxjscience) We may need to set the dtype flag in LayerNorm, need to double check + self.layer_norm = get_norm_layer(in_channels=units, + normalization=normalization, + epsilon=layer_norm_eps, + **kwargs) + if self._use_adapter and 'location_1' in self._adapter_config: + self.adapter_layer_ffn = AdapterModule(in_units=units, adapter_config=adapter_config['location_1']) + + def forward(self, data): + """ + + Parameters + ---------- + F + data : + Shape (B, seq_length, C_in) + + Returns + ------- + out : + Shape (B, seq_length, C_out) + """ + residual = data + if self._pre_norm: + data = self.layer_norm(data) + if self._use_gated_activation: + gated_out = self.activation(self.gated_ffn_1(data)) + out = gated_out * self.ffn_1(data) + else: + out = self.activation(self.ffn_1(data)) + out = self.activation_dropout_layer(out) + out = self.ffn_2(out) + out = self.dropout_layer(out) + if self._use_adapter and 'location_1' in self._adapter_config: + out = self.adapter_layer_ffn(out, residual) + out = out + residual + if not self._pre_norm: + out = self.layer_norm(out) + return out + + def __repr__(self): + return _gen_repr_with_kwargs(self._kwargs, self.__class__.__name__) \ No newline at end of file From cf6c058b4f5393f6deb93200e3c4ebb80109ef18 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 9 Apr 2021 11:04:51 +0000 Subject: [PATCH 09/23] change interface --- src/gluonnlp/models/bert.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gluonnlp/models/bert.py b/src/gluonnlp/models/bert.py index b2b4188f52..608c1fe26f 100644 --- a/src/gluonnlp/models/bert.py +++ b/src/gluonnlp/models/bert.py @@ -230,7 +230,7 @@ def __init__(self, units: int = 512, activation='gelu', layout='NT', use_adapter='False', - adapter_config={}): + adapter_config='{}'): super().__init__() assert units % num_heads == 0,\ 'In BertTransformer, The units should be divided exactly ' \ @@ -360,6 +360,7 @@ def __init__(self, self.layer_norm_eps = layer_norm_eps self._layout = layout self._use_adapter = use_adapter + self._adapter_config = adapter_config if self._use_adapter: self._adapter_config = json.loads(adapter_config) if compute_layout is None or compute_layout == 'auto': From 89ea09d7eff734be107ea87a16910ca2867b55f7 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 12 Apr 2021 03:49:07 +0000 Subject: [PATCH 10/23] remove useless code --- src/gluonnlp/layers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/gluonnlp/layers.py b/src/gluonnlp/layers.py index 1f1e185c1c..4f24a8b921 100644 --- a/src/gluonnlp/layers.py +++ b/src/gluonnlp/layers.py @@ -617,8 +617,6 @@ def __init__(self, normalization=normalization, epsilon=layer_norm_eps, **kwargs) - if self._use_adapter: - self.adapter_layer_ffn = AdapterModule(in_units=units, adapter_config=adapter_config) def forward(self, data): """ From 3f09b7906120ddf450fdd9eb45a7f388f7c951df Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 12 Apr 2021 03:58:01 +0000 Subject: [PATCH 11/23] change config to be safe --- scripts/classification/train_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/classification/train_classification.py b/scripts/classification/train_classification.py index 35f3a80fdf..ab3ea5c9fa 100644 --- a/scripts/classification/train_classification.py +++ b/scripts/classification/train_classification.py @@ -142,7 +142,7 @@ def get_network(model_name, # Otherwise, download backbone parameters from gluon zoo. backbone_params_path = backbone_path if backbone_path else download_params_path if checkpoint_path is None: - backbone.load_parameters(backbone_params_path, ignore_extra=True, allow_missing=True, + backbone.load_parameters(backbone_params_path, ignore_extra=True, allow_missing=(args.method != 'full'), ctx=ctx_l, cast_dtype=True) num_params, num_fixed_params \ = count_parameters(deduplicate_param_dict(backbone.collect_params())) From b9d451040317f29ac1003667959294342d6c51ff Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 12 Apr 2021 04:39:48 +0000 Subject: [PATCH 12/23] remove useless code --- src/gluonnlp/layers.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/gluonnlp/layers.py b/src/gluonnlp/layers.py index 4f24a8b921..4189d10bf4 100644 --- a/src/gluonnlp/layers.py +++ b/src/gluonnlp/layers.py @@ -546,8 +546,6 @@ def __init__(self, layer_norm_eps: float = 1E-5, pre_norm: bool = False, dtype='float32', - use_adapter='False', - adapter_config={}, **kwargs): """ @@ -575,7 +573,6 @@ def __init__(self, self._dtype = dtype self._pre_norm = pre_norm self._use_gated_activation = use_gated_activation - self._use_adapter = use_adapter self._kwargs = OrderedDict([ ('units', units), ('hidden_size', hidden_size), @@ -643,8 +640,6 @@ def forward(self, data): out = self.activation_dropout_layer(out) out = self.ffn_2(out) out = self.dropout_layer(out) - if self._use_adapter: - out = self.adapter_layer_ffn(out, residual) out = out + residual if not self._pre_norm: out = self.layer_norm(out) From 8928a77c564ca64e45c1f6bb869e935d3af520e2 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 12 Apr 2021 04:47:36 +0000 Subject: [PATCH 13/23] remove useless code --- src/gluonnlp/models/transformer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/gluonnlp/models/transformer.py b/src/gluonnlp/models/transformer.py index 6bb8c0a1e8..90a1cd623c 100644 --- a/src/gluonnlp/models/transformer.py +++ b/src/gluonnlp/models/transformer.py @@ -247,9 +247,7 @@ def __init__(self, layer_norm_eps=layer_norm_eps, activation=activation, pre_norm=pre_norm, - dtype=self._dtype, - use_adapter=self._use_adapter, - adapter_config=self._adapter_config) + dtype=self._dtype) @property def layout(self) -> str: From ada971e6d89ce989a53aa1878405384beca53b42 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 12 Apr 2021 12:49:50 +0000 Subject: [PATCH 14/23] update some result --- scripts/classification/README.md | 33 +++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/scripts/classification/README.md b/scripts/classification/README.md index de584816bb..2eda8f80d1 100644 --- a/scripts/classification/README.md +++ b/scripts/classification/README.md @@ -51,4 +51,35 @@ here are some results with their hyperparameters | CoLA | Matthew Corr. | 2e-5 | 32 | 7800 | 10 | 59.23 | https://tensorboard.dev/experiment/33euRGh9SrW3p15JWgILnw/ | | RTE | Accuracy | 2e-5 | 32 | 1800 | 10 | 69.67 | https://tensorboard.dev/experiment/XjTxr5anRrC1LMukLJJQ3g/| | MRPC | Accuracy/F1 | 3e-5 | 32 | 7800 | 5 | 85.38/87.31 | https://tensorboard.dev/experiment/jEJFq2XXQ8SvCxt6eKIjwg/ | -| MNLI | Accuracy(m/mm) | 2e-5 | 48 | 7800 | 5 | 84.90/85.10 | https://tensorboard.dev/experiment/CZQlOBedRQeTZwn5o5fbKQ/ | \ No newline at end of file +| MNLI | Accuracy(m/mm) | 2e-5 | 48 | 7800 | 5 | 84.90/85.10 | https://tensorboard.dev/experiment/CZQlOBedRQeTZwn5o5fbKQ/ | + + +## different method +We also offer different finetune method to save time and space. So now we offer two different method: +bias-finetune() and adapter-finetune. To use them, you can directly add an augment "method" like: +```bash +python train_classification.py \ + --model_name google_en_uncased_bert_base \ + --task_name mrpc \ + --lr 4.5e-4\ + --model_name google_en_cased_bert_base \ + --batch_size 32 \ + --do_train \ + --do_eval \ + --seed 7800 \ + --epochs 10 \ + --optimizer adamw \ + --train_dir glue/mrpc/train.parquet \ + --eval_dir glue/mrpc/dev.parquet \ + --gpus 1 +``` +And here are some result of different method(the blank means we can't find proper hyperparameter until now) + +| task Name | metirc | full | bias-finetune | adapter | +|-----------|-------------|-------------|-------------|-------------| +| SST | Accuracy | 93.23 | | 93.46 | +| STS | Pearson Corr. | 89.26 | 89.30 | 89.70 | +| CoLA | Matthew Corr. | 59.23 | | 61.20 | +| RTE | Accuracy | 69.67 | 69.31 | 70.75 | +| MRPC | Accuracy/F1 | 85.38/87.31 | 85.29/88.63 | 87.74/91.39| +| MNLI | Accuracy(m/mm) | 84.90/85.10 | \ No newline at end of file From c71ba1d47d4f9ef25e32a71cab3b2335f296955b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 12 Apr 2021 13:50:46 +0000 Subject: [PATCH 15/23] add fusion script --- scripts/continual_learning/train_continual.py | 563 ++++++++++++++++++ 1 file changed, 563 insertions(+) create mode 100644 scripts/continual_learning/train_continual.py diff --git a/scripts/continual_learning/train_continual.py b/scripts/continual_learning/train_continual.py new file mode 100644 index 0000000000..6ae167b2f9 --- /dev/null +++ b/scripts/continual_learning/train_continual.py @@ -0,0 +1,563 @@ +import gluonnlp +from tensorboardX import SummaryWriter +import numpy as np +import mxnet as mx +import json +import random +import pandas as pd +import mxnet.numpy_extension as _mx_npx +import os +import json +import logging +import time +import argparse +import copy +from mxnet.gluon.metric import Accuracy, F1, MCC, PearsonCorrelation, CompositeEvalMetric +from classification_utils import get_task +import matplotlib.pyplot as plt +from tqdm import tqdm +from mxnet import gluon +from gluonnlp.data.sampler import SplitSampler +from mxnet.gluon import nn +from gluonnlp.models import get_backbone +from gluonnlp.utils.parameter import clip_grad_global_norm, count_parameters, deduplicate_param_dict +from gluonnlp.utils.preprocessing import get_trimmed_lengths +from gluonnlp.utils.misc import get_mxnet_visible_ctx, grouper, repeat, logging_config +from mxnet.gluon.data import batchify as bf +from mxnet.gluon.data import DataLoader +from mxnet.lr_scheduler import PolyScheduler +from gluonnlp.utils import set_seed +from gluonnlp.utils.misc import init_comm, parse_ctx +try: + import horovod.mxnet as hvd +except ImportError: + pass +from classification import TextPredictionNet + +mx.npx.set_np() + + + +CACHE_PATH = os.path.realpath(os.path.join(os.path.realpath(__file__), '..', 'cached')) +if not os.path.exists(CACHE_PATH): + os.makedirs(CACHE_PATH, exist_ok=True) + + +def parse_args(): + parser = argparse.ArgumentParser( + description='classification example. ' + 'We fine-tune the pretrained model on GLUE dataset to do different taks.') + parser.add_argument('--model_name', type=str, default='google_en_uncased_bert_base', + help='Name of the pretrained model.') + parser.add_argument('--task_name', type=str, default='STS', + help='Name of classification taks') + parser.add_argument('--lr', type=float, default=5E-4, + help='Initial learning rate. default is 2e-5') + parser.add_argument('--comm_backend', type=str, default='device', + choices=['horovod', 'dist_sync_device', 'device'], + help='Communication backend.') + parser.add_argument('--gpus', type=str, default='0', + help='list of gpus to run, e.g. 0 or 0,2,5. -1 means using cpu.') + parser.add_argument('--epochs', type=int, default=3, + help='Number of epochs, default is 3') + parser.add_argument('--do_train', action='store_true', + help='do training.') + parser.add_argument('--do_eval', action='store_true', + help='do eval.') + parser.add_argument('--param_checkpoint', type=str, default=None, + help='The parameter checkpoint for evaluating the model') + parser.add_argument('--backbone_path', type=str, default=None, + help='The parameter checkpoint of backbone model') + parser.add_argument('--overwrite_cache', action='store_true', + help='Whether to overwrite the feature cache.') + parser.add_argument('--num_accumulated', type=int, default=1, + help='The number of batches for gradients accumulation to ' + 'simulate large batch size.') + parser.add_argument('--output_dir', type=str, default='cls_dir', + help='The output directory where the model params will be written.' + ' default is cls_dir') + parser.add_argument('--log_interval', type=int, default=-1, + help='The logging interval for training') + parser.add_argument('--optimizer', type=str, default='adamw', + help='The optimization algorithm') + parser.add_argument('--batch_size', type=int, default=32, + help='Batch size. Number of examples per gpu in a minibatch. default is 64') + parser.add_argument( + '--seed', type=int, default=2, help='Random seed') + + parser.add_argument('--wd', type=float, default=0.01, help='weight decay') + parser.add_argument('--max_grad_norm', type=float, default=1.0, + help='Max gradient norm.') + parser.add_argument('--train_dir', type=str, default=None, + help='the path to training dataset') + parser.add_argument('--eval_dir', type=str, default=None, + help='the path to training dataset') + parser.add_argument('--warmup_ratio', type=float, default=0.1, + help='Ratio of warmup steps in the learning rate scheduler.') + parser.add_argument('--method', type=str, default='adapter', choices=['full', 'bias', 'adapter', 'last_layer'], + help='different finetune method') + + + args = parser.parse_args() + return args + +def load_adapters(model, config, ctx_l): + config = config['location_0'] if 'location_0' in config else config['location_1'] + index = 0 + new_loaded = {} + + for task in config['task_names']: + ckpt_name = config[task]['backbone'] + loaded = _mx_npx.load(ckpt_name) + + for key in loaded: + if 'base_adapter_stacks' in key: + new_key = '.'.join(key.split('.')[0:-3] + [str(index)] + key.split('.')[-2:]) + new_loaded.update({new_key:loaded[key]}) + index += 1 + full_dict = {'params': new_loaded, 'filename': ckpt_name} + model.load_dict(full_dict, ctx_l, allow_missing=True, + ignore_extra=True, cast_dtype=True) + return model + + + +#'cola': {'type': 'Basic', 'units': 64, 'activation': 'gelu', +# 'backbone': '/home/ubuntu/gluon-nlp/scripts/continual_learning/cls_dir/cola/google_en_uncased_bert_base_cola_2680_adapter.params'}, + +def change_adapter_cfg(cfg): + adapter_config = { + 'location_1': { + 'adapter_fusion': True, + 'pre_operator': True, + 'layer_norm_eps':1e-5, + 'task_names': ['mrpc', 'cola'], + 'mrpc': {'type': 'Basic', 'units': 64, 'activation': 'gelu', 'backbone':'/home/ubuntu/gluon-nlp/scripts/continual_learning/cls_dir/mrpc/google_en_uncased_bert_base_mrpc_461_adapter.params'}, + 'cola': {'type': 'Basic', 'units': 64, 'activation': 'gelu', + 'backbone': '/home/ubuntu/gluon-nlp/scripts/continual_learning/cls_dir/cola/google_en_uncased_bert_base_cola_2680_adapter.params'}, + } + } + cfg.defrost() + cfg.MODEL.use_adapter = True + cfg.MODEL.adapter_config = json.dumps(adapter_config) + cfg.freeze() + return cfg + +def get_network(model_name, + ctx_l, + method='full', + checkpoint_path=None, + backbone_path=None, + task=None): + """ + Get the network that fine-tune the Question Answering Task + """ + use_segmentation = 'roberta' not in model_name and 'xlmr' not in model_name + Model, cfg, tokenizer, download_params_path, _ = \ + get_backbone(model_name, load_backbone=not backbone_path) + + if method == 'adapter': + cfg = change_adapter_cfg(cfg) + backbone = Model.from_cfg(cfg) + # Load local backbone parameters if backbone_path provided. + # Otherwise, download backbone parameters from gluon zoo. + backbone.initialize(ctx=ctx_l) + backbone_params_path = backbone_path if backbone_path else download_params_path + if checkpoint_path is None: + backbone.load_parameters(backbone_params_path, ignore_extra=True, allow_missing=True, + ctx=ctx_l, cast_dtype=True) + num_params, num_fixed_params \ + = count_parameters(deduplicate_param_dict(backbone.collect_params())) + logging.info( + 'Loading Backbone Model from {}, with total/fixd parameters={}/{}'.format( + backbone_params_path, num_params, num_fixed_params)) + classify_net = TextPredictionNet(backbone, task.class_num) + classify_net = load_adapters(classify_net, json.loads(cfg.MODEL.adapter_config), ctx_l) + #full_dict = {'params': new_loaded, 'filename': ckpt_name} + #classify_net.load_dict(full_dict, ctx_l, allow_missing=True, + # ignore_extra=True, cast_dtype=True) + if checkpoint_path is None: + # Ignore the UserWarning during initialization, + # There is no need to re-initialize the parameters of backbone + classify_net.initialize(ctx=ctx_l) + else: + classify_net.load_parameters(checkpoint_path, ctx=ctx_l, cast_dtype=True) + classify_net.hybridize() + + return cfg, tokenizer, classify_net, use_segmentation + +def project_label(label, task): + projected_label = copy.copy(label) + for i in range(len(label)): + projected_label[i] = task.proj_label[label[i]] + + return projected_label + + + +def preprocess_data(df, feature_columns, label_column, tokenizer, + max_length=128, use_label=True, use_tqdm=True, task=None): + out = [] + if isinstance(feature_columns, str): + feature_columns = [feature_columns] + cls_id = tokenizer.vocab.cls_id + sep_id = tokenizer.vocab.sep_id + iterator = tqdm(df.iterrows(), total=len(df)) if use_tqdm else df.iterrows() + for idx, row in iterator: + # Token IDs = [CLS] token_ids1 [SEP] token_ids2 [SEP] + # Segment IDs = 0 0 0 1 1 + + encoded_text_l = [tokenizer.encode(row[col_name], int) + for col_name in feature_columns] + trimmed_lengths = get_trimmed_lengths([len(ele) for ele in encoded_text_l], + max_length=max_length - len(feature_columns) - 1, + do_merge=True) + + token_ids = [cls_id] + sum([ele[:length] + [sep_id] + for length, ele in zip(trimmed_lengths, encoded_text_l)], []) + token_types = [0] + sum([[i % 2] * (length + 1) + for i, length in enumerate(trimmed_lengths)], []) + valid_length = len(token_ids) + feature = (token_ids, token_types, valid_length) + if use_label: + label = row[label_column] + if task.task_name != 'sts': + label = task.proj_label[label] + out.append((feature, label)) + else: + out.append(feature) + + return out + + +def get_task_data(args, task, tokenizer, segment): + feature_column = task.feature_column + label_column = task.label_column + if segment == 'train': + input_df = task.raw_train_data + file_name = args.train_dir.split('/')[-1] + else: + input_df = task.raw_eval_data + file_name = args.eval_dir.split('/')[-1] + data_cache_path = os.path.join(CACHE_PATH, + '{}_{}_{}_{}.ndjson'.format( + segment, args.model_name, task.task_name, file_name)) + if os.path.exists(data_cache_path) and not args.overwrite_cache: + processed_data = [] + with open(data_cache_path, 'r') as f: + for line in f: + processed_data.append(json.loads(line)) + logging.info('Found cached data features, load from {}'.format(data_cache_path)) + else: + processed_data = preprocess_data(input_df, feature_column, label_column, + tokenizer, use_label=True, task=task) + with open(data_cache_path, 'w') as f: + for feature in processed_data: + f.write(json.dumps(feature) + '\n') + + label = input_df[label_column] + if task.task_name != 'sts': + label = project_label(label, task) + return processed_data, label + + + + + +def train(args): + store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm( + args.comm_backend, args.gpus) + task = get_task(args.task_name, args.train_dir, args.eval_dir) + #setup_logging(args, local_rank) + #random seed + set_seed(args.seed) + level = logging.INFO + if not os.path.exists(args.output_dir): + os.mkdir(args.output_dir) + detail_dir = os.path.join(args.output_dir, args.task_name) + if not os.path.exists(detail_dir): + os.mkdir(detail_dir) + logging_config(detail_dir, + name='train_{}_{}_{}_'.format(args.task_name, args.model_name, args.method) + str(rank), # avoid race + level=level, + console=(local_rank == 0)) + logging.info(args) + cfg, tokenizer, classify_net, use_segmentation = \ + get_network(args.model_name, ctx_l, args.method, + args.param_checkpoint, + args.backbone_path, + task) + + logging.info('Prepare training data') + train_data, _ = get_task_data(args, task, tokenizer, segment='train') + train_batchify = bf.Group(bf.Group(bf.Pad(), bf.Pad(), bf.Stack()), + bf.Stack()) + + rs = np.random.RandomState(100) + rs.shuffle(train_data) + sampler = SplitSampler( + len(train_data), + num_parts=num_workers, + part_index=rank, + even_size=True) + + dataloader = DataLoader(train_data, + batch_size=args.batch_size, + batchify_fn=train_batchify, + num_workers=0, + sampler=sampler) + + + if args.method == 'full': + target_params_name = classify_net.collect_params().keys() + elif args.method == 'adapter': + target_params_name = [key + for key in classify_net.collect_params() if + 'fusion' in key or 'out_proj' in key or ('adapter' in key and 'norm' in key)] + + for name in classify_net.collect_params(): + if name not in target_params_name: + classify_net.collect_params()[name].grad_req = 'null' + store_names = [key for key in classify_net.collect_params() if + 'adapter' in key] + target_params = {name:classify_net.collect_params()[name] for name in target_params_name} + param_dict = classify_net.collect_params() + # Do not apply weight decay to all the LayerNorm and bias + for _, v in classify_net.collect_params('.*beta|.*gamma|.*bias').items(): + v.wd_mult = 0.0 + # Set grad_req if gradient accumulation is required + params = [p for p in param_dict.values() if p.grad_req != 'null'] + num_accumulated = args.num_accumulated + if num_accumulated > 1: + logging.info('Using gradient accumulation. Effective global batch size = {}' + .format(num_accumulated * args.batch_size * len(ctx_l) * num_workers)) + for p in params: + p.grad_req = 'add' + if local_rank == 0: + writer = SummaryWriter(logdir=os.path.join(args.output_dir, + args.task_name + '_tensorboard_' + + str(args.lr) + '_' + str(args.epochs) + '_' + str(args.method))) + if args.comm_backend == 'horovod': + # Horovod: fetch and broadcast parameters + hvd.broadcast_parameters(param_dict, root_rank=0) + + epoch_size = (len(dataloader) + len(ctx_l) - 1) // len(ctx_l) + max_update = epoch_size * args.epochs + warmup_steps = int(np.ceil(max_update * args.warmup_ratio)) + + dataloader = grouper(repeat(dataloader), len(ctx_l)) + + lr_scheduler = PolyScheduler(max_update=max_update, + base_lr=args.lr, + warmup_begin_lr=0.0, + pwr=1, + final_lr=0.0, + warmup_steps=warmup_steps, + warmup_mode='linear') + optimizer_params = {'learning_rate': args.lr, + 'wd': args.wd, + 'lr_scheduler': lr_scheduler} + + + if args.comm_backend == 'horovod': + trainer = hvd.DistributedTrainer(target_params, args.optimizer, optimizer_params) + else: + trainer = mx.gluon.Trainer(target_params, + 'adamw', + optimizer_params) + + if args.task_name == 'sts': + loss_function = gluon.loss.L2Loss() + else: + loss_function = gluon.loss.SoftmaxCELoss() + metrics = task.metric + #prepare loss function + log_loss = 0 + log_gnorm = 0 + log_step = 0 + if args.log_interval > 0: + log_interval = args.log_interval + else: + log_interval = int(epoch_size * 0.5) + + start_time = time.time() + total_loss = 0 + total_grad = 0 + total_step = 0 + for i in range(max_update): + sample_l = next(dataloader) + loss_l = [] + for sample, ctx in zip(sample_l, ctx_l): + (token_ids, token_types, valid_length), label = sample + # Move to the corresponding context + token_ids = mx.np.array(token_ids, ctx=ctx) + token_types = mx.np.array(token_types, ctx=ctx) + valid_length = mx.np.array(valid_length, ctx=ctx) + label = mx.np.array(label, ctx=ctx) + with mx.autograd.record(): + scores = classify_net(token_ids, token_types, valid_length) + loss = loss_function(scores, label).mean() / len(ctx_l) + loss_l.append(loss) + if task.task_name == 'sts': + label = label.reshape((-1, 1)) + for metric in metrics: + metric.update([label], [scores]) + + for loss in loss_l: + loss.backward() + trainer.allreduce_grads() + # Begin Norm Clipping + total_norm, ratio, is_finite = clip_grad_global_norm(params, args.max_grad_norm) + trainer.update(1.0) + step_loss = sum([loss.asnumpy() for loss in loss_l]) + log_loss += step_loss + log_gnorm += total_norm + log_step += 1 + total_step += 1 + total_loss += step_loss + total_grad += total_norm + if local_rank == 0: + writer.add_scalar('train_loss_avg', total_loss * 1.0 / total_step, i) + writer.add_scalar('lr', trainer.learning_rate, i) + writer.add_scalar('train_loss', step_loss, i) + writer.add_scalar('grad_norm_avg', total_grad * 1.0 / total_step, i) + writer.add_scalar('grad_norm', total_norm, i) + for metric in metrics: + metric_name, result = metric.get() + writer.add_scalar(metric_name, result, i) + if log_step >= log_interval or i == max_update - 1: + curr_time = time.time() + metric_log = '' + for metric in metrics: + metric_nm, val = metric.get() + metric_log += ', {}: = {}'.format(metric_nm, val) + logging.info('[Iter {} / {}] avg {} = {:.2f}, avg gradient norm = {:.2f}, lr = {}, ETA={:.2f}h'.format(i + 1, + max_update, + 'loss', + log_loss / log_step, + log_gnorm / log_step, + trainer.learning_rate, + + (max_update-i)*((curr_time - start_time)/i)/3600) + + metric_log) + log_loss = 0 + log_gnorm = 0 + log_step = 0 + if local_rank == 0 and (i == max_update - 1 or i%(max_update//args.epochs) == 0 and i>0): + ckpt_name = '{}_{}_{}_{}_continual.params'.format(args.model_name, + args.task_name, + (i + 1), + args.method) + + tmp_params = classify_net._collect_params_with_prefix() + params_saved = os.path.join(detail_dir, ckpt_name) + arg_dict = {key: tmp_params[key]._reduce() for key in store_names} + _mx_npx.savez(params_saved, **arg_dict) + logging.info('Params saved in: {}'.format(params_saved)) + #print(tmp_params['backbone.encoder.all_layers.6.ffn.adapter_layer_ffn.adapter_fusion.key_proj.weight']._reduce()) + #print(tmp_params[ + # 'backbone.encoder.all_layers.6.ffn.adapter_layer_ffn.pre_norm.gamma']._reduce()) + for metric in metrics: + metric.reset() + + end_time = time.time() + logging.info('Total costs:{}'.format(end_time - start_time)) + + + +def evaluate(args): + store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm( + args.comm_backend, args.gpus) + # setup_logging(args, local_rank) + task = get_task(args.task_name, args.train_dir, args.eval_dir) + level = logging.INFO + detail_dir = os.path.join(args.output_dir, args.task_name) + if not os.path.exists(detail_dir): + os.mkdir(detail_dir) + logging_config(detail_dir, + name='train_{}_{}_'.format(args.task_name, args.model_name) + str(rank), # avoid race + level=level, + console=(local_rank == 0)) + if rank != 0: + logging.info('Skipping node {}'.format(rank)) + return + ctx_l = parse_ctx(args.gpus) + logging.info( + 'Srarting inference without horovod on the first node on device {}'.format( + str(ctx_l))) + + cfg, tokenizer, classify_net, use_segmentation = \ + get_network(args.model_name, ctx_l, args.method, + args.param_checkpoint, + args.backbone_path, + task) + + candidate_ckpt = [] + detail_dir = os.path.join(args.output_dir, args.task_name) + for name in os.listdir(detail_dir): + if name.endswith(args.method + '_continual.params') and args.task_name in name and args.model_name in name: + candidate_ckpt.append(os.path.join(detail_dir, name)) + candidate_ckpt.sort(reverse=False) + best_ckpt = {} + metrics = task.metric + def evaluate_by_ckpt(ckpt_name, best_ckpt): + + loaded = _mx_npx.load(ckpt_name) + full_dict = {'params': loaded, 'filename': ckpt_name} + classify_net.load_dict(full_dict, ctx_l, allow_missing=True, + ignore_extra=True, cast_dtype=True) + #print(loaded['backbone.encoder.all_layers.6.ffn.adapter_layer_ffn.adapter_fusion.key_proj.weight']) + #print(loaded['backbone.encoder.all_layers.6.ffn.adapter_layer_ffn.pre_norm.gamma']) + logging.info('Prepare dev data') + + dev_data, label = get_task_data(args, task, tokenizer, segment='eval') + dev_batchify = bf.Group(bf.Group(bf.Pad(), bf.Pad(), bf.Stack()), bf.Stack()) + dataloader = DataLoader(dev_data, + batch_size=args.batch_size, + batchify_fn=dev_batchify, + shuffle=False) + + for sample_l in grouper(dataloader, len(ctx_l)): + for sample, ctx in zip(sample_l, ctx_l): + if sample is None: + continue + (token_ids, token_types, valid_length), label = sample + token_ids = mx.np.array(token_ids, ctx=ctx) + token_types = mx.np.array(token_types, ctx=ctx) + valid_length = mx.np.array(valid_length, ctx=ctx) + scores = classify_net(token_ids, token_types, valid_length) + + if task.task_name == 'sts': + label = label.reshape((-1,1)) + for metric in metrics: + metric.update([label], [scores]) + #pred.append(scores) + + + for metric in metrics: + metric_name, result = metric.get() + logging.info('checkpoint {} get result: {}:{}'.format(ckpt_name, metric_name, result)) + if best_ckpt.get(metric_name, [0, ''])[0] Date: Mon, 12 Apr 2021 14:59:24 +0000 Subject: [PATCH 16/23] change store parameters --- scripts/continual_learning/train_continual.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/continual_learning/train_continual.py b/scripts/continual_learning/train_continual.py index 6ae167b2f9..111f8d525a 100644 --- a/scripts/continual_learning/train_continual.py +++ b/scripts/continual_learning/train_continual.py @@ -319,7 +319,7 @@ def train(args): if name not in target_params_name: classify_net.collect_params()[name].grad_req = 'null' store_names = [key for key in classify_net.collect_params() if - 'adapter' in key] + 'adapter' in key or 'out_proj' in key] target_params = {name:classify_net.collect_params()[name] for name in target_params_name} param_dict = classify_net.collect_params() # Do not apply weight decay to all the LayerNorm and bias From 43327ded9f4431a1dd81e1421f003b91e30a5d61 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 12 Apr 2021 15:13:37 +0000 Subject: [PATCH 17/23] change setting --- scripts/classification/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/classification/README.md b/scripts/classification/README.md index 2eda8f80d1..4a057daa3f 100644 --- a/scripts/classification/README.md +++ b/scripts/classification/README.md @@ -51,7 +51,7 @@ here are some results with their hyperparameters | CoLA | Matthew Corr. | 2e-5 | 32 | 7800 | 10 | 59.23 | https://tensorboard.dev/experiment/33euRGh9SrW3p15JWgILnw/ | | RTE | Accuracy | 2e-5 | 32 | 1800 | 10 | 69.67 | https://tensorboard.dev/experiment/XjTxr5anRrC1LMukLJJQ3g/| | MRPC | Accuracy/F1 | 3e-5 | 32 | 7800 | 5 | 85.38/87.31 | https://tensorboard.dev/experiment/jEJFq2XXQ8SvCxt6eKIjwg/ | -| MNLI | Accuracy(m/mm) | 2e-5 | 48 | 7800 | 5 | 84.90/85.10 | https://tensorboard.dev/experiment/CZQlOBedRQeTZwn5o5fbKQ/ | +| MNLI | Accuracy(m/mm) | 2e-5 | 48 | 7800 | 4 | 84.90/85.10 | https://tensorboard.dev/experiment/CZQlOBedRQeTZwn5o5fbKQ/ | ## different method @@ -60,6 +60,7 @@ bias-finetune() and adapter-finetune. To use them, you can directly add an augme ```bash python train_classification.py \ --model_name google_en_uncased_bert_base \ + --method adapter \ --task_name mrpc \ --lr 4.5e-4\ --model_name google_en_cased_bert_base \ From 25d1a31a3df723858ef695e78077516442e8491d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 15 Apr 2021 03:25:04 +0000 Subject: [PATCH 18/23] add basic num --- src/gluonnlp/adapters.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/gluonnlp/adapters.py b/src/gluonnlp/adapters.py index 29480f57ed..436b7730e2 100644 --- a/src/gluonnlp/adapters.py +++ b/src/gluonnlp/adapters.py @@ -58,9 +58,10 @@ def forward(self, query, key, value): key = self.key_proj(key).transpose((0, 1, 3, 2)) value = self.value_proj(value) scores = np.squeeze(npx.batch_dot(query, key), axis=2) + attn_weights = npx.softmax(scores, axis=-1) - output = np.squeeze(npx.batch_dot(npx.reshape(attn_weights, (-2, -2, 1, -1)), value), axis=2) + output = np.squeeze(npx.batch_dot(npx.reshape(attn_weights, (-2, -2, 1, -1)), value), axis=2) return output @use_np @@ -77,9 +78,11 @@ def __init__(self, in_units:int, adapter_config:dict): super().__init__() self._in_units = in_units self._adapter_config = adapter_config + self._basic_num = 0 self.base_adapter_stacks = nn.HybridSequential() for name in adapter_config['task_names']: self.base_adapter_stacks.add(get_base_adapter(adapter_config[name], in_units)) + self._basic_num += 1 if adapter_config['adapter_fusion']: self.adapter_fusion = AdapterFusion(in_units) if adapter_config['pre_operator']: @@ -94,8 +97,9 @@ def forward(self, data, residual): data = self.pre_norm(data) output = [] - for base_adapter in self.base_adapter_stacks: - output.append(base_adapter(data, new_residual)) + for layer_idx in range(self._basic_num): + layer = self.base_adapter_stacks[layer_idx] + output.append(layer(data, new_residual)) if self._adapter_config['adapter_fusion']: output = np.stack(output, axis=2) From a099ba0f7ada0a9c78d17a5652106c4f67fed812 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 15 Apr 2021 03:26:30 +0000 Subject: [PATCH 19/23] chaneg config --- scripts/continual_learning/train_continual.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/continual_learning/train_continual.py b/scripts/continual_learning/train_continual.py index 111f8d525a..99c00cee00 100644 --- a/scripts/continual_learning/train_continual.py +++ b/scripts/continual_learning/train_continual.py @@ -124,6 +124,8 @@ def load_adapters(model, config, ctx_l): #'cola': {'type': 'Basic', 'units': 64, 'activation': 'gelu', # 'backbone': '/home/ubuntu/gluon-nlp/scripts/continual_learning/cls_dir/cola/google_en_uncased_bert_base_cola_2680_adapter.params'}, +#'mrpc': {'type': 'Basic', 'units': 64, 'activation': 'gelu', 'backbone':'/home/ubuntu/gluon-nlp/scripts/continual_learning/cls_dir/mrpc/google_en_uncased_bert_base_mrpc_461_adapter.params'}, + def change_adapter_cfg(cfg): adapter_config = { @@ -131,8 +133,7 @@ def change_adapter_cfg(cfg): 'adapter_fusion': True, 'pre_operator': True, 'layer_norm_eps':1e-5, - 'task_names': ['mrpc', 'cola'], - 'mrpc': {'type': 'Basic', 'units': 64, 'activation': 'gelu', 'backbone':'/home/ubuntu/gluon-nlp/scripts/continual_learning/cls_dir/mrpc/google_en_uncased_bert_base_mrpc_461_adapter.params'}, + 'task_names': ['cola'], 'cola': {'type': 'Basic', 'units': 64, 'activation': 'gelu', 'backbone': '/home/ubuntu/gluon-nlp/scripts/continual_learning/cls_dir/cola/google_en_uncased_bert_base_cola_2680_adapter.params'}, } From 4783597da63c007b517124e96997b1cf75be42d6 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 16 Apr 2021 00:40:11 +0000 Subject: [PATCH 20/23] simplify code --- src/gluonnlp/models/transformer.py | 35 ++++++++++++------------------ 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/src/gluonnlp/models/transformer.py b/src/gluonnlp/models/transformer.py index 90a1cd623c..3a13f891ad 100644 --- a/src/gluonnlp/models/transformer.py +++ b/src/gluonnlp/models/transformer.py @@ -12,6 +12,7 @@ import mxnet as mx from mxnet import np, npx from mxnet import use_np +import functools from mxnet.gluon import nn, HybridBlock from typing import Optional, Tuple, List from ..utils.registry import Registry @@ -225,29 +226,21 @@ def __init__(self, self.layer_norm = nn.LayerNorm(epsilon=layer_norm_eps, in_channels=units) if self._use_adapter: - self.ffn = PositionwiseFFN_adapter(units=units, - hidden_size=hidden_size, - dropout=hidden_dropout_prob, - activation_dropout=activation_dropout_prob, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - layer_norm_eps=layer_norm_eps, - activation=activation, - pre_norm=pre_norm, - dtype=self._dtype, - use_adapter=self._use_adapter, + get_ffn = functools.partial(PositionwiseFFN_adapter, use_adapter=self._use_adapter, adapter_config=self._adapter_config) else: - self.ffn = PositionwiseFFN(units=units, - hidden_size=hidden_size, - dropout=hidden_dropout_prob, - activation_dropout=activation_dropout_prob, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - layer_norm_eps=layer_norm_eps, - activation=activation, - pre_norm=pre_norm, - dtype=self._dtype) + get_ffn = PositionwiseFFN + self.ffn = get_ffn(units=units, + hidden_size=hidden_size, + dropout=hidden_dropout_prob, + activation_dropout=activation_dropout_prob, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + layer_norm_eps=layer_norm_eps, + activation=activation, + pre_norm=pre_norm, + dtype=self._dtype) + @property def layout(self) -> str: From 11660e5d3d9c98a04864bf7bc5d5636d7d9c7033 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 19 May 2021 11:50:04 +0000 Subject: [PATCH 21/23] use einsum --- src/gluonnlp/adapters.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/gluonnlp/adapters.py b/src/gluonnlp/adapters.py index 436b7730e2..578bff2dc2 100644 --- a/src/gluonnlp/adapters.py +++ b/src/gluonnlp/adapters.py @@ -54,14 +54,19 @@ def __init__(self, in_units): def forward(self, query, key, value): #query bs, length, unit #key bs, length, num_adapters, unit - query = npx.reshape(self.query_proj(query), (-2, -2, 1, -1)) + + key = self.key_proj(key).transpose((0, 1, 3, 2)) value = self.value_proj(value) - scores = np.squeeze(npx.batch_dot(query, key), axis=2) - + # query = npx.reshape(self.query_proj(query), (-2, -2, 1, -1)) + query = self.query_proj(query) + #scores = np.squeeze(npx.batch_dot(query, key), axis=2) + scores = np.einsum('blu, blun -> bln', query, key) attn_weights = npx.softmax(scores, axis=-1) - - output = np.squeeze(npx.batch_dot(npx.reshape(attn_weights, (-2, -2, 1, -1)), value), axis=2) + #attn batch size lenght, num + #value bs l, num, u + output = np.einsum('bln, blnu -> blu', attn_weights, value) + #output = np.squeeze(npx.batch_dot(npx.reshape(attn_weights, (-2, -2, 1, -1)), value), axis=2) return output @use_np From a093709a09a7ea1d57b9048fb452ccaeaa692bc1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 19 May 2021 11:50:57 +0000 Subject: [PATCH 22/23] add scripit --- scripts/continual_learning/train_transfer.py | 577 +++++++++++++++++++ 1 file changed, 577 insertions(+) create mode 100644 scripts/continual_learning/train_transfer.py diff --git a/scripts/continual_learning/train_transfer.py b/scripts/continual_learning/train_transfer.py new file mode 100644 index 0000000000..c96efa6bd6 --- /dev/null +++ b/scripts/continual_learning/train_transfer.py @@ -0,0 +1,577 @@ +import gluonnlp +from tensorboardX import SummaryWriter +import numpy as np +import mxnet as mx +import json +import random +import pandas as pd +import mxnet.numpy_extension as _mx_npx +import os +import json +import logging +import time +import argparse +import copy +from mxnet.gluon.metric import Accuracy, F1, MCC, PearsonCorrelation, CompositeEvalMetric +from classification_utils import get_task +import matplotlib.pyplot as plt +from tqdm import tqdm +from mxnet import gluon +from gluonnlp.data.sampler import SplitSampler +from mxnet.gluon import nn +from gluonnlp.models import get_backbone +from gluonnlp.utils.parameter import clip_grad_global_norm, count_parameters, deduplicate_param_dict +from gluonnlp.utils.preprocessing import get_trimmed_lengths +from gluonnlp.utils.misc import get_mxnet_visible_ctx, grouper, repeat, logging_config +from mxnet.gluon.data import batchify as bf +from mxnet.gluon.data import DataLoader +from mxnet.lr_scheduler import PolyScheduler +from gluonnlp.utils import set_seed +from gluonnlp.utils.misc import init_comm, parse_ctx +try: + import horovod.mxnet as hvd +except ImportError: + pass +from classification import TextPredictionNet + +mx.npx.set_np() + + + +CACHE_PATH = os.path.realpath(os.path.join(os.path.realpath(__file__), '..', 'cached')) +if not os.path.exists(CACHE_PATH): + os.makedirs(CACHE_PATH, exist_ok=True) + + +def parse_args(): + parser = argparse.ArgumentParser( + description='classification example. ' + 'We fine-tune the pretrained model on GLUE dataset to do different taks.') + parser.add_argument('--model_name', type=str, default='google_en_uncased_bert_base', + help='Name of the pretrained model.') + parser.add_argument('--task_name', type=str, default='STS', + help='Name of classification taks') + parser.add_argument('--lr', type=float, default=5E-4, + help='Initial learning rate. default is 2e-5') + parser.add_argument('--comm_backend', type=str, default='device', + choices=['horovod', 'dist_sync_device', 'device'], + help='Communication backend.') + parser.add_argument('--gpus', type=str, default='0', + help='list of gpus to run, e.g. 0 or 0,2,5. -1 means using cpu.') + parser.add_argument('--epochs', type=int, default=3, + help='Number of epochs, default is 3') + parser.add_argument('--do_train', action='store_true', + help='do training.') + parser.add_argument('--do_eval', action='store_true', + help='do eval.') + parser.add_argument('--param_checkpoint', type=str, default=None, + help='The parameter checkpoint for evaluating the model') + parser.add_argument('--backbone_path', type=str, default=None, + help='The parameter checkpoint of backbone model') + parser.add_argument('--overwrite_cache', action='store_true', + help='Whether to overwrite the feature cache.') + parser.add_argument('--num_accumulated', type=int, default=1, + help='The number of batches for gradients accumulation to ' + 'simulate large batch size.') + parser.add_argument('--output_dir', type=str, default='cls_dir', + help='The output directory where the model params will be written.' + ' default is cls_dir') + parser.add_argument('--log_interval', type=int, default=-1, + help='The logging interval for training') + parser.add_argument('--optimizer', type=str, default='adamw', + help='The optimization algorithm') + parser.add_argument('--batch_size', type=int, default=32, + help='Batch size. Number of examples per gpu in a minibatch. default is 64') + parser.add_argument( + '--seed', type=int, default=2, help='Random seed') + + parser.add_argument('--wd', type=float, default=0.01, help='weight decay') + parser.add_argument('--max_grad_norm', type=float, default=1.0, + help='Max gradient norm.') + parser.add_argument('--train_dir', type=str, default=None, + help='the path to training dataset') + parser.add_argument('--eval_dir', type=str, default=None, + help='the path to training dataset') + parser.add_argument('--warmup_ratio', type=float, default=0.1, + help='Ratio of warmup steps in the learning rate scheduler.') + parser.add_argument('--method', type=str, default='adapter', choices=['full', 'bias', 'adapter', 'last_layer'], + help='different finetune method') + + + args = parser.parse_args() + return args + +def load_adapters(model, config, ctx_l): + config = config['location_0'] if 'location_0' in config else config['location_1'] + index = 0 + new_loaded = {} + + for task in config['task_names']: + ckpt_name = config[task]['backbone'] + loaded = _mx_npx.load(ckpt_name) + + for key in loaded: + if 'base_adapter_stacks' in key: + new_key = '.'.join(key.split('.')[0:-3] + [str(index)] + key.split('.')[-2:]) + new_loaded.update({new_key:loaded[key]}) + + #print(loaded['backbone.encoder.all_layers.11.ffn.adapter_layer_ffn.base_adapter_stacks.0.down_proj.weight']) + #print(index, ckpt_name) + + index += 1 + full_dict = {'params': new_loaded, 'filename': ckpt_name} + model.load_dict(full_dict, ctx_l, allow_missing=True, + ignore_extra=True, cast_dtype=True) + #print(model.collect_params()['backbone.encoder.all_layers.11.ffn.adapter_layer_ffn.base_adapter_stacks.1.down_proj.weight']._reduce()) + #exit() + return model + + + +#'cola': {'type': 'Basic', 'units': 64, 'activation': 'gelu', +# 'backbone': '/home/ubuntu/gluon-nlp/scripts/continual_learning/cls_dir/cola/google_en_uncased_bert_base_cola_2680_adapter.params'}, +#'mrpc': {'type': 'Basic', 'units': 64, 'activation': 'gelu', 'backbone':'/home/ubuntu/gluon-nlp/scripts/continual_learning/cls_dir/mrpc/google_en_uncased_bert_base_mrpc_461_adapter.params'}, + + +def change_adapter_cfg(cfg): + adapter_config = { + 'location_1': { + 'adapter_fusion': True, + 'pre_operator': True, + 'layer_norm_eps':1e-5, + 'task_names': ['mrpc', 'mnli', 'qqp'], + 'mrpc': {'type': 'Basic', 'units': 64, 'activation': 'gelu', 'backbone':'/home/ubuntu/gluon-nlp/scripts/continual_learning/cls_dir/mrpc/google_en_uncased_bert_base_mrpc_461_adapter.params'}, + 'qqp':{'type':'Basic', 'units':64, 'activation':'gelu', 'backbone':'/home/ubuntu/gluon-nlp/scripts/continual_learning/cls_dir/qqp/google_en_uncased_bert_base_qqp_102340_adapter_continual.params'}, + 'mnli': {'type': 'Basic', 'units': 64, 'activation': 'gelu', + 'backbone': '/home/ubuntu/gluon-nlp/scripts/continual_learning/cls_dir/mnli/google_en_uncased_bert_base_mnli_24547_adapter.params'} + #'cola': {'type': 'Basic', 'units': 64, 'activation': 'gelu', + # 'backbone': '/home/ubuntu/gluon-nlp/scripts/continual_learning/cls_dir/cola/google_en_uncased_bert_base_cola_2680_adapter.params'}, + } + } + cfg.defrost() + cfg.MODEL.use_adapter = True + cfg.MODEL.adapter_config = json.dumps(adapter_config) + cfg.freeze() + return cfg + +def get_network(model_name, + ctx_l, + method='full', + checkpoint_path=None, + backbone_path=None, + task=None): + """ + Get the network that fine-tune the Question Answering Task + """ + use_segmentation = 'roberta' not in model_name and 'xlmr' not in model_name + Model, cfg, tokenizer, download_params_path, _ = \ + get_backbone(model_name, load_backbone=not backbone_path) + + if method == 'adapter': + cfg = change_adapter_cfg(cfg) + backbone = Model.from_cfg(cfg) + # Load local backbone parameters if backbone_path provided. + # Otherwise, download backbone parameters from gluon zoo. + backbone.initialize(ctx=ctx_l) + backbone_params_path = backbone_path if backbone_path else download_params_path + if checkpoint_path is None: + backbone.load_parameters(backbone_params_path, ignore_extra=True, allow_missing=True, + ctx=ctx_l, cast_dtype=True) + num_params, num_fixed_params \ + = count_parameters(deduplicate_param_dict(backbone.collect_params())) + logging.info( + 'Loading Backbone Model from {}, with total/fixd parameters={}/{}'.format( + backbone_params_path, num_params, num_fixed_params)) + classify_net = TextPredictionNet(backbone, task.class_num) + classify_net = load_adapters(classify_net, json.loads(cfg.MODEL.adapter_config), ctx_l) + #full_dict = {'params': new_loaded, 'filename': ckpt_name} + #classify_net.load_dict(full_dict, ctx_l, allow_missing=True, + # ignore_extra=True, cast_dtype=True) + if checkpoint_path is None: + # Ignore the UserWarning during initialization, + # There is no need to re-initialize the parameters of backbone + classify_net.initialize(ctx=ctx_l) + else: + classify_net.load_parameters(checkpoint_path, ctx=ctx_l, cast_dtype=True) + classify_net.hybridize() + + return cfg, tokenizer, classify_net, use_segmentation + +def project_label(label, task): + projected_label = copy.copy(label) + for i in range(len(label)): + projected_label[i] = task.proj_label[label[i]] + + return projected_label + + + +def preprocess_data(df, feature_columns, label_column, tokenizer, + max_length=128, use_label=True, use_tqdm=True, task=None): + out = [] + if isinstance(feature_columns, str): + feature_columns = [feature_columns] + cls_id = tokenizer.vocab.cls_id + sep_id = tokenizer.vocab.sep_id + iterator = tqdm(df.iterrows(), total=len(df)) if use_tqdm else df.iterrows() + for idx, row in iterator: + # Token IDs = [CLS] token_ids1 [SEP] token_ids2 [SEP] + # Segment IDs = 0 0 0 1 1 + + encoded_text_l = [tokenizer.encode(row[col_name], int) + for col_name in feature_columns] + trimmed_lengths = get_trimmed_lengths([len(ele) for ele in encoded_text_l], + max_length=max_length - len(feature_columns) - 1, + do_merge=True) + + token_ids = [cls_id] + sum([ele[:length] + [sep_id] + for length, ele in zip(trimmed_lengths, encoded_text_l)], []) + token_types = [0] + sum([[i % 2] * (length + 1) + for i, length in enumerate(trimmed_lengths)], []) + valid_length = len(token_ids) + feature = (token_ids, token_types, valid_length) + if use_label: + label = row[label_column] + if task.task_name != 'sts': + label = task.proj_label[label] + out.append((feature, label)) + else: + out.append(feature) + + return out + + +def get_task_data(args, task, tokenizer, segment): + feature_column = task.feature_column + label_column = task.label_column + if segment == 'train': + input_df = task.raw_train_data + file_name = args.train_dir.split('/')[-1] + else: + input_df = task.raw_eval_data + file_name = args.eval_dir.split('/')[-1] + data_cache_path = os.path.join(CACHE_PATH, + '{}_{}_{}_{}.ndjson'.format( + segment, args.model_name, task.task_name, file_name)) + if os.path.exists(data_cache_path) and not args.overwrite_cache: + processed_data = [] + with open(data_cache_path, 'r') as f: + for line in f: + processed_data.append(json.loads(line)) + logging.info('Found cached data features, load from {}'.format(data_cache_path)) + else: + processed_data = preprocess_data(input_df, feature_column, label_column, + tokenizer, use_label=True, task=task) + with open(data_cache_path, 'w') as f: + for feature in processed_data: + f.write(json.dumps(feature) + '\n') + + label = input_df[label_column] + if task.task_name != 'sts': + label = project_label(label, task) + return processed_data, label + + + + + +def train(args): + store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm( + args.comm_backend, args.gpus) + task = get_task(args.task_name, args.train_dir, args.eval_dir) + #setup_logging(args, local_rank) + #random seed + set_seed(args.seed) + level = logging.INFO + if not os.path.exists(args.output_dir): + os.mkdir(args.output_dir) + detail_dir = os.path.join(args.output_dir, args.task_name) + if not os.path.exists(detail_dir): + os.mkdir(detail_dir) + logging_config(detail_dir, + name='train_{}_{}_{}_'.format(args.task_name, args.model_name, args.method) + str(rank), # avoid race + level=level, + console=(local_rank == 0)) + logging.info(args) + cfg, tokenizer, classify_net, use_segmentation = \ + get_network(args.model_name, ctx_l, args.method, + args.param_checkpoint, + args.backbone_path, + task) + #print(classify_net.backbone) + logging.info('Prepare training data') + train_data, _ = get_task_data(args, task, tokenizer, segment='train') + train_batchify = bf.Group(bf.Group(bf.Pad(), bf.Pad(), bf.Stack()), + bf.Stack()) + + rs = np.random.RandomState(100) + rs.shuffle(train_data) + sampler = SplitSampler( + len(train_data), + num_parts=num_workers, + part_index=rank, + even_size=True) + + dataloader = DataLoader(train_data, + batch_size=args.batch_size, + batchify_fn=train_batchify, + num_workers=0, + sampler=sampler) + + + if args.method == 'full': + target_params_name = classify_net.collect_params().keys() + elif args.method == 'adapter': + target_params_name = [key + for key in classify_net.collect_params() if + 'fusion' in key or 'out_proj' in key or ('adapter' in key and 'norm' in key)] + + for name in classify_net.collect_params(): + if name not in target_params_name: + classify_net.collect_params()[name].grad_req = 'null' + #print(classify_net.collect_params()) + #print(classify_net.collect_params()['backbone.encoder.all_layers.11.ffn.adapter_layer_ffn.base_adapter_stacks.0.down_proj.weight']._reduce()) + #exit() + store_names = [key for key in classify_net.collect_params() if + 'adapter' in key or 'out_proj' in key] + target_params = {name:classify_net.collect_params()[name] for name in target_params_name} + param_dict = classify_net.collect_params() + # Do not apply weight decay to all the LayerNorm and bias + for _, v in classify_net.collect_params('.*beta|.*gamma|.*bias').items(): + v.wd_mult = 0.0 + # Set grad_req if gradient accumulation is required + params = [p for p in param_dict.values() if p.grad_req != 'null'] + num_accumulated = args.num_accumulated + if num_accumulated > 1: + logging.info('Using gradient accumulation. Effective global batch size = {}' + .format(num_accumulated * args.batch_size * len(ctx_l) * num_workers)) + for p in params: + p.grad_req = 'add' + if local_rank == 0: + writer = SummaryWriter(logdir=os.path.join(args.output_dir, + args.task_name + '_tensorboard_' + + str(args.lr) + '_' + str(args.epochs) + '_' + str(args.method))) + if args.comm_backend == 'horovod': + # Horovod: fetch and broadcast parameters + hvd.broadcast_parameters(param_dict, root_rank=0) + + epoch_size = (len(dataloader) + len(ctx_l) - 1) // len(ctx_l) + max_update = epoch_size * args.epochs + warmup_steps = int(np.ceil(max_update * args.warmup_ratio)) + + dataloader = grouper(repeat(dataloader), len(ctx_l)) + + lr_scheduler = PolyScheduler(max_update=max_update, + base_lr=args.lr, + warmup_begin_lr=0.0, + pwr=1, + final_lr=0.0, + warmup_steps=warmup_steps, + warmup_mode='linear') + optimizer_params = {'learning_rate': args.lr, + 'wd': args.wd, + 'lr_scheduler': lr_scheduler} + + + if args.comm_backend == 'horovod': + trainer = hvd.DistributedTrainer(target_params, args.optimizer, optimizer_params) + else: + trainer = mx.gluon.Trainer(target_params, + 'adamw', + optimizer_params) + + if args.task_name == 'sts': + loss_function = gluon.loss.L2Loss() + else: + loss_function = gluon.loss.SoftmaxCELoss() + metrics = task.metric + #prepare loss function + log_loss = 0 + log_gnorm = 0 + log_step = 0 + if args.log_interval > 0: + log_interval = args.log_interval + else: + log_interval = int(epoch_size * 0.5) + + start_time = time.time() + total_loss = 0 + total_grad = 0 + total_step = 0 + for i in range(max_update): + sample_l = next(dataloader) + loss_l = [] + for sample, ctx in zip(sample_l, ctx_l): + (token_ids, token_types, valid_length), label = sample + # Move to the corresponding context + token_ids = mx.np.array(token_ids, ctx=ctx) + token_types = mx.np.array(token_types, ctx=ctx) + valid_length = mx.np.array(valid_length, ctx=ctx) + label = mx.np.array(label, ctx=ctx) + with mx.autograd.record(): + scores = classify_net(token_ids, token_types, valid_length) + loss = loss_function(scores, label).mean() / len(ctx_l) + loss_l.append(loss) + if task.task_name == 'sts': + label = label.reshape((-1, 1)) + for metric in metrics: + metric.update([label], [scores]) + + for loss in loss_l: + loss.backward() + trainer.allreduce_grads() + # Begin Norm Clipping + total_norm, ratio, is_finite = clip_grad_global_norm(params, args.max_grad_norm) + trainer.update(1.0) + step_loss = sum([loss.asnumpy() for loss in loss_l]) + log_loss += step_loss + log_gnorm += total_norm + log_step += 1 + total_step += 1 + total_loss += step_loss + total_grad += total_norm + if local_rank == 0: + writer.add_scalar('train_loss_avg', total_loss * 1.0 / total_step, i) + writer.add_scalar('lr', trainer.learning_rate, i) + writer.add_scalar('train_loss', step_loss, i) + writer.add_scalar('grad_norm_avg', total_grad * 1.0 / total_step, i) + writer.add_scalar('grad_norm', total_norm, i) + for metric in metrics: + metric_name, result = metric.get() + writer.add_scalar(metric_name, result, i) + if log_step >= log_interval or i == max_update - 1: + curr_time = time.time() + metric_log = '' + for metric in metrics: + metric_nm, val = metric.get() + metric_log += ', {}: = {}'.format(metric_nm, val) + logging.info('[Iter {} / {}] avg {} = {:.2f}, avg gradient norm = {:.2f}, lr = {}, ETA={:.2f}h'.format(i + 1, + max_update, + 'loss', + log_loss / log_step, + log_gnorm / log_step, + trainer.learning_rate, + + (max_update-i)*((curr_time - start_time)/i)/3600) + + metric_log) + log_loss = 0 + log_gnorm = 0 + log_step = 0 + if local_rank == 0 and (i == max_update - 1 or i%(max_update//args.epochs) == 0 and i>0): + ckpt_name = '{}_{}_{}_{}_continual.params'.format(args.model_name, + args.task_name, + (i + 1), + args.method) + + tmp_params = classify_net._collect_params_with_prefix() + params_saved = os.path.join(detail_dir, ckpt_name) + arg_dict = {key: tmp_params[key]._reduce() for key in store_names} + _mx_npx.savez(params_saved, **arg_dict) + logging.info('Params saved in: {}'.format(params_saved)) + #print(tmp_params['backbone.encoder.all_layers.6.ffn.adapter_layer_ffn.adapter_fusion.key_proj.weight']._reduce()) + #print(tmp_params[ + # 'backbone.encoder.all_layers.6.ffn.adapter_layer_ffn.pre_norm.gamma']._reduce()) + for metric in metrics: + metric.reset() + + end_time = time.time() + logging.info('Total costs:{}'.format(end_time - start_time)) + + + +def evaluate(args): + store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm( + args.comm_backend, args.gpus) + # setup_logging(args, local_rank) + task = get_task(args.task_name, args.train_dir, args.eval_dir) + level = logging.INFO + detail_dir = os.path.join(args.output_dir, args.task_name) + if not os.path.exists(detail_dir): + os.mkdir(detail_dir) + logging_config(detail_dir, + name='train_{}_{}_'.format(args.task_name, args.model_name) + str(rank), # avoid race + level=level, + console=(local_rank == 0)) + if rank != 0: + logging.info('Skipping node {}'.format(rank)) + return + ctx_l = parse_ctx(args.gpus) + logging.info( + 'Srarting inference without horovod on the first node on device {}'.format( + str(ctx_l))) + + cfg, tokenizer, classify_net, use_segmentation = \ + get_network(args.model_name, ctx_l, args.method, + args.param_checkpoint, + args.backbone_path, + task) + + candidate_ckpt = [] + detail_dir = os.path.join(args.output_dir, args.task_name) + for name in os.listdir(detail_dir): + if name.endswith(args.method + '_continual.params') and args.task_name in name and args.model_name in name: + candidate_ckpt.append(os.path.join(detail_dir, name)) + candidate_ckpt.sort(reverse=False) + best_ckpt = {} + metrics = task.metric + def evaluate_by_ckpt(ckpt_name, best_ckpt): + + loaded = _mx_npx.load(ckpt_name) + full_dict = {'params': loaded, 'filename': ckpt_name} + classify_net.load_dict(full_dict, ctx_l, allow_missing=True, + ignore_extra=True, cast_dtype=True) + #print(loaded['backbone.encoder.all_layers.6.ffn.adapter_layer_ffn.adapter_fusion.key_proj.weight']) + #print(loaded['backbone.encoder.all_layers.6.ffn.adapter_layer_ffn.pre_norm.gamma']) + logging.info('Prepare dev data') + + dev_data, label = get_task_data(args, task, tokenizer, segment='eval') + dev_batchify = bf.Group(bf.Group(bf.Pad(), bf.Pad(), bf.Stack()), bf.Stack()) + dataloader = DataLoader(dev_data, + batch_size=args.batch_size, + batchify_fn=dev_batchify, + shuffle=False) + + for sample_l in grouper(dataloader, len(ctx_l)): + for sample, ctx in zip(sample_l, ctx_l): + if sample is None: + continue + (token_ids, token_types, valid_length), label = sample + token_ids = mx.np.array(token_ids, ctx=ctx) + token_types = mx.np.array(token_types, ctx=ctx) + valid_length = mx.np.array(valid_length, ctx=ctx) + scores = classify_net(token_ids, token_types, valid_length) + + if task.task_name == 'sts': + label = label.reshape((-1,1)) + for metric in metrics: + metric.update([label], [scores]) + #pred.append(scores) + + + for metric in metrics: + metric_name, result = metric.get() + logging.info('checkpoint {} get result: {}:{}'.format(ckpt_name, metric_name, result)) + if best_ckpt.get(metric_name, [0, ''])[0] Date: Wed, 19 May 2021 12:11:19 +0000 Subject: [PATCH 23/23] add comment --- src/gluonnlp/adapters.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/gluonnlp/adapters.py b/src/gluonnlp/adapters.py index 578bff2dc2..46b7687d4e 100644 --- a/src/gluonnlp/adapters.py +++ b/src/gluonnlp/adapters.py @@ -58,10 +58,14 @@ def forward(self, query, key, value): key = self.key_proj(key).transpose((0, 1, 3, 2)) value = self.value_proj(value) + + #previous implementaion # query = npx.reshape(self.query_proj(query), (-2, -2, 1, -1)) + # scores = np.squeeze(npx.batch_dot(query, key), axis=2) + # with einsum query = self.query_proj(query) - #scores = np.squeeze(npx.batch_dot(query, key), axis=2) scores = np.einsum('blu, blun -> bln', query, key) + attn_weights = npx.softmax(scores, axis=-1) #attn batch size lenght, num #value bs l, num, u