diff --git a/De-identification NB.ipynb b/De-identification NB.ipynb new file mode 100644 index 0000000..a7094bc --- /dev/null +++ b/De-identification NB.ipynb @@ -0,0 +1,401 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "from torch.utils import data\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "import torch.backends.cudnn as cudnn\n", + "import torch.nn.parallel\n", + "from torch import optim\n", + "import numpy as np\n", + "from tqdm import tqdm, trange\n", + "from sklearn.metrics import f1_score,confusion_matrix\n", + "import torch\n", + "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import precision_score, recall_score\n", + "from transformers import BertForTokenClassification, AdamW\n", + "from transformers import get_linear_schedule_with_warmup\n", + "from seqeval.metrics import f1_score\n", + "\n", + "\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "n_gpu = torch.cuda.device_count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dictionary to map label names to int values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tag_values = [\"0\",\"1\",\"2\",\"3\",\"4\",\"5\"]\n", + "tag2idx = {t: i for i, t in enumerate(tag_values)}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load preprocessed data. Note we are droppin last sentence because it does not have 8 tokens, hence this save us from padding the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentences = pickle.load( open( \"sentences_8.p\", \"rb\" ))[:-1]\n", + "sentence_labels = pickle.load( open( \"sentence_labels_8.p\", \"rb\" ))[:-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MAX_LEN = 75\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Creating attention masks, which is basically 1 for all tokens" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "attention_masks = [[float(i != 0.0) for i in ii] for ii in sentences]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "performing train test split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(sentences, sentence_labels, random_state=2018, test_size=0.25)\n", + "tr_masks, val_masks, _, _ = train_test_split(attention_masks, sentences, random_state=2018, test_size=0.25)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Converting data into tensor form for Pytorch processing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tr_inputs = torch.tensor(tr_inputs).to(torch.long)\n", + "val_inputs = torch.tensor(val_inputs).to(torch.long)\n", + "tr_tags = torch.tensor(tr_tags).to(torch.long)\n", + "val_tags = torch.tensor(val_tags).to(torch.long)\n", + "\n", + "tr_masks = torch.tensor(tr_masks).to(torch.long)\n", + "val_masks = torch.tensor(val_masks).to(torch.long)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Building data loaders for the model (batch size is 32 as in the BERT paper)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bs = 32\n", + "\n", + "train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)\n", + "train_sampler = RandomSampler(train_data)\n", + "train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)\n", + "\n", + "valid_data = TensorDataset(val_inputs, val_masks, val_tags)\n", + "valid_sampler = SequentialSampler(valid_data)\n", + "valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading the SciBERT model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = BertForTokenClassification.from_pretrained(\n", + " 'allenai/scibert_scivocab_uncased',\n", + " num_labels=6,\n", + " output_attentions = False,\n", + " output_hidden_states = False\n", + ")\n", + "\n", + "model.cuda();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Choosing if we want to fine tune all parameters or just the ones in the classification model on top of the BERT model. For best performance we finetune all the weights." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "FULL_FINETUNING = True\n", + "if FULL_FINETUNING:\n", + " param_optimizer = list(model.named_parameters())\n", + " no_decay = ['bias', 'gamma', 'beta']\n", + " optimizer_grouped_parameters = [\n", + " {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],\n", + " 'weight_decay_rate': 0.01},\n", + " {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],\n", + " 'weight_decay_rate': 0.0}\n", + " ]\n", + "else:\n", + " param_optimizer = list(model.classifier.named_parameters())\n", + " optimizer_grouped_parameters = [{\"params\": [p for n, p in param_optimizer]}]\n", + "\n", + "optimizer = AdamW(\n", + " optimizer_grouped_parameters,\n", + " lr=3e-5,\n", + " eps=1e-8\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train for 3 epochs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epochs = 3\n", + "max_grad_norm = 1.0\n", + "\n", + "# Total number of training steps is number of batches * number of epochs.\n", + "total_steps = len(train_dataloader) * epochs\n", + "\n", + "# Create the learning rate scheduler.\n", + "scheduler = get_linear_schedule_with_warmup(\n", + " optimizer,\n", + " num_warmup_steps=0,\n", + " num_training_steps=total_steps\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define function to measure accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def flat_accuracy(preds, labels):\n", + " pred_flat = np.argmax(preds, axis=2).flatten()\n", + " labels_flat = labels.flatten()\n", + " return np.sum(pred_flat == labels_flat) / len(labels_flat)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train and test loop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Store the average loss after each epoch so we can plot them.\n", + "loss_values, validation_loss_values = [], []\n", + "\n", + "for _ in trange(epochs, desc=\"Epoch\"):\n", + " # ========================================\n", + " # Training\n", + " # ========================================\n", + " # Perform one full pass over the training set.\n", + " # Put the model into training mode.\n", + " model.train()\n", + " # Reset the total loss for this epoch.\n", + " total_loss = 0\n", + " # Training loop\n", + " for step, batch in enumerate(train_dataloader):\n", + " # add batch to gpu\n", + " batch = tuple(t.to(device) for t in batch)\n", + " b_input_ids, b_input_mask, b_labels = batch\n", + " # Always clear any previously calculated gradients before performing a backward pass.\n", + " model.zero_grad()\n", + " # forward pass\n", + " # This will return the loss (rather than the model output)\n", + " # because we have provided the `labels`.\n", + " outputs = model(b_input_ids, token_type_ids=None,\n", + " attention_mask=b_input_mask, labels=b_labels)\n", + " # get the loss\n", + " loss = outputs[0]\n", + " # Perform a backward pass to calculate the gradients.\n", + " loss.backward()\n", + " # track train loss\n", + " total_loss += loss.item()\n", + " # Clip the norm of the gradient\n", + " # This is to help prevent the \"exploding gradients\" problem.\n", + " torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)\n", + " # update parameters\n", + " optimizer.step()\n", + " # Update the learning rate.\n", + " scheduler.step()\n", + " # Calculate the average loss over the training data.\n", + " avg_train_loss = total_loss / len(train_dataloader)\n", + " print(\"Average train loss: {}\".format(avg_train_loss))\n", + " # Store the loss value for plotting the learning curve.\n", + " loss_values.append(avg_train_loss)\n", + " # ========================================\n", + " # Validation\n", + " # ========================================\n", + " # After the completion of each training epoch, measure our performance on\n", + " # our validation set.\n", + " # Put the model into evaluation mode\n", + " model.eval()\n", + " # Reset the validation loss for this epoch.\n", + " eval_loss, eval_accuracy = 0, 0\n", + " nb_eval_steps, nb_eval_examples = 0, 0\n", + " predictions , true_labels = [], []\n", + " for batch in valid_dataloader:\n", + " batch = tuple(t.to(device) for t in batch)\n", + " b_input_ids, b_input_mask, b_labels = batch\n", + " # Telling the model not to compute or store gradients,\n", + " # saving memory and speeding up validation\n", + " with torch.no_grad():\n", + " # Forward pass, calculate logit predictions.\n", + " # This will return the logits rather than the loss because we have not provided labels.\n", + " outputs = model(b_input_ids, token_type_ids=None,\n", + " attention_mask=b_input_mask, labels=b_labels)\n", + " # Move logits and labels to CPU\n", + " logits = outputs[1].detach().cpu().numpy()\n", + " label_ids = b_labels.to('cpu').numpy()\n", + " # Calculate the accuracy for this batch of test sentences.\n", + " eval_loss += outputs[0].mean().item()\n", + " eval_accuracy += flat_accuracy(logits, label_ids)\n", + " predictions.extend([list(p) for p in np.argmax(logits, axis=2)])\n", + " true_labels.extend(label_ids)\n", + " nb_eval_examples += b_input_ids.size(0)\n", + " nb_eval_steps += 1\n", + " eval_loss = eval_loss / nb_eval_steps\n", + " validation_loss_values.append(eval_loss)\n", + " print(\"Validation loss: {}\".format(eval_loss))\n", + " print(\"Validation Accuracy: {}\".format(eval_accuracy/nb_eval_steps))\n", + " pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)\n", + " for p_i, l_i in zip(p, l) if tag_values[l_i] != \"PAD\"]\n", + " valid_tags = [tag_values[l_i] for l in true_labels\n", + " for l_i in l if tag_values[l_i] != \"PAD\"]\n", + " #print(\"Validation F1-Score: {}\".format(f1_score(pred_tags, valid_tags)))\n", + " print(\"Validation F1-Score: {}\".format(f1_score(pred_tags, valid_tags)))\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save trained model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(model.state_dict(), \"model_full.pth\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}