OOM using EfficientNet B3 #5801

finnBsch · 2021-01-14T15:45:39Z

finnBsch
Jan 14, 2021

Hi,
I am currently doing the ISIC 2019 Challenge using pretrained EfficientNet B3 on Kaggle with a 16GB GPU. I instantly run into out of memory problems, tried reducing the batchsize and 16 bit precision, doesn't help. I don't know how to debug this and can't find anything special in my code. Here's the important parts of the model:
`
class LightModel(pl.LightningModule):

 def __init__(self, classes_named, id_train, id_val, class_weights, header_line, hparams):
        self.model= EfficientNet.from_pretrained('efficientnet-b3', num_classes=8)    

def forward(self, x):
    b = self.model(x)
    return b

def prepare_data(self):
    # This is called at the start of training
    pass

def train_dataloader(self):
    # Simply define a pytorch dataloader here that will take care of batching. Note it works well with dictionnaries !
    train_dl = DataLoader(self.trainset, batch_size=self.hparams.batch_size, num_workers = 4, shuffle=True,pin_memory=True,
                    )
    return train_dl

def val_dataloader(self):
    # Same but for validation. Pytorch lightning allows multiple validation dataloaders hence why I return a list.
    val_dl = DataLoader(self.valset, batch_size=self.hparams.batch_size, shuffle=False,num_workers = 4,pin_memory=True,
                              ) 
    return val_dl

def test_dataloader(self):
    test_dl = DataLoader(self.testset, batch_size=self.hparams.batch_size, shuffle=False,num_workers = 4, pin_memory=True,
                              ) 
    return test_dl

def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr) # self.model.parameters or self.parameters
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=10 * self.hparams.lr, 
                                                    epochs=self.hparams.epochs, steps_per_epoch=len(self.train_dataloader()))
    return [optimizer], [scheduler]

def step(self, batch):
    # return batch loss
    x, y  = batch['image'], batch['target']
    y_hat = self(x) 
    #criterion_ = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(self.hparams.pos_weight))
    loss  = self.criterion_(y_hat, y)
    return loss, y, y_hat

def training_step(self, batch, batch_idx):
    # hardware agnostic training
    loss, y, y_hat = self.step(batch)
    pred = y_hat.max(1, keepdim=True)[1]
    acc = pred.eq(y.view_as(pred)).sum().item()/len(batch['image'])
    self.log('train_loss', loss, on_epoch=True, on_step=False, logger=True)
    self.log('train_acc', acc, on_epoch=True, on_step=False, logger=True)
    return {'loss': loss, 'acc': acc}
    

def validation_step(self, batch, batch_idx):
    loss, y, y_hat = self.step(batch)
    return {'val_loss': loss.detach(),
            'y': y.detach(), 'y_hat': y_hat.detach()}

def test_step(self, batch, batch_idx):
    x, _ = batch['image'], batch['target']
    y_hat = self(x)
    return {'y_hat': y_hat}

def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
    y = torch.cat([x['y'] for x in outputs])
    y_hat = torch.cat([x['y_hat'] for x in outputs])
    #auc = pl.metrics.functional.classification.multiclass_auroc(pred=y_hat, target=y) if y.float().mean() > 0 else 0.5 # skip sanity check
    #auc = 0.5
    sm = torch.nn.Softmax(dim=1)
    y_p = sm(y_hat)
    prediction = y_hat.max(1, keepdim=True)[1]
    acc = prediction.eq(y.view_as(prediction)).sum().item()/list(y_hat.size())[0]

    conf = self.conf(y_p, y)
    weighted_acc = torch.diagonal(conf)/torch.sum(conf, 1)
    w_b_acc = torch.sum(weighted_acc)/8
    self.log('val_loss', avg_loss, logger=True)
    self.log('val_recall', self.recall(y_p, y), logger=True)
    self.log('w_b_acc', w_b_acc, logger=True)
    print(conf)
    print(f'Epoch {self.current_epoch}: loss: {avg_loss:.2f}, acc: {acc:.4f}, auc: {self.auc(y_p, y,num_classes=8):.4f}, recall: {self.recall(y_p, y):.4f}, w_acc: {w_b_acc:.4f}')

def test_epoch_end(self, outputs):
    y_hat = torch.cat([x['y_hat'] for x in outputs])
    test_df = dict()
    test_df[header_line[0]] = [x[:-4] for x in self.ids_val]
    decision = torch.argmax(y_hat, dim=1)
    y_out = pl.metrics.utils.to_onehot(decision, num_classes=9).cpu().detach().numpy()
    for i in range(y_out.shape[1]):
        test_df[header_line[1 + i]] = y_out[:, i]
    df = pd.DataFrame(data=test_df)
    df.to_csv('/kaggle/working/submission.csv', index=False)`

justusschock · 2021-01-14T19:22:25Z

justusschock
Jan 14, 2021
Maintainer

Looks fine for me. Have you trained the same network without lightning successfully? Because I remember it being a quite large network, so it is likely that you run OOM here just because of that.

0 replies

Borda · 2021-01-15T18:07:37Z

Borda
Jan 15, 2021
Maintainer

@SeanNaren could sharded help as you showed that it reduces memory usage...?

0 replies

finnBsch · 2021-01-15T21:34:31Z

finnBsch
Jan 15, 2021
Author

@justusschock No I have not. However I've read a paper which managed to train the network with 11Gb of vram in a similar setup, can't access the code though.

0 replies

jspaezp · 2021-01-20T19:03:03Z

jspaezp
Jan 20, 2021

How are you defining self.trainset?? ... It could also be that the data loader is filling your memory, try setting the workers to 0 and removing the memory pinning ...
Lmk if it helps

0 replies

sakvaua · 2021-02-22T15:14:44Z

sakvaua
Feb 22, 2021

What's the image size you're using?

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

OOM using EfficientNet B3 #5801

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 5 comments

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

Select a reply

OOM using EfficientNet B3 #5801

finnBsch Jan 14, 2021

Replies: 5 comments

justusschock Jan 14, 2021 Maintainer

Borda Jan 15, 2021 Maintainer

finnBsch Jan 15, 2021 Author

jspaezp Jan 20, 2021

sakvaua Feb 22, 2021

finnBsch
Jan 14, 2021

justusschock
Jan 14, 2021
Maintainer

Borda
Jan 15, 2021
Maintainer

finnBsch
Jan 15, 2021
Author

jspaezp
Jan 20, 2021

sakvaua
Feb 22, 2021