Working model test for tf2 lstm with deep supervision. but torch lear…

…ning too slow
tensor-works · Jun 20, 2024 · 8d765c6 · 8d765c6
1 parent e4dbb2d
commit 8d765c6
Show file tree

Hide file tree

Showing 11 changed files with 1,583 additions and 1,648 deletions.
diff --git a/datalab/read_dataset.ipynb b/datalab/read_dataset.ipynb
diff --git a/src/datasets/readers.py b/src/datasets/readers.py
@@ -984,7 +984,7 @@ def to_numpy(self,
                  scaler=None,
                  imputer=None,
                  subject_ids: Union[List[str], List[int]] = None,
-                 read_masks: bool = False,
+                 deep_supervision: bool = False,
                  read_timestamps: bool = False,
                  data_type=None,
                  return_ids: bool = False,
@@ -1037,15 +1037,15 @@ def to_numpy(self,
 
             dataset = self.read_samples(subject_ids,
                                         read_timestamps=read_timestamps,
-                                        read_masks=read_masks,
+                                        read_masks=deep_supervision,
                                         data_type=data_type)
         else:
 
             dataset, subject_ids = self.random_samples(n_subjects=len(self.subject_ids),
                                                        read_timestamps=read_timestamps,
                                                        data_type=data_type,
                                                        return_ids=True,
-                                                       read_masks=read_masks,
+                                                       read_masks=deep_supervision,
                                                        seed=seed)
             if n_samples is not None:
                 for prefix in deepcopy(list(dataset.keys())):

diff --git a/src/models/pytorch/__init__.py b/src/models/pytorch/__init__.py
@@ -19,9 +19,17 @@
 
 class AbstractTorchNetwork(nn.Module):
 
-    def __init__(self, output_dim: int, model_path: Path = None):
+    def __init__(self, final_activation, output_dim: int, model_path: Path = None):
         super(AbstractTorchNetwork, self).__init__()
         self._model_path = model_path
+        if final_activation is None:
+            if output_dim == 1:
+                self._final_activation = nn.Sigmoid()
+            else:
+                self._final_activation = nn.Softmax(dim=-1)
+        else:
+            self._final_activation = activation_mapping[final_activation]
+
         if self._model_path is not None:
             # Persistent history
             self._model_path.mkdir(parents=True, exist_ok=True)
@@ -119,7 +127,10 @@ def _init_metrics(self, metrics, prefix: str = None) -> Dict[str, Metric]:
                 metric_name = metric
                 metric = metric_mapping[metric]
             else:
-                metric_name = to_snake_case(metric.__name__)
+                try:
+                    metric_name = to_snake_case(metric.__name__)
+                except:
+                    metric_name = "unknonw"
             if isinstance(metric, type):
                 metric = metric(**settings)
             if prefix is not None:
@@ -267,15 +278,16 @@ def _train_with_arrays(self,
             self._optimizer.zero_grad()
             outputs = self(input_batch, masks=masks_batch)
             loss = self._loss(outputs, label_batch)
-            self._update_metrics(self._train_metrics, outputs, label_batch)
             loss.backward()
             self._optimizer.step()
             train_losses.append(loss.item())
 
-            self._train_progbar.update(batch_idx + 1,
-                                       values=[('loss', loss.item())] +
-                                       self._get_metrics(self._train_metrics),
-                                       finalize=(batch_idx == generator_size - 1 and not has_val))
+            with torch.no_grad():
+                self._update_metrics(self._train_metrics, outputs, label_batch)
+                self._train_progbar.update(
+                    batch_idx + 1,
+                    values=[('loss', loss.item())] + self._get_metrics(self._train_metrics),
+                    finalize=(batch_idx == generator_size - 1 and not has_val))
 
         avg_train_loss = np.mean(train_losses)
         self._history.train_loss[epoch] = avg_train_loss
@@ -297,22 +309,25 @@ def _train_with_dataloader(self,
         self._train_progbar = Progbar(generator_size)
 
         for batch_idx, (inputs, labels) in enumerate(train_generator):
+            self._optimizer.zero_grad()
             if self._deep_supervision:
                 inputs, masks = inputs
                 inputs = inputs.to(self._device)
                 # Set labels to zero when masking since forward does the same
-                labels = labels * masks
-                masks = masks.to(self._device)
+                masks = masks.to(self._device).bool()
             else:
                 inputs = inputs.to(self._device)
                 masks = None
             labels = labels.to(self._device)
-            self._optimizer.zero_grad()
+            # labels = labels * masks
             outputs = self(inputs, masks=masks)
             loss = self._loss(outputs, labels)
-            self._update_metrics(self._train_metrics, outputs, labels)
+            outputs = torch.masked_select(outputs, masks)
+            labels = torch.masked_select(labels, masks)
             loss.backward()
             self._optimizer.step()
+            torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)
+            self._update_metrics(self._train_metrics, outputs, labels)
             train_losses.append(loss.item())
 
             self._train_progbar.update(batch_idx + 1,

diff --git a/src/models/pytorch/lstm.py b/src/models/pytorch/lstm.py
@@ -1,3 +1,4 @@
+import torch
 import torch.nn as nn
 import torch.optim as optim
 from pathlib import Path
@@ -6,6 +7,54 @@
 from settings import *
 from models.pytorch.mappings import *
 from models.pytorch import AbstractTorchNetwork
+from utils import is_iterable
+import torch.nn as nn
+
+
+class TimeDistributed(nn.Module):
+
+    def __init__(self, module, batch_first=True):
+        super(TimeDistributed, self).__init__()
+        self.module = module
+        self.batch_first = batch_first
+
+    def forward(self, x):
+        if len(x.size()) <= 2:
+            return self.module(x)
+
+        # Compute TimeDistributed layer
+        batch_size = x.size(0)
+        time_steps = x.size(1)
+        remaining_dims = x.size()[2:]
+
+        # Reshape input tensor for module
+        if self.batch_first:
+            x = x.contiguous().view(batch_size * time_steps, *remaining_dims)
+        else:
+            x = x.contiguous().view(time_steps, batch_size, *remaining_dims).transpose(0, 1)
+
+        # Apply module and reshape output
+        y = self.module(x)
+        if self.batch_first:
+            y = y.view(batch_size, time_steps, *y.size()[1:])
+        else:
+            y = y.view(time_steps, batch_size, *y.size()[1:]).transpose(0, 1)
+
+        return y
+
+
+class TimeDistributedDense(nn.Module):
+
+    def __init__(self, input_size, output_size):
+        super().__init__()
+        self.dense = nn.Linear(input_size, output_size)
+
+    def forward(self, x):
+        batch_size, seq_len, _ = x.size()
+        x_reshaped = x.contiguous().view(-1, x.size(-1))  # Combine batch_size and seq_len
+        y = self.dense(x_reshaped)
+        y = y.view(batch_size, seq_len, -1)  # Restore batch_size and seq_len
+        return y
 
 
 class LSTMNetwork(AbstractTorchNetwork):
@@ -20,31 +69,26 @@ def __init__(self,
                  output_dim: int = 1,
                  depth: int = 1,
                  model_path: Path = None):
-        super().__init__(output_dim, model_path)
+        super().__init__(final_activation, output_dim, model_path)
 
         self._layer_size = layer_size
         self._dropout_rate = dropout
         self._recurrent_dropout = recurrent_dropout
         self._depth = depth
         self._deep_supervision = deep_supervision
-
-        if final_activation is None:
-            if output_dim == 1:
-                self._final_activation = nn.Sigmoid()
-            else:
-                self._final_activation = nn.Softmax(dim=-1)
-        else:
-            self._final_activation = activation_mapping[final_activation]
-
         self._output_dim = output_dim
 
         if isinstance(layer_size, int):
-            self._hidden_sizes = [layer_size] * depth
-        else:
-            self._hidden_sizes = layer_size
+            self._hidden_sizes = [layer_size] * (depth - 1)
+            last_layer_size = layer_size
+        elif is_iterable(layer_size):
+            self._hidden_sizes = layer_size[:-1]
+            last_layer_size = layer_size[-1]
             if depth != 1:
                 warn_io("Specified hidden sizes and depth are not consistent. "
                         "Using hidden sizes and ignoring depth.")
+        else:
+            raise ValueError("Layer size must be an integer or a list of integers.")
 
         self.lstm_layers = nn.ModuleList()
         input_size = input_dim
@@ -58,38 +102,51 @@ def __init__(self,
             input_size = hidden_size
 
         self._lstm_final = nn.LSTM(input_size=input_size,
-                                   hidden_size=hidden_size,
+                                   hidden_size=last_layer_size,
                                    num_layers=1,
-                                   batch_first=True,
-                                   dropout=(recurrent_dropout if i < depth - 1 else 0))
+                                   batch_first=True)
 
         self._dropout = nn.Dropout(dropout)
-        self._output_layer = nn.Linear(input_size, self._output_dim)
+        self._output_layer = nn.Linear(
+            last_layer_size,
+            self._output_dim)  # TimeDistributedDense(last_layer_size, self._output_dim)
+        #
+
+        for lstm in [self._lstm_final, self._output_layer]:
+            for name, param in lstm.named_parameters():
+                if 'weight' in name:
+                    nn.init.xavier_uniform_(param)
+                elif 'bias' in name:
+                    nn.init.zeros_(param)
 
     def forward(self, x, masks=None):
         if masks is not None:
             masks = masks.to(self._device)
         x = x.to(self._device)
+
         # Masking is not natively supported in PyTorch LSTM, assume x is already preprocessed if necessary
         for lstm in self.lstm_layers:
             x, _ = lstm(x)
         x, _ = self._lstm_final(x)
-        x = self._dropout(x)
 
         if self._deep_supervision:
-            # return predictions for all timesteps
-            masks.to(self._device)
-            x = self._output_layer(x)
+            outputs = list()
+            # Apply the linear to teach timestep
+            for ts in range(x.shape[1]):
+                outputs.append(self._output_layer(x[:, ts, :]))
+            # Cat to vector
+            x = torch.cat(outputs, dim=1)
+            if len(x.shape) < 3:
+                x = x.unsqueeze(-1)
+
         else:
             # Only return the last prediction
             x = x[:, -1, :]
             x = self._output_layer(x)
 
         if self._final_activation:
             x = self._final_activation(x)
-        # maks predictions
-        if masks is not None:
-            x = masks * x
+
         return x
 
 
@@ -100,7 +157,7 @@ def forward(self, x, masks=None):
     from generators.pytorch import TorchGenerator
     reader = datasets.load_data(chunksize=75836,
                                 source_path=TEST_DATA_DEMO,
-                                storage_path=Path(SEMITEMP_DIR, "deep_supervision"),
+                                storage_path=Path(SEMITEMP_DIR),
                                 discretize=True,
                                 time_step_size=1.0,
                                 start_at_zero=True,
@@ -126,15 +183,14 @@ def forward(self, x, masks=None):
     model_path.mkdir(parents=True, exist_ok=True)
     model = LSTMNetwork(1000,
                         59,
-                        0.2,
-                        recurrent_dropout=0.,
                         output_dim=1,
-                        depth=3,
+                        depth=1,
+                        final_activation="softmax",
                         deep_supervision=True)
-
-    criterion = nn.BCELoss()
-    optimizer = optim.Adam(model.parameters(), lr=0.001)
-    model.compile(optimizer=optimizer, loss=criterion)
+    import torch
+    criterion = nn.BCELoss(torch.tensor([4.0], dtype=torch.float32))
+    optimizer = optim.Adam(model.parameters(), lr=0.00001)
+    model.compile(optimizer=optimizer, loss=criterion, metrics=["pr_auc", "roc_auc"])
     # Example training loop
     history = model.fit(train_generator=train_generator, epochs=40)
     print(history)
diff --git a/src/models/tf2/__init__.py b/src/models/tf2/__init__.py
@@ -1,27 +1,43 @@
-from models.tf2.mappings import metric_mapping
+from tensorflow import config
 from tensorflow.keras import Model
+from utils.IO import *
+
+import tensorflow as tf
+
+try:
+    gpus = config.experimental.list_physical_devices('GPU')
+    for gpu in gpus:
+        config.experimental.set_memory_growth(gpu, True)
+except:
+    warn_io("Could not set dynamic memory growth for GPUs. This may lead to memory errors.")
+
+from models.tf2.mappings import metric_mapping
 
 
 class AbstractTf2Model(Model):
 
     def compile(self,
                 optimizer='rmsprop',
                 loss=None,
+                metrics=None,
                 loss_weights=None,
-                metrics=[],
                 weighted_metrics=None,
-                run_eagerly=False,
-                steps_per_execution=1,
-                jit_compile='auto',
-                auto_scale_loss=True):
-        for metric in metrics:
-            if metric in metric_mapping:
-                metrics[metrics.index(metric)] = metric_mapping[metric]
+                run_eagerly=None,
+                steps_per_execution=None,
+                jit_compile=None,
+                pss_evaluation_shards=0,
+                **kwargs):
+        if metrics is not None:
+            for metric in metrics:
+                if metric in metric_mapping:
+                    metrics[metrics.index(metric)] = metric_mapping[metric]
         super().compile(optimizer=optimizer,
                         loss=loss,
-                        loss_weights=loss_weights,
                         metrics=metrics,
+                        loss_weights=loss_weights,
                         weighted_metrics=weighted_metrics,
                         run_eagerly=run_eagerly,
                         steps_per_execution=steps_per_execution,
-                        jit_compile=jit_compile)
+                        jit_compile=jit_compile,
+                        pss_evaluation_shards=pss_evaluation_shards,
+                        **kwargs)
diff --git a/src/models/tf2/lstm.py b/src/models/tf2/lstm.py
@@ -1,3 +1,4 @@
+from models.tf2 import AbstractTf2Model
 import pdb
 import tensorflow as tf
 from typing import List, Union
@@ -6,10 +7,9 @@
 from tensorflow.keras import layers
 from utils.IO import *
 from models.tf2.mappings import activation_names
-from models.tf2 import AbstractTf2Model
 
 
-class LSTMNetwork(Model):
+class LSTMNetwork(AbstractTf2Model):
     """
     """
 

diff --git a/src/models/tf2/mappings.py b/src/models/tf2/mappings.py
@@ -8,7 +8,7 @@
 
 metric_mapping = {
     "roc_auc": metrics.AUC(50, curve="ROC"),
-    "roc_pr": metrics.AUC(50, curve="PR"),
+    "pr_auc": metrics.AUC(50, curve="PR"),
     "accuracy": "accuracy",
     "acc": "acc",
     "binary_accuracy": "binary_accuracy",