LSTM-FNN example does not work anymore #1391

mg64ve · 2024-01-04T10:06:16Z

Hello,
I took my code from Posit AI Blog:

https://blogs.rstudio.com/ai/posts/2020-07-20-fnn-lstm/

the code is the following:

# DL-related packages
library(tensorflow)
library(keras)
library(tfdatasets)
library(tfautograph)
library(reticulate)

# going to need those later
library(tidyverse)
library(cowplot)

library(deSolve)


parameters <- c(a = .2,
                b = .2,
                c = 5.7)

initial_state <-
  c(x = 1,
    y = 1,
    z = 1.05)

roessler <- function(t, state, parameters) {
  with(as.list(c(state, parameters)), {
    dx <- -y - z
    dy <- x + a * y
    dz = b + z * (x - c)
    
    list(c(dx, dy, dz))
  })
}

times <- seq(0, 2500, length.out = 20000)

roessler_ts <-
  ode(
    y = initial_state,
    times = times,
    func = roessler,
    parms = parameters,
    method = "lsoda"
  ) %>% unclass() %>% as_tibble()

n <- 10000
roessler <- roessler_ts$x[1:n]

roessler <- scale(roessler)



# add noise
noise <- 1 # also used 1.5, 2, 2.5
roessler <- roessler + rnorm(10000, mean = 0, sd = noise)
str(roessler)



encoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm1 <-  layer_lstm(
      units = n_recurrent,
      input_shape = c(n_timesteps, n_features),
      return_sequences = TRUE
    ) 
    self$batchnorm1 <- layer_batch_normalization()
    self$lstm2 <-  layer_lstm(
      units = n_latent,
      return_sequences = FALSE
    ) 
    self$batchnorm2 <- layer_batch_normalization()
    
    function (x, mask = NULL) {
      x %>%
        self$noise() %>%
        self$lstm1() %>%
        self$batchnorm1() %>%
        self$lstm2() %>%
        self$batchnorm2() 
    }
  })
}

decoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$repeat_vector <- layer_repeat_vector(n = n_timesteps)
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm <- layer_lstm(
      units = n_recurrent,
      return_sequences = TRUE,
      go_backwards = TRUE
    ) 
    self$batchnorm <- layer_batch_normalization()
    self$elu <- layer_activation_elu() 
    self$time_distributed <- time_distributed(layer = layer_dense(units = n_features))
    
    function (x, mask = NULL) {
      x %>%
        self$repeat_vector() %>%
        self$noise() %>%
        self$lstm() %>%
        self$batchnorm() %>%
        self$elu() %>%
        self$time_distributed()
    }
  })
}

# loss FNN
loss_false_nn <- function(x) {
  
  # changing these parameters is equivalent to
  # changing the strength of the regularizer, so we keep these fixed (these values
  # correspond to the original values used in Kennel et al 1992).
  rtol <- 10 
  atol <- 2
  k_frac <- 0.01
  
  k <- max(1, floor(k_frac * batch_size))
  
  ## Vectorized version of distance matrix calculation
  tri_mask <-
    tf$linalg$band_part(
      tf$ones(
        shape = c(tf$cast(n_latent, tf$int32), tf$cast(n_latent, tf$int32)),
        dtype = tf$float32
      ),
      num_lower = -1L,
      num_upper = 0L
    )
  
  # latent x batch_size x latent
  batch_masked <-
    tf$multiply(tri_mask[, tf$newaxis,], x[tf$newaxis, reticulate::py_ellipsis()])
  
  # latent x batch_size x 1
  x_squared <-
    tf$reduce_sum(batch_masked * batch_masked,
                  axis = 2L,
                  keepdims = TRUE)
  
  # latent x batch_size x batch_size
  pdist_vector <- x_squared + tf$transpose(x_squared, perm = c(0L, 2L, 1L)) -
    2 * tf$matmul(batch_masked, tf$transpose(batch_masked, perm = c(0L, 2L, 1L)))
  
  #(latent, batch_size, batch_size)
  all_dists <- pdist_vector
  # latent
  all_ra <-
    tf$sqrt((1 / (
      batch_size * tf$range(1, 1 + n_latent, dtype = tf$float32)
    )) *
      tf$reduce_sum(tf$square(
        batch_masked - tf$reduce_mean(batch_masked, axis = 1L, keepdims = TRUE)
      ), axis = c(1L, 2L)))
  
  # Avoid singularity in the case of zeros
  #(latent, batch_size, batch_size)
  all_dists <-
    tf$clip_by_value(all_dists, 1e-14, tf$reduce_max(all_dists))
  
  #inds = tf.argsort(all_dists, axis=-1)
  top_k <- tf$math$top_k(-all_dists, tf$cast(k + 1, tf$int32))
  # (#(latent, batch_size, batch_size)
  top_indices <- top_k[[1]]
  
  #(latent, batch_size, batch_size)
  neighbor_dists_d <-
    tf$gather(all_dists, top_indices, batch_dims = -1L)
  #(latent - 1, batch_size, batch_size)
  neighbor_new_dists <-
    tf$gather(all_dists[2:-1, , ],
              top_indices[1:-2, , ],
              batch_dims = -1L)
  
  # Eq. 4 of Kennel et al.
  #(latent - 1, batch_size, batch_size)
  scaled_dist <- tf$sqrt((
    tf$square(neighbor_new_dists) -
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])) /
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])
  )
  
  # Kennel condition #1
  #(latent - 1, batch_size, batch_size)
  is_false_change <- (scaled_dist > rtol)
  # Kennel condition 2
  #(latent - 1, batch_size, batch_size)
  is_large_jump <-
    (neighbor_new_dists > atol * all_ra[1:-2, tf$newaxis, tf$newaxis])
  
  is_false_neighbor <-
    tf$math$logical_or(is_false_change, is_large_jump)
  #(latent - 1, batch_size, 1)
  total_false_neighbors <-
    tf$cast(is_false_neighbor, tf$int32)[reticulate::py_ellipsis(), 2:(k + 2)]
  
  # Pad zero to match dimensionality of latent space
  # (latent - 1)
  reg_weights <-
    1 - tf$reduce_mean(tf$cast(total_false_neighbors, tf$float32), axis = c(1L, 2L))
  # (latent,)
  reg_weights <- tf$pad(reg_weights, list(list(1L, 0L)))
  
  # Find batch average activity
  
  # L2 Activity regularization
  activations_batch_averaged <-
    tf$sqrt(tf$reduce_mean(tf$square(x), axis = 0L))
  
  loss <- tf$reduce_sum(tf$multiply(reg_weights, activations_batch_averaged))
  loss
  
}

n_latent <- 10L
n_features <- 1
n_timesteps <- 120
batch_size <- 32
n_hidden <- 32


gen_timesteps <- function(x, n_timesteps) {
  do.call(rbind,
          purrr::map(seq_along(x),
                     function(i) {
                       start <- i
                       end <- i + n_timesteps - 1
                       out <- x[start:end]
                       out
                     })
  ) %>%
    na.omit()
}


train <- gen_timesteps(roessler[1:(n/2)], 2 * n_timesteps)
test <- gen_timesteps(roessler[(n/2):n], 2 * n_timesteps) 

dim(train) <- c(dim(train), 1)
dim(test) <- c(dim(test), 1)

x_train <- train[ , 1:n_timesteps, , drop = FALSE]
y_train <- train[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_train <- tensor_slices_dataset(list(x_train, y_train)) %>%
  dataset_shuffle(nrow(x_train)) %>%
  dataset_batch(batch_size)

x_test <- test[ , 1:n_timesteps, , drop = FALSE]
y_test <- test[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_test <- tensor_slices_dataset(list(x_test, y_test)) %>%
  dataset_batch(nrow(x_test))


train_step <- function(batch) {
  with (tf$GradientTape(persistent = TRUE) %as% tape, {
    code <- encoder(batch[[1]])
    prediction <- decoder(code)
    
    l_mse <- mse_loss(batch[[2]], prediction)
    l_fnn <- loss_false_nn(code)
    loss <- l_mse + fnn_weight * l_fnn
  })
  
  encoder_gradients <-
    tape$gradient(loss, encoder$trainable_variables)
  decoder_gradients <-
    tape$gradient(loss, decoder$trainable_variables)
  
  optimizer$apply_gradients(purrr::transpose(list(
    encoder_gradients, encoder$trainable_variables
  )))
  optimizer$apply_gradients(purrr::transpose(list(
    decoder_gradients, decoder$trainable_variables
  )))
  
  train_loss(loss)
  train_mse(l_mse)
  train_fnn(l_fnn)
  
  
}

training_loop <- tf_function(autograph(function(ds_train) {
  for (batch in ds_train) {
    train_step(batch)
  }
  
  tf$print("Loss: ", train_loss$result())
  tf$print("MSE: ", train_mse$result())
  tf$print("FNN loss: ", train_fnn$result())
  
  train_loss$reset_states()
  train_mse$reset_states()
  train_fnn$reset_states()
  
}))


mse_loss <-
  tf$keras$losses$MeanSquaredError(reduction = tf$keras$losses$Reduction$SUM)

train_loss <- tf$keras$metrics$Mean(name = 'train_loss')
train_fnn <- tf$keras$metrics$Mean(name = 'train_fnn')
train_mse <-  tf$keras$metrics$Mean(name = 'train_mse')

# fnn_multiplier should be chosen individually per dataset
# this is the value we used on the geyser dataset
fnn_multiplier <- 0.7
fnn_weight <- fnn_multiplier * nrow(x_train)/batch_size

# learning rate may also need adjustment
optimizer <- optimizer_adam(learning_rate = 1e-3)


encoder <- encoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)

decoder <- decoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)


for (epoch in 1:200) {
  cat("Epoch: ", epoch, " -----------\n")
  training_loop(ds_train)
  
  test_batch <- as_iterator(ds_test) %>% iter_next()
  encoded <- encoder(test_batch[[1]]) 
  test_var <- tf$math$reduce_variance(encoded, axis = 0L)
  print(test_var %>% as.numeric() %>% round(5))
}

In my installation is failing with the following error:

Error in py_call_impl(callable, call_args$unnamed, call_args$named) : 
  KeyError: 'The optimizer cannot recognize variable lstm_8/lstm_cell/kernel:0. This usually means you are trying to call the optimizer to update different parts of the model separately. Please call `optimizer.build(variables)` with the full list of trainable variables before the training loop or use legacy optimizer `tf.keras.optimizers.legacy.Adam.'
Run `reticulate::py_last_error()` for details.

is this related to a problem in my installation or this model is not working anymore?

The text was updated successfully, but these errors were encountered:

t-kalinowski · 2024-01-04T13:55:22Z

Thanks for reporting. I am able to reproduce the error, and am able to run the example after two edits:

# optimizer <- optimizer_adam(learning_rate = 1e-3)
optimizer <- tf$keras$optimizers$legacy$Adam(learning_rate = 1e-3)

and

# top_indices <- top_k[[1]]
top_indices <- top_k$indices

(this is on TF version 2.15, Ubuntu 22.04)

> tf_config()
TensorFlow v2.15.0 (~/.virtualenvs/r-tensorflow/lib/python3.9/site-packages/tensorflow)
Python v3.9 (~/.virtualenvs/r-tensorflow/bin/python)

mg64ve · 2024-01-04T17:46:06Z

Thanks @t-kalinowski for your quick response.
In case I want to adjust this concept to multiple features, do you have any suggestion on how to change the following:

gen_timesteps <- function(x, n_timesteps) {
  do.call(rbind,
          purrr::map(seq_along(x),
                     function(i) {
                       start <- i
                       end <- i + n_timesteps - 1
                       out <- x[start:end]
                       out
                     })
  ) %>%
    na.omit()
}

for multiple features or do you have any recommendation to use any other function for that?

I have tried with the following:

gen_timesteps <- function(x, n_timesteps) {
  do.call(abind,
          c(purrr::map(seq_along_rows(x),
                       function(i) {
                         f <- dim(x)[2]
                         start <- i
                         end <- i + n_timesteps - 1
                         if (end <= dim(x)[1]) {
                           out <- x[start:end,]
                           dim(out) <- c(1, dim(out))
                           out
                         }
                         else {
                           return(array(rep(NA, n_timesteps*f), dim = c(1, n_timesteps, f)))
                         }
                       }) %>% na.omit(),
            along=1)
  ) %>% na.omit()
}

But it produces NA that I can't remove. The problem is also due to the fact that in R, subsetting a vector produces NA if the dimensions are exceeded, that is not the case of array where you get the message that index is out of bounds.

As a consequence, the following does not give me any error but it produces NA during training:

# DL-related packages
library(tensorflow)
library(keras)
library(tfdatasets)
library(tfautograph)
library(reticulate)

# going to need those later
library(tidyverse)
library(cowplot)

library(deSolve)


parameters <- c(a = .2,
                b = .2,
                c = 5.7)

initial_state <-
  c(x = 1,
    y = 1,
    z = 1.05)

roessler <- function(t, state, parameters) {
  with(as.list(c(state, parameters)), {
    dx <- -y - z
    dy <- x + a * y
    dz = b + z * (x - c)
    
    list(c(dx, dy, dz))
  })
}

times <- seq(0, 2500, length.out = 20000)

roessler_ts <-
  ode(
    y = initial_state,
    times = times,
    func = roessler,
    parms = parameters,
    method = "lsoda"
  ) %>% unclass() %>% as_tibble()

n <- 10000
roessler <- roessler_ts$x[1:n]

roessler <- scale(roessler)



# add noise
noise <- 1 # also used 1.5, 2, 2.5
roessler1 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler2 <- roessler + rnorm(10000, mean = 0, sd = noise)
str(roessler1)

roessler <- array(c(roessler1, roessler2), dim = c(n,2))
str(roessler)



encoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm1 <-  layer_lstm(
      units = n_recurrent,
      input_shape = c(n_timesteps, n_features),
      return_sequences = TRUE
    ) 
    self$batchnorm1 <- layer_batch_normalization()
    self$lstm2 <-  layer_lstm(
      units = n_latent,
      return_sequences = FALSE
    ) 
    self$batchnorm2 <- layer_batch_normalization()
    
    function (x, mask = NULL) {
      x %>%
        self$noise() %>%
        self$lstm1() %>%
        self$batchnorm1() %>%
        self$lstm2() %>%
        self$batchnorm2() 
    }
  })
}

decoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$repeat_vector <- layer_repeat_vector(n = n_timesteps)
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm <- layer_lstm(
      units = n_recurrent,
      return_sequences = TRUE,
      go_backwards = TRUE
    ) 
    self$batchnorm <- layer_batch_normalization()
    self$elu <- layer_activation_elu() 
    self$time_distributed <- time_distributed(layer = layer_dense(units = n_features))
    
    function (x, mask = NULL) {
      x %>%
        self$repeat_vector() %>%
        self$noise() %>%
        self$lstm() %>%
        self$batchnorm() %>%
        self$elu() %>%
        self$time_distributed()
    }
  })
}

# loss FNN
loss_false_nn <- function(x) {
  
  # changing these parameters is equivalent to
  # changing the strength of the regularizer, so we keep these fixed (these values
  # correspond to the original values used in Kennel et al 1992).
  rtol <- 10 
  atol <- 2
  k_frac <- 0.01
  
  k <- max(1, floor(k_frac * batch_size))
  
  ## Vectorized version of distance matrix calculation
  tri_mask <-
    tf$linalg$band_part(
      tf$ones(
        shape = c(tf$cast(n_latent, tf$int32), tf$cast(n_latent, tf$int32)),
        dtype = tf$float32
      ),
      num_lower = -1L,
      num_upper = 0L
    )
  
  # latent x batch_size x latent
  batch_masked <-
    tf$multiply(tri_mask[, tf$newaxis,], x[tf$newaxis, reticulate::py_ellipsis()])
  
  # latent x batch_size x 1
  x_squared <-
    tf$reduce_sum(batch_masked * batch_masked,
                  axis = 2L,
                  keepdims = TRUE)
  
  # latent x batch_size x batch_size
  pdist_vector <- x_squared + tf$transpose(x_squared, perm = c(0L, 2L, 1L)) -
    2 * tf$matmul(batch_masked, tf$transpose(batch_masked, perm = c(0L, 2L, 1L)))
  
  #(latent, batch_size, batch_size)
  all_dists <- pdist_vector
  # latent
  all_ra <-
    tf$sqrt((1 / (
      batch_size * tf$range(1, 1 + n_latent, dtype = tf$float32)
    )) *
      tf$reduce_sum(tf$square(
        batch_masked - tf$reduce_mean(batch_masked, axis = 1L, keepdims = TRUE)
      ), axis = c(1L, 2L)))
  
  # Avoid singularity in the case of zeros
  #(latent, batch_size, batch_size)
  all_dists <-
    tf$clip_by_value(all_dists, 1e-14, tf$reduce_max(all_dists))
  
  #inds = tf.argsort(all_dists, axis=-1)
  top_k <- tf$math$top_k(-all_dists, tf$cast(k + 1, tf$int32))
  # (#(latent, batch_size, batch_size)
  top_indices <- top_k$indices
  
  #(latent, batch_size, batch_size)
  neighbor_dists_d <-
    tf$gather(all_dists, top_indices, batch_dims = -1L)
  #(latent - 1, batch_size, batch_size)
  neighbor_new_dists <-
    tf$gather(all_dists[2:-1, , ],
              top_indices[1:-2, , ],
              batch_dims = -1L)
  
  # Eq. 4 of Kennel et al.
  #(latent - 1, batch_size, batch_size)
  scaled_dist <- tf$sqrt((
    tf$square(neighbor_new_dists) -
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])) /
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])
  )
  
  # Kennel condition #1
  #(latent - 1, batch_size, batch_size)
  is_false_change <- (scaled_dist > rtol)
  # Kennel condition 2
  #(latent - 1, batch_size, batch_size)
  is_large_jump <-
    (neighbor_new_dists > atol * all_ra[1:-2, tf$newaxis, tf$newaxis])
  
  is_false_neighbor <-
    tf$math$logical_or(is_false_change, is_large_jump)
  #(latent - 1, batch_size, 1)
  total_false_neighbors <-
    tf$cast(is_false_neighbor, tf$int32)[reticulate::py_ellipsis(), 2:(k + 2)]
  
  # Pad zero to match dimensionality of latent space
  # (latent - 1)
  reg_weights <-
    1 - tf$reduce_mean(tf$cast(total_false_neighbors, tf$float32), axis = c(1L, 2L))
  # (latent,)
  reg_weights <- tf$pad(reg_weights, list(list(1L, 0L)))
  
  # Find batch average activity
  
  # L2 Activity regularization
  activations_batch_averaged <-
    tf$sqrt(tf$reduce_mean(tf$square(x), axis = 0L))
  
  loss <- tf$reduce_sum(tf$multiply(reg_weights, activations_batch_averaged))
  loss
  
}

n_latent <- 10L
n_features <- 2
n_timesteps <- 120
batch_size <- 32
n_hidden <- 32


gen_timesteps <- function(x, n_timesteps) {
  do.call(abind,
          c(purrr::map(seq_along_rows(x),
                       function(i) {
                         f <- dim(x)[2]
                         start <- i
                         end <- i + n_timesteps - 1
                         if (end <= dim(x)[1]) {
                           out <- x[start:end,]
                           dim(out) <- c(1, dim(out))
                           out
                         }
                         else {
                           return(array(rep(NA, n_timesteps*f), dim = c(1, n_timesteps, f)))
                         }
                       }) %>% na.omit(),
            along=1)
  ) %>% na.omit()
}


train <- gen_timesteps(roessler[1:(n/2),], 2 * n_timesteps)
test <- gen_timesteps(roessler[(n/2):n,], 2 * n_timesteps) 

dim(train)
dim(test)

x_train <- train[ , 1:n_timesteps, , drop = FALSE]
y_train <- train[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_train <- tensor_slices_dataset(list(x_train, y_train)) %>%
  dataset_shuffle(nrow(x_train)) %>%
  dataset_batch(batch_size)

x_test <- test[ , 1:n_timesteps, , drop = FALSE]
y_test <- test[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_test <- tensor_slices_dataset(list(x_test, y_test)) %>%
  dataset_batch(nrow(x_test))


train_step <- function(batch) {
  with (tf$GradientTape(persistent = TRUE) %as% tape, {
    code <- encoder(batch[[1]])
    prediction <- decoder(code)
    
    l_mse <- mse_loss(batch[[2]], prediction)
    l_fnn <- loss_false_nn(code)
    loss <- l_mse + fnn_weight * l_fnn
  })
  
  encoder_gradients <-
    tape$gradient(loss, encoder$trainable_variables)
  decoder_gradients <-
    tape$gradient(loss, decoder$trainable_variables)
  
  optimizer$apply_gradients(purrr::transpose(list(
    encoder_gradients, encoder$trainable_variables
  )))
  optimizer$apply_gradients(purrr::transpose(list(
    decoder_gradients, decoder$trainable_variables
  )))
  
  train_loss(loss)
  train_mse(l_mse)
  train_fnn(l_fnn)
  
  
}

training_loop <- tf_function(autograph(function(ds_train) {
  for (batch in ds_train) {
    train_step(batch)
  }
  
  tf$print("Loss: ", train_loss$result())
  tf$print("MSE: ", train_mse$result())
  tf$print("FNN loss: ", train_fnn$result())
  
  train_loss$reset_states()
  train_mse$reset_states()
  train_fnn$reset_states()
  
}))


mse_loss <-
  tf$keras$losses$MeanSquaredError(reduction = tf$keras$losses$Reduction$SUM)

train_loss <- tf$keras$metrics$Mean(nam# DL-related packages
library(tensorflow)
library(keras)
library(tfdatasets)
library(tfautograph)
library(reticulate)

# going to need those later
library(tidyverse)
library(cowplot)

library(deSolve)


parameters <- c(a = .2,
                b = .2,
                c = 5.7)

initial_state <-
  c(x = 1,
    y = 1,
    z = 1.05)

roessler <- function(t, state, parameters) {
  with(as.list(c(state, parameters)), {
    dx <- -y - z
    dy <- x + a * y
    dz = b + z * (x - c)
    
    list(c(dx, dy, dz))
  })
}

times <- seq(0, 2500, length.out = 20000)

roessler_ts <-
  ode(
    y = initial_state,
    times = times,
    func = roessler,
    parms = parameters,
    method = "lsoda"
  ) %>% unclass() %>% as_tibble()

n <- 10000
roessler <- roessler_ts$x[1:n]

roessler <- scale(roessler)



# add noise
noise <- 1 # also used 1.5, 2, 2.5
roessler1 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler2 <- roessler + rnorm(10000, mean = 0, sd = noise)
str(roessler1)

roessler <- array(c(roessler1, roessler2), dim = c(n,2))
str(roessler)



encoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm1 <-  layer_lstm(
      units = n_recurrent,
      input_shape = c(n_timesteps, n_features),
      return_sequences = TRUE
    ) 
    self$batchnorm1 <- layer_batch_normalization()
    self$lstm2 <-  layer_lstm(
      units = n_latent,
      return_sequences = FALSE
    ) 
    self$batchnorm2 <- layer_batch_normalization()
    
    function (x, mask = NULL) {
      x %>%
        self$noise() %>%
        self$lstm1() %>%
        self$batchnorm1() %>%
        self$lstm2() %>%
        self$batchnorm2() 
    }
  })
}

decoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$repeat_vector <- layer_repeat_vector(n = n_timesteps)
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm <- layer_lstm(
      units = n_recurrent,
      return_sequences = TRUE,
      go_backwards = TRUE
    ) 
    self$batchnorm <- layer_batch_normalization()
    self$elu <- layer_activation_elu() 
    self$time_distributed <- time_distributed(layer = layer_dense(units = n_features))
    
    function (x, mask = NULL) {
      x %>%
        self$repeat_vector() %>%
        self$noise() %>%
        self$lstm() %>%
        self$batchnorm() %>%
        self$elu() %>%
        self$time_distributed()
    }
  })
}

# loss FNN
loss_false_nn <- function(x) {
  
  # changing these parameters is equivalent to
  # changing the strength of the regularizer, so we keep these fixed (these values
  # correspond to the original values used in Kennel et al 1992).
  rtol <- 10 
  atol <- 2
  k_frac <- 0.01
  
  k <- max(1, floor(k_frac * batch_size))
  
  ## Vectorized version of distance matrix calculation
  tri_mask <-
    tf$linalg$band_part(
      tf$ones(
        shape = c(tf$cast(n_latent, tf$int32), tf$cast(n_latent, tf$int32)),
        dtype = tf$float32
      ),
      num_lower = -1L,
      num_upper = 0L
    )
  
  # latent x batch_size x latent
  batch_masked <-
    tf$multiply(tri_mask[, tf$newaxis,], x[tf$newaxis, reticulate::py_ellipsis()])
  
  # latent x batch_size x 1
  x_squared <-
    tf$reduce_sum(batch_masked * batch_masked,
                  axis = 2L,
                  keepdims = TRUE)
  
  # latent x batch_size x batch_size
  pdist_vector <- x_squared + tf$transpose(x_squared, perm = c(0L, 2L, 1L)) -
    2 * tf$matmul(batch_masked, tf$transpose(batch_masked, perm = c(0L, 2L, 1L)))
  
  #(latent, batch_size, batch_size)
  all_dists <- pdist_vector
  # latent
  all_ra <-
    tf$sqrt((1 / (
      batch_size * tf$range(1, 1 + n_latent, dtype = tf$float32)
    )) *
      tf$reduce_sum(tf$square(
        batch_masked - tf$reduce_mean(batch_masked, axis = 1L, keepdims = TRUE)
      ), axis = c(1L, 2L)))
  
  # Avoid singularity in the case of zeros
  #(latent, batch_size, batch_size)
  all_dists <-
    tf$clip_by_value(all_dists, 1e-14, tf$reduce_max(all_dists))
  
  #inds = tf.argsort(all_dists, axis=-1)
  top_k <- tf$math$top_k(-all_dists, tf$cast(k + 1, tf$int32))
  # (#(latent, batch_size, batch_size)
  top_indices <- top_k$indices
  
  #(latent, batch_size, batch_size)
  neighbor_dists_d <-
    tf$gather(all_dists, top_indices, batch_dims = -1L)
  #(latent - 1, batch_size, batch_size)
  neighbor_new_dists <-
    tf$gather(all_dists[2:-1, , ],
              top_indices[1:-2, , ],
              batch_dims = -1L)
  
  # Eq. 4 of Kennel et al.
  #(latent - 1, batch_size, batch_size)
  scaled_dist <- tf$sqrt((
    tf$square(neighbor_new_dists) -
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])) /
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])
  )
  
  # Kennel condition #1
  #(latent - 1, batch_size, batch_size)
  is_false_change <- (scaled_dist > rtol)
  # Kennel condition 2
  #(latent - 1, batch_size, batch_size)
  is_large_jump <-
    (neighbor_new_dists > atol * all_ra[1:-2, tf$newaxis, tf$newaxis])
  
  is_false_neighbor <-
    tf$math$logical_or(is_false_change, is_large_jump)
  #(latent - 1, batch_size, 1)
  total_false_neighbors <-
    tf$cast(is_false_neighbor, tf$int32)[reticulate::py_ellipsis(), 2:(k + 2)]
  
  # Pad zero to match dimensionality of latent space
  # (latent - 1)
  reg_weights <-
    1 - tf$reduce_mean(tf$cast(total_false_neighbors, tf$float32), axis = c(1L, 2L))
  # (latent,)
  reg_weights <- tf$pad(reg_weights, list(list(1L, 0L)))
  
  # Find batch average activity
  
  # L2 Activity regularization
  activations_batch_averaged <-
    tf$sqrt(tf$reduce_mean(tf$square(x), axis = 0L))
  
  loss <- tf$reduce_sum(tf$multiply(reg_weights, activations_batch_averaged))
  loss
  
}

n_latent <- 10L
n_features <- 2
n_timesteps <- 120
batch_size <- 32
n_hidden <- 32


gen_timesteps <- function(x, n_timesteps) {
  do.call(abind,
          c(purrr::map(seq_along_rows(x),
                       function(i) {
                         f <- dim(x)[2]
                         start <- i
                         end <- i + n_timesteps - 1
                         if (end <= dim(x)[1]) {
                           out <- x[start:end,]
                           dim(out) <- c(1, dim(out))
                           out
                         }
                         else {
                           return(array(rep(NA, n_timesteps*f), dim = c(1, n_timesteps, f)))
                         }
                       }) %>% na.omit(),
            along=1)
  ) %>% na.omit()
}


train <- gen_timesteps(roessler[1:(n/2),], 2 * n_timesteps)
test <- gen_timesteps(roessler[(n/2):n,], 2 * n_timesteps) 

dim(train)
dim(test)

x_train <- train[ , 1:n_timesteps, , drop = FALSE]
y_train <- train[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_train <- tensor_slices_dataset(list(x_train, y_train)) %>%
  dataset_shuffle(nrow(x_train)) %>%
  dataset_batch(batch_size)

x_test <- test[ , 1:n_timesteps, , drop = FALSE]
y_test <- test[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_test <- tensor_slices_dataset(list(x_test, y_test)) %>%
  dataset_batch(nrow(x_test))


train_step <- function(batch) {
  with (tf$GradientTape(persistent = TRUE) %as% tape, {
    code <- encoder(batch[[1]])
    prediction <- decoder(code)
    
    l_mse <- mse_loss(batch[[2]], prediction)
    l_fnn <- loss_false_nn(code)
    loss <- l_mse + fnn_weight * l_fnn
  })
  
  encoder_gradients <-
    tape$gradient(loss, encoder$trainable_variables)
  decoder_gradients <-
    tape$gradient(loss, decoder$trainable_variables)
  
  optimizer$apply_gradients(purrr::transpose(list(
    encoder_gradients, encoder$trainable_variables
  )))
  optimizer$apply_gradients(purrr::transpose(list(
    decoder_gradients, decoder$trainable_variables
  )))
  
  train_loss(loss)
  train_mse(l_mse)
  train_fnn(l_fnn)
  
  
}

training_loop <- tf_function(autograph(function(ds_train) {
  for (batch in ds_train) {
    train_step(batch)
  }
  
  tf$print("Loss: ", train_loss$result())
  tf$print("MSE: ", train_mse$result())
  tf$print("FNN loss: ", train_fnn$result())
  
  train_loss$reset_states()
  train_mse$reset_states()
  train_fnn$reset_states()
  
}))


mse_loss <-
  tf$keras$losses$MeanSquaredError(reduction = tf$keras$losses$Reduction$SUM)

train_loss <- tf$keras$metrics$Mean(name = 'train_loss')
train_fnn <- tf$keras$metrics$Mean(name = 'train_fnn')
train_mse <-  tf$keras$metrics$Mean(name = 'train_mse')

# fnn_multiplier should be chosen individually per dataset
# this is the value we used on the geyser dataset
fnn_multiplier <- 0.7
fnn_weight <- fnn_multiplier * nrow(x_train)/batch_size

# learning rate may also need adjustment
#optimizer <- optimizer_adam(learning_rate = 1e-3)
optimizer <- tf$keras$optimizers$legacy$Adam(learning_rate = 1e-3)


encoder <- encoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)

decoder <- decoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)


for (epoch in 1:10) {
  cat("Epoch: ", epoch, " -----------\n")
  training_loop(ds_train)
  
  test_batch <- as_iterator(ds_test) %>% iter_next()
  encoded <- encoder(test_batch[[1]]) 
  test_var <- tf$math$reduce_variance(encoded, axis = 0L)
  print(test_var %>% as.numeric() %>% round(5))
}




test_batch <- as_iterator(ds_test) %>% iter_next()
encoded <- encoder(test_batch[[1]]) %>%
  as.array() %>%
  as_tibble()

encoded %>% summarise_all(var)


given <- data.frame(as.array(tf$concat(list(
  test_batch[[1]][, , 1], test_batch[[2]][, , 1]
),
axis = 1L)) %>% t()) %>%
  add_column(type = "given") %>%
  add_column(num = 1:(2 * n_timesteps))

fnn <- data.frame(as.array(prediction_fnn[, , 1]) %>%
                    t()) %>%
  add_column(type = "fnn") %>%
  add_column(num = (n_timesteps  + 1):(2 * n_timesteps))

compare_preds_df <- bind_rows(given, fnn)

plots <- 
  purrr::map(sample(1:dim(compare_preds_df)[2], 16),
             function(v) {
               ggplot(compare_preds_df, aes(num, .data[[paste0("X", v)]], color = type)) +
                 geom_line() +
                 theme_classic() +
                 theme(legend.position = "none", axis.title = element_blank()) +
                 scale_color_manual(values = c("#00008B", "#DB7093"))
             })

plot_grid(plotlist = plots, ncol = 4)





e = 'train_loss')
train_fnn <- tf$keras$metrics$Mean(name = 'train_fnn')
train_mse <-  tf$keras$metrics$Mean(name = 'train_mse')

# fnn_multiplier should be chosen individually per dataset
# this is the value we used on the geyser dataset
fnn_multiplier <- 0.7
fnn_weight <- fnn_multiplier * nrow(x_train)/batch_size

# learning rate may also need adjustment
#optimizer <- optimizer_adam(learning_rate = 1e-3)
optimizer <- tf$keras$optimizers$legacy$Adam(learning_rate = 1e-3)


encoder <- encoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)

decoder <- decoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)


for (epoch in 1:10) {
  cat("Epoch: ", epoch, " -----------\n")
  training_loop(ds_train)
  
  test_batch <- as_iterator(ds_test) %>% iter_next()
  encoded <- encoder(test_batch[[1]]) 
  test_var <- tf$math$reduce_variance(encoded, axis = 0L)
  print(test_var %>% as.numeric() %>% round(5))
}




test_batch <- as_iterator(ds_test) %>% iter_next()
encoded <- encoder(test_batch[[1]]) %>%
  as.array() %>%
  as_tibble()

encoded %>% summarise_all(var)


given <- data.frame(as.array(tf$concat(list(
  test_batch[[1]][, , 1], test_batch[[2]][, , 1]
),
axis = 1L)) %>% t()) %>%
  add_column(type = "given") %>%
  add_column(num = 1:(2 * n_timesteps))

fnn <- data.frame(as.array(prediction_fnn[, , 1]) %>%
                    t()) %>%
  add_column(type = "fnn") %>%
  add_column(num = (n_timesteps  + 1):(2 * n_timesteps))

compare_preds_df <- bind_rows(given, fnn)

plots <- 
  purrr::map(sample(1:dim(compare_preds_df)[2], 16),
             function(v) {
               ggplot(compare_preds_df, aes(num, .data[[paste0("X", v)]], color = type)) +
                 geom_line() +
                 theme_classic() +
                 theme(legend.position = "none", axis.title = element_blank()) +
                 scale_color_manual(values = c("#00008B", "#DB7093"))
             })

plot_grid(plotlist = plots, ncol = 4)

t-kalinowski · 2024-01-04T18:07:11Z

I think you can redefine gen_timesteps() to use timeseries_dataset_from_array(), which takes care of many of these fiddly details.

gen_timesteps <- function(x, n_timesteps) {
  ds <- timeseries_dataset_from_array(
    x, NULL,
    sequence_length = n_timesteps,
    batch_size = NROW(x)
  )
  ds |>
    as_array_iterator() |>
    iter_next()
}

x <- 1:40
gen_timesteps(x, 10)

out <- gen_timesteps(cbind(x * .1, x, x * 10), 10)

out[1, , ] # first datapoint

mg64ve · 2024-01-04T18:35:45Z

Ok thanks @t-kalinowski , it seems to work.
However the following working code with 14 features:

# DL-related packages
library(tensorflow)
library(keras)
library(tfdatasets)
library(tfautograph)
library(reticulate)

# going to need those later
library(tidyverse)
library(cowplot)

library(deSolve)


parameters <- c(a = .2,
                b = .2,
                c = 5.7)

initial_state <-
  c(x = 1,
    y = 1,
    z = 1.05)

roessler <- function(t, state, parameters) {
  with(as.list(c(state, parameters)), {
    dx <- -y - z
    dy <- x + a * y
    dz = b + z * (x - c)
    
    list(c(dx, dy, dz))
  })
}

times <- seq(0, 2500, length.out = 20000)

roessler_ts <-
  ode(
    y = initial_state,
    times = times,
    func = roessler,
    parms = parameters,
    method = "lsoda"
  ) %>% unclass() %>% as_tibble()

n <- 10000
roessler <- roessler_ts$x[1:n]

roessler <- scale(roessler)



# add noise
noise <- 1 # also used 1.5, 2, 2.5
roessler1 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler2 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler3 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler4 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler5 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler6 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler7 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler8 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler9 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler10 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler11 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler12 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler13 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler14 <- roessler + rnorm(10000, mean = 0, sd = noise)

str(roessler1)

roessler <- array(c(roessler1, roessler2, roessler3, roessler4, roessler5, roessler6
                    , roessler7, roessler8, roessler9, roessler10, roessler11, roessler12, roessler13, roessler14), dim = c(n,14))
str(roessler)



encoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm1 <-  layer_lstm(
      units = n_recurrent,
      input_shape = c(n_timesteps, n_features),
      return_sequences = TRUE
    ) 
    self$batchnorm1 <- layer_batch_normalization()
    self$lstm2 <-  layer_lstm(
      units = n_latent,
      return_sequences = FALSE
    ) 
    self$batchnorm2 <- layer_batch_normalization()
    
    function (x, mask = NULL) {
      x %>%
        self$noise() %>%
        self$lstm1() %>%
        self$batchnorm1() %>%
        self$lstm2() %>%
        self$batchnorm2() 
    }
  })
}

decoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$repeat_vector <- layer_repeat_vector(n = n_timesteps)
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm <- layer_lstm(
      units = n_recurrent,
      return_sequences = TRUE,
      go_backwards = TRUE
    ) 
    self$batchnorm <- layer_batch_normalization()
    self$elu <- layer_activation_elu() 
    self$time_distributed <- time_distributed(layer = layer_dense(units = n_features))
    
    function (x, mask = NULL) {
      x %>%
        self$repeat_vector() %>%
        self$noise() %>%
        self$lstm() %>%
        self$batchnorm() %>%
        self$elu() %>%
        self$time_distributed()
    }
  })
}

# loss FNN
loss_false_nn <- function(x) {
  
  # changing these parameters is equivalent to
  # changing the strength of the regularizer, so we keep these fixed (these values
  # correspond to the original values used in Kennel et al 1992).
  rtol <- 10 
  atol <- 2
  k_frac <- 0.01
  
  k <- max(1, floor(k_frac * batch_size))
  
  ## Vectorized version of distance matrix calculation
  tri_mask <-
    tf$linalg$band_part(
      tf$ones(
        shape = c(tf$cast(n_latent, tf$int32), tf$cast(n_latent, tf$int32)),
        dtype = tf$float32
      ),
      num_lower = -1L,
      num_upper = 0L
    )
  
  # latent x batch_size x latent
  batch_masked <-
    tf$multiply(tri_mask[, tf$newaxis,], x[tf$newaxis, reticulate::py_ellipsis()])
  
  # latent x batch_size x 1
  x_squared <-
    tf$reduce_sum(batch_masked * batch_masked,
                  axis = 2L,
                  keepdims = TRUE)
  
  # latent x batch_size x batch_size
  pdist_vector <- x_squared + tf$transpose(x_squared, perm = c(0L, 2L, 1L)) -
    2 * tf$matmul(batch_masked, tf$transpose(batch_masked, perm = c(0L, 2L, 1L)))
  
  #(latent, batch_size, batch_size)
  all_dists <- pdist_vector
  # latent
  all_ra <-
    tf$sqrt((1 / (
      batch_size * tf$range(1, 1 + n_latent, dtype = tf$float32)
    )) *
      tf$reduce_sum(tf$square(
        batch_masked - tf$reduce_mean(batch_masked, axis = 1L, keepdims = TRUE)
      ), axis = c(1L, 2L)))
  
  # Avoid singularity in the case of zeros
  #(latent, batch_size, batch_size)
  all_dists <-
    tf$clip_by_value(all_dists, 1e-14, tf$reduce_max(all_dists))
  
  #inds = tf.argsort(all_dists, axis=-1)
  top_k <- tf$math$top_k(-all_dists, tf$cast(k + 1, tf$int32))
  # (#(latent, batch_size, batch_size)
  top_indices <- top_k$indices
  
  #(latent, batch_size, batch_size)
  neighbor_dists_d <-
    tf$gather(all_dists, top_indices, batch_dims = -1L)
  #(latent - 1, batch_size, batch_size)
  neighbor_new_dists <-
    tf$gather(all_dists[2:-1, , ],
              top_indices[1:-2, , ],
              batch_dims = -1L)
  
  # Eq. 4 of Kennel et al.
  #(latent - 1, batch_size, batch_size)
  scaled_dist <- tf$sqrt((
    tf$square(neighbor_new_dists) -
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])) /
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])
  )
  
  # Kennel condition #1
  #(latent - 1, batch_size, batch_size)
  is_false_change <- (scaled_dist > rtol)
  # Kennel condition 2
  #(latent - 1, batch_size, batch_size)
  is_large_jump <-
    (neighbor_new_dists > atol * all_ra[1:-2, tf$newaxis, tf$newaxis])
  
  is_false_neighbor <-
    tf$math$logical_or(is_false_change, is_large_jump)
  #(latent - 1, batch_size, 1)
  total_false_neighbors <-
    tf$cast(is_false_neighbor, tf$int32)[reticulate::py_ellipsis(), 2:(k + 2)]
  
  # Pad zero to match dimensionality of latent space
  # (latent - 1)
  reg_weights <-
    1 - tf$reduce_mean(tf$cast(total_false_neighbors, tf$float32), axis = c(1L, 2L))
  # (latent,)
  reg_weights <- tf$pad(reg_weights, list(list(1L, 0L)))
  
  # Find batch average activity
  
  # L2 Activity regularization
  activations_batch_averaged <-
    tf$sqrt(tf$reduce_mean(tf$square(x), axis = 0L))
  
  loss <- tf$reduce_sum(tf$multiply(reg_weights, activations_batch_averaged))
  loss
  
}

n_latent <- 10L
n_features <- 14
n_timesteps <- 120
batch_size <- 32
n_hidden <- 32


gen_timesteps <- function(x, n_timesteps) {
  ds <- timeseries_dataset_from_array(
    x, NULL,
    sequence_length = n_timesteps,
    batch_size = NROW(x)
  )
  ds |>
    as_array_iterator() |>
    iter_next()
}


train <- gen_timesteps(roessler[1:(n/2),], 2 * n_timesteps)
test <- gen_timesteps(roessler[(n/2):n,], 2 * n_timesteps) 

dim(train)
dim(test)

x_train <- train[ , 1:n_timesteps, , drop = FALSE]
y_train <- train[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_train <- tensor_slices_dataset(list(x_train, y_train)) %>%
  dataset_shuffle(nrow(x_train)) %>%
  dataset_batch(batch_size)

x_test <- test[ , 1:n_timesteps, , drop = FALSE]
y_test <- test[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_test <- tensor_slices_dataset(list(x_test, y_test)) %>%
  dataset_batch(nrow(x_test))


train_step <- function(batch) {
  with (tf$GradientTape(persistent = TRUE) %as% tape, {
    code <- encoder(batch[[1]])
    prediction <- decoder(code)
    
    l_mse <- mse_loss(batch[[2]], prediction)
    l_fnn <- loss_false_nn(code)
    loss <- l_mse + fnn_weight * l_fnn
  })
  
  encoder_gradients <-
    tape$gradient(loss, encoder$trainable_variables)
  decoder_gradients <-
    tape$gradient(loss, decoder$trainable_variables)
  
  optimizer$apply_gradients(purrr::transpose(list(
    encoder_gradients, encoder$trainable_variables
  )))
  optimizer$apply_gradients(purrr::transpose(list(
    decoder_gradients, decoder$trainable_variables
  )))
  
  train_loss(loss)
  train_mse(l_mse)
  train_fnn(l_fnn)
  
  
}

training_loop <- tf_function(autograph(function(ds_train) {
  for (batch in ds_train) {
    train_step(batch)
  }
  
  tf$print("Loss: ", train_loss$result())
  tf$print("MSE: ", train_mse$result())
  tf$print("FNN loss: ", train_fnn$result())
  
  train_loss$reset_states()
  train_mse$reset_states()
  train_fnn$reset_states()
  
}))


mse_loss <-
  tf$keras$losses$MeanSquaredError(reduction = tf$keras$losses$Reduction$SUM)

train_loss <- tf$keras$metrics$Mean(name = 'train_loss')
train_fnn <- tf$keras$metrics$Mean(name = 'train_fnn')
train_mse <-  tf$keras$metrics$Mean(name = 'train_mse')

# fnn_multiplier should be chosen individually per dataset
# this is the value we used on the geyser dataset
fnn_multiplier <- 0.7
fnn_weight <- fnn_multiplier * nrow(x_train)/batch_size

# learning rate may also need adjustment
#optimizer <- optimizer_adam(learning_rate = 1e-3)
optimizer <- tf$keras$optimizers$legacy$Adam(learning_rate = 1e-3)


encoder <- encoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)

decoder <- decoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)


for (epoch in 1:10) {
  cat("Epoch: ", epoch, " -----------\n")
  training_loop(ds_train)
  
  test_batch <- as_iterator(ds_test) %>% iter_next()
  encoded <- encoder(test_batch[[1]]) 
  test_var <- tf$math$reduce_variance(encoded, axis = 0L)
  print(test_var %>% as.numeric() %>% round(5))
}




test_batch <- as_iterator(ds_test) %>% iter_next()
encoded <- encoder(test_batch[[1]]) %>%
  as.array() %>%
  as_tibble()

encoded %>% summarise_all(var)


given <- data.frame(as.array(tf$concat(list(
  test_batch[[1]][, , 1], test_batch[[2]][, , 1]
),
axis = 1L)) %>% t()) %>%
  add_column(type = "given") %>%
  add_column(num = 1:(2 * n_timesteps))

prediction_fnn <- decoder(encoder(test_batch[[1]]))
fnn <- data.frame(as.array(prediction_fnn[, , 1]) %>%
                    t()) %>%
  add_column(type = "fnn") %>%
  add_column(num = (n_timesteps  + 1):(2 * n_timesteps))

compare_preds_df <- bind_rows(given, fnn)

plots <- 
  purrr::map(sample(1:dim(compare_preds_df)[2], 16),
             function(v) {
               ggplot(compare_preds_df, aes(num, .data[[paste0("X", v)]], color = type)) +
                 geom_line() +
                 theme_classic() +
                 theme(legend.position = "none", axis.title = element_blank()) +
                 scale_color_manual(values = c("#00008B", "#DB7093"))
             })

plot_grid(plotlist = plots, ncol = 4)

gives me the following warning at the end of training:

Warning message:
Negative numbers are interpreted python-style when subsetting tensorflow tensors.
See: ?`[.tensorflow.tensor` for details.
To turn off this warning, set `options(tensorflow.extract.warn_negatives_pythonic = FALSE)`

what is it all about?

t-kalinowski · 2024-01-04T19:15:46Z

Somewhere you're calling [ on a tensorfow tensor with a negative number.

You can trace which call it is by setting options(warn = 2, error = quote(print(print(rlang::trace_back()))) before evaluating your script.

See ?`[.tensorflow.tensor` (particularly the examples) to understand what a "pythonic" negative index is.

t-kalinowski mentioned this issue Jan 4, 2024

VAE with FNN rstudio/ai-blog#224

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

LSTM-FNN example does not work anymore #1391

LSTM-FNN example does not work anymore #1391

mg64ve commented Jan 4, 2024

t-kalinowski commented Jan 4, 2024

mg64ve commented Jan 4, 2024

t-kalinowski commented Jan 4, 2024 •

edited

Loading

mg64ve commented Jan 4, 2024

t-kalinowski commented Jan 4, 2024

LSTM-FNN example does not work anymore #1391

LSTM-FNN example does not work anymore #1391

Comments

mg64ve commented Jan 4, 2024

t-kalinowski commented Jan 4, 2024

mg64ve commented Jan 4, 2024

t-kalinowski commented Jan 4, 2024 • edited Loading

mg64ve commented Jan 4, 2024

t-kalinowski commented Jan 4, 2024

t-kalinowski commented Jan 4, 2024 •

edited

Loading