Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LSTM-FNN example does not work anymore #1391

Open
mg64ve opened this issue Jan 4, 2024 · 5 comments
Open

LSTM-FNN example does not work anymore #1391

mg64ve opened this issue Jan 4, 2024 · 5 comments

Comments

@mg64ve
Copy link

mg64ve commented Jan 4, 2024

Hello,
I took my code from Posit AI Blog:

https://blogs.rstudio.com/ai/posts/2020-07-20-fnn-lstm/

the code is the following:

# DL-related packages
library(tensorflow)
library(keras)
library(tfdatasets)
library(tfautograph)
library(reticulate)

# going to need those later
library(tidyverse)
library(cowplot)

library(deSolve)


parameters <- c(a = .2,
                b = .2,
                c = 5.7)

initial_state <-
  c(x = 1,
    y = 1,
    z = 1.05)

roessler <- function(t, state, parameters) {
  with(as.list(c(state, parameters)), {
    dx <- -y - z
    dy <- x + a * y
    dz = b + z * (x - c)
    
    list(c(dx, dy, dz))
  })
}

times <- seq(0, 2500, length.out = 20000)

roessler_ts <-
  ode(
    y = initial_state,
    times = times,
    func = roessler,
    parms = parameters,
    method = "lsoda"
  ) %>% unclass() %>% as_tibble()

n <- 10000
roessler <- roessler_ts$x[1:n]

roessler <- scale(roessler)



# add noise
noise <- 1 # also used 1.5, 2, 2.5
roessler <- roessler + rnorm(10000, mean = 0, sd = noise)
str(roessler)



encoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm1 <-  layer_lstm(
      units = n_recurrent,
      input_shape = c(n_timesteps, n_features),
      return_sequences = TRUE
    ) 
    self$batchnorm1 <- layer_batch_normalization()
    self$lstm2 <-  layer_lstm(
      units = n_latent,
      return_sequences = FALSE
    ) 
    self$batchnorm2 <- layer_batch_normalization()
    
    function (x, mask = NULL) {
      x %>%
        self$noise() %>%
        self$lstm1() %>%
        self$batchnorm1() %>%
        self$lstm2() %>%
        self$batchnorm2() 
    }
  })
}

decoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$repeat_vector <- layer_repeat_vector(n = n_timesteps)
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm <- layer_lstm(
      units = n_recurrent,
      return_sequences = TRUE,
      go_backwards = TRUE
    ) 
    self$batchnorm <- layer_batch_normalization()
    self$elu <- layer_activation_elu() 
    self$time_distributed <- time_distributed(layer = layer_dense(units = n_features))
    
    function (x, mask = NULL) {
      x %>%
        self$repeat_vector() %>%
        self$noise() %>%
        self$lstm() %>%
        self$batchnorm() %>%
        self$elu() %>%
        self$time_distributed()
    }
  })
}

# loss FNN
loss_false_nn <- function(x) {
  
  # changing these parameters is equivalent to
  # changing the strength of the regularizer, so we keep these fixed (these values
  # correspond to the original values used in Kennel et al 1992).
  rtol <- 10 
  atol <- 2
  k_frac <- 0.01
  
  k <- max(1, floor(k_frac * batch_size))
  
  ## Vectorized version of distance matrix calculation
  tri_mask <-
    tf$linalg$band_part(
      tf$ones(
        shape = c(tf$cast(n_latent, tf$int32), tf$cast(n_latent, tf$int32)),
        dtype = tf$float32
      ),
      num_lower = -1L,
      num_upper = 0L
    )
  
  # latent x batch_size x latent
  batch_masked <-
    tf$multiply(tri_mask[, tf$newaxis,], x[tf$newaxis, reticulate::py_ellipsis()])
  
  # latent x batch_size x 1
  x_squared <-
    tf$reduce_sum(batch_masked * batch_masked,
                  axis = 2L,
                  keepdims = TRUE)
  
  # latent x batch_size x batch_size
  pdist_vector <- x_squared + tf$transpose(x_squared, perm = c(0L, 2L, 1L)) -
    2 * tf$matmul(batch_masked, tf$transpose(batch_masked, perm = c(0L, 2L, 1L)))
  
  #(latent, batch_size, batch_size)
  all_dists <- pdist_vector
  # latent
  all_ra <-
    tf$sqrt((1 / (
      batch_size * tf$range(1, 1 + n_latent, dtype = tf$float32)
    )) *
      tf$reduce_sum(tf$square(
        batch_masked - tf$reduce_mean(batch_masked, axis = 1L, keepdims = TRUE)
      ), axis = c(1L, 2L)))
  
  # Avoid singularity in the case of zeros
  #(latent, batch_size, batch_size)
  all_dists <-
    tf$clip_by_value(all_dists, 1e-14, tf$reduce_max(all_dists))
  
  #inds = tf.argsort(all_dists, axis=-1)
  top_k <- tf$math$top_k(-all_dists, tf$cast(k + 1, tf$int32))
  # (#(latent, batch_size, batch_size)
  top_indices <- top_k[[1]]
  
  #(latent, batch_size, batch_size)
  neighbor_dists_d <-
    tf$gather(all_dists, top_indices, batch_dims = -1L)
  #(latent - 1, batch_size, batch_size)
  neighbor_new_dists <-
    tf$gather(all_dists[2:-1, , ],
              top_indices[1:-2, , ],
              batch_dims = -1L)
  
  # Eq. 4 of Kennel et al.
  #(latent - 1, batch_size, batch_size)
  scaled_dist <- tf$sqrt((
    tf$square(neighbor_new_dists) -
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])) /
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])
  )
  
  # Kennel condition #1
  #(latent - 1, batch_size, batch_size)
  is_false_change <- (scaled_dist > rtol)
  # Kennel condition 2
  #(latent - 1, batch_size, batch_size)
  is_large_jump <-
    (neighbor_new_dists > atol * all_ra[1:-2, tf$newaxis, tf$newaxis])
  
  is_false_neighbor <-
    tf$math$logical_or(is_false_change, is_large_jump)
  #(latent - 1, batch_size, 1)
  total_false_neighbors <-
    tf$cast(is_false_neighbor, tf$int32)[reticulate::py_ellipsis(), 2:(k + 2)]
  
  # Pad zero to match dimensionality of latent space
  # (latent - 1)
  reg_weights <-
    1 - tf$reduce_mean(tf$cast(total_false_neighbors, tf$float32), axis = c(1L, 2L))
  # (latent,)
  reg_weights <- tf$pad(reg_weights, list(list(1L, 0L)))
  
  # Find batch average activity
  
  # L2 Activity regularization
  activations_batch_averaged <-
    tf$sqrt(tf$reduce_mean(tf$square(x), axis = 0L))
  
  loss <- tf$reduce_sum(tf$multiply(reg_weights, activations_batch_averaged))
  loss
  
}

n_latent <- 10L
n_features <- 1
n_timesteps <- 120
batch_size <- 32
n_hidden <- 32


gen_timesteps <- function(x, n_timesteps) {
  do.call(rbind,
          purrr::map(seq_along(x),
                     function(i) {
                       start <- i
                       end <- i + n_timesteps - 1
                       out <- x[start:end]
                       out
                     })
  ) %>%
    na.omit()
}


train <- gen_timesteps(roessler[1:(n/2)], 2 * n_timesteps)
test <- gen_timesteps(roessler[(n/2):n], 2 * n_timesteps) 

dim(train) <- c(dim(train), 1)
dim(test) <- c(dim(test), 1)

x_train <- train[ , 1:n_timesteps, , drop = FALSE]
y_train <- train[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_train <- tensor_slices_dataset(list(x_train, y_train)) %>%
  dataset_shuffle(nrow(x_train)) %>%
  dataset_batch(batch_size)

x_test <- test[ , 1:n_timesteps, , drop = FALSE]
y_test <- test[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_test <- tensor_slices_dataset(list(x_test, y_test)) %>%
  dataset_batch(nrow(x_test))


train_step <- function(batch) {
  with (tf$GradientTape(persistent = TRUE) %as% tape, {
    code <- encoder(batch[[1]])
    prediction <- decoder(code)
    
    l_mse <- mse_loss(batch[[2]], prediction)
    l_fnn <- loss_false_nn(code)
    loss <- l_mse + fnn_weight * l_fnn
  })
  
  encoder_gradients <-
    tape$gradient(loss, encoder$trainable_variables)
  decoder_gradients <-
    tape$gradient(loss, decoder$trainable_variables)
  
  optimizer$apply_gradients(purrr::transpose(list(
    encoder_gradients, encoder$trainable_variables
  )))
  optimizer$apply_gradients(purrr::transpose(list(
    decoder_gradients, decoder$trainable_variables
  )))
  
  train_loss(loss)
  train_mse(l_mse)
  train_fnn(l_fnn)
  
  
}

training_loop <- tf_function(autograph(function(ds_train) {
  for (batch in ds_train) {
    train_step(batch)
  }
  
  tf$print("Loss: ", train_loss$result())
  tf$print("MSE: ", train_mse$result())
  tf$print("FNN loss: ", train_fnn$result())
  
  train_loss$reset_states()
  train_mse$reset_states()
  train_fnn$reset_states()
  
}))


mse_loss <-
  tf$keras$losses$MeanSquaredError(reduction = tf$keras$losses$Reduction$SUM)

train_loss <- tf$keras$metrics$Mean(name = 'train_loss')
train_fnn <- tf$keras$metrics$Mean(name = 'train_fnn')
train_mse <-  tf$keras$metrics$Mean(name = 'train_mse')

# fnn_multiplier should be chosen individually per dataset
# this is the value we used on the geyser dataset
fnn_multiplier <- 0.7
fnn_weight <- fnn_multiplier * nrow(x_train)/batch_size

# learning rate may also need adjustment
optimizer <- optimizer_adam(learning_rate = 1e-3)


encoder <- encoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)

decoder <- decoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)


for (epoch in 1:200) {
  cat("Epoch: ", epoch, " -----------\n")
  training_loop(ds_train)
  
  test_batch <- as_iterator(ds_test) %>% iter_next()
  encoded <- encoder(test_batch[[1]]) 
  test_var <- tf$math$reduce_variance(encoded, axis = 0L)
  print(test_var %>% as.numeric() %>% round(5))
}

In my installation is failing with the following error:

Error in py_call_impl(callable, call_args$unnamed, call_args$named) : 
  KeyError: 'The optimizer cannot recognize variable lstm_8/lstm_cell/kernel:0. This usually means you are trying to call the optimizer to update different parts of the model separately. Please call `optimizer.build(variables)` with the full list of trainable variables before the training loop or use legacy optimizer `tf.keras.optimizers.legacy.Adam.'
Run `reticulate::py_last_error()` for details.

is this related to a problem in my installation or this model is not working anymore?

@t-kalinowski
Copy link
Member

Thanks for reporting. I am able to reproduce the error, and am able to run the example after two edits:

# optimizer <- optimizer_adam(learning_rate = 1e-3)
optimizer <- tf$keras$optimizers$legacy$Adam(learning_rate = 1e-3)

and

# top_indices <- top_k[[1]]
top_indices <- top_k$indices

(this is on TF version 2.15, Ubuntu 22.04)

> tf_config()
TensorFlow v2.15.0 (~/.virtualenvs/r-tensorflow/lib/python3.9/site-packages/tensorflow)
Python v3.9 (~/.virtualenvs/r-tensorflow/bin/python)

@mg64ve
Copy link
Author

mg64ve commented Jan 4, 2024

Thanks @t-kalinowski for your quick response.
In case I want to adjust this concept to multiple features, do you have any suggestion on how to change the following:

gen_timesteps <- function(x, n_timesteps) {
  do.call(rbind,
          purrr::map(seq_along(x),
                     function(i) {
                       start <- i
                       end <- i + n_timesteps - 1
                       out <- x[start:end]
                       out
                     })
  ) %>%
    na.omit()
}

for multiple features or do you have any recommendation to use any other function for that?

I have tried with the following:

gen_timesteps <- function(x, n_timesteps) {
  do.call(abind,
          c(purrr::map(seq_along_rows(x),
                       function(i) {
                         f <- dim(x)[2]
                         start <- i
                         end <- i + n_timesteps - 1
                         if (end <= dim(x)[1]) {
                           out <- x[start:end,]
                           dim(out) <- c(1, dim(out))
                           out
                         }
                         else {
                           return(array(rep(NA, n_timesteps*f), dim = c(1, n_timesteps, f)))
                         }
                       }) %>% na.omit(),
            along=1)
  ) %>% na.omit()
}

But it produces NA that I can't remove. The problem is also due to the fact that in R, subsetting a vector produces NA if the dimensions are exceeded, that is not the case of array where you get the message that index is out of bounds.

As a consequence, the following does not give me any error but it produces NA during training:

# DL-related packages
library(tensorflow)
library(keras)
library(tfdatasets)
library(tfautograph)
library(reticulate)

# going to need those later
library(tidyverse)
library(cowplot)

library(deSolve)


parameters <- c(a = .2,
                b = .2,
                c = 5.7)

initial_state <-
  c(x = 1,
    y = 1,
    z = 1.05)

roessler <- function(t, state, parameters) {
  with(as.list(c(state, parameters)), {
    dx <- -y - z
    dy <- x + a * y
    dz = b + z * (x - c)
    
    list(c(dx, dy, dz))
  })
}

times <- seq(0, 2500, length.out = 20000)

roessler_ts <-
  ode(
    y = initial_state,
    times = times,
    func = roessler,
    parms = parameters,
    method = "lsoda"
  ) %>% unclass() %>% as_tibble()

n <- 10000
roessler <- roessler_ts$x[1:n]

roessler <- scale(roessler)



# add noise
noise <- 1 # also used 1.5, 2, 2.5
roessler1 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler2 <- roessler + rnorm(10000, mean = 0, sd = noise)
str(roessler1)

roessler <- array(c(roessler1, roessler2), dim = c(n,2))
str(roessler)



encoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm1 <-  layer_lstm(
      units = n_recurrent,
      input_shape = c(n_timesteps, n_features),
      return_sequences = TRUE
    ) 
    self$batchnorm1 <- layer_batch_normalization()
    self$lstm2 <-  layer_lstm(
      units = n_latent,
      return_sequences = FALSE
    ) 
    self$batchnorm2 <- layer_batch_normalization()
    
    function (x, mask = NULL) {
      x %>%
        self$noise() %>%
        self$lstm1() %>%
        self$batchnorm1() %>%
        self$lstm2() %>%
        self$batchnorm2() 
    }
  })
}

decoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$repeat_vector <- layer_repeat_vector(n = n_timesteps)
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm <- layer_lstm(
      units = n_recurrent,
      return_sequences = TRUE,
      go_backwards = TRUE
    ) 
    self$batchnorm <- layer_batch_normalization()
    self$elu <- layer_activation_elu() 
    self$time_distributed <- time_distributed(layer = layer_dense(units = n_features))
    
    function (x, mask = NULL) {
      x %>%
        self$repeat_vector() %>%
        self$noise() %>%
        self$lstm() %>%
        self$batchnorm() %>%
        self$elu() %>%
        self$time_distributed()
    }
  })
}

# loss FNN
loss_false_nn <- function(x) {
  
  # changing these parameters is equivalent to
  # changing the strength of the regularizer, so we keep these fixed (these values
  # correspond to the original values used in Kennel et al 1992).
  rtol <- 10 
  atol <- 2
  k_frac <- 0.01
  
  k <- max(1, floor(k_frac * batch_size))
  
  ## Vectorized version of distance matrix calculation
  tri_mask <-
    tf$linalg$band_part(
      tf$ones(
        shape = c(tf$cast(n_latent, tf$int32), tf$cast(n_latent, tf$int32)),
        dtype = tf$float32
      ),
      num_lower = -1L,
      num_upper = 0L
    )
  
  # latent x batch_size x latent
  batch_masked <-
    tf$multiply(tri_mask[, tf$newaxis,], x[tf$newaxis, reticulate::py_ellipsis()])
  
  # latent x batch_size x 1
  x_squared <-
    tf$reduce_sum(batch_masked * batch_masked,
                  axis = 2L,
                  keepdims = TRUE)
  
  # latent x batch_size x batch_size
  pdist_vector <- x_squared + tf$transpose(x_squared, perm = c(0L, 2L, 1L)) -
    2 * tf$matmul(batch_masked, tf$transpose(batch_masked, perm = c(0L, 2L, 1L)))
  
  #(latent, batch_size, batch_size)
  all_dists <- pdist_vector
  # latent
  all_ra <-
    tf$sqrt((1 / (
      batch_size * tf$range(1, 1 + n_latent, dtype = tf$float32)
    )) *
      tf$reduce_sum(tf$square(
        batch_masked - tf$reduce_mean(batch_masked, axis = 1L, keepdims = TRUE)
      ), axis = c(1L, 2L)))
  
  # Avoid singularity in the case of zeros
  #(latent, batch_size, batch_size)
  all_dists <-
    tf$clip_by_value(all_dists, 1e-14, tf$reduce_max(all_dists))
  
  #inds = tf.argsort(all_dists, axis=-1)
  top_k <- tf$math$top_k(-all_dists, tf$cast(k + 1, tf$int32))
  # (#(latent, batch_size, batch_size)
  top_indices <- top_k$indices
  
  #(latent, batch_size, batch_size)
  neighbor_dists_d <-
    tf$gather(all_dists, top_indices, batch_dims = -1L)
  #(latent - 1, batch_size, batch_size)
  neighbor_new_dists <-
    tf$gather(all_dists[2:-1, , ],
              top_indices[1:-2, , ],
              batch_dims = -1L)
  
  # Eq. 4 of Kennel et al.
  #(latent - 1, batch_size, batch_size)
  scaled_dist <- tf$sqrt((
    tf$square(neighbor_new_dists) -
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])) /
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])
  )
  
  # Kennel condition #1
  #(latent - 1, batch_size, batch_size)
  is_false_change <- (scaled_dist > rtol)
  # Kennel condition 2
  #(latent - 1, batch_size, batch_size)
  is_large_jump <-
    (neighbor_new_dists > atol * all_ra[1:-2, tf$newaxis, tf$newaxis])
  
  is_false_neighbor <-
    tf$math$logical_or(is_false_change, is_large_jump)
  #(latent - 1, batch_size, 1)
  total_false_neighbors <-
    tf$cast(is_false_neighbor, tf$int32)[reticulate::py_ellipsis(), 2:(k + 2)]
  
  # Pad zero to match dimensionality of latent space
  # (latent - 1)
  reg_weights <-
    1 - tf$reduce_mean(tf$cast(total_false_neighbors, tf$float32), axis = c(1L, 2L))
  # (latent,)
  reg_weights <- tf$pad(reg_weights, list(list(1L, 0L)))
  
  # Find batch average activity
  
  # L2 Activity regularization
  activations_batch_averaged <-
    tf$sqrt(tf$reduce_mean(tf$square(x), axis = 0L))
  
  loss <- tf$reduce_sum(tf$multiply(reg_weights, activations_batch_averaged))
  loss
  
}

n_latent <- 10L
n_features <- 2
n_timesteps <- 120
batch_size <- 32
n_hidden <- 32


gen_timesteps <- function(x, n_timesteps) {
  do.call(abind,
          c(purrr::map(seq_along_rows(x),
                       function(i) {
                         f <- dim(x)[2]
                         start <- i
                         end <- i + n_timesteps - 1
                         if (end <= dim(x)[1]) {
                           out <- x[start:end,]
                           dim(out) <- c(1, dim(out))
                           out
                         }
                         else {
                           return(array(rep(NA, n_timesteps*f), dim = c(1, n_timesteps, f)))
                         }
                       }) %>% na.omit(),
            along=1)
  ) %>% na.omit()
}


train <- gen_timesteps(roessler[1:(n/2),], 2 * n_timesteps)
test <- gen_timesteps(roessler[(n/2):n,], 2 * n_timesteps) 

dim(train)
dim(test)

x_train <- train[ , 1:n_timesteps, , drop = FALSE]
y_train <- train[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_train <- tensor_slices_dataset(list(x_train, y_train)) %>%
  dataset_shuffle(nrow(x_train)) %>%
  dataset_batch(batch_size)

x_test <- test[ , 1:n_timesteps, , drop = FALSE]
y_test <- test[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_test <- tensor_slices_dataset(list(x_test, y_test)) %>%
  dataset_batch(nrow(x_test))


train_step <- function(batch) {
  with (tf$GradientTape(persistent = TRUE) %as% tape, {
    code <- encoder(batch[[1]])
    prediction <- decoder(code)
    
    l_mse <- mse_loss(batch[[2]], prediction)
    l_fnn <- loss_false_nn(code)
    loss <- l_mse + fnn_weight * l_fnn
  })
  
  encoder_gradients <-
    tape$gradient(loss, encoder$trainable_variables)
  decoder_gradients <-
    tape$gradient(loss, decoder$trainable_variables)
  
  optimizer$apply_gradients(purrr::transpose(list(
    encoder_gradients, encoder$trainable_variables
  )))
  optimizer$apply_gradients(purrr::transpose(list(
    decoder_gradients, decoder$trainable_variables
  )))
  
  train_loss(loss)
  train_mse(l_mse)
  train_fnn(l_fnn)
  
  
}

training_loop <- tf_function(autograph(function(ds_train) {
  for (batch in ds_train) {
    train_step(batch)
  }
  
  tf$print("Loss: ", train_loss$result())
  tf$print("MSE: ", train_mse$result())
  tf$print("FNN loss: ", train_fnn$result())
  
  train_loss$reset_states()
  train_mse$reset_states()
  train_fnn$reset_states()
  
}))


mse_loss <-
  tf$keras$losses$MeanSquaredError(reduction = tf$keras$losses$Reduction$SUM)

train_loss <- tf$keras$metrics$Mean(nam# DL-related packages
library(tensorflow)
library(keras)
library(tfdatasets)
library(tfautograph)
library(reticulate)

# going to need those later
library(tidyverse)
library(cowplot)

library(deSolve)


parameters <- c(a = .2,
                b = .2,
                c = 5.7)

initial_state <-
  c(x = 1,
    y = 1,
    z = 1.05)

roessler <- function(t, state, parameters) {
  with(as.list(c(state, parameters)), {
    dx <- -y - z
    dy <- x + a * y
    dz = b + z * (x - c)
    
    list(c(dx, dy, dz))
  })
}

times <- seq(0, 2500, length.out = 20000)

roessler_ts <-
  ode(
    y = initial_state,
    times = times,
    func = roessler,
    parms = parameters,
    method = "lsoda"
  ) %>% unclass() %>% as_tibble()

n <- 10000
roessler <- roessler_ts$x[1:n]

roessler <- scale(roessler)



# add noise
noise <- 1 # also used 1.5, 2, 2.5
roessler1 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler2 <- roessler + rnorm(10000, mean = 0, sd = noise)
str(roessler1)

roessler <- array(c(roessler1, roessler2), dim = c(n,2))
str(roessler)



encoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm1 <-  layer_lstm(
      units = n_recurrent,
      input_shape = c(n_timesteps, n_features),
      return_sequences = TRUE
    ) 
    self$batchnorm1 <- layer_batch_normalization()
    self$lstm2 <-  layer_lstm(
      units = n_latent,
      return_sequences = FALSE
    ) 
    self$batchnorm2 <- layer_batch_normalization()
    
    function (x, mask = NULL) {
      x %>%
        self$noise() %>%
        self$lstm1() %>%
        self$batchnorm1() %>%
        self$lstm2() %>%
        self$batchnorm2() 
    }
  })
}

decoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$repeat_vector <- layer_repeat_vector(n = n_timesteps)
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm <- layer_lstm(
      units = n_recurrent,
      return_sequences = TRUE,
      go_backwards = TRUE
    ) 
    self$batchnorm <- layer_batch_normalization()
    self$elu <- layer_activation_elu() 
    self$time_distributed <- time_distributed(layer = layer_dense(units = n_features))
    
    function (x, mask = NULL) {
      x %>%
        self$repeat_vector() %>%
        self$noise() %>%
        self$lstm() %>%
        self$batchnorm() %>%
        self$elu() %>%
        self$time_distributed()
    }
  })
}

# loss FNN
loss_false_nn <- function(x) {
  
  # changing these parameters is equivalent to
  # changing the strength of the regularizer, so we keep these fixed (these values
  # correspond to the original values used in Kennel et al 1992).
  rtol <- 10 
  atol <- 2
  k_frac <- 0.01
  
  k <- max(1, floor(k_frac * batch_size))
  
  ## Vectorized version of distance matrix calculation
  tri_mask <-
    tf$linalg$band_part(
      tf$ones(
        shape = c(tf$cast(n_latent, tf$int32), tf$cast(n_latent, tf$int32)),
        dtype = tf$float32
      ),
      num_lower = -1L,
      num_upper = 0L
    )
  
  # latent x batch_size x latent
  batch_masked <-
    tf$multiply(tri_mask[, tf$newaxis,], x[tf$newaxis, reticulate::py_ellipsis()])
  
  # latent x batch_size x 1
  x_squared <-
    tf$reduce_sum(batch_masked * batch_masked,
                  axis = 2L,
                  keepdims = TRUE)
  
  # latent x batch_size x batch_size
  pdist_vector <- x_squared + tf$transpose(x_squared, perm = c(0L, 2L, 1L)) -
    2 * tf$matmul(batch_masked, tf$transpose(batch_masked, perm = c(0L, 2L, 1L)))
  
  #(latent, batch_size, batch_size)
  all_dists <- pdist_vector
  # latent
  all_ra <-
    tf$sqrt((1 / (
      batch_size * tf$range(1, 1 + n_latent, dtype = tf$float32)
    )) *
      tf$reduce_sum(tf$square(
        batch_masked - tf$reduce_mean(batch_masked, axis = 1L, keepdims = TRUE)
      ), axis = c(1L, 2L)))
  
  # Avoid singularity in the case of zeros
  #(latent, batch_size, batch_size)
  all_dists <-
    tf$clip_by_value(all_dists, 1e-14, tf$reduce_max(all_dists))
  
  #inds = tf.argsort(all_dists, axis=-1)
  top_k <- tf$math$top_k(-all_dists, tf$cast(k + 1, tf$int32))
  # (#(latent, batch_size, batch_size)
  top_indices <- top_k$indices
  
  #(latent, batch_size, batch_size)
  neighbor_dists_d <-
    tf$gather(all_dists, top_indices, batch_dims = -1L)
  #(latent - 1, batch_size, batch_size)
  neighbor_new_dists <-
    tf$gather(all_dists[2:-1, , ],
              top_indices[1:-2, , ],
              batch_dims = -1L)
  
  # Eq. 4 of Kennel et al.
  #(latent - 1, batch_size, batch_size)
  scaled_dist <- tf$sqrt((
    tf$square(neighbor_new_dists) -
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])) /
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])
  )
  
  # Kennel condition #1
  #(latent - 1, batch_size, batch_size)
  is_false_change <- (scaled_dist > rtol)
  # Kennel condition 2
  #(latent - 1, batch_size, batch_size)
  is_large_jump <-
    (neighbor_new_dists > atol * all_ra[1:-2, tf$newaxis, tf$newaxis])
  
  is_false_neighbor <-
    tf$math$logical_or(is_false_change, is_large_jump)
  #(latent - 1, batch_size, 1)
  total_false_neighbors <-
    tf$cast(is_false_neighbor, tf$int32)[reticulate::py_ellipsis(), 2:(k + 2)]
  
  # Pad zero to match dimensionality of latent space
  # (latent - 1)
  reg_weights <-
    1 - tf$reduce_mean(tf$cast(total_false_neighbors, tf$float32), axis = c(1L, 2L))
  # (latent,)
  reg_weights <- tf$pad(reg_weights, list(list(1L, 0L)))
  
  # Find batch average activity
  
  # L2 Activity regularization
  activations_batch_averaged <-
    tf$sqrt(tf$reduce_mean(tf$square(x), axis = 0L))
  
  loss <- tf$reduce_sum(tf$multiply(reg_weights, activations_batch_averaged))
  loss
  
}

n_latent <- 10L
n_features <- 2
n_timesteps <- 120
batch_size <- 32
n_hidden <- 32


gen_timesteps <- function(x, n_timesteps) {
  do.call(abind,
          c(purrr::map(seq_along_rows(x),
                       function(i) {
                         f <- dim(x)[2]
                         start <- i
                         end <- i + n_timesteps - 1
                         if (end <= dim(x)[1]) {
                           out <- x[start:end,]
                           dim(out) <- c(1, dim(out))
                           out
                         }
                         else {
                           return(array(rep(NA, n_timesteps*f), dim = c(1, n_timesteps, f)))
                         }
                       }) %>% na.omit(),
            along=1)
  ) %>% na.omit()
}


train <- gen_timesteps(roessler[1:(n/2),], 2 * n_timesteps)
test <- gen_timesteps(roessler[(n/2):n,], 2 * n_timesteps) 

dim(train)
dim(test)

x_train <- train[ , 1:n_timesteps, , drop = FALSE]
y_train <- train[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_train <- tensor_slices_dataset(list(x_train, y_train)) %>%
  dataset_shuffle(nrow(x_train)) %>%
  dataset_batch(batch_size)

x_test <- test[ , 1:n_timesteps, , drop = FALSE]
y_test <- test[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_test <- tensor_slices_dataset(list(x_test, y_test)) %>%
  dataset_batch(nrow(x_test))


train_step <- function(batch) {
  with (tf$GradientTape(persistent = TRUE) %as% tape, {
    code <- encoder(batch[[1]])
    prediction <- decoder(code)
    
    l_mse <- mse_loss(batch[[2]], prediction)
    l_fnn <- loss_false_nn(code)
    loss <- l_mse + fnn_weight * l_fnn
  })
  
  encoder_gradients <-
    tape$gradient(loss, encoder$trainable_variables)
  decoder_gradients <-
    tape$gradient(loss, decoder$trainable_variables)
  
  optimizer$apply_gradients(purrr::transpose(list(
    encoder_gradients, encoder$trainable_variables
  )))
  optimizer$apply_gradients(purrr::transpose(list(
    decoder_gradients, decoder$trainable_variables
  )))
  
  train_loss(loss)
  train_mse(l_mse)
  train_fnn(l_fnn)
  
  
}

training_loop <- tf_function(autograph(function(ds_train) {
  for (batch in ds_train) {
    train_step(batch)
  }
  
  tf$print("Loss: ", train_loss$result())
  tf$print("MSE: ", train_mse$result())
  tf$print("FNN loss: ", train_fnn$result())
  
  train_loss$reset_states()
  train_mse$reset_states()
  train_fnn$reset_states()
  
}))


mse_loss <-
  tf$keras$losses$MeanSquaredError(reduction = tf$keras$losses$Reduction$SUM)

train_loss <- tf$keras$metrics$Mean(name = 'train_loss')
train_fnn <- tf$keras$metrics$Mean(name = 'train_fnn')
train_mse <-  tf$keras$metrics$Mean(name = 'train_mse')

# fnn_multiplier should be chosen individually per dataset
# this is the value we used on the geyser dataset
fnn_multiplier <- 0.7
fnn_weight <- fnn_multiplier * nrow(x_train)/batch_size

# learning rate may also need adjustment
#optimizer <- optimizer_adam(learning_rate = 1e-3)
optimizer <- tf$keras$optimizers$legacy$Adam(learning_rate = 1e-3)


encoder <- encoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)

decoder <- decoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)


for (epoch in 1:10) {
  cat("Epoch: ", epoch, " -----------\n")
  training_loop(ds_train)
  
  test_batch <- as_iterator(ds_test) %>% iter_next()
  encoded <- encoder(test_batch[[1]]) 
  test_var <- tf$math$reduce_variance(encoded, axis = 0L)
  print(test_var %>% as.numeric() %>% round(5))
}




test_batch <- as_iterator(ds_test) %>% iter_next()
encoded <- encoder(test_batch[[1]]) %>%
  as.array() %>%
  as_tibble()

encoded %>% summarise_all(var)


given <- data.frame(as.array(tf$concat(list(
  test_batch[[1]][, , 1], test_batch[[2]][, , 1]
),
axis = 1L)) %>% t()) %>%
  add_column(type = "given") %>%
  add_column(num = 1:(2 * n_timesteps))

fnn <- data.frame(as.array(prediction_fnn[, , 1]) %>%
                    t()) %>%
  add_column(type = "fnn") %>%
  add_column(num = (n_timesteps  + 1):(2 * n_timesteps))

compare_preds_df <- bind_rows(given, fnn)

plots <- 
  purrr::map(sample(1:dim(compare_preds_df)[2], 16),
             function(v) {
               ggplot(compare_preds_df, aes(num, .data[[paste0("X", v)]], color = type)) +
                 geom_line() +
                 theme_classic() +
                 theme(legend.position = "none", axis.title = element_blank()) +
                 scale_color_manual(values = c("#00008B", "#DB7093"))
             })

plot_grid(plotlist = plots, ncol = 4)





e = 'train_loss')
train_fnn <- tf$keras$metrics$Mean(name = 'train_fnn')
train_mse <-  tf$keras$metrics$Mean(name = 'train_mse')

# fnn_multiplier should be chosen individually per dataset
# this is the value we used on the geyser dataset
fnn_multiplier <- 0.7
fnn_weight <- fnn_multiplier * nrow(x_train)/batch_size

# learning rate may also need adjustment
#optimizer <- optimizer_adam(learning_rate = 1e-3)
optimizer <- tf$keras$optimizers$legacy$Adam(learning_rate = 1e-3)


encoder <- encoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)

decoder <- decoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)


for (epoch in 1:10) {
  cat("Epoch: ", epoch, " -----------\n")
  training_loop(ds_train)
  
  test_batch <- as_iterator(ds_test) %>% iter_next()
  encoded <- encoder(test_batch[[1]]) 
  test_var <- tf$math$reduce_variance(encoded, axis = 0L)
  print(test_var %>% as.numeric() %>% round(5))
}




test_batch <- as_iterator(ds_test) %>% iter_next()
encoded <- encoder(test_batch[[1]]) %>%
  as.array() %>%
  as_tibble()

encoded %>% summarise_all(var)


given <- data.frame(as.array(tf$concat(list(
  test_batch[[1]][, , 1], test_batch[[2]][, , 1]
),
axis = 1L)) %>% t()) %>%
  add_column(type = "given") %>%
  add_column(num = 1:(2 * n_timesteps))

fnn <- data.frame(as.array(prediction_fnn[, , 1]) %>%
                    t()) %>%
  add_column(type = "fnn") %>%
  add_column(num = (n_timesteps  + 1):(2 * n_timesteps))

compare_preds_df <- bind_rows(given, fnn)

plots <- 
  purrr::map(sample(1:dim(compare_preds_df)[2], 16),
             function(v) {
               ggplot(compare_preds_df, aes(num, .data[[paste0("X", v)]], color = type)) +
                 geom_line() +
                 theme_classic() +
                 theme(legend.position = "none", axis.title = element_blank()) +
                 scale_color_manual(values = c("#00008B", "#DB7093"))
             })

plot_grid(plotlist = plots, ncol = 4)

@t-kalinowski
Copy link
Member

t-kalinowski commented Jan 4, 2024

I think you can redefine gen_timesteps() to use timeseries_dataset_from_array(), which takes care of many of these fiddly details.

gen_timesteps <- function(x, n_timesteps) {
  ds <- timeseries_dataset_from_array(
    x, NULL,
    sequence_length = n_timesteps,
    batch_size = NROW(x)
  )
  ds |>
    as_array_iterator() |>
    iter_next()
}

x <- 1:40
gen_timesteps(x, 10)

out <- gen_timesteps(cbind(x * .1, x, x * 10), 10)

out[1, , ] # first datapoint

@mg64ve
Copy link
Author

mg64ve commented Jan 4, 2024

Ok thanks @t-kalinowski , it seems to work.
However the following working code with 14 features:

# DL-related packages
library(tensorflow)
library(keras)
library(tfdatasets)
library(tfautograph)
library(reticulate)

# going to need those later
library(tidyverse)
library(cowplot)

library(deSolve)


parameters <- c(a = .2,
                b = .2,
                c = 5.7)

initial_state <-
  c(x = 1,
    y = 1,
    z = 1.05)

roessler <- function(t, state, parameters) {
  with(as.list(c(state, parameters)), {
    dx <- -y - z
    dy <- x + a * y
    dz = b + z * (x - c)
    
    list(c(dx, dy, dz))
  })
}

times <- seq(0, 2500, length.out = 20000)

roessler_ts <-
  ode(
    y = initial_state,
    times = times,
    func = roessler,
    parms = parameters,
    method = "lsoda"
  ) %>% unclass() %>% as_tibble()

n <- 10000
roessler <- roessler_ts$x[1:n]

roessler <- scale(roessler)



# add noise
noise <- 1 # also used 1.5, 2, 2.5
roessler1 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler2 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler3 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler4 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler5 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler6 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler7 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler8 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler9 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler10 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler11 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler12 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler13 <- roessler + rnorm(10000, mean = 0, sd = noise)
roessler14 <- roessler + rnorm(10000, mean = 0, sd = noise)

str(roessler1)

roessler <- array(c(roessler1, roessler2, roessler3, roessler4, roessler5, roessler6
                    , roessler7, roessler8, roessler9, roessler10, roessler11, roessler12, roessler13, roessler14), dim = c(n,14))
str(roessler)



encoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm1 <-  layer_lstm(
      units = n_recurrent,
      input_shape = c(n_timesteps, n_features),
      return_sequences = TRUE
    ) 
    self$batchnorm1 <- layer_batch_normalization()
    self$lstm2 <-  layer_lstm(
      units = n_latent,
      return_sequences = FALSE
    ) 
    self$batchnorm2 <- layer_batch_normalization()
    
    function (x, mask = NULL) {
      x %>%
        self$noise() %>%
        self$lstm1() %>%
        self$batchnorm1() %>%
        self$lstm2() %>%
        self$batchnorm2() 
    }
  })
}

decoder_model <- function(n_timesteps,
                          n_features,
                          n_recurrent,
                          n_latent,
                          name = NULL) {
  
  keras_model_custom(name = name, function(self) {
    
    self$repeat_vector <- layer_repeat_vector(n = n_timesteps)
    self$noise <- layer_gaussian_noise(stddev = 0.5)
    self$lstm <- layer_lstm(
      units = n_recurrent,
      return_sequences = TRUE,
      go_backwards = TRUE
    ) 
    self$batchnorm <- layer_batch_normalization()
    self$elu <- layer_activation_elu() 
    self$time_distributed <- time_distributed(layer = layer_dense(units = n_features))
    
    function (x, mask = NULL) {
      x %>%
        self$repeat_vector() %>%
        self$noise() %>%
        self$lstm() %>%
        self$batchnorm() %>%
        self$elu() %>%
        self$time_distributed()
    }
  })
}

# loss FNN
loss_false_nn <- function(x) {
  
  # changing these parameters is equivalent to
  # changing the strength of the regularizer, so we keep these fixed (these values
  # correspond to the original values used in Kennel et al 1992).
  rtol <- 10 
  atol <- 2
  k_frac <- 0.01
  
  k <- max(1, floor(k_frac * batch_size))
  
  ## Vectorized version of distance matrix calculation
  tri_mask <-
    tf$linalg$band_part(
      tf$ones(
        shape = c(tf$cast(n_latent, tf$int32), tf$cast(n_latent, tf$int32)),
        dtype = tf$float32
      ),
      num_lower = -1L,
      num_upper = 0L
    )
  
  # latent x batch_size x latent
  batch_masked <-
    tf$multiply(tri_mask[, tf$newaxis,], x[tf$newaxis, reticulate::py_ellipsis()])
  
  # latent x batch_size x 1
  x_squared <-
    tf$reduce_sum(batch_masked * batch_masked,
                  axis = 2L,
                  keepdims = TRUE)
  
  # latent x batch_size x batch_size
  pdist_vector <- x_squared + tf$transpose(x_squared, perm = c(0L, 2L, 1L)) -
    2 * tf$matmul(batch_masked, tf$transpose(batch_masked, perm = c(0L, 2L, 1L)))
  
  #(latent, batch_size, batch_size)
  all_dists <- pdist_vector
  # latent
  all_ra <-
    tf$sqrt((1 / (
      batch_size * tf$range(1, 1 + n_latent, dtype = tf$float32)
    )) *
      tf$reduce_sum(tf$square(
        batch_masked - tf$reduce_mean(batch_masked, axis = 1L, keepdims = TRUE)
      ), axis = c(1L, 2L)))
  
  # Avoid singularity in the case of zeros
  #(latent, batch_size, batch_size)
  all_dists <-
    tf$clip_by_value(all_dists, 1e-14, tf$reduce_max(all_dists))
  
  #inds = tf.argsort(all_dists, axis=-1)
  top_k <- tf$math$top_k(-all_dists, tf$cast(k + 1, tf$int32))
  # (#(latent, batch_size, batch_size)
  top_indices <- top_k$indices
  
  #(latent, batch_size, batch_size)
  neighbor_dists_d <-
    tf$gather(all_dists, top_indices, batch_dims = -1L)
  #(latent - 1, batch_size, batch_size)
  neighbor_new_dists <-
    tf$gather(all_dists[2:-1, , ],
              top_indices[1:-2, , ],
              batch_dims = -1L)
  
  # Eq. 4 of Kennel et al.
  #(latent - 1, batch_size, batch_size)
  scaled_dist <- tf$sqrt((
    tf$square(neighbor_new_dists) -
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])) /
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])
  )
  
  # Kennel condition #1
  #(latent - 1, batch_size, batch_size)
  is_false_change <- (scaled_dist > rtol)
  # Kennel condition 2
  #(latent - 1, batch_size, batch_size)
  is_large_jump <-
    (neighbor_new_dists > atol * all_ra[1:-2, tf$newaxis, tf$newaxis])
  
  is_false_neighbor <-
    tf$math$logical_or(is_false_change, is_large_jump)
  #(latent - 1, batch_size, 1)
  total_false_neighbors <-
    tf$cast(is_false_neighbor, tf$int32)[reticulate::py_ellipsis(), 2:(k + 2)]
  
  # Pad zero to match dimensionality of latent space
  # (latent - 1)
  reg_weights <-
    1 - tf$reduce_mean(tf$cast(total_false_neighbors, tf$float32), axis = c(1L, 2L))
  # (latent,)
  reg_weights <- tf$pad(reg_weights, list(list(1L, 0L)))
  
  # Find batch average activity
  
  # L2 Activity regularization
  activations_batch_averaged <-
    tf$sqrt(tf$reduce_mean(tf$square(x), axis = 0L))
  
  loss <- tf$reduce_sum(tf$multiply(reg_weights, activations_batch_averaged))
  loss
  
}

n_latent <- 10L
n_features <- 14
n_timesteps <- 120
batch_size <- 32
n_hidden <- 32


gen_timesteps <- function(x, n_timesteps) {
  ds <- timeseries_dataset_from_array(
    x, NULL,
    sequence_length = n_timesteps,
    batch_size = NROW(x)
  )
  ds |>
    as_array_iterator() |>
    iter_next()
}


train <- gen_timesteps(roessler[1:(n/2),], 2 * n_timesteps)
test <- gen_timesteps(roessler[(n/2):n,], 2 * n_timesteps) 

dim(train)
dim(test)

x_train <- train[ , 1:n_timesteps, , drop = FALSE]
y_train <- train[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_train <- tensor_slices_dataset(list(x_train, y_train)) %>%
  dataset_shuffle(nrow(x_train)) %>%
  dataset_batch(batch_size)

x_test <- test[ , 1:n_timesteps, , drop = FALSE]
y_test <- test[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_test <- tensor_slices_dataset(list(x_test, y_test)) %>%
  dataset_batch(nrow(x_test))


train_step <- function(batch) {
  with (tf$GradientTape(persistent = TRUE) %as% tape, {
    code <- encoder(batch[[1]])
    prediction <- decoder(code)
    
    l_mse <- mse_loss(batch[[2]], prediction)
    l_fnn <- loss_false_nn(code)
    loss <- l_mse + fnn_weight * l_fnn
  })
  
  encoder_gradients <-
    tape$gradient(loss, encoder$trainable_variables)
  decoder_gradients <-
    tape$gradient(loss, decoder$trainable_variables)
  
  optimizer$apply_gradients(purrr::transpose(list(
    encoder_gradients, encoder$trainable_variables
  )))
  optimizer$apply_gradients(purrr::transpose(list(
    decoder_gradients, decoder$trainable_variables
  )))
  
  train_loss(loss)
  train_mse(l_mse)
  train_fnn(l_fnn)
  
  
}

training_loop <- tf_function(autograph(function(ds_train) {
  for (batch in ds_train) {
    train_step(batch)
  }
  
  tf$print("Loss: ", train_loss$result())
  tf$print("MSE: ", train_mse$result())
  tf$print("FNN loss: ", train_fnn$result())
  
  train_loss$reset_states()
  train_mse$reset_states()
  train_fnn$reset_states()
  
}))


mse_loss <-
  tf$keras$losses$MeanSquaredError(reduction = tf$keras$losses$Reduction$SUM)

train_loss <- tf$keras$metrics$Mean(name = 'train_loss')
train_fnn <- tf$keras$metrics$Mean(name = 'train_fnn')
train_mse <-  tf$keras$metrics$Mean(name = 'train_mse')

# fnn_multiplier should be chosen individually per dataset
# this is the value we used on the geyser dataset
fnn_multiplier <- 0.7
fnn_weight <- fnn_multiplier * nrow(x_train)/batch_size

# learning rate may also need adjustment
#optimizer <- optimizer_adam(learning_rate = 1e-3)
optimizer <- tf$keras$optimizers$legacy$Adam(learning_rate = 1e-3)


encoder <- encoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)

decoder <- decoder_model(n_timesteps,
                         n_features,
                         n_hidden,
                         n_latent)


for (epoch in 1:10) {
  cat("Epoch: ", epoch, " -----------\n")
  training_loop(ds_train)
  
  test_batch <- as_iterator(ds_test) %>% iter_next()
  encoded <- encoder(test_batch[[1]]) 
  test_var <- tf$math$reduce_variance(encoded, axis = 0L)
  print(test_var %>% as.numeric() %>% round(5))
}




test_batch <- as_iterator(ds_test) %>% iter_next()
encoded <- encoder(test_batch[[1]]) %>%
  as.array() %>%
  as_tibble()

encoded %>% summarise_all(var)


given <- data.frame(as.array(tf$concat(list(
  test_batch[[1]][, , 1], test_batch[[2]][, , 1]
),
axis = 1L)) %>% t()) %>%
  add_column(type = "given") %>%
  add_column(num = 1:(2 * n_timesteps))

prediction_fnn <- decoder(encoder(test_batch[[1]]))
fnn <- data.frame(as.array(prediction_fnn[, , 1]) %>%
                    t()) %>%
  add_column(type = "fnn") %>%
  add_column(num = (n_timesteps  + 1):(2 * n_timesteps))

compare_preds_df <- bind_rows(given, fnn)

plots <- 
  purrr::map(sample(1:dim(compare_preds_df)[2], 16),
             function(v) {
               ggplot(compare_preds_df, aes(num, .data[[paste0("X", v)]], color = type)) +
                 geom_line() +
                 theme_classic() +
                 theme(legend.position = "none", axis.title = element_blank()) +
                 scale_color_manual(values = c("#00008B", "#DB7093"))
             })

plot_grid(plotlist = plots, ncol = 4)

gives me the following warning at the end of training:

Warning message:
Negative numbers are interpreted python-style when subsetting tensorflow tensors.
See: ?`[.tensorflow.tensor` for details.
To turn off this warning, set `options(tensorflow.extract.warn_negatives_pythonic = FALSE)` 

what is it all about?

@t-kalinowski
Copy link
Member

Somewhere you're calling [ on a tensorfow tensor with a negative number.

You can trace which call it is by setting options(warn = 2, error = quote(print(print(rlang::trace_back()))) before evaluating your script.

See ?`[.tensorflow.tensor` (particularly the examples) to understand what a "pythonic" negative index is.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants