INRIA · SpiderCoop · Dec 20, 2024
diff --git a/python_scripts/cross_validation_time.py b/python_scripts/cross_validation_time.py
@@ -18,7 +18,8 @@
 # ```
 #
 # This assumption is usually violated when dealing with time series. A sample
-# depends on past information.
+# depends on past information, in other words, the generative process has
+# memory or is influenced by past data.
 #
 # We will take an example to highlight such issues with non-i.i.d. data in the
 # previous cross-validation strategies presented. We are going to load financial
@@ -59,6 +60,8 @@
 # Here, we want to predict the quotation of Chevron using all other energy
 # companies' quotes. To make explanatory plots, we first use a train-test split
 # and then we evaluate other cross-validation methods.
+#
+
 
 # %%
 from sklearn.model_selection import train_test_split
@@ -68,9 +71,16 @@
     data, target, shuffle=True, random_state=0
 )
 
+# Correct the order of the data to maintain the time order
+data_train.sort_index(ascending=True, inplace=True)
+data_test.sort_index(ascending=True, inplace=True)
+target_train.sort_index(ascending=True, inplace=True)
+target_test.sort_index(ascending=True, inplace=True)
+
+
 # %% [markdown]
 # We will use a decision tree regressor that we expect to overfit and thus not
-# generalize to unseen data. We will use a `ShuffleSplit` cross-validation to
+# generalize to unseen data of the time series. We will use a `ShuffleSplit` cross-validation to
 # check the generalization performance of our model.
 #
 # Let's first define our model
@@ -89,7 +99,7 @@
 cv = ShuffleSplit(random_state=0)
 
 # %% [markdown]
-# Finally, we perform the evaluation.
+# Finally, we perform the evaluation using the `ShuffleSplit` cross-validation
 
 # %%
 from sklearn.model_selection import cross_val_score
@@ -103,7 +113,9 @@
 # Surprisingly, we get outstanding generalization performance. We will
 # investigate and find the reason for such good results with a model that is
 # expected to fail. We previously mentioned that `ShuffleSplit` is an iterative
-# cross-validation scheme that shuffles data and split. We will simplify this
+# cross-validation scheme that shuffles data and split. 
+#  
+# We will simplify this
 # procedure with a single split and plot the prediction. We can use
 # `train_test_split` for this purpose.
 
@@ -117,37 +129,42 @@
 # Let's check the generalization performance of our model on this split.
 
 # %%
-from sklearn.metrics import r2_score
+from sklearn.metrics import r2_score, mean_squared_error
 
 test_score = r2_score(target_test, target_predicted)
+mse_score = mean_squared_error(target_test, target_predicted)
 print(f"The R2 on this single split is: {test_score:.2f}")
+print(f"The mean squared error on this single split is: {mse_score:.2f}")
+
 
 # %% [markdown]
-# Similarly, we obtain good results in terms of $R^2$. We will plot the
+# Similarly, we obtain good results in terms of $R^2$ and the mean squared error. We will plot the
 # training, testing and prediction samples.
 
 # %%
 target_train.plot(label="Training")
 target_test.plot(label="Testing")
 target_predicted.plot(label="Prediction")
 
-plt.ylabel("Quote value")
+plt.ylabel("Quote value") 
 plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left")
 _ = plt.title("Model predictions using a ShuffleSplit strategy")
 
 # %% [markdown]
-# So in this context, it seems that the model predictions are following the
-# testing. But we can also see that the testing samples are next to some
-# training sample. And with these time-series, we see a relationship between a
-# sample at the time `t` and a sample at `t+1`. In this case, we are violating
-# the i.i.d. assumption. The insight to get is the following: a model can output
-# of its training set at the time `t` for a testing sample at the time `t+1`.
-# This prediction would be close to the true value even if our model did not
-# learn anything, but just memorized the training dataset.
+# In this time-series, we see a relationship between a sample at the time `t`
+# and a sample at `t+1`, and by the nature or structure of any time series, we can not know the value
+# at time `t+1` whithout knowing the value at time `t`. In that sense, we are violating
+# the i.i.d. assumption
+# 
+# The insight to get is the following: we obseve that the model predicts quite well in terms of the $R^2$
+# and the mean squared error, but that is only because the testing samples are next to some training samples,
+# contradicting the structure of the time series, in other words, the model is memorizing the training data 
+# and predicting the testing data filling the gaps, but not learning anything from the training data.
 #
 # An easy way to verify this hypothesis is to not shuffle the data when doing
 # the split. In this case, we will use the first 75% of the data to train and
-# the remaining data to test.
+# the remaining data to test. This way of splitting the data maintains the
+# time order of the data.
 
 # %%
 data_train, data_test, target_train, target_test = train_test_split(
@@ -161,8 +178,11 @@
 target_predicted = pd.Series(target_predicted, index=target_test.index)
 
 # %%
+
 test_score = r2_score(target_test, target_predicted)
+mse_score = mean_squared_error(target_test, target_predicted)
 print(f"The R2 on this single split is: {test_score:.2f}")
+print(f"The mean squared error on this single split is: {mse_score:.2f}")
 
 # %% [markdown]
 # In this case, we see that our model is not magical anymore. Indeed, it
@@ -213,7 +233,7 @@
 
 cv = TimeSeriesSplit(n_splits=groups.nunique())
 test_score = cross_val_score(
-    regressor, data, target, cv=cv, groups=groups, n_jobs=2
+    regressor, data, target, cv=cv, n_jobs=2
 )
 print(f"The mean R2 is: {test_score.mean():.2f} ± {test_score.std():.2f}")