diff --git a/wtpsplit/train/evaluate.py b/wtpsplit/train/evaluate.py
index c2cbc34f..c9991276 100644
--- a/wtpsplit/train/evaluate.py
+++ b/wtpsplit/train/evaluate.py
@@ -268,9 +268,9 @@ def evaluate_sentence_kmers(
             info["threshold_best"] / (1 - info["threshold_best"])
         )  # inverse sigmoid
         # For accuracy, check if all the labels in between are correctly predicted (ignore the one at the end)
-        intermediate_newline_labels = newline_labels[: -len(separator)]  # Exclude the end
-        intermediate_predicted_labels = predicted_labels[: -len(separator)]  # Exclude the end
-        intermediate_predicted_labels_opt = predicted_labels_optimal[: -len(separator)]  # Exclude the end
+        intermediate_newline_labels = newline_labels[:-1]  # Exclude the end
+        intermediate_predicted_labels = predicted_labels[:-1]
+        intermediate_predicted_labels_opt = predicted_labels_optimal[:-1]
         correct = np.array_equal(intermediate_newline_labels, intermediate_predicted_labels)
         correct_optimal = np.array_equal(intermediate_newline_labels, intermediate_predicted_labels_opt)
         accuracy_list.append(correct)
diff --git a/wtpsplit/train/train.py b/wtpsplit/train/train.py
index 3a2d61f6..a4299768 100644
--- a/wtpsplit/train/train.py
+++ b/wtpsplit/train/train.py
@@ -570,8 +570,8 @@ def compute_metrics(trainer):
                         lang_code,
                         dataset["data"],
                         model,
-                        stride=args.eval_stride,
-                        block_size=args.block_size,
+                        stride=128,
+                        block_size=512,
                         batch_size=training_args.per_device_eval_batch_size,
                     )
                     metrics[f"{lang_code}_{dataset_name}_pr_auc"] = score
@@ -608,8 +608,8 @@ def compute_metrics(trainer):
                             lang_code,
                             dataset["data"],
                             model,
-                            stride=args.eval_stride,
-                            block_size=args.block_size,
+                            stride=128,
+                            block_size=512,
                             batch_size=training_args.per_device_eval_batch_size,
                             k=k,
                             # sample_pct=0.1,