diff --git a/wtpsplit/train/evaluate.py b/wtpsplit/train/evaluate.py index c2cbc34f..c9991276 100644 --- a/wtpsplit/train/evaluate.py +++ b/wtpsplit/train/evaluate.py @@ -268,9 +268,9 @@ def evaluate_sentence_kmers( info["threshold_best"] / (1 - info["threshold_best"]) ) # inverse sigmoid # For accuracy, check if all the labels in between are correctly predicted (ignore the one at the end) - intermediate_newline_labels = newline_labels[: -len(separator)] # Exclude the end - intermediate_predicted_labels = predicted_labels[: -len(separator)] # Exclude the end - intermediate_predicted_labels_opt = predicted_labels_optimal[: -len(separator)] # Exclude the end + intermediate_newline_labels = newline_labels[:-1] # Exclude the end + intermediate_predicted_labels = predicted_labels[:-1] + intermediate_predicted_labels_opt = predicted_labels_optimal[:-1] correct = np.array_equal(intermediate_newline_labels, intermediate_predicted_labels) correct_optimal = np.array_equal(intermediate_newline_labels, intermediate_predicted_labels_opt) accuracy_list.append(correct) diff --git a/wtpsplit/train/train.py b/wtpsplit/train/train.py index 3a2d61f6..a4299768 100644 --- a/wtpsplit/train/train.py +++ b/wtpsplit/train/train.py @@ -570,8 +570,8 @@ def compute_metrics(trainer): lang_code, dataset["data"], model, - stride=args.eval_stride, - block_size=args.block_size, + stride=128, + block_size=512, batch_size=training_args.per_device_eval_batch_size, ) metrics[f"{lang_code}_{dataset_name}_pr_auc"] = score @@ -608,8 +608,8 @@ def compute_metrics(trainer): lang_code, dataset["data"], model, - stride=args.eval_stride, - block_size=args.block_size, + stride=128, + block_size=512, batch_size=training_args.per_device_eval_batch_size, k=k, # sample_pct=0.1,