From 74696c68f6b3b7b8e1d4b220f8feff02a82495f7 Mon Sep 17 00:00:00 2001 From: Artem Snegirev Date: Fri, 24 May 2024 18:17:15 +0300 Subject: [PATCH 1/8] add ru-mteb tasks --- mteb/tasks/Classification/__init__.py | 7 +++ .../rus/GeoreviewClassification.py | 34 +++++++++++ .../rus/HeadlineClassification.py | 57 +++++++++++++++++ .../rus/InappropriatenessClassification.py | 61 +++++++++++++++++++ .../rus/KinopoiskClassification.py | 42 +++++++++++++ .../rus/RuReviewsClassification.py | 45 ++++++++++++++ .../rus/RuSciBenchGRNTIClassification.py | 34 +++++++++++ .../rus/RuSciBenchOECDClassification.py | 34 +++++++++++ mteb/tasks/Classification/rus/__init__.py | 0 mteb/tasks/Clustering/__init__.py | 3 + .../Clustering/rus/GeoreviewClusteringP2P.py | 34 +++++++++++ .../rus/RuSciBenchGRNTIClusteringP2P.py | 34 +++++++++++ .../rus/RuSciBenchOECDClusteringP2P.py | 34 +++++++++++ mteb/tasks/Clustering/rus/__init__.py | 0 mteb/tasks/PairClassification/__init__.py | 1 + mteb/tasks/PairClassification/rus/TERRa.py | 49 +++++++++++++++ mteb/tasks/PairClassification/rus/__init__.py | 0 mteb/tasks/Retrieval/__init__.py | 3 + mteb/tasks/Retrieval/rus/MMarcoRetrieval.py | 47 ++++++++++++++ mteb/tasks/Retrieval/rus/RiaNewsRetrieval.py | 39 ++++++++++++ mteb/tasks/Retrieval/rus/RuBQRetrieval.py | 40 ++++++++++++ mteb/tasks/Retrieval/rus/__init__.py | 0 mteb/tasks/STS/__init__.py | 1 + mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py | 42 +++++++++++++ 24 files changed, 641 insertions(+) create mode 100644 mteb/tasks/Classification/rus/GeoreviewClassification.py create mode 100644 mteb/tasks/Classification/rus/HeadlineClassification.py create mode 100644 mteb/tasks/Classification/rus/InappropriatenessClassification.py create mode 100644 mteb/tasks/Classification/rus/KinopoiskClassification.py create mode 100644 mteb/tasks/Classification/rus/RuReviewsClassification.py create mode 100644 mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py create mode 100644 mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py create mode 100644 mteb/tasks/Classification/rus/__init__.py create mode 100644 mteb/tasks/Clustering/rus/GeoreviewClusteringP2P.py create mode 100644 mteb/tasks/Clustering/rus/RuSciBenchGRNTIClusteringP2P.py create mode 100644 mteb/tasks/Clustering/rus/RuSciBenchOECDClusteringP2P.py create mode 100644 mteb/tasks/Clustering/rus/__init__.py create mode 100644 mteb/tasks/PairClassification/rus/TERRa.py create mode 100644 mteb/tasks/PairClassification/rus/__init__.py create mode 100644 mteb/tasks/Retrieval/rus/MMarcoRetrieval.py create mode 100644 mteb/tasks/Retrieval/rus/RiaNewsRetrieval.py create mode 100644 mteb/tasks/Retrieval/rus/RuBQRetrieval.py create mode 100644 mteb/tasks/Retrieval/rus/__init__.py create mode 100644 mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index bfee0a6ae..89826de5c 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -105,6 +105,13 @@ from .ron.Moroco import * from .ron.RomanianReviewsSentiment import * from .ron.RomanianSentimentClassification import * +from .rus.GeoreviewClassification import * +from .rus.HeadlineClassification import * +from .rus.InappropriatenessClassification import * +from .rus.KinopoiskClassification import * +from .rus.RuReviewsClassification import * +from .rus.RuSciBenchGRNTIClassification import * +from .rus.RuSciBenchOECDClassification import * from .san.SanskritShlokasClassification import * from .sin.SinhalaNewsClassification import * from .sin.SinhalaNewsSourceClassification import * diff --git a/mteb/tasks/Classification/rus/GeoreviewClassification.py b/mteb/tasks/Classification/rus/GeoreviewClassification.py new file mode 100644 index 000000000..51fc89ff5 --- /dev/null +++ b/mteb/tasks/Classification/rus/GeoreviewClassification.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks import AbsTaskClassification + + +class GeoreviewClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="GeoreviewClassification", + dataset={ + "path": "ai-forever/georeview-classification", + "revision": "3765c0d1de6b7d264bc459433c45e5a75513839c", + }, + description="Review classification (5-point scale) based on Yandex Georeview dataset", + reference="https://github.com/yandex/geo-reviews-dataset-2023", + type="Classification", + category="p2p", + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2023-01-01", "2023-08-01"), + form=["written"], + domains=["Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="mit", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="found", + bibtex_citation="""""", + n_samples={"validation": 5000, "test": 5000}, + avg_character_length={"validation": 412.9, "test": 409.0}, + ) diff --git a/mteb/tasks/Classification/rus/HeadlineClassification.py b/mteb/tasks/Classification/rus/HeadlineClassification.py new file mode 100644 index 000000000..f89072059 --- /dev/null +++ b/mteb/tasks/Classification/rus/HeadlineClassification.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks import AbsTaskClassification + + +class HeadlineClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="HeadlineClassification", + dataset={ + "path": "ai-forever/headline-classification", + "revision": "2fe05ee6b5832cda29f2ef7aaad7b7fe6a3609eb", + }, + description="Headline rubric classification based on the paraphraser plus dataset.", + reference="https://aclanthology.org/2020.ngt-1.6/", + type="Classification", + category="s2s", + eval_splits=["validation", "test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2009-01-01", "2020-01-01"), + form=["written"], + domains=["News"], + task_subtypes=["Topic classification"], + license="mit", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="found", + bibtex_citation="""@inproceedings{gudkov-etal-2020-automatically, + title = "Automatically Ranked {R}ussian Paraphrase Corpus for Text Generation", + author = "Gudkov, Vadim and + Mitrofanova, Olga and + Filippskikh, Elizaveta", + editor = "Birch, Alexandra and + Finch, Andrew and + Hayashi, Hiroaki and + Heafield, Kenneth and + Junczys-Dowmunt, Marcin and + Konstas, Ioannis and + Li, Xian and + Neubig, Graham and + Oda, Yusuke", + booktitle = "Proceedings of the Fourth Workshop on Neural Generation and Translation", + month = jul, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.ngt-1.6", + doi = "10.18653/v1/2020.ngt-1.6", + pages = "54--59", + abstract = "The article is focused on automatic development and ranking of a large corpus for Russian paraphrase generation which proves to be the first corpus of such type in Russian computational linguistics. Existing manually annotated paraphrase datasets for Russian are limited to small-sized ParaPhraser corpus and ParaPlag which are suitable for a set of NLP tasks, such as paraphrase and plagiarism detection, sentence similarity and relatedness estimation, etc. Due to size restrictions, these datasets can hardly be applied in end-to-end text generation solutions. Meanwhile, paraphrase generation requires a large amount of training data. In our study we propose a solution to the problem: we collect, rank and evaluate a new publicly available headline paraphrase corpus (ParaPhraser Plus), and then perform text generation experiments with manual evaluation on automatically ranked corpora using the Universal Transformer architecture.", + }""", + n_samples={"validation": 12000, "test": 12000}, + avg_character_length={"validation": 61.7, "test": 61.6}, + ) diff --git a/mteb/tasks/Classification/rus/InappropriatenessClassification.py b/mteb/tasks/Classification/rus/InappropriatenessClassification.py new file mode 100644 index 000000000..8bc541113 --- /dev/null +++ b/mteb/tasks/Classification/rus/InappropriatenessClassification.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks import AbsTaskClassification + + +class InappropriatenessClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="InappropriatenessClassification", + dataset={ + "path": "ai-forever/inappropriateness-classification", + "revision": "601651fdc45ef243751676e62dd7a19f491c0285", + }, + description="Inappropriateness identification in the form of binary classification", + reference="https://aclanthology.org/2021.bsnlp-1.4", + type="Classification", + category="s2s", + eval_splits=["validation", "test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2006-01-01", "2021-04-01"), + form=["written"], + domains=["Web", "Social"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-nc-sa-4.0", + socioeconomic_status="mixed", + annotations_creators="human-annotated", + dialect=[], + text_creation="found", + bibtex_citation="""@inproceedings{babakov-etal-2021-detecting, + title = "Detecting Inappropriate Messages on Sensitive Topics that Could Harm a Company{'}s Reputation", + author = "Babakov, Nikolay and + Logacheva, Varvara and + Kozlova, Olga and + Semenov, Nikita and + Panchenko, Alexander", + editor = "Babych, Bogdan and + Kanishcheva, Olga and + Nakov, Preslav and + Piskorski, Jakub and + Pivovarova, Lidia and + Starko, Vasyl and + Steinberger, Josef and + Yangarber, Roman and + Marci{\'n}czuk, Micha{\l} and + Pollak, Senja and + P{\v{r}}ib{\'a}{\v{n}}, Pavel and + Robnik-{\v{S}}ikonja, Marko", + booktitle = "Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing", + month = apr, + year = "2021", + address = "Kiyv, Ukraine", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.bsnlp-1.4", + pages = "26--36", + abstract = "Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.", + }""", + n_samples={"validation": 4000, "test": 10000}, + avg_character_length={"validation": 96.8, "test": 97.7}, + ) diff --git a/mteb/tasks/Classification/rus/KinopoiskClassification.py b/mteb/tasks/Classification/rus/KinopoiskClassification.py new file mode 100644 index 000000000..2d6198e63 --- /dev/null +++ b/mteb/tasks/Classification/rus/KinopoiskClassification.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks import AbsTaskClassification + + +class KinopoiskClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="KinopoiskClassification", + dataset={ + "path": "ai-forever/kinopoisk-sentiment-classification", + "revision": "5911f26666ac11af46cb9c6849d0dc80a378af24", + }, + description="Kinopoisk review sentiment classification", + reference="https://www.dialog-21.ru/media/1226/blinovpd.pdf", + type="Classification", + category="p2p", + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2004-07-01", "2012-12-01"), + form=["written"], + domains=["Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="Not specified", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="found", + bibtex_citation="""@article{blinov2013research, + title={Research of lexical approach and machine learning methods for sentiment analysis}, + author={Blinov, PD and Klekovkina, Maria and Kotelnikov, Eugeny and Pestov, Oleg}, + journal={Computational Linguistics and Intellectual Technologies}, + volume={2}, + number={12}, + pages={48--58}, + year={2013} + }""", + n_samples={"validation": 1500, "test": 1500}, + avg_character_length={"validation": 1941.7, "test": 1897.3}, + ) diff --git a/mteb/tasks/Classification/rus/RuReviewsClassification.py b/mteb/tasks/Classification/rus/RuReviewsClassification.py new file mode 100644 index 000000000..9a06ac9c4 --- /dev/null +++ b/mteb/tasks/Classification/rus/RuReviewsClassification.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks import AbsTaskClassification + + +class RuReviewsClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="RuReviewsClassification", + dataset={ + "path": "ai-forever/ru-reviews-classification", + "revision": "f6d2c31f4dc6b88f468552750bfec05b4b41b05a", + }, + description="Product review classification (3-point scale) based on RuRevies dataset", + reference="https://github.com/sismetanin/rureviews", + type="Classification", + category="p2p", + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("2000-01-01", "2020-01-01"), + form=["written"], + domains=["Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="apache-2.0", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="found", + bibtex_citation="""@INPROCEEDINGS{Smetanin-SA-2019, + author={Sergey Smetanin and Michail Komarov}, + booktitle={2019 IEEE 21st Conference on Business Informatics (CBI)}, + title={Sentiment Analysis of Product Reviews in Russian using Convolutional Neural Networks}, + year={2019}, + volume={01}, + number={}, + pages={482-486}, + doi={10.1109/CBI.2019.00062}, + ISSN={2378-1963}, + month={July} + }""", + n_samples={"validation": 15000, "test": 15000}, + avg_character_length={"validation": 132.8, "test": 133.2}, + ) diff --git a/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py b/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py new file mode 100644 index 000000000..9cc7d7d53 --- /dev/null +++ b/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks import AbsTaskClassification + + +class RuSciBenchGRNTIClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="RuSciBenchGRNTIClassification", + dataset={ + "path": "ai-forever/ru-scibench-grnti-classification", + "revision": "673a610d6d3dd91a547a0d57ae1b56f37ebbf6a1", + }, + description="Classification of scientific papers (title+abstract) by rubric", + reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench/", + type="Classification", + category="p2p", + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("1999-01-01", "2024-01-01"), + form=["written"], + domains=["Academic"], + task_subtypes=["Topic classification"], + license="Not specified", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="found", + bibtex_citation="""""", + n_samples={"test": 2772}, + avg_character_length={"test": 890.1}, + ) diff --git a/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py b/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py new file mode 100644 index 000000000..8b6ee984e --- /dev/null +++ b/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks import AbsTaskClassification + + +class RuSciBenchOECDClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="RuSciBenchOECDClassification", + dataset={ + "path": "ai-forever/ru-scibench-oecd-classification", + "revision": "26c88e99dcaba32bb45d0e1bfc21902337f6d471", + }, + description="Classification of scientific papers (title+abstract) by rubric", + reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench/", + type="Classification", + category="p2p", + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="accuracy", + date=("1999-01-01", "2024-01-01"), + form=["written"], + domains=["Academic"], + task_subtypes=["Topic classification"], + license="Not specified", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="found", + bibtex_citation="""""", + n_samples={"test": 3219}, + avg_character_length={"test": 838.9}, + ) diff --git a/mteb/tasks/Classification/rus/__init__.py b/mteb/tasks/Classification/rus/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index d5d4de73b..c8533f87e 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -39,6 +39,9 @@ from .nob.vg_clustering import * from .pol.PolishClustering import * from .rom.RomaniBibleClustering import * +from .rus.GeoreviewClusteringP2P import * +from .rus.RuSciBenchGRNTIClusteringP2P import * +from .rus.RuSciBenchOECDClusteringP2P import * from .spa.FloresClusteringS2S import * from .spa.SpanishNewsClusteringP2P import * from .swe.swedn_clustering import * diff --git a/mteb/tasks/Clustering/rus/GeoreviewClusteringP2P.py b/mteb/tasks/Clustering/rus/GeoreviewClusteringP2P.py new file mode 100644 index 000000000..eb3fd0c57 --- /dev/null +++ b/mteb/tasks/Clustering/rus/GeoreviewClusteringP2P.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskClustering import AbsTaskClustering + + +class GeoreviewClusteringP2P(AbsTaskClustering): + metadata = TaskMetadata( + name="GeoreviewClusteringP2P", + dataset={ + "path": "ai-forever/georeview-clustering-p2p", + "revision": "e82bdbb7d767270d37c9b4ea88cb6475facfd656", + }, + description="Review clustering based on Yandex Georeview dataset", + reference="https://github.com/yandex/geo-reviews-dataset-2023", + type="Clustering", + category="p2p", + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="v_measure", + date=("2023-01-01", "2023-07-01"), + form=["written"], + domains=["Reviews"], + task_subtypes=[], + license="mit", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="found", + bibtex_citation="""""", + n_samples={"test": 301510}, + avg_character_length={"test": 290.5}, + ) diff --git a/mteb/tasks/Clustering/rus/RuSciBenchGRNTIClusteringP2P.py b/mteb/tasks/Clustering/rus/RuSciBenchGRNTIClusteringP2P.py new file mode 100644 index 000000000..d309fc474 --- /dev/null +++ b/mteb/tasks/Clustering/rus/RuSciBenchGRNTIClusteringP2P.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskClustering import AbsTaskClustering + + +class RuSciBenchGRNTIClusteringP2P(AbsTaskClustering): + metadata = TaskMetadata( + name="RuSciBenchGRNTIClusteringP2P", + dataset={ + "path": "ai-forever/ru-scibench-grnti-clustering-p2p", + "revision": "5add37c2d5028dda82cf115a659b56153580c203", + }, + description="Clustering of scientific papers (title+abstract) by rubric", + reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench/", + type="Clustering", + category="p2p", + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="v_measure", + date=("1999-01-01", "2024-01-01"), + form=["written"], + domains=["Academic"], + task_subtypes=["Thematic clustering"], + license="Not specified", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="found", + bibtex_citation="""""", + n_samples={"test": 31080}, + avg_character_length={"test": 863.3}, + ) diff --git a/mteb/tasks/Clustering/rus/RuSciBenchOECDClusteringP2P.py b/mteb/tasks/Clustering/rus/RuSciBenchOECDClusteringP2P.py new file mode 100644 index 000000000..15b7b97c9 --- /dev/null +++ b/mteb/tasks/Clustering/rus/RuSciBenchOECDClusteringP2P.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskClustering import AbsTaskClustering + + +class RuSciBenchOECDClusteringP2P(AbsTaskClustering): + metadata = TaskMetadata( + name="RuSciBenchOECDClusteringP2P", + dataset={ + "path": "ai-forever/ru-scibench-oecd-clustering-p2p", + "revision": "08475cf0f71cd474bdc3525ee49d8495a12a9a6a", + }, + description="Clustering of scientific papers (title+abstract) by rubric", + reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench/", + type="Clustering", + category="p2p", + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="v_measure", + date=("1999-01-01", "2024-01-01"), + form=["written"], + domains=["Academic"], + task_subtypes=["Thematic clustering"], + license="Not specified", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="found", + bibtex_citation="""""", + n_samples={"test": 30740}, + avg_character_length={"test": 838.7}, + ) diff --git a/mteb/tasks/Clustering/rus/__init__.py b/mteb/tasks/Clustering/rus/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index c06e939c8..8e433eee5 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -18,4 +18,5 @@ from .pol.PolishPC import * from .por.Assin2RTE import * from .por.SickBrPC import * +from .rus.TERRa import * from .zho.CMTEBPairClassification import * diff --git a/mteb/tasks/PairClassification/rus/TERRa.py b/mteb/tasks/PairClassification/rus/TERRa.py new file mode 100644 index 000000000..7f7b44288 --- /dev/null +++ b/mteb/tasks/PairClassification/rus/TERRa.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskPairClassification import AbsTaskPairClassification + + +class TERRa(AbsTaskPairClassification): + metadata = TaskMetadata( + name="TERRa", + dataset={ + "path": "ai-forever/terra-pairclassification", + "revision": "7b58f24536063837d644aab9a023c62199b2a612", + }, + description="Textual Entailment Recognition for Russian. This task requires to recognize, given two text fragments, " + "whether the meaning of one text is entailed (can be inferred) from the other text.", + reference="https://arxiv.org/pdf/2010.15925", + type="PairClassification", + category="s2s", + eval_splits=["dev"], + eval_langs=["rus-Cyrl"], + main_score="ap", + date=("2000-01-01", "2018-01-01"), + form=["written"], + domains=["News", "Web"], + task_subtypes=[], + license="mit", + socioeconomic_status="mixed", + annotations_creators="human-annotated", + dialect=[], + text_creation="found", + bibtex_citation="""@article{shavrina2020russiansuperglue, + title={RussianSuperGLUE: A Russian Language Understanding Evaluation Benchmark}, + author={Shavrina, Tatiana + and Fenogenova, Alena + and Emelyanov, Anton + and Shevelev, Denis + and Artemova, Ekaterina + and Malykh, Valentin + and Mikhailov, Vladislav + and Tikhonova, Maria + and Chertok, Andrey + and Evlampiev, Andrey}, + journal={arXiv preprint arXiv:2010.15925}, + year={2020} + }""", + n_samples={"dev": 307}, + avg_character_length={"dev": 138.2}, + ) diff --git a/mteb/tasks/PairClassification/rus/__init__.py b/mteb/tasks/PairClassification/rus/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 2cbddaed9..9efa36867 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -91,6 +91,9 @@ from .pol.SCIDOCSPLRetrieval import * from .pol.SciFactPLRetrieval import * from .pol.TRECCOVIDPLRetrieval import * +from .rus.MMarcoRetrieval import * +from .rus.RiaNewsRetrieval import * +from .rus.RuBQRetrieval import * from .slk.SlovakSumRetrieval import * from .spa.SpanishPassageRetrievalS2P import * from .spa.SpanishPassageRetrievalS2S import * diff --git a/mteb/tasks/Retrieval/rus/MMarcoRetrieval.py b/mteb/tasks/Retrieval/rus/MMarcoRetrieval.py new file mode 100644 index 000000000..317185960 --- /dev/null +++ b/mteb/tasks/Retrieval/rus/MMarcoRetrieval.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class RuMMarcoRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="RuMMarcoRetrieval", + dataset={ + "path": "ai-forever/ru-mmarco-retrieval", + "revision": "18d1c2b1ab2a7e8920614329e19ab4c513113d7e", + }, + description="mMARCO: A Multilingual Version of the MS MARCO Passage Ranking Dataset", + reference="https://arxiv.org/abs/2108.13897", + type="Retrieval", + category="s2p", + eval_splits=["dev"], + eval_langs=["rus-Cyrl"], + main_score="ndcg_at_10", + date=("2000-01-01", "2019-01-01"), + form=["written"], + domains=["Web"], + task_subtypes=["Question answering"], + license="apache-2.0", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="machine-translated", + bibtex_citation="""@misc{bonifacio2022mmarco, + title={mMARCO: A Multilingual Version of the MS MARCO Passage Ranking Dataset}, + author={Luiz Bonifacio + and Vitor Jeronymo + and Hugo Queiroz Abonizio + and Israel Campiotti + and Marzieh Fadaee + and Roberto Lotufo + and Rodrigo Nogueira}, + year={2022}, + eprint={2108.13897}, + archivePrefix={arXiv}, + primaryClass={cs.CL} + }""", + n_samples={"dev": 7437}, + avg_character_length={"dev": 385.9}, + ) diff --git a/mteb/tasks/Retrieval/rus/RiaNewsRetrieval.py b/mteb/tasks/Retrieval/rus/RiaNewsRetrieval.py new file mode 100644 index 000000000..7177982c7 --- /dev/null +++ b/mteb/tasks/Retrieval/rus/RiaNewsRetrieval.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class RiaNewsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="RiaNewsRetrieval", + dataset={ + "path": "ai-forever/ria-news-retrieval", + "revision": "82374b0bbacda6114f39ff9c5b925fa1512ca5d7", + }, + description="News article retrieval by headline. Based on Rossiya Segodnya dataset.", + reference="https://arxiv.org/abs/1901.07786", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="ndcg_at_10", + date=("2010-01-01", "2014-12-31"), + form=["written"], + domains=["News"], + task_subtypes=["Article retrieval"], + license="cc-by-nc-nd-4.0", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="found", + bibtex_citation="""@inproceedings{gavrilov2018self, + title={Self-Attentive Model for Headline Generation}, + author={Gavrilov, Daniil and Kalaidin, Pavel and Malykh, Valentin}, + booktitle={Proceedings of the 41st European Conference on Information Retrieval}, + year={2019} + }""", + n_samples={"test": 10000}, + avg_character_length={"test": 1230.8}, + ) diff --git a/mteb/tasks/Retrieval/rus/RuBQRetrieval.py b/mteb/tasks/Retrieval/rus/RuBQRetrieval.py new file mode 100644 index 000000000..fad818a99 --- /dev/null +++ b/mteb/tasks/Retrieval/rus/RuBQRetrieval.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class RuBQRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="RuBQRetrieval", + dataset={ + "path": "ai-forever/rubq-retrieval", + "revision": "e19b6ffa60b3bc248e0b41f4cc37c26a55c2a67b", + }, + description="Paragraph retrieval based on RuBQ 2.0. Retrieve paragraphs from Wikipedia that answer the question.", + reference="https://openreview.net/pdf?id=P5UQFFoQ4PJ", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["rus-Cyrl"], + main_score="ndcg_at_10", + date=("2001-01-01", "2021-01-01"), + form=["written"], + domains=["Encyclopaedic"], + task_subtypes=["Question answering"], + license="cc-by-sa-4.0", + socioeconomic_status="mixed", + annotations_creators="human-annotated", + dialect=[], + text_creation="created", + bibtex_citation="""@inproceedings{RuBQ2021, + title={RuBQ 2.0: An Innovated Russian Question Answering Dataset}, + author={Ivan Rybin and Vladislav Korablinov and Pavel Efimov and Pavel Braslavski}, + booktitle={ESWC}, + year={2021}, + pages={532--547} + }""", + n_samples={"test": 2845}, + avg_character_length={"test": 509.5}, + ) diff --git a/mteb/tasks/Retrieval/rus/__init__.py b/mteb/tasks/Retrieval/rus/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/mteb/tasks/STS/__init__.py b/mteb/tasks/STS/__init__.py index a63308bff..4b34eece4 100644 --- a/mteb/tasks/STS/__init__.py +++ b/mteb/tasks/STS/__init__.py @@ -25,5 +25,6 @@ from .por.SickBrSTS import * from .ron.RonSTS import * from .rus.RUParaPhraserSTS import * +from .rus.RuSTSBenchmarkSTS import * from .spa.STSES import * from .zho.CMTEBSTS import * diff --git a/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py b/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py new file mode 100644 index 000000000..6da4acaed --- /dev/null +++ b/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskSTS import AbsTaskSTS + + +class RuSTSBenchmarkSTS(AbsTaskSTS): + metadata = TaskMetadata( + name="RuSTSBenchmarkSTS", + dataset={ + "path": "ai-forever/ru-stsbenchmark-sts", + "revision": "7cf24f325c6da6195df55bef3d86b5e0616f3018", + }, + description="Semantic Textual Similarity Benchmark (STSbenchmark) dataset translated into Russian and verified. " + "The dataset was checked with RuCOLA model to ensure that the translation is good and filtered.", + reference="https://github.com/PhilipMay/stsb-multi-mt/", + type="STS", + category="s2s", + eval_splits=["validation", "test"], + eval_langs=["rus-Cyrl"], + main_score="cosine_spearman", + date=("2012-01-01", "2018-01-01"), + form=["written"], + domains=["News", "Social", "Web"], + task_subtypes=[], + license="cc-by-sa-4.0", + socioeconomic_status="mixed", + annotations_creators="human-annotated", + dialect=[], + text_creation="machine-translated and verified", + bibtex_citation="""""", + n_samples={"validation": 1336, "test": 1264}, + avg_character_length={"validation": 65.4, "test": 54.2}, + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 0 + metadata_dict["max_score"] = 5 + return metadata_dict From 166400e9ad7959c54713c6a93af5eea95bdacd92 Mon Sep 17 00:00:00 2001 From: Artem Snegirev Date: Fri, 24 May 2024 19:46:28 +0300 Subject: [PATCH 2/8] add results for new tasks --- .../GeoreviewClassification.json | 62 ++++++++++++++++ .../GeoreviewClusteringP2P.json | 32 ++++++++ .../HeadlineClassification.json | 62 ++++++++++++++++ .../InappropriatenessClassification.json | 73 +++++++++++++++++++ .../KinopoiskClassification.json | 62 ++++++++++++++++ .../RiaNewsRetrieval.json | 53 ++++++++++++++ .../RuBQRetrieval.json | 53 ++++++++++++++ .../RuMMarcoRetrieval.json | 53 ++++++++++++++ .../RuReviewsClassification.json | 62 ++++++++++++++++ .../RuSTSBenchmarkSTS.json | 72 ++++++++++++++++++ .../RuSciBenchGRNTIClassification.json | 62 ++++++++++++++++ .../RuSciBenchGRNTIClusteringP2P.json | 32 ++++++++ .../RuSciBenchOECDClassification.json | 62 ++++++++++++++++ .../RuSciBenchOECDClusteringP2P.json | 32 ++++++++ .../TERRa.json | 59 +++++++++++++++ .../GeoreviewClassification.json | 62 ++++++++++++++++ .../GeoreviewClusteringP2P.json | 32 ++++++++ .../HeadlineClassification.json | 62 ++++++++++++++++ .../InappropriatenessClassification.json | 73 +++++++++++++++++++ .../KinopoiskClassification.json | 62 ++++++++++++++++ .../RiaNewsRetrieval.json | 53 ++++++++++++++ .../RuBQRetrieval.json | 53 ++++++++++++++ .../RuMMarcoRetrieval.json | 53 ++++++++++++++ .../RuReviewsClassification.json | 62 ++++++++++++++++ .../RuSTSBenchmarkSTS.json | 72 ++++++++++++++++++ .../RuSciBenchGRNTIClassification.json | 62 ++++++++++++++++ .../RuSciBenchGRNTIClusteringP2P.json | 32 ++++++++ .../RuSciBenchOECDClassification.json | 62 ++++++++++++++++ .../RuSciBenchOECDClusteringP2P.json | 32 ++++++++ .../TERRa.json | 59 +++++++++++++++ 30 files changed, 1662 insertions(+) create mode 100644 results/intfloat__multilingual-e5-small/GeoreviewClassification.json create mode 100644 results/intfloat__multilingual-e5-small/GeoreviewClusteringP2P.json create mode 100644 results/intfloat__multilingual-e5-small/HeadlineClassification.json create mode 100644 results/intfloat__multilingual-e5-small/InappropriatenessClassification.json create mode 100644 results/intfloat__multilingual-e5-small/KinopoiskClassification.json create mode 100644 results/intfloat__multilingual-e5-small/RiaNewsRetrieval.json create mode 100644 results/intfloat__multilingual-e5-small/RuBQRetrieval.json create mode 100644 results/intfloat__multilingual-e5-small/RuMMarcoRetrieval.json create mode 100644 results/intfloat__multilingual-e5-small/RuReviewsClassification.json create mode 100644 results/intfloat__multilingual-e5-small/RuSTSBenchmarkSTS.json create mode 100644 results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClassification.json create mode 100644 results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClusteringP2P.json create mode 100644 results/intfloat__multilingual-e5-small/RuSciBenchOECDClassification.json create mode 100644 results/intfloat__multilingual-e5-small/RuSciBenchOECDClusteringP2P.json create mode 100644 results/intfloat__multilingual-e5-small/TERRa.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClassification.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClusteringP2P.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/HeadlineClassification.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/InappropriatenessClassification.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/KinopoiskClassification.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RiaNewsRetrieval.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuBQRetrieval.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuMMarcoRetrieval.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuReviewsClassification.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSTSBenchmarkSTS.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClassification.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClusteringP2P.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClassification.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClusteringP2P.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/TERRa.json diff --git a/results/intfloat__multilingual-e5-small/GeoreviewClassification.json b/results/intfloat__multilingual-e5-small/GeoreviewClassification.json new file mode 100644 index 000000000..9a36b9d27 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/GeoreviewClassification.json @@ -0,0 +1,62 @@ +{ + "dataset_revision": "3765c0d1de6b7d264bc459433c45e5a75513839c", + "evaluation_time": 15.669439792633057, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.41904, + "f1": 0.40292348540107337, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.41904, + "scores_per_experiment": [ + { + "accuracy": 0.411, + "f1": 0.3928318478049264 + }, + { + "accuracy": 0.4246, + "f1": 0.41875134731430447 + }, + { + "accuracy": 0.4316, + "f1": 0.4027451272640956 + }, + { + "accuracy": 0.447, + "f1": 0.425627940731658 + }, + { + "accuracy": 0.3944, + "f1": 0.38064524661195753 + }, + { + "accuracy": 0.4076, + "f1": 0.3986492952787144 + }, + { + "accuracy": 0.4124, + "f1": 0.3960854031159166 + }, + { + "accuracy": 0.432, + "f1": 0.42863585110751046 + }, + { + "accuracy": 0.3824, + "f1": 0.37616986314412426 + }, + { + "accuracy": 0.4474, + "f1": 0.4090929316375262 + } + ] + } + ] + }, + "task_name": "GeoreviewClassification" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/GeoreviewClusteringP2P.json b/results/intfloat__multilingual-e5-small/GeoreviewClusteringP2P.json new file mode 100644 index 000000000..ea797c4fd --- /dev/null +++ b/results/intfloat__multilingual-e5-small/GeoreviewClusteringP2P.json @@ -0,0 +1,32 @@ +{ + "dataset_revision": "e82bdbb7d767270d37c9b4ea88cb6475facfd656", + "evaluation_time": 159.50612330436707, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.5425637559995888, + "v_measure": 0.5425637559995888, + "v_measure_std": 0.08710098611855956, + "v_measures": [ + 0.6023455440259576, + 0.6599726508444369, + 0.46536913827747883, + 0.5202244461986688, + 0.42112427118783446, + 0.43658962026432985, + 0.4915792990438388, + 0.5356736921699828, + 0.6769466153847908, + 0.6158122825985685 + ] + } + ] + }, + "task_name": "GeoreviewClusteringP2P" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/HeadlineClassification.json b/results/intfloat__multilingual-e5-small/HeadlineClassification.json new file mode 100644 index 000000000..6b99331f3 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/HeadlineClassification.json @@ -0,0 +1,62 @@ +{ + "dataset_revision": "2fe05ee6b5832cda29f2ef7aaad7b7fe6a3609eb", + "evaluation_time": 11.093019485473633, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.7409333333333333, + "f1": 0.7403291196935363, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.7409333333333333, + "scores_per_experiment": [ + { + "accuracy": 0.722, + "f1": 0.7214219292530725 + }, + { + "accuracy": 0.7549166666666667, + "f1": 0.7535662878680841 + }, + { + "accuracy": 0.7399166666666667, + "f1": 0.7400112761607208 + }, + { + "accuracy": 0.7433333333333333, + "f1": 0.7444121839162109 + }, + { + "accuracy": 0.7480833333333333, + "f1": 0.7504724205846119 + }, + { + "accuracy": 0.7578333333333334, + "f1": 0.7581892120223918 + }, + { + "accuracy": 0.7254166666666667, + "f1": 0.7231835308563578 + }, + { + "accuracy": 0.7301666666666666, + "f1": 0.7287202972794837 + }, + { + "accuracy": 0.724, + "f1": 0.7198580630027918 + }, + { + "accuracy": 0.7636666666666667, + "f1": 0.7634559959916389 + } + ] + } + ] + }, + "task_name": "HeadlineClassification" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/InappropriatenessClassification.json b/results/intfloat__multilingual-e5-small/InappropriatenessClassification.json new file mode 100644 index 000000000..33c7ee9d7 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/InappropriatenessClassification.json @@ -0,0 +1,73 @@ +{ + "dataset_revision": "601651fdc45ef243751676e62dd7a19f491c0285", + "evaluation_time": 13.447438716888428, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.58024, + "ap": 0.5476512898306003, + "f1": 0.5758618133958813, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.58024, + "scores_per_experiment": [ + { + "accuracy": 0.5999, + "ap": 0.5623475279503105, + "f1": 0.5960600458104859 + }, + { + "accuracy": 0.6063, + "ap": 0.5631799041363394, + "f1": 0.6047161410584456 + }, + { + "accuracy": 0.6141, + "ap": 0.5738311420469193, + "f1": 0.6091889075801037 + }, + { + "accuracy": 0.5248, + "ap": 0.5130003904724717, + "f1": 0.5247292607031631 + }, + { + "accuracy": 0.5543, + "ap": 0.530097900419916, + "f1": 0.5542999955429999 + }, + { + "accuracy": 0.5694, + "ap": 0.5384336124031008, + "f1": 0.5601522000051073 + }, + { + "accuracy": 0.6176, + "ap": 0.5731581395348837, + "f1": 0.6174704908093684 + }, + { + "accuracy": 0.5503, + "ap": 0.5269225164635001, + "f1": 0.5287804936626321 + }, + { + "accuracy": 0.5768, + "ap": 0.5442467882632831, + "f1": 0.5767918066893775 + }, + { + "accuracy": 0.5889, + "ap": 0.5512949766152779, + "f1": 0.5864287920971301 + } + ] + } + ] + }, + "task_name": "InappropriatenessClassification" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/KinopoiskClassification.json b/results/intfloat__multilingual-e5-small/KinopoiskClassification.json new file mode 100644 index 000000000..258d27240 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/KinopoiskClassification.json @@ -0,0 +1,62 @@ +{ + "dataset_revision": "5911f26666ac11af46cb9c6849d0dc80a378af24", + "evaluation_time": 7.126714468002319, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.47573333333333323, + "f1": 0.46508168540748, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.47573333333333323, + "scores_per_experiment": [ + { + "accuracy": 0.404, + "f1": 0.40434158024390915 + }, + { + "accuracy": 0.4866666666666667, + "f1": 0.47198679046105146 + }, + { + "accuracy": 0.49466666666666664, + "f1": 0.4811370783030105 + }, + { + "accuracy": 0.44733333333333336, + "f1": 0.44844883362018234 + }, + { + "accuracy": 0.4806666666666667, + "f1": 0.47808004908479934 + }, + { + "accuracy": 0.48933333333333334, + "f1": 0.4711961973486585 + }, + { + "accuracy": 0.4866666666666667, + "f1": 0.4706516940071077 + }, + { + "accuracy": 0.47733333333333333, + "f1": 0.478896312108459 + }, + { + "accuracy": 0.5033333333333333, + "f1": 0.47733021888146904 + }, + { + "accuracy": 0.48733333333333334, + "f1": 0.468748100016153 + } + ] + } + ] + }, + "task_name": "KinopoiskClassification" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/RiaNewsRetrieval.json b/results/intfloat__multilingual-e5-small/RiaNewsRetrieval.json new file mode 100644 index 000000000..48999c70b --- /dev/null +++ b/results/intfloat__multilingual-e5-small/RiaNewsRetrieval.json @@ -0,0 +1,53 @@ +{ + "dataset_revision": "82374b0bbacda6114f39ff9c5b925fa1512ca5d7", + "evaluation_time": 1019.5961062908173, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.66664, + "map_at_1": 0.5157, + "map_at_10": 0.61824, + "map_at_100": 0.62344, + "map_at_1000": 0.62361, + "map_at_20": 0.62147, + "map_at_3": 0.59615, + "map_at_5": 0.6102, + "mrr_at_1": 0.5157, + "mrr_at_10": 0.61824, + "mrr_at_100": 0.62343, + "mrr_at_1000": 0.6236, + "mrr_at_20": 0.62147, + "mrr_at_3": 0.59615, + "mrr_at_5": 0.61018, + "ndcg_at_1": 0.5157, + "ndcg_at_10": 0.66664, + "ndcg_at_100": 0.6921, + "ndcg_at_1000": 0.69676, + "ndcg_at_20": 0.67824, + "ndcg_at_3": 0.622, + "ndcg_at_5": 0.64727, + "precision_at_1": 0.5157, + "precision_at_10": 0.08176, + "precision_at_100": 0.00937, + "precision_at_1000": 0.00097, + "precision_at_20": 0.04316, + "precision_at_3": 0.23223, + "precision_at_5": 0.15158, + "recall_at_1": 0.5157, + "recall_at_10": 0.8176, + "recall_at_100": 0.937, + "recall_at_1000": 0.9739, + "recall_at_20": 0.8631, + "recall_at_3": 0.6967, + "recall_at_5": 0.7579 + } + ] + }, + "task_name": "RiaNewsRetrieval" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/RuBQRetrieval.json b/results/intfloat__multilingual-e5-small/RuBQRetrieval.json new file mode 100644 index 000000000..a1373e600 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/RuBQRetrieval.json @@ -0,0 +1,53 @@ +{ + "dataset_revision": "e19b6ffa60b3bc248e0b41f4cc37c26a55c2a67b", + "evaluation_time": 35.930275678634644, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.6635, + "map_at_1": 0.39342, + "map_at_10": 0.58223, + "map_at_100": 0.59224, + "map_at_1000": 0.59259, + "map_at_20": 0.58905, + "map_at_3": 0.5259, + "map_at_5": 0.56096, + "mrr_at_1": 0.5656, + "mrr_at_10": 0.67727, + "mrr_at_100": 0.68071, + "mrr_at_1000": 0.68082, + "mrr_at_20": 0.67956, + "mrr_at_3": 0.65317, + "mrr_at_5": 0.66836, + "ndcg_at_1": 0.56383, + "ndcg_at_10": 0.6635, + "ndcg_at_100": 0.69628, + "ndcg_at_1000": 0.70337, + "ndcg_at_20": 0.681, + "ndcg_at_3": 0.58575, + "ndcg_at_5": 0.62663, + "precision_at_1": 0.56383, + "precision_at_10": 0.1318, + "precision_at_100": 0.01557, + "precision_at_1000": 0.00164, + "precision_at_20": 0.07151, + "precision_at_3": 0.31836, + "precision_at_5": 0.22754, + "recall_at_1": 0.39342, + "recall_at_10": 0.80586, + "recall_at_100": 0.93322, + "recall_at_1000": 0.98254, + "recall_at_20": 0.86433, + "recall_at_3": 0.61409, + "recall_at_5": 0.70917 + } + ] + }, + "task_name": "RuBQRetrieval" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/RuMMarcoRetrieval.json b/results/intfloat__multilingual-e5-small/RuMMarcoRetrieval.json new file mode 100644 index 000000000..742986080 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/RuMMarcoRetrieval.json @@ -0,0 +1,53 @@ +{ + "dataset_revision": "18d1c2b1ab2a7e8920614329e19ab4c513113d7e", + "evaluation_time": 5528.388280630112, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "dev": [ + { + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.28082, + "map_at_1": 0.14021, + "map_at_10": 0.22886, + "map_at_100": 0.24022, + "map_at_1000": 0.24096, + "map_at_20": 0.23568, + "map_at_3": 0.19892, + "map_at_5": 0.2155, + "mrr_at_1": 0.14435, + "mrr_at_10": 0.23293, + "mrr_at_100": 0.2441, + "mrr_at_1000": 0.24479, + "mrr_at_20": 0.23978, + "mrr_at_3": 0.20303, + "mrr_at_5": 0.21966, + "ndcg_at_1": 0.14435, + "ndcg_at_10": 0.28082, + "ndcg_at_100": 0.33788, + "ndcg_at_1000": 0.35838, + "ndcg_at_20": 0.30552, + "ndcg_at_3": 0.21925, + "ndcg_at_5": 0.24894, + "precision_at_1": 0.14435, + "precision_at_10": 0.04616, + "precision_at_100": 0.0075, + "precision_at_1000": 0.00093, + "precision_at_20": 0.02818, + "precision_at_3": 0.09466, + "precision_at_5": 0.07182, + "recall_at_1": 0.14021, + "recall_at_10": 0.44217, + "recall_at_100": 0.71052, + "recall_at_1000": 0.87136, + "recall_at_20": 0.53845, + "recall_at_3": 0.27413, + "recall_at_5": 0.34533 + } + ] + }, + "task_name": "RuMMarcoRetrieval" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/RuReviewsClassification.json b/results/intfloat__multilingual-e5-small/RuReviewsClassification.json new file mode 100644 index 000000000..39eee2760 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/RuReviewsClassification.json @@ -0,0 +1,62 @@ +{ + "dataset_revision": "f6d2c31f4dc6b88f468552750bfec05b4b41b05a", + "evaluation_time": 7.535195827484131, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.6060933333333334, + "f1": 0.597007474521683, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.6060933333333334, + "scores_per_experiment": [ + { + "accuracy": 0.6346, + "f1": 0.6329801908707503 + }, + { + "accuracy": 0.5249333333333334, + "f1": 0.4981585587582864 + }, + { + "accuracy": 0.6071333333333333, + "f1": 0.6088541683884725 + }, + { + "accuracy": 0.6487333333333334, + "f1": 0.6501016672839444 + }, + { + "accuracy": 0.6554, + "f1": 0.6544128367544028 + }, + { + "accuracy": 0.5646666666666667, + "f1": 0.5515952394177405 + }, + { + "accuracy": 0.6053333333333333, + "f1": 0.5900342341506833 + }, + { + "accuracy": 0.5972, + "f1": 0.5817886490218223 + }, + { + "accuracy": 0.5809333333333333, + "f1": 0.5672444304738421 + }, + { + "accuracy": 0.642, + "f1": 0.6349047700968856 + } + ] + } + ] + }, + "task_name": "RuReviewsClassification" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/RuSTSBenchmarkSTS.json b/results/intfloat__multilingual-e5-small/RuSTSBenchmarkSTS.json new file mode 100644 index 000000000..7e7970225 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/RuSTSBenchmarkSTS.json @@ -0,0 +1,72 @@ +{ + "dataset_revision": "7cf24f325c6da6195df55bef3d86b5e0616f3018", + "evaluation_time": 2.9129486083984375, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "cos_sim": { + "pearson": 0.7804893678286633, + "spearman": 0.7772461027539413 + }, + "euclidean": { + "pearson": 0.7688724664736094, + "spearman": 0.7772461027539413 + }, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.7772461027539413, + "manhattan": { + "pearson": 0.7672934089369625, + "spearman": 0.7758844882898225 + } + } + ], + "train": [ + { + "cos_sim": { + "pearson": 0.7978802401575843, + "spearman": 0.7781758504177039 + }, + "euclidean": { + "pearson": 0.7786276295006407, + "spearman": 0.7781731374641703 + }, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.7781758504177039, + "manhattan": { + "pearson": 0.7785232719634329, + "spearman": 0.7775080206948711 + } + } + ], + "validation": [ + { + "cos_sim": { + "pearson": 0.8230354096147491, + "spearman": 0.8244747741010346 + }, + "euclidean": { + "pearson": 0.8118130979247168, + "spearman": 0.8244747741010346 + }, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.8244747741010346, + "manhattan": { + "pearson": 0.8114795850180648, + "spearman": 0.8241630998227286 + } + } + ] + }, + "task_name": "RuSTSBenchmarkSTS" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClassification.json b/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClassification.json new file mode 100644 index 000000000..002b2d12e --- /dev/null +++ b/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClassification.json @@ -0,0 +1,62 @@ +{ + "dataset_revision": "673a610d6d3dd91a547a0d57ae1b56f37ebbf6a1", + "evaluation_time": 9.047378063201904, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.5348484848484848, + "f1": 0.5210756834528681, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.5348484848484848, + "scores_per_experiment": [ + { + "accuracy": 0.5404040404040404, + "f1": 0.5244669347212237 + }, + { + "accuracy": 0.5382395382395382, + "f1": 0.5273913015496474 + }, + { + "accuracy": 0.5256132756132756, + "f1": 0.5152792170693732 + }, + { + "accuracy": 0.553030303030303, + "f1": 0.5435989736187034 + }, + { + "accuracy": 0.5393217893217893, + "f1": 0.529493005068726 + }, + { + "accuracy": 0.5003607503607503, + "f1": 0.4830755796054487 + }, + { + "accuracy": 0.5436507936507936, + "f1": 0.5250488962586415 + }, + { + "accuracy": 0.5263347763347763, + "f1": 0.5093739702927322 + }, + { + "accuracy": 0.5429292929292929, + "f1": 0.5289516187196283 + }, + { + "accuracy": 0.5386002886002886, + "f1": 0.5240773376245568 + } + ] + } + ] + }, + "task_name": "RuSciBenchGRNTIClassification" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClusteringP2P.json b/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClusteringP2P.json new file mode 100644 index 000000000..9719a4d7e --- /dev/null +++ b/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClusteringP2P.json @@ -0,0 +1,32 @@ +{ + "dataset_revision": "5add37c2d5028dda82cf115a659b56153580c203", + "evaluation_time": 43.90299892425537, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.4907036398379492, + "v_measure": 0.4907036398379492, + "v_measure_std": 0.01442403310059541, + "v_measures": [ + 0.5111910016583134, + 0.4826920493259084, + 0.5037578196331924, + 0.46108371524279357, + 0.4830341704583758, + 0.4968990747062376, + 0.5026536055346332, + 0.47474077277521115, + 0.49751454520339666, + 0.4934696438414291 + ] + } + ] + }, + "task_name": "RuSciBenchGRNTIClusteringP2P" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/RuSciBenchOECDClassification.json b/results/intfloat__multilingual-e5-small/RuSciBenchOECDClassification.json new file mode 100644 index 000000000..aeae193c8 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/RuSciBenchOECDClassification.json @@ -0,0 +1,62 @@ +{ + "dataset_revision": "26c88e99dcaba32bb45d0e1bfc21902337f6d471", + "evaluation_time": 9.497691631317139, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.40465983224603913, + "f1": 0.3813156482688752, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.40465983224603913, + "scores_per_experiment": [ + { + "accuracy": 0.4044734389561976, + "f1": 0.3791045524892834 + }, + { + "accuracy": 0.39577508543025786, + "f1": 0.3724397241737727 + }, + { + "accuracy": 0.40509474992233613, + "f1": 0.38530126616056976 + }, + { + "accuracy": 0.3979496738117428, + "f1": 0.36366224022921334 + }, + { + "accuracy": 0.4016775396085741, + "f1": 0.3656766512184062 + }, + { + "accuracy": 0.4113078595837217, + "f1": 0.3938519036434973 + }, + { + "accuracy": 0.4119291705498602, + "f1": 0.39099679411232896 + }, + { + "accuracy": 0.4200062131096614, + "f1": 0.4073591469425345 + }, + { + "accuracy": 0.39422180801491147, + "f1": 0.37126427433989134 + }, + { + "accuracy": 0.4041627834731283, + "f1": 0.38349992937925514 + } + ] + } + ] + }, + "task_name": "RuSciBenchOECDClassification" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/RuSciBenchOECDClusteringP2P.json b/results/intfloat__multilingual-e5-small/RuSciBenchOECDClusteringP2P.json new file mode 100644 index 000000000..a4a3f770b --- /dev/null +++ b/results/intfloat__multilingual-e5-small/RuSciBenchOECDClusteringP2P.json @@ -0,0 +1,32 @@ +{ + "dataset_revision": "08475cf0f71cd474bdc3525ee49d8495a12a9a6a", + "evaluation_time": 41.33925414085388, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.40860295286491655, + "v_measure": 0.40860295286491655, + "v_measure_std": 0.008542546238438057, + "v_measures": [ + 0.40474303948152135, + 0.4161826607911408, + 0.3916797573391887, + 0.3997524368597952, + 0.4197209726992979, + 0.4107211103601622, + 0.41239273911509505, + 0.40698654817604785, + 0.41984750311820723, + 0.40400276070870866 + ] + } + ] + }, + "task_name": "RuSciBenchOECDClusteringP2P" +} \ No newline at end of file diff --git a/results/intfloat__multilingual-e5-small/TERRa.json b/results/intfloat__multilingual-e5-small/TERRa.json new file mode 100644 index 000000000..1de0dc014 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/TERRa.json @@ -0,0 +1,59 @@ +{ + "dataset_revision": "7b58f24536063837d644aab9a023c62199b2a612", + "evaluation_time": 1.423902988433838, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "dev": [ + { + "cos_sim": { + "accuracy": 0.5830618892508144, + "accuracy_threshold": 0.8752437829971313, + "ap": 0.5751046579848809, + "f1": 0.6728971962616822, + "f1_threshold": 0.8406317234039307, + "precision": 0.5236363636363637, + "recall": 0.9411764705882353 + }, + "dot": { + "accuracy": 0.5830618892508144, + "accuracy_threshold": 0.8752437829971313, + "ap": 0.5751046579848809, + "f1": 0.6728971962616822, + "f1_threshold": 0.8406317234039307, + "precision": 0.5236363636363637, + "recall": 0.9411764705882353 + }, + "euclidean": { + "accuracy": 0.5830618892508144, + "accuracy_threshold": 0.49951231479644775, + "ap": 0.5751046579848809, + "f1": 0.6728971962616822, + "f1_threshold": 0.5645672082901001, + "precision": 0.5236363636363637, + "recall": 0.9411764705882353 + }, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.5751046579848809, + "manhattan": { + "accuracy": 0.5765472312703583, + "accuracy_threshold": 8.063196182250977, + "ap": 0.5728843427691928, + "f1": 0.6699507389162561, + "f1_threshold": 8.409059524536133, + "precision": 0.5375494071146245, + "recall": 0.8888888888888888 + }, + "max": { + "accuracy": 0.5830618892508144, + "ap": 0.5751046579848809, + "f1": 0.6728971962616822 + } + } + ] + }, + "task_name": "TERRa" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClassification.json new file mode 100644 index 000000000..369e85f13 --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClassification.json @@ -0,0 +1,62 @@ +{ + "dataset_revision": "3765c0d1de6b7d264bc459433c45e5a75513839c", + "evaluation_time": 15.76297402381897, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.37874, + "f1": 0.3745824725441968, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.37874, + "scores_per_experiment": [ + { + "accuracy": 0.3482, + "f1": 0.3478295316844259 + }, + { + "accuracy": 0.3712, + "f1": 0.37478419559913767 + }, + { + "accuracy": 0.4204, + "f1": 0.4033108154167591 + }, + { + "accuracy": 0.3724, + "f1": 0.37528403563866586 + }, + { + "accuracy": 0.3786, + "f1": 0.37474862797257674 + }, + { + "accuracy": 0.3346, + "f1": 0.3328824346726555 + }, + { + "accuracy": 0.3936, + "f1": 0.38690113762564987 + }, + { + "accuracy": 0.4156, + "f1": 0.4093053648237972 + }, + { + "accuracy": 0.3528, + "f1": 0.3468968846000202 + }, + { + "accuracy": 0.4, + "f1": 0.3938816974082802 + } + ] + } + ] + }, + "task_name": "GeoreviewClassification" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClusteringP2P.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClusteringP2P.json new file mode 100644 index 000000000..c890de988 --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClusteringP2P.json @@ -0,0 +1,32 @@ +{ + "dataset_revision": "e82bdbb7d767270d37c9b4ea88cb6475facfd656", + "evaluation_time": 147.46828031539917, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.4547073107289486, + "v_measure": 0.4547073107289486, + "v_measure_std": 0.062205229475387265, + "v_measures": [ + 0.48664109686626755, + 0.5408877316411709, + 0.4331384047503243, + 0.4636243738216654, + 0.3738562677328548, + 0.35238197464166343, + 0.4378906693865557, + 0.40749948572778594, + 0.5172189167028366, + 0.5339341860183607 + ] + } + ] + }, + "task_name": "GeoreviewClusteringP2P" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/HeadlineClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/HeadlineClassification.json new file mode 100644 index 000000000..40d61425b --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/HeadlineClassification.json @@ -0,0 +1,62 @@ +{ + "dataset_revision": "2fe05ee6b5832cda29f2ef7aaad7b7fe6a3609eb", + "evaluation_time": 11.46289873123169, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.6805333333333333, + "f1": 0.6816775809403113, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.6805333333333333, + "scores_per_experiment": [ + { + "accuracy": 0.6300833333333333, + "f1": 0.6369181634277693 + }, + { + "accuracy": 0.6569166666666667, + "f1": 0.6556779102488896 + }, + { + "accuracy": 0.70575, + "f1": 0.7063579593355636 + }, + { + "accuracy": 0.7003333333333334, + "f1": 0.7022311853029809 + }, + { + "accuracy": 0.70625, + "f1": 0.708128015241555 + }, + { + "accuracy": 0.7210833333333333, + "f1": 0.7209020613658228 + }, + { + "accuracy": 0.6421666666666667, + "f1": 0.6417652955002949 + }, + { + "accuracy": 0.6639166666666667, + "f1": 0.6654400013293077 + }, + { + "accuracy": 0.6651666666666667, + "f1": 0.6653355970756962 + }, + { + "accuracy": 0.7136666666666667, + "f1": 0.7140196205752338 + } + ] + } + ] + }, + "task_name": "HeadlineClassification" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/InappropriatenessClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/InappropriatenessClassification.json new file mode 100644 index 000000000..2c8607810 --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/InappropriatenessClassification.json @@ -0,0 +1,73 @@ +{ + "dataset_revision": "601651fdc45ef243751676e62dd7a19f491c0285", + "evaluation_time": 11.479481220245361, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.5788, + "ap": 0.5478227138673264, + "f1": 0.574297812767316, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.5788, + "scores_per_experiment": [ + { + "accuracy": 0.5935, + "ap": 0.5552890212932213, + "f1": 0.5934424273821416 + }, + { + "accuracy": 0.5946, + "ap": 0.5569062258480034, + "f1": 0.5941252726839421 + }, + { + "accuracy": 0.5575, + "ap": 0.5327556336321784, + "f1": 0.5541016795896397 + }, + { + "accuracy": 0.5911, + "ap": 0.553201862437765, + "f1": 0.5903670478550566 + }, + { + "accuracy": 0.6126, + "ap": 0.5709001381851682, + "f1": 0.6109154037687734 + }, + { + "accuracy": 0.4817, + "ap": 0.4912873073909637, + "f1": 0.4744940469138418 + }, + { + "accuracy": 0.6019, + "ap": 0.5626143563244215, + "f1": 0.6006964952436292 + }, + { + "accuracy": 0.5635, + "ap": 0.5348987193503045, + "f1": 0.5547353709683239 + }, + { + "accuracy": 0.5764, + "ap": 0.5485345609065155, + "f1": 0.5553457268850318 + }, + { + "accuracy": 0.6152, + "ap": 0.571839313304721, + "f1": 0.6147546563827785 + } + ] + } + ] + }, + "task_name": "InappropriatenessClassification" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/KinopoiskClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/KinopoiskClassification.json new file mode 100644 index 000000000..72e76f62c --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/KinopoiskClassification.json @@ -0,0 +1,62 @@ +{ + "dataset_revision": "5911f26666ac11af46cb9c6849d0dc80a378af24", + "evaluation_time": 4.948300123214722, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.4145333333333333, + "f1": 0.4116865556807029, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.4145333333333333, + "scores_per_experiment": [ + { + "accuracy": 0.33466666666666667, + "f1": 0.3308025264578252 + }, + { + "accuracy": 0.448, + "f1": 0.44411053266482714 + }, + { + "accuracy": 0.40066666666666667, + "f1": 0.3999889004297336 + }, + { + "accuracy": 0.396, + "f1": 0.39615783396819343 + }, + { + "accuracy": 0.428, + "f1": 0.4288487670299537 + }, + { + "accuracy": 0.4206666666666667, + "f1": 0.4208413300162474 + }, + { + "accuracy": 0.4146666666666667, + "f1": 0.40380422768674284 + }, + { + "accuracy": 0.46, + "f1": 0.45727075700781117 + }, + { + "accuracy": 0.416, + "f1": 0.41602445256093956 + }, + { + "accuracy": 0.4266666666666667, + "f1": 0.41901622898475455 + } + ] + } + ] + }, + "task_name": "KinopoiskClassification" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RiaNewsRetrieval.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RiaNewsRetrieval.json new file mode 100644 index 000000000..81e26e323 --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RiaNewsRetrieval.json @@ -0,0 +1,53 @@ +{ + "dataset_revision": "82374b0bbacda6114f39ff9c5b925fa1512ca5d7", + "evaluation_time": 442.88722825050354, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.44824, + "map_at_1": 0.335, + "map_at_10": 0.40885, + "map_at_100": 0.41582, + "map_at_1000": 0.41639, + "map_at_20": 0.41289, + "map_at_3": 0.38805, + "map_at_5": 0.40023, + "mrr_at_1": 0.336, + "mrr_at_10": 0.40937, + "mrr_at_100": 0.41636, + "mrr_at_1000": 0.41693, + "mrr_at_20": 0.41343, + "mrr_at_3": 0.38862, + "mrr_at_5": 0.40075, + "ndcg_at_1": 0.335, + "ndcg_at_10": 0.44824, + "ndcg_at_100": 0.48452, + "ndcg_at_1000": 0.50126, + "ndcg_at_20": 0.463, + "ndcg_at_3": 0.40513, + "ndcg_at_5": 0.42714, + "precision_at_1": 0.335, + "precision_at_10": 0.05738, + "precision_at_100": 0.00749, + "precision_at_1000": 0.00088, + "precision_at_20": 0.03161, + "precision_at_3": 0.1515, + "precision_at_5": 0.10162, + "recall_at_1": 0.335, + "recall_at_10": 0.5738, + "recall_at_100": 0.7489, + "recall_at_1000": 0.8841, + "recall_at_20": 0.6322, + "recall_at_3": 0.4545, + "recall_at_5": 0.5081 + } + ] + }, + "task_name": "RiaNewsRetrieval" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuBQRetrieval.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuBQRetrieval.json new file mode 100644 index 000000000..c1f482185 --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuBQRetrieval.json @@ -0,0 +1,53 @@ +{ + "dataset_revision": "e19b6ffa60b3bc248e0b41f4cc37c26a55c2a67b", + "evaluation_time": 23.76644492149353, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.29702, + "map_at_1": 0.13875, + "map_at_10": 0.23727, + "map_at_100": 0.24935, + "map_at_1000": 0.25051, + "map_at_20": 0.24354, + "map_at_3": 0.20521, + "map_at_5": 0.22361, + "mrr_at_1": 0.20922, + "mrr_at_10": 0.30287, + "mrr_at_100": 0.31193, + "mrr_at_1000": 0.31262, + "mrr_at_20": 0.30789, + "mrr_at_3": 0.27709, + "mrr_at_5": 0.2924, + "ndcg_at_1": 0.20863, + "ndcg_at_10": 0.29702, + "ndcg_at_100": 0.353, + "ndcg_at_1000": 0.38148, + "ndcg_at_20": 0.31696, + "ndcg_at_3": 0.2414, + "ndcg_at_5": 0.26828, + "precision_at_1": 0.20863, + "precision_at_10": 0.06602, + "precision_at_100": 0.01064, + "precision_at_1000": 0.00141, + "precision_at_20": 0.0388, + "precision_at_3": 0.13948, + "precision_at_5": 0.10532, + "recall_at_1": 0.13875, + "recall_at_10": 0.40745, + "recall_at_100": 0.64716, + "recall_at_1000": 0.84964, + "recall_at_20": 0.47623, + "recall_at_3": 0.26812, + "recall_at_5": 0.33012 + } + ] + }, + "task_name": "RuBQRetrieval" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuMMarcoRetrieval.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuMMarcoRetrieval.json new file mode 100644 index 000000000..c34f4fc94 --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuMMarcoRetrieval.json @@ -0,0 +1,53 @@ +{ + "dataset_revision": "18d1c2b1ab2a7e8920614329e19ab4c513113d7e", + "evaluation_time": 4171.972459077835, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "dev": [ + { + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.10022, + "map_at_1": 0.04642, + "map_at_10": 0.07956, + "map_at_100": 0.08654, + "map_at_1000": 0.08742, + "map_at_20": 0.08308, + "map_at_3": 0.06717, + "map_at_5": 0.07386, + "mrr_at_1": 0.04817, + "mrr_at_10": 0.08199, + "mrr_at_100": 0.08905, + "mrr_at_1000": 0.0899, + "mrr_at_20": 0.08558, + "mrr_at_3": 0.06943, + "mrr_at_5": 0.07616, + "ndcg_at_1": 0.04817, + "ndcg_at_10": 0.10022, + "ndcg_at_100": 0.13959, + "ndcg_at_1000": 0.1665, + "ndcg_at_20": 0.11301, + "ndcg_at_3": 0.0744, + "ndcg_at_5": 0.08635, + "precision_at_1": 0.04817, + "precision_at_10": 0.01716, + "precision_at_100": 0.00378, + "precision_at_1000": 0.00061, + "precision_at_20": 0.01122, + "precision_at_3": 0.0324, + "precision_at_5": 0.0254, + "recall_at_1": 0.04642, + "recall_at_10": 0.16416, + "recall_at_100": 0.35889, + "recall_at_1000": 0.57588, + "recall_at_20": 0.21396, + "recall_at_3": 0.09307, + "recall_at_5": 0.12164 + } + ] + }, + "task_name": "RuMMarcoRetrieval" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuReviewsClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuReviewsClassification.json new file mode 100644 index 000000000..41dbb4cd0 --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuReviewsClassification.json @@ -0,0 +1,62 @@ +{ + "dataset_revision": "f6d2c31f4dc6b88f468552750bfec05b4b41b05a", + "evaluation_time": 7.741372585296631, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.5871999999999999, + "f1": 0.5837261469604392, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.5871999999999999, + "scores_per_experiment": [ + { + "accuracy": 0.63, + "f1": 0.6303589229215918 + }, + { + "accuracy": 0.5772666666666667, + "f1": 0.5767187453017142 + }, + { + "accuracy": 0.5995333333333334, + "f1": 0.6066554960243958 + }, + { + "accuracy": 0.6422, + "f1": 0.6468765408412193 + }, + { + "accuracy": 0.6464666666666666, + "f1": 0.6480125690154023 + }, + { + "accuracy": 0.5368666666666667, + "f1": 0.5347438669396679 + }, + { + "accuracy": 0.5526666666666666, + "f1": 0.5275844379060841 + }, + { + "accuracy": 0.5902, + "f1": 0.5699928819977857 + }, + { + "accuracy": 0.47533333333333333, + "f1": 0.47710990793388125 + }, + { + "accuracy": 0.6214666666666666, + "f1": 0.6192081007226498 + } + ] + } + ] + }, + "task_name": "RuReviewsClassification" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSTSBenchmarkSTS.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSTSBenchmarkSTS.json new file mode 100644 index 000000000..239fc37b5 --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSTSBenchmarkSTS.json @@ -0,0 +1,72 @@ +{ + "dataset_revision": "7cf24f325c6da6195df55bef3d86b5e0616f3018", + "evaluation_time": 2.954987049102783, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "cos_sim": { + "pearson": 0.789283710553957, + "spearman": 0.7954850999929017 + }, + "euclidean": { + "pearson": 0.77346567425632, + "spearman": 0.7727314522939311 + }, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.7954850999929017, + "manhattan": { + "pearson": 0.7734820755444827, + "spearman": 0.7725759998467114 + } + } + ], + "train": [ + { + "cos_sim": { + "pearson": 0.8167225483626231, + "spearman": 0.8004316378621577 + }, + "euclidean": { + "pearson": 0.814022124536146, + "spearman": 0.7983481288726039 + }, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.8004316378621577, + "manhattan": { + "pearson": 0.81379986366986, + "spearman": 0.7983336550748988 + } + } + ], + "validation": [ + { + "cos_sim": { + "pearson": 0.8405088639041084, + "spearman": 0.8429697776337159 + }, + "euclidean": { + "pearson": 0.8170485377169829, + "spearman": 0.8225506394065449 + }, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.8429697776337159, + "manhattan": { + "pearson": 0.815989675052089, + "spearman": 0.820892726176619 + } + } + ] + }, + "task_name": "RuSTSBenchmarkSTS" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClassification.json new file mode 100644 index 000000000..961d7db8f --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClassification.json @@ -0,0 +1,62 @@ +{ + "dataset_revision": "673a610d6d3dd91a547a0d57ae1b56f37ebbf6a1", + "evaluation_time": 6.399334907531738, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.5295454545454545, + "f1": 0.5267719307729761, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.5295454545454545, + "scores_per_experiment": [ + { + "accuracy": 0.5523088023088023, + "f1": 0.5485584626187086 + }, + { + "accuracy": 0.5378787878787878, + "f1": 0.5383560398266264 + }, + { + "accuracy": 0.4935064935064935, + "f1": 0.490176752799251 + }, + { + "accuracy": 0.5555555555555556, + "f1": 0.5516142577765969 + }, + { + "accuracy": 0.538961038961039, + "f1": 0.539290641279128 + }, + { + "accuracy": 0.5191197691197691, + "f1": 0.5132684814984376 + }, + { + "accuracy": 0.5086580086580087, + "f1": 0.5049038330254615 + }, + { + "accuracy": 0.5256132756132756, + "f1": 0.5202828911043906 + }, + { + "accuracy": 0.5248917748917749, + "f1": 0.5239214029959183 + }, + { + "accuracy": 0.538961038961039, + "f1": 0.5373465448052426 + } + ] + } + ] + }, + "task_name": "RuSciBenchGRNTIClassification" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClusteringP2P.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClusteringP2P.json new file mode 100644 index 000000000..1e0f4080e --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClusteringP2P.json @@ -0,0 +1,32 @@ +{ + "dataset_revision": "5add37c2d5028dda82cf115a659b56153580c203", + "evaluation_time": 27.22791576385498, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.4707243048007701, + "v_measure": 0.4707243048007701, + "v_measure_std": 0.008993513565805002, + "v_measures": [ + 0.4568940516605606, + 0.4647838252635624, + 0.4747646295876639, + 0.46754763831760426, + 0.47791105336236367, + 0.47689738622690375, + 0.464306841618253, + 0.48971364051921895, + 0.4716642112374646, + 0.4627597702141062 + ] + } + ] + }, + "task_name": "RuSciBenchGRNTIClusteringP2P" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClassification.json new file mode 100644 index 000000000..5f8c62631 --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClassification.json @@ -0,0 +1,62 @@ +{ + "dataset_revision": "26c88e99dcaba32bb45d0e1bfc21902337f6d471", + "evaluation_time": 6.813362121582031, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "accuracy": 0.41556383970177074, + "f1": 0.4073756403133939, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.41556383970177074, + "scores_per_experiment": [ + { + "accuracy": 0.42684063373718545, + "f1": 0.4157660008020526 + }, + { + "accuracy": 0.40726933830382106, + "f1": 0.4018935354837055 + }, + { + "accuracy": 0.418452935694315, + "f1": 0.41188049861474574 + }, + { + "accuracy": 0.40726933830382106, + "f1": 0.39853740983349073 + }, + { + "accuracy": 0.41876359117738426, + "f1": 0.4081694037369186 + }, + { + "accuracy": 0.4181422802112457, + "f1": 0.4097209556016289 + }, + { + "accuracy": 0.40602671637154397, + "f1": 0.39937285745788964 + }, + { + "accuracy": 0.42684063373718545, + "f1": 0.4226815911158912 + }, + { + "accuracy": 0.4113078595837217, + "f1": 0.396924130188379 + }, + { + "accuracy": 0.4147250698974837, + "f1": 0.40881002029923663 + } + ] + } + ] + }, + "task_name": "RuSciBenchOECDClassification" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClusteringP2P.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClusteringP2P.json new file mode 100644 index 000000000..b7b854d0d --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClusteringP2P.json @@ -0,0 +1,32 @@ +{ + "dataset_revision": "08475cf0f71cd474bdc3525ee49d8495a12a9a6a", + "evaluation_time": 26.567129611968994, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "test": [ + { + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.4051600422450699, + "v_measure": 0.4051600422450699, + "v_measure_std": 0.011396762847751415, + "v_measures": [ + 0.40430177897104075, + 0.4013933520419551, + 0.4109438030061019, + 0.4083699017502154, + 0.4094881739139133, + 0.41200961686882875, + 0.3780387081159579, + 0.4070882848354379, + 0.3960421227057608, + 0.4239246802414878 + ] + } + ] + }, + "task_name": "RuSciBenchOECDClusteringP2P" +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/TERRa.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/TERRa.json new file mode 100644 index 000000000..2b18b4165 --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/TERRa.json @@ -0,0 +1,59 @@ +{ + "dataset_revision": "7b58f24536063837d644aab9a023c62199b2a612", + "evaluation_time": 0.2334122657775879, + "kg_co2_emissions": null, + "mteb_version": "1.11.6", + "scores": { + "dev": [ + { + "cos_sim": { + "accuracy": 0.5928338762214984, + "accuracy_threshold": 0.4586835503578186, + "ap": 0.5855563164758582, + "f1": 0.6987951807228915, + "f1_threshold": 0.44296181201934814, + "precision": 0.5534351145038168, + "recall": 0.9477124183006536 + }, + "dot": { + "accuracy": 0.5570032573289903, + "accuracy_threshold": 6.291913032531738, + "ap": 0.5616434201070278, + "f1": 0.6820276497695853, + "f1_threshold": 3.9489879608154297, + "precision": 0.5266903914590747, + "recall": 0.9673202614379085 + }, + "euclidean": { + "accuracy": 0.5960912052117264, + "accuracy_threshold": 3.2749500274658203, + "ap": 0.5711044697568897, + "f1": 0.6835443037974683, + "f1_threshold": 3.773308753967285, + "precision": 0.5578512396694215, + "recall": 0.8823529411764706 + }, + "hf_subset": "default", + "languages": [ + "rus-Cyrl" + ], + "main_score": 0.5855563164758582, + "manhattan": { + "accuracy": 0.5960912052117264, + "accuracy_threshold": 51.012184143066406, + "ap": 0.5704622074255113, + "f1": 0.6799007444168735, + "f1_threshold": 59.606590270996094, + "precision": 0.548, + "recall": 0.8954248366013072 + }, + "max": { + "accuracy": 0.5960912052117264, + "ap": 0.5855563164758582, + "f1": 0.6987951807228915 + } + } + ] + }, + "task_name": "TERRa" +} \ No newline at end of file From 03453afd64d8a07822db167ee90955ef52479085 Mon Sep 17 00:00:00 2001 From: Artem Snegirev Date: Mon, 27 May 2024 13:05:18 +0300 Subject: [PATCH 3/8] downsample classifcation tasks & remove validation splits --- .../rus/GeoreviewClassification.py | 9 ++- .../rus/HeadlineClassification.py | 11 ++- .../rus/InappropriatenessClassification.py | 11 ++- .../rus/KinopoiskClassification.py | 4 +- .../rus/RuReviewsClassification.py | 9 ++- .../rus/RuSciBenchGRNTIClassification.py | 5 ++ .../rus/RuSciBenchOECDClassification.py | 5 ++ .../GeoreviewClassification.json | 48 ++++++------- .../HeadlineClassification.json | 48 ++++++------- .../InappropriatenessClassification.json | 70 +++++++++---------- .../KinopoiskClassification.json | 2 +- .../RuReviewsClassification.json | 48 ++++++------- .../RuSciBenchGRNTIClassification.json | 48 ++++++------- .../RuSciBenchOECDClassification.json | 48 ++++++------- .../GeoreviewClassification.json | 48 ++++++------- .../HeadlineClassification.json | 48 ++++++------- .../InappropriatenessClassification.json | 70 +++++++++---------- .../KinopoiskClassification.json | 2 +- .../RuReviewsClassification.json | 48 ++++++------- .../RuSciBenchGRNTIClassification.json | 48 ++++++------- .../RuSciBenchOECDClassification.json | 48 ++++++------- 21 files changed, 354 insertions(+), 324 deletions(-) diff --git a/mteb/tasks/Classification/rus/GeoreviewClassification.py b/mteb/tasks/Classification/rus/GeoreviewClassification.py index 51fc89ff5..f91da2a29 100644 --- a/mteb/tasks/Classification/rus/GeoreviewClassification.py +++ b/mteb/tasks/Classification/rus/GeoreviewClassification.py @@ -29,6 +29,11 @@ class GeoreviewClassification(AbsTaskClassification): dialect=[], text_creation="found", bibtex_citation="""""", - n_samples={"validation": 5000, "test": 5000}, - avg_character_length={"validation": 412.9, "test": 409.0}, + n_samples={"test": 5000}, + avg_character_length={"test": 409.0}, ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/rus/HeadlineClassification.py b/mteb/tasks/Classification/rus/HeadlineClassification.py index f89072059..420098c7b 100644 --- a/mteb/tasks/Classification/rus/HeadlineClassification.py +++ b/mteb/tasks/Classification/rus/HeadlineClassification.py @@ -16,7 +16,7 @@ class HeadlineClassification(AbsTaskClassification): reference="https://aclanthology.org/2020.ngt-1.6/", type="Classification", category="s2s", - eval_splits=["validation", "test"], + eval_splits=["test"], eval_langs=["rus-Cyrl"], main_score="accuracy", date=("2009-01-01", "2020-01-01"), @@ -52,6 +52,11 @@ class HeadlineClassification(AbsTaskClassification): pages = "54--59", abstract = "The article is focused on automatic development and ranking of a large corpus for Russian paraphrase generation which proves to be the first corpus of such type in Russian computational linguistics. Existing manually annotated paraphrase datasets for Russian are limited to small-sized ParaPhraser corpus and ParaPlag which are suitable for a set of NLP tasks, such as paraphrase and plagiarism detection, sentence similarity and relatedness estimation, etc. Due to size restrictions, these datasets can hardly be applied in end-to-end text generation solutions. Meanwhile, paraphrase generation requires a large amount of training data. In our study we propose a solution to the problem: we collect, rank and evaluate a new publicly available headline paraphrase corpus (ParaPhraser Plus), and then perform text generation experiments with manual evaluation on automatically ranked corpora using the Universal Transformer architecture.", }""", - n_samples={"validation": 12000, "test": 12000}, - avg_character_length={"validation": 61.7, "test": 61.6}, + n_samples={"test": 12000}, + avg_character_length={"test": 61.6}, ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/rus/InappropriatenessClassification.py b/mteb/tasks/Classification/rus/InappropriatenessClassification.py index 8bc541113..c69846a97 100644 --- a/mteb/tasks/Classification/rus/InappropriatenessClassification.py +++ b/mteb/tasks/Classification/rus/InappropriatenessClassification.py @@ -16,7 +16,7 @@ class InappropriatenessClassification(AbsTaskClassification): reference="https://aclanthology.org/2021.bsnlp-1.4", type="Classification", category="s2s", - eval_splits=["validation", "test"], + eval_splits=["test"], eval_langs=["rus-Cyrl"], main_score="accuracy", date=("2006-01-01", "2021-04-01"), @@ -56,6 +56,11 @@ class InappropriatenessClassification(AbsTaskClassification): pages = "26--36", abstract = "Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.", }""", - n_samples={"validation": 4000, "test": 10000}, - avg_character_length={"validation": 96.8, "test": 97.7}, + n_samples={"test": 10000}, + avg_character_length={"test": 97.7}, ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/rus/KinopoiskClassification.py b/mteb/tasks/Classification/rus/KinopoiskClassification.py index 2d6198e63..ee680c0f5 100644 --- a/mteb/tasks/Classification/rus/KinopoiskClassification.py +++ b/mteb/tasks/Classification/rus/KinopoiskClassification.py @@ -37,6 +37,6 @@ class KinopoiskClassification(AbsTaskClassification): pages={48--58}, year={2013} }""", - n_samples={"validation": 1500, "test": 1500}, - avg_character_length={"validation": 1941.7, "test": 1897.3}, + n_samples={"test": 1500}, + avg_character_length={"test": 1897.3}, ) diff --git a/mteb/tasks/Classification/rus/RuReviewsClassification.py b/mteb/tasks/Classification/rus/RuReviewsClassification.py index 9a06ac9c4..32cc4de2b 100644 --- a/mteb/tasks/Classification/rus/RuReviewsClassification.py +++ b/mteb/tasks/Classification/rus/RuReviewsClassification.py @@ -40,6 +40,11 @@ class RuReviewsClassification(AbsTaskClassification): ISSN={2378-1963}, month={July} }""", - n_samples={"validation": 15000, "test": 15000}, - avg_character_length={"validation": 132.8, "test": 133.2}, + n_samples={"test": 15000}, + avg_character_length={"test": 133.2}, ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py b/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py index 9cc7d7d53..96c4eb22f 100644 --- a/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py +++ b/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py @@ -32,3 +32,8 @@ class RuSciBenchGRNTIClassification(AbsTaskClassification): n_samples={"test": 2772}, avg_character_length={"test": 890.1}, ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py b/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py index 8b6ee984e..e21e9f496 100644 --- a/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py +++ b/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py @@ -32,3 +32,8 @@ class RuSciBenchOECDClassification(AbsTaskClassification): n_samples={"test": 3219}, avg_character_length={"test": 838.9}, ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/results/intfloat__multilingual-e5-small/GeoreviewClassification.json b/results/intfloat__multilingual-e5-small/GeoreviewClassification.json index 9a36b9d27..ec8e39030 100644 --- a/results/intfloat__multilingual-e5-small/GeoreviewClassification.json +++ b/results/intfloat__multilingual-e5-small/GeoreviewClassification.json @@ -1,58 +1,58 @@ { "dataset_revision": "3765c0d1de6b7d264bc459433c45e5a75513839c", - "evaluation_time": 15.669439792633057, + "evaluation_time": 14.2263662815094, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { "test": [ { - "accuracy": 0.41904, - "f1": 0.40292348540107337, + "accuracy": 0.423046875, + "f1": 0.40627294178104983, "hf_subset": "default", "languages": [ "rus-Cyrl" ], - "main_score": 0.41904, + "main_score": 0.423046875, "scores_per_experiment": [ { - "accuracy": 0.411, - "f1": 0.3928318478049264 + "accuracy": 0.42529296875, + "f1": 0.4062613411708762 }, { - "accuracy": 0.4246, - "f1": 0.41875134731430447 + "accuracy": 0.4267578125, + "f1": 0.4211003062181492 }, { - "accuracy": 0.4316, - "f1": 0.4027451272640956 + "accuracy": 0.43505859375, + "f1": 0.40564165138267916 }, { - "accuracy": 0.447, - "f1": 0.425627940731658 + "accuracy": 0.4560546875, + "f1": 0.43623452643616645 }, { - "accuracy": 0.3944, - "f1": 0.38064524661195753 + "accuracy": 0.400390625, + "f1": 0.38445812042139876 }, { - "accuracy": 0.4076, - "f1": 0.3986492952787144 + "accuracy": 0.40478515625, + "f1": 0.3920306534479027 }, { - "accuracy": 0.4124, - "f1": 0.3960854031159166 + "accuracy": 0.419921875, + "f1": 0.40308357429464775 }, { - "accuracy": 0.432, - "f1": 0.42863585110751046 + "accuracy": 0.44580078125, + "f1": 0.4419482016465282 }, { - "accuracy": 0.3824, - "f1": 0.37616986314412426 + "accuracy": 0.36279296875, + "f1": 0.35565502536226984 }, { - "accuracy": 0.4474, - "f1": 0.4090929316375262 + "accuracy": 0.45361328125, + "f1": 0.41631601742988017 } ] } diff --git a/results/intfloat__multilingual-e5-small/HeadlineClassification.json b/results/intfloat__multilingual-e5-small/HeadlineClassification.json index 6b99331f3..c7e954b6b 100644 --- a/results/intfloat__multilingual-e5-small/HeadlineClassification.json +++ b/results/intfloat__multilingual-e5-small/HeadlineClassification.json @@ -1,58 +1,58 @@ { "dataset_revision": "2fe05ee6b5832cda29f2ef7aaad7b7fe6a3609eb", - "evaluation_time": 11.093019485473633, + "evaluation_time": 8.446706295013428, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { "test": [ { - "accuracy": 0.7409333333333333, - "f1": 0.7403291196935363, + "accuracy": 0.73740234375, + "f1": 0.7368256804848748, "hf_subset": "default", "languages": [ "rus-Cyrl" ], - "main_score": 0.7409333333333333, + "main_score": 0.73740234375, "scores_per_experiment": [ { - "accuracy": 0.722, - "f1": 0.7214219292530725 + "accuracy": 0.72412109375, + "f1": 0.7238105970908896 }, { - "accuracy": 0.7549166666666667, - "f1": 0.7535662878680841 + "accuracy": 0.75732421875, + "f1": 0.756323973256969 }, { - "accuracy": 0.7399166666666667, - "f1": 0.7400112761607208 + "accuracy": 0.724609375, + "f1": 0.7244978411773731 }, { - "accuracy": 0.7433333333333333, - "f1": 0.7444121839162109 + "accuracy": 0.73779296875, + "f1": 0.7387013633644486 }, { - "accuracy": 0.7480833333333333, - "f1": 0.7504724205846119 + "accuracy": 0.7421875, + "f1": 0.7448734427343854 }, { - "accuracy": 0.7578333333333334, - "f1": 0.7581892120223918 + "accuracy": 0.75048828125, + "f1": 0.7511093727244752 }, { - "accuracy": 0.7254166666666667, - "f1": 0.7231835308563578 + "accuracy": 0.73095703125, + "f1": 0.7287251210809632 }, { - "accuracy": 0.7301666666666666, - "f1": 0.7287202972794837 + "accuracy": 0.72705078125, + "f1": 0.7253848114297723 }, { - "accuracy": 0.724, - "f1": 0.7198580630027918 + "accuracy": 0.7099609375, + "f1": 0.7049781360594113 }, { - "accuracy": 0.7636666666666667, - "f1": 0.7634559959916389 + "accuracy": 0.76953125, + "f1": 0.7698521459300599 } ] } diff --git a/results/intfloat__multilingual-e5-small/InappropriatenessClassification.json b/results/intfloat__multilingual-e5-small/InappropriatenessClassification.json index 33c7ee9d7..731144403 100644 --- a/results/intfloat__multilingual-e5-small/InappropriatenessClassification.json +++ b/results/intfloat__multilingual-e5-small/InappropriatenessClassification.json @@ -1,69 +1,69 @@ { "dataset_revision": "601651fdc45ef243751676e62dd7a19f491c0285", - "evaluation_time": 13.447438716888428, + "evaluation_time": 8.770590543746948, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { "test": [ { - "accuracy": 0.58024, - "ap": 0.5476512898306003, - "f1": 0.5758618133958813, + "accuracy": 0.584423828125, + "ap": 0.5502987014659555, + "f1": 0.5800244669851191, "hf_subset": "default", "languages": [ "rus-Cyrl" ], - "main_score": 0.58024, + "main_score": 0.584423828125, "scores_per_experiment": [ { - "accuracy": 0.5999, - "ap": 0.5623475279503105, - "f1": 0.5960600458104859 + "accuracy": 0.60546875, + "ap": 0.5672262643129771, + "f1": 0.6000676710991237 }, { - "accuracy": 0.6063, - "ap": 0.5631799041363394, - "f1": 0.6047161410584456 + "accuracy": 0.6103515625, + "ap": 0.5657792902317177, + "f1": 0.60819332566168 }, { - "accuracy": 0.6141, - "ap": 0.5738311420469193, - "f1": 0.6091889075801037 + "accuracy": 0.62158203125, + "ap": 0.5790063740974729, + "f1": 0.6181912435606457 }, { - "accuracy": 0.5248, - "ap": 0.5130003904724717, - "f1": 0.5247292607031631 + "accuracy": 0.53466796875, + "ap": 0.5185253811713456, + "f1": 0.5346589821285135 }, { - "accuracy": 0.5543, - "ap": 0.530097900419916, - "f1": 0.5542999955429999 + "accuracy": 0.56298828125, + "ap": 0.5352946496433584, + "f1": 0.5627771911172896 }, { - "accuracy": 0.5694, - "ap": 0.5384336124031008, - "f1": 0.5601522000051073 + "accuracy": 0.56396484375, + "ap": 0.5351492376464475, + "f1": 0.5544684003495333 }, { - "accuracy": 0.6176, - "ap": 0.5731581395348837, - "f1": 0.6174704908093684 + "accuracy": 0.6171875, + "ap": 0.57232666015625, + "f1": 0.6171875 }, { - "accuracy": 0.5503, - "ap": 0.5269225164635001, - "f1": 0.5287804936626321 + "accuracy": 0.56201171875, + "ap": 0.5337010983487338, + "f1": 0.5411185923736085 }, { - "accuracy": 0.5768, - "ap": 0.5442467882632831, - "f1": 0.5767918066893775 + "accuracy": 0.57568359375, + "ap": 0.543724924774323, + "f1": 0.5756098317068372 }, { - "accuracy": 0.5889, - "ap": 0.5512949766152779, - "f1": 0.5864287920971301 + "accuracy": 0.59033203125, + "ap": 0.5522531342769296, + "f1": 0.5879719318539599 } ] } diff --git a/results/intfloat__multilingual-e5-small/KinopoiskClassification.json b/results/intfloat__multilingual-e5-small/KinopoiskClassification.json index 258d27240..e17c2795a 100644 --- a/results/intfloat__multilingual-e5-small/KinopoiskClassification.json +++ b/results/intfloat__multilingual-e5-small/KinopoiskClassification.json @@ -1,6 +1,6 @@ { "dataset_revision": "5911f26666ac11af46cb9c6849d0dc80a378af24", - "evaluation_time": 7.126714468002319, + "evaluation_time": 6.863701581954956, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { diff --git a/results/intfloat__multilingual-e5-small/RuReviewsClassification.json b/results/intfloat__multilingual-e5-small/RuReviewsClassification.json index 39eee2760..807fa7300 100644 --- a/results/intfloat__multilingual-e5-small/RuReviewsClassification.json +++ b/results/intfloat__multilingual-e5-small/RuReviewsClassification.json @@ -1,58 +1,58 @@ { "dataset_revision": "f6d2c31f4dc6b88f468552750bfec05b4b41b05a", - "evaluation_time": 7.535195827484131, + "evaluation_time": 2.707639455795288, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { "test": [ { - "accuracy": 0.6060933333333334, - "f1": 0.597007474521683, + "accuracy": 0.6064453125, + "f1": 0.5959307702980192, "hf_subset": "default", "languages": [ "rus-Cyrl" ], - "main_score": 0.6060933333333334, + "main_score": 0.6064453125, "scores_per_experiment": [ { - "accuracy": 0.6346, - "f1": 0.6329801908707503 + "accuracy": 0.6318359375, + "f1": 0.6292213707533455 }, { - "accuracy": 0.5249333333333334, - "f1": 0.4981585587582864 + "accuracy": 0.5224609375, + "f1": 0.4947457821550861 }, { - "accuracy": 0.6071333333333333, - "f1": 0.6088541683884725 + "accuracy": 0.61572265625, + "f1": 0.6168475257585656 }, { - "accuracy": 0.6487333333333334, - "f1": 0.6501016672839444 + "accuracy": 0.64501953125, + "f1": 0.6442740771488549 }, { - "accuracy": 0.6554, - "f1": 0.6544128367544028 + "accuracy": 0.64599609375, + "f1": 0.6430137891119658 }, { - "accuracy": 0.5646666666666667, - "f1": 0.5515952394177405 + "accuracy": 0.5537109375, + "f1": 0.5380866809329515 }, { - "accuracy": 0.6053333333333333, - "f1": 0.5900342341506833 + "accuracy": 0.62060546875, + "f1": 0.6048385654790182 }, { - "accuracy": 0.5972, - "f1": 0.5817886490218223 + "accuracy": 0.59033203125, + "f1": 0.5721374912588382 }, { - "accuracy": 0.5809333333333333, - "f1": 0.5672444304738421 + "accuracy": 0.59423828125, + "f1": 0.5792890149320954 }, { - "accuracy": 0.642, - "f1": 0.6349047700968856 + "accuracy": 0.64453125, + "f1": 0.6368534054494702 } ] } diff --git a/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClassification.json b/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClassification.json index 002b2d12e..b90fc13a6 100644 --- a/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClassification.json +++ b/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClassification.json @@ -1,58 +1,58 @@ { "dataset_revision": "673a610d6d3dd91a547a0d57ae1b56f37ebbf6a1", - "evaluation_time": 9.047378063201904, + "evaluation_time": 7.974362373352051, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { "test": [ { - "accuracy": 0.5348484848484848, - "f1": 0.5210756834528681, + "accuracy": 0.535888671875, + "f1": 0.5210574042694371, "hf_subset": "default", "languages": [ "rus-Cyrl" ], - "main_score": 0.5348484848484848, + "main_score": 0.535888671875, "scores_per_experiment": [ { - "accuracy": 0.5404040404040404, - "f1": 0.5244669347212237 + "accuracy": 0.5439453125, + "f1": 0.5263606964925389 }, { - "accuracy": 0.5382395382395382, - "f1": 0.5273913015496474 + "accuracy": 0.5361328125, + "f1": 0.5239255540910781 }, { - "accuracy": 0.5256132756132756, - "f1": 0.5152792170693732 + "accuracy": 0.52587890625, + "f1": 0.5147638083564116 }, { - "accuracy": 0.553030303030303, - "f1": 0.5435989736187034 + "accuracy": 0.56005859375, + "f1": 0.5487350557415427 }, { - "accuracy": 0.5393217893217893, - "f1": 0.529493005068726 + "accuracy": 0.541015625, + "f1": 0.5304532553514425 }, { - "accuracy": 0.5003607503607503, - "f1": 0.4830755796054487 + "accuracy": 0.50048828125, + "f1": 0.480416441703842 }, { - "accuracy": 0.5436507936507936, - "f1": 0.5250488962586415 + "accuracy": 0.54248046875, + "f1": 0.522217091817123 }, { - "accuracy": 0.5263347763347763, - "f1": 0.5093739702927322 + "accuracy": 0.52783203125, + "f1": 0.5113532709549421 }, { - "accuracy": 0.5429292929292929, - "f1": 0.5289516187196283 + "accuracy": 0.54296875, + "f1": 0.5294004464115111 }, { - "accuracy": 0.5386002886002886, - "f1": 0.5240773376245568 + "accuracy": 0.5380859375, + "f1": 0.5229484217739401 } ] } diff --git a/results/intfloat__multilingual-e5-small/RuSciBenchOECDClassification.json b/results/intfloat__multilingual-e5-small/RuSciBenchOECDClassification.json index aeae193c8..ede89e152 100644 --- a/results/intfloat__multilingual-e5-small/RuSciBenchOECDClassification.json +++ b/results/intfloat__multilingual-e5-small/RuSciBenchOECDClassification.json @@ -1,58 +1,58 @@ { "dataset_revision": "26c88e99dcaba32bb45d0e1bfc21902337f6d471", - "evaluation_time": 9.497691631317139, + "evaluation_time": 7.700200796127319, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { "test": [ { - "accuracy": 0.40465983224603913, - "f1": 0.3813156482688752, + "accuracy": 0.403466796875, + "f1": 0.3804475350091884, "hf_subset": "default", "languages": [ "rus-Cyrl" ], - "main_score": 0.40465983224603913, + "main_score": 0.403466796875, "scores_per_experiment": [ { - "accuracy": 0.4044734389561976, - "f1": 0.3791045524892834 + "accuracy": 0.40771484375, + "f1": 0.38158231676574966 }, { - "accuracy": 0.39577508543025786, - "f1": 0.3724397241737727 + "accuracy": 0.3974609375, + "f1": 0.3770214282257016 }, { - "accuracy": 0.40509474992233613, - "f1": 0.38530126616056976 + "accuracy": 0.4033203125, + "f1": 0.38432802673810523 }, { - "accuracy": 0.3979496738117428, - "f1": 0.36366224022921334 + "accuracy": 0.40087890625, + "f1": 0.36754207835016633 }, { - "accuracy": 0.4016775396085741, - "f1": 0.3656766512184062 + "accuracy": 0.39892578125, + "f1": 0.3613803385395889 }, { - "accuracy": 0.4113078595837217, - "f1": 0.3938519036434973 + "accuracy": 0.40869140625, + "f1": 0.3915561625326378 }, { - "accuracy": 0.4119291705498602, - "f1": 0.39099679411232896 + "accuracy": 0.4052734375, + "f1": 0.3858284706220626 }, { - "accuracy": 0.4200062131096614, - "f1": 0.4073591469425345 + "accuracy": 0.41845703125, + "f1": 0.4065895008800672 }, { - "accuracy": 0.39422180801491147, - "f1": 0.37126427433989134 + "accuracy": 0.38720703125, + "f1": 0.36261347966042223 }, { - "accuracy": 0.4041627834731283, - "f1": 0.38349992937925514 + "accuracy": 0.40673828125, + "f1": 0.38603354777738197 } ] } diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClassification.json index 369e85f13..fa1b395dc 100644 --- a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClassification.json +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClassification.json @@ -1,58 +1,58 @@ { "dataset_revision": "3765c0d1de6b7d264bc459433c45e5a75513839c", - "evaluation_time": 15.76297402381897, + "evaluation_time": 13.287531614303589, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { "test": [ { - "accuracy": 0.37874, - "f1": 0.3745824725441968, + "accuracy": 0.382373046875, + "f1": 0.37726350906971967, "hf_subset": "default", "languages": [ "rus-Cyrl" ], - "main_score": 0.37874, + "main_score": 0.382373046875, "scores_per_experiment": [ { - "accuracy": 0.3482, - "f1": 0.3478295316844259 + "accuracy": 0.3623046875, + "f1": 0.3598446286149032 }, { - "accuracy": 0.3712, - "f1": 0.37478419559913767 + "accuracy": 0.3662109375, + "f1": 0.3702823326404931 }, { - "accuracy": 0.4204, - "f1": 0.4033108154167591 + "accuracy": 0.4248046875, + "f1": 0.40556367506133945 }, { - "accuracy": 0.3724, - "f1": 0.37528403563866586 + "accuracy": 0.3857421875, + "f1": 0.38948479339311637 }, { - "accuracy": 0.3786, - "f1": 0.37474862797257674 + "accuracy": 0.37109375, + "f1": 0.3644427003654772 }, { - "accuracy": 0.3346, - "f1": 0.3328824346726555 + "accuracy": 0.33349609375, + "f1": 0.33189604456149324 }, { - "accuracy": 0.3936, - "f1": 0.38690113762564987 + "accuracy": 0.4052734375, + "f1": 0.3967362773894577 }, { - "accuracy": 0.4156, - "f1": 0.4093053648237972 + "accuracy": 0.42041015625, + "f1": 0.4125367136089998 }, { - "accuracy": 0.3528, - "f1": 0.3468968846000202 + "accuracy": 0.3515625, + "f1": 0.34603044326497817 }, { - "accuracy": 0.4, - "f1": 0.3938816974082802 + "accuracy": 0.40283203125, + "f1": 0.3958174817969385 } ] } diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/HeadlineClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/HeadlineClassification.json index 40d61425b..e7c31459e 100644 --- a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/HeadlineClassification.json +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/HeadlineClassification.json @@ -1,58 +1,58 @@ { "dataset_revision": "2fe05ee6b5832cda29f2ef7aaad7b7fe6a3609eb", - "evaluation_time": 11.46289873123169, + "evaluation_time": 8.532874584197998, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { "test": [ { - "accuracy": 0.6805333333333333, - "f1": 0.6816775809403113, + "accuracy": 0.682958984375, + "f1": 0.6838045096273764, "hf_subset": "default", "languages": [ "rus-Cyrl" ], - "main_score": 0.6805333333333333, + "main_score": 0.682958984375, "scores_per_experiment": [ { - "accuracy": 0.6300833333333333, - "f1": 0.6369181634277693 + "accuracy": 0.63427734375, + "f1": 0.6416444230621724 }, { - "accuracy": 0.6569166666666667, - "f1": 0.6556779102488896 + "accuracy": 0.66650390625, + "f1": 0.6642391694485621 }, { - "accuracy": 0.70575, - "f1": 0.7063579593355636 + "accuracy": 0.70361328125, + "f1": 0.7038968430371253 }, { - "accuracy": 0.7003333333333334, - "f1": 0.7022311853029809 + "accuracy": 0.7060546875, + "f1": 0.7077608462308818 }, { - "accuracy": 0.70625, - "f1": 0.708128015241555 + "accuracy": 0.703125, + "f1": 0.7059096969479263 }, { - "accuracy": 0.7210833333333333, - "f1": 0.7209020613658228 + "accuracy": 0.7353515625, + "f1": 0.7349423604779687 }, { - "accuracy": 0.6421666666666667, - "f1": 0.6417652955002949 + "accuracy": 0.638671875, + "f1": 0.6381514569844325 }, { - "accuracy": 0.6639166666666667, - "f1": 0.6654400013293077 + "accuracy": 0.673828125, + "f1": 0.67455579208203 }, { - "accuracy": 0.6651666666666667, - "f1": 0.6653355970756962 + "accuracy": 0.65478515625, + "f1": 0.6534103414038156 }, { - "accuracy": 0.7136666666666667, - "f1": 0.7140196205752338 + "accuracy": 0.71337890625, + "f1": 0.7135341665988495 } ] } diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/InappropriatenessClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/InappropriatenessClassification.json index 2c8607810..216615a5c 100644 --- a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/InappropriatenessClassification.json +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/InappropriatenessClassification.json @@ -1,69 +1,69 @@ { "dataset_revision": "601651fdc45ef243751676e62dd7a19f491c0285", - "evaluation_time": 11.479481220245361, + "evaluation_time": 8.756818294525146, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { "test": [ { - "accuracy": 0.5788, - "ap": 0.5478227138673264, - "f1": 0.574297812767316, + "accuracy": 0.581787109375, + "ap": 0.5500034660932301, + "f1": 0.5769633015450566, "hf_subset": "default", "languages": [ "rus-Cyrl" ], - "main_score": 0.5788, + "main_score": 0.581787109375, "scores_per_experiment": [ { - "accuracy": 0.5935, - "ap": 0.5552890212932213, - "f1": 0.5934424273821416 + "accuracy": 0.59912109375, + "ap": 0.5593188310802619, + "f1": 0.5991164104233052 }, { - "accuracy": 0.5946, - "ap": 0.5569062258480034, - "f1": 0.5941252726839421 + "accuracy": 0.59130859375, + "ap": 0.5547076940283668, + "f1": 0.5906682907714251 }, { - "accuracy": 0.5575, - "ap": 0.5327556336321784, - "f1": 0.5541016795896397 + "accuracy": 0.564453125, + "ap": 0.5371161099137931, + "f1": 0.5619763927772294 }, { - "accuracy": 0.5911, - "ap": 0.553201862437765, - "f1": 0.5903670478550566 + "accuracy": 0.6044921875, + "ap": 0.562579424676525, + "f1": 0.6041747197526093 }, { - "accuracy": 0.6126, - "ap": 0.5709001381851682, - "f1": 0.6109154037687734 + "accuracy": 0.62158203125, + "ap": 0.5773341444672131, + "f1": 0.620507060141932 }, { - "accuracy": 0.4817, - "ap": 0.4912873073909637, - "f1": 0.4744940469138418 + "accuracy": 0.4775390625, + "ap": 0.4894218059501263, + "f1": 0.4707473529639076 }, { - "accuracy": 0.6019, - "ap": 0.5626143563244215, - "f1": 0.6006964952436292 + "accuracy": 0.61279296875, + "ap": 0.5702115298581654, + "f1": 0.6121863256651614 }, { - "accuracy": 0.5635, - "ap": 0.5348987193503045, - "f1": 0.5547353709683239 + "accuracy": 0.56494140625, + "ap": 0.5356959484690067, + "f1": 0.5543998052296427 }, { - "accuracy": 0.5764, - "ap": 0.5485345609065155, - "f1": 0.5553457268850318 + "accuracy": 0.56982421875, + "ap": 0.5441062744590239, + "f1": 0.5447100803619936 }, { - "accuracy": 0.6152, - "ap": 0.571839313304721, - "f1": 0.6147546563827785 + "accuracy": 0.61181640625, + "ap": 0.569542898029819, + "f1": 0.61114657736336 } ] } diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/KinopoiskClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/KinopoiskClassification.json index 72e76f62c..ace616b4b 100644 --- a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/KinopoiskClassification.json +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/KinopoiskClassification.json @@ -1,6 +1,6 @@ { "dataset_revision": "5911f26666ac11af46cb9c6849d0dc80a378af24", - "evaluation_time": 4.948300123214722, + "evaluation_time": 4.645890235900879, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuReviewsClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuReviewsClassification.json index 41dbb4cd0..decaa726b 100644 --- a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuReviewsClassification.json +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuReviewsClassification.json @@ -1,58 +1,58 @@ { "dataset_revision": "f6d2c31f4dc6b88f468552750bfec05b4b41b05a", - "evaluation_time": 7.741372585296631, + "evaluation_time": 2.554316282272339, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { "test": [ { - "accuracy": 0.5871999999999999, - "f1": 0.5837261469604392, + "accuracy": 0.588818359375, + "f1": 0.5852138369263127, "hf_subset": "default", "languages": [ "rus-Cyrl" ], - "main_score": 0.5871999999999999, + "main_score": 0.588818359375, "scores_per_experiment": [ { - "accuracy": 0.63, - "f1": 0.6303589229215918 + "accuracy": 0.61865234375, + "f1": 0.6184558882302665 }, { - "accuracy": 0.5772666666666667, - "f1": 0.5767187453017142 + "accuracy": 0.576171875, + "f1": 0.5756002105389183 }, { - "accuracy": 0.5995333333333334, - "f1": 0.6066554960243958 + "accuracy": 0.619140625, + "f1": 0.6259604480080455 }, { - "accuracy": 0.6422, - "f1": 0.6468765408412193 + "accuracy": 0.6337890625, + "f1": 0.6385496813880095 }, { - "accuracy": 0.6464666666666666, - "f1": 0.6480125690154023 + "accuracy": 0.65234375, + "f1": 0.6530946753496178 }, { - "accuracy": 0.5368666666666667, - "f1": 0.5347438669396679 + "accuracy": 0.537109375, + "f1": 0.5338678800515294 }, { - "accuracy": 0.5526666666666666, - "f1": 0.5275844379060841 + "accuracy": 0.56591796875, + "f1": 0.5394400447234527 }, { - "accuracy": 0.5902, - "f1": 0.5699928819977857 + "accuracy": 0.5830078125, + "f1": 0.5627666526242403 }, { - "accuracy": 0.47533333333333333, - "f1": 0.47710990793388125 + "accuracy": 0.4736328125, + "f1": 0.477180007252824 }, { - "accuracy": 0.6214666666666666, - "f1": 0.6192081007226498 + "accuracy": 0.62841796875, + "f1": 0.6272228810962228 } ] } diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClassification.json index 961d7db8f..7bfadea31 100644 --- a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClassification.json +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClassification.json @@ -1,58 +1,58 @@ { "dataset_revision": "673a610d6d3dd91a547a0d57ae1b56f37ebbf6a1", - "evaluation_time": 6.399334907531738, + "evaluation_time": 5.351984977722168, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { "test": [ { - "accuracy": 0.5295454545454545, - "f1": 0.5267719307729761, + "accuracy": 0.53193359375, + "f1": 0.529038201409928, "hf_subset": "default", "languages": [ "rus-Cyrl" ], - "main_score": 0.5295454545454545, + "main_score": 0.53193359375, "scores_per_experiment": [ { - "accuracy": 0.5523088023088023, - "f1": 0.5485584626187086 + "accuracy": 0.54931640625, + "f1": 0.5457219170201418 }, { - "accuracy": 0.5378787878787878, - "f1": 0.5383560398266264 + "accuracy": 0.54150390625, + "f1": 0.5413860959312798 }, { - "accuracy": 0.4935064935064935, - "f1": 0.490176752799251 + "accuracy": 0.49853515625, + "f1": 0.49509664689953514 }, { - "accuracy": 0.5555555555555556, - "f1": 0.5516142577765969 + "accuracy": 0.56591796875, + "f1": 0.5616241311852406 }, { - "accuracy": 0.538961038961039, - "f1": 0.539290641279128 + "accuracy": 0.53564453125, + "f1": 0.5370138411519967 }, { - "accuracy": 0.5191197691197691, - "f1": 0.5132684814984376 + "accuracy": 0.52197265625, + "f1": 0.516652507049508 }, { - "accuracy": 0.5086580086580087, - "f1": 0.5049038330254615 + "accuracy": 0.51416015625, + "f1": 0.5105101783568153 }, { - "accuracy": 0.5256132756132756, - "f1": 0.5202828911043906 + "accuracy": 0.52783203125, + "f1": 0.5218310772423697 }, { - "accuracy": 0.5248917748917749, - "f1": 0.5239214029959183 + "accuracy": 0.525390625, + "f1": 0.5234302601616807 }, { - "accuracy": 0.538961038961039, - "f1": 0.5373465448052426 + "accuracy": 0.5390625, + "f1": 0.5371153591007112 } ] } diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClassification.json index 5f8c62631..220d8b201 100644 --- a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClassification.json +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClassification.json @@ -1,58 +1,58 @@ { "dataset_revision": "26c88e99dcaba32bb45d0e1bfc21902337f6d471", - "evaluation_time": 6.813362121582031, + "evaluation_time": 5.41755485534668, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { "test": [ { - "accuracy": 0.41556383970177074, - "f1": 0.4073756403133939, + "accuracy": 0.414111328125, + "f1": 0.40606541681576697, "hf_subset": "default", "languages": [ "rus-Cyrl" ], - "main_score": 0.41556383970177074, + "main_score": 0.414111328125, "scores_per_experiment": [ { - "accuracy": 0.42684063373718545, - "f1": 0.4157660008020526 + "accuracy": 0.4248046875, + "f1": 0.4148930015586425 }, { - "accuracy": 0.40726933830382106, - "f1": 0.4018935354837055 + "accuracy": 0.40234375, + "f1": 0.39761883708226003 }, { - "accuracy": 0.418452935694315, - "f1": 0.41188049861474574 + "accuracy": 0.4130859375, + "f1": 0.4064363080246256 }, { - "accuracy": 0.40726933830382106, - "f1": 0.39853740983349073 + "accuracy": 0.4072265625, + "f1": 0.3988079304959545 }, { - "accuracy": 0.41876359117738426, - "f1": 0.4081694037369186 + "accuracy": 0.408203125, + "f1": 0.3977161549429694 }, { - "accuracy": 0.4181422802112457, - "f1": 0.4097209556016289 + "accuracy": 0.42041015625, + "f1": 0.41313434945816946 }, { - "accuracy": 0.40602671637154397, - "f1": 0.39937285745788964 + "accuracy": 0.41015625, + "f1": 0.40344665651935563 }, { - "accuracy": 0.42684063373718545, - "f1": 0.4226815911158912 + "accuracy": 0.42431640625, + "f1": 0.4200394944876985 }, { - "accuracy": 0.4113078595837217, - "f1": 0.396924130188379 + "accuracy": 0.408203125, + "f1": 0.3923320809148424 }, { - "accuracy": 0.4147250698974837, - "f1": 0.40881002029923663 + "accuracy": 0.42236328125, + "f1": 0.4162293546731521 } ] } From e6dc362c31bec3f24c921d631f2c9c4d069dd573 Mon Sep 17 00:00:00 2001 From: Artem Snegirev Date: Mon, 27 May 2024 21:50:10 +0300 Subject: [PATCH 4/8] update clustering tasks to fit size limit --- .../Clustering/rus/GeoreviewClusteringP2P.py | 10 +++--- .../rus/RuSciBenchGRNTIClusteringP2P.py | 21 +++++++---- .../rus/RuSciBenchOECDClusteringP2P.py | 21 +++++++---- .../GeoreviewClusteringP2P.json | 35 ++++++++++--------- .../RuSciBenchGRNTIClusteringP2P.json | 35 ++++++++++--------- .../RuSciBenchOECDClusteringP2P.json | 35 ++++++++++--------- .../GeoreviewClusteringP2P.json | 35 ++++++++++--------- .../RuSciBenchGRNTIClusteringP2P.json | 35 ++++++++++--------- .../RuSciBenchOECDClusteringP2P.json | 35 ++++++++++--------- 9 files changed, 143 insertions(+), 119 deletions(-) diff --git a/mteb/tasks/Clustering/rus/GeoreviewClusteringP2P.py b/mteb/tasks/Clustering/rus/GeoreviewClusteringP2P.py index eb3fd0c57..390d291c4 100644 --- a/mteb/tasks/Clustering/rus/GeoreviewClusteringP2P.py +++ b/mteb/tasks/Clustering/rus/GeoreviewClusteringP2P.py @@ -2,15 +2,15 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskClustering import AbsTaskClustering +from ....abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast -class GeoreviewClusteringP2P(AbsTaskClustering): +class GeoreviewClusteringP2P(AbsTaskClusteringFast): metadata = TaskMetadata( name="GeoreviewClusteringP2P", dataset={ "path": "ai-forever/georeview-clustering-p2p", - "revision": "e82bdbb7d767270d37c9b4ea88cb6475facfd656", + "revision": "97a313c8fc85b47f13f33e7e9a95c1ad888c7fec", }, description="Review clustering based on Yandex Georeview dataset", reference="https://github.com/yandex/geo-reviews-dataset-2023", @@ -29,6 +29,6 @@ class GeoreviewClusteringP2P(AbsTaskClustering): dialect=[], text_creation="found", bibtex_citation="""""", - n_samples={"test": 301510}, - avg_character_length={"test": 290.5}, + n_samples={"test": 2000}, + avg_character_length={"test": 384.5}, ) diff --git a/mteb/tasks/Clustering/rus/RuSciBenchGRNTIClusteringP2P.py b/mteb/tasks/Clustering/rus/RuSciBenchGRNTIClusteringP2P.py index d309fc474..e8a89bd0b 100644 --- a/mteb/tasks/Clustering/rus/RuSciBenchGRNTIClusteringP2P.py +++ b/mteb/tasks/Clustering/rus/RuSciBenchGRNTIClusteringP2P.py @@ -2,15 +2,16 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskClustering import AbsTaskClustering +from ....abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast -class RuSciBenchGRNTIClusteringP2P(AbsTaskClustering): +class RuSciBenchGRNTIClusteringP2P(AbsTaskClusteringFast): metadata = TaskMetadata( name="RuSciBenchGRNTIClusteringP2P", dataset={ - "path": "ai-forever/ru-scibench-grnti-clustering-p2p", - "revision": "5add37c2d5028dda82cf115a659b56153580c203", + # here we use the same split for clustering + "path": "ai-forever/ru-scibench-grnti-classification", + "revision": "673a610d6d3dd91a547a0d57ae1b56f37ebbf6a1", }, description="Clustering of scientific papers (title+abstract) by rubric", reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench/", @@ -29,6 +30,14 @@ class RuSciBenchGRNTIClusteringP2P(AbsTaskClustering): dialect=[], text_creation="found", bibtex_citation="""""", - n_samples={"test": 31080}, - avg_character_length={"test": 863.3}, + n_samples={"test": 2772}, + avg_character_length={"test": 890.1}, ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"label": "labels", "text": "sentences"} + ) + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"], label="labels" + ) diff --git a/mteb/tasks/Clustering/rus/RuSciBenchOECDClusteringP2P.py b/mteb/tasks/Clustering/rus/RuSciBenchOECDClusteringP2P.py index 15b7b97c9..c7d8d293d 100644 --- a/mteb/tasks/Clustering/rus/RuSciBenchOECDClusteringP2P.py +++ b/mteb/tasks/Clustering/rus/RuSciBenchOECDClusteringP2P.py @@ -2,15 +2,16 @@ from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskClustering import AbsTaskClustering +from ....abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast -class RuSciBenchOECDClusteringP2P(AbsTaskClustering): +class RuSciBenchOECDClusteringP2P(AbsTaskClusteringFast): metadata = TaskMetadata( name="RuSciBenchOECDClusteringP2P", dataset={ - "path": "ai-forever/ru-scibench-oecd-clustering-p2p", - "revision": "08475cf0f71cd474bdc3525ee49d8495a12a9a6a", + # here we use the same split for clustering + "path": "ai-forever/ru-scibench-oecd-classification", + "revision": "26c88e99dcaba32bb45d0e1bfc21902337f6d471", }, description="Clustering of scientific papers (title+abstract) by rubric", reference="https://github.com/mlsa-iai-msu-lab/ru_sci_bench/", @@ -29,6 +30,14 @@ class RuSciBenchOECDClusteringP2P(AbsTaskClustering): dialect=[], text_creation="found", bibtex_citation="""""", - n_samples={"test": 30740}, - avg_character_length={"test": 838.7}, + n_samples={"test": 3219}, + avg_character_length={"test": 838.9}, ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"label": "labels", "text": "sentences"} + ) + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"], label="labels" + ) diff --git a/results/intfloat__multilingual-e5-small/GeoreviewClusteringP2P.json b/results/intfloat__multilingual-e5-small/GeoreviewClusteringP2P.json index ea797c4fd..768592856 100644 --- a/results/intfloat__multilingual-e5-small/GeoreviewClusteringP2P.json +++ b/results/intfloat__multilingual-e5-small/GeoreviewClusteringP2P.json @@ -1,6 +1,6 @@ { - "dataset_revision": "e82bdbb7d767270d37c9b4ea88cb6475facfd656", - "evaluation_time": 159.50612330436707, + "dataset_revision": "97a313c8fc85b47f13f33e7e9a95c1ad888c7fec", + "evaluation_time": 5.706880569458008, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { @@ -10,21 +10,22 @@ "languages": [ "rus-Cyrl" ], - "main_score": 0.5425637559995888, - "v_measure": 0.5425637559995888, - "v_measure_std": 0.08710098611855956, - "v_measures": [ - 0.6023455440259576, - 0.6599726508444369, - 0.46536913827747883, - 0.5202244461986688, - 0.42112427118783446, - 0.43658962026432985, - 0.4915792990438388, - 0.5356736921699828, - 0.6769466153847908, - 0.6158122825985685 - ] + "main_score": 0.6064478428133167, + "v_measure": 0.6064478428133167, + "v_measures": { + "Level 0": [ + 0.5869810746949777, + 0.626424324127219, + 0.6031611353092661, + 0.5895363788159923, + 0.6214719408318518, + 0.592721142566816, + 0.6071467727643062, + 0.6384264087542161, + 0.5942364240627622, + 0.6043728262057583 + ] + } } ] }, diff --git a/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClusteringP2P.json b/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClusteringP2P.json index 9719a4d7e..4b23df1c3 100644 --- a/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClusteringP2P.json +++ b/results/intfloat__multilingual-e5-small/RuSciBenchGRNTIClusteringP2P.json @@ -1,6 +1,6 @@ { - "dataset_revision": "5add37c2d5028dda82cf115a659b56153580c203", - "evaluation_time": 43.90299892425537, + "dataset_revision": "673a610d6d3dd91a547a0d57ae1b56f37ebbf6a1", + "evaluation_time": 6.427574157714844, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { @@ -10,21 +10,22 @@ "languages": [ "rus-Cyrl" ], - "main_score": 0.4907036398379492, - "v_measure": 0.4907036398379492, - "v_measure_std": 0.01442403310059541, - "v_measures": [ - 0.5111910016583134, - 0.4826920493259084, - 0.5037578196331924, - 0.46108371524279357, - 0.4830341704583758, - 0.4968990747062376, - 0.5026536055346332, - 0.47474077277521115, - 0.49751454520339666, - 0.4934696438414291 - ] + "main_score": 0.4833275608904005, + "v_measure": 0.4833275608904005, + "v_measures": { + "Level 0": [ + 0.4743824305693189, + 0.4826440165132136, + 0.4825720404988663, + 0.49359440629785456, + 0.48104798125243475, + 0.4851506944196132, + 0.478857987042417, + 0.4951041876148091, + 0.47311132482777524, + 0.48681053986770206 + ] + } } ] }, diff --git a/results/intfloat__multilingual-e5-small/RuSciBenchOECDClusteringP2P.json b/results/intfloat__multilingual-e5-small/RuSciBenchOECDClusteringP2P.json index a4a3f770b..d977070c1 100644 --- a/results/intfloat__multilingual-e5-small/RuSciBenchOECDClusteringP2P.json +++ b/results/intfloat__multilingual-e5-small/RuSciBenchOECDClusteringP2P.json @@ -1,6 +1,6 @@ { - "dataset_revision": "08475cf0f71cd474bdc3525ee49d8495a12a9a6a", - "evaluation_time": 41.33925414085388, + "dataset_revision": "26c88e99dcaba32bb45d0e1bfc21902337f6d471", + "evaluation_time": 5.347955942153931, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { @@ -10,21 +10,22 @@ "languages": [ "rus-Cyrl" ], - "main_score": 0.40860295286491655, - "v_measure": 0.40860295286491655, - "v_measure_std": 0.008542546238438057, - "v_measures": [ - 0.40474303948152135, - 0.4161826607911408, - 0.3916797573391887, - 0.3997524368597952, - 0.4197209726992979, - 0.4107211103601622, - 0.41239273911509505, - 0.40698654817604785, - 0.41984750311820723, - 0.40400276070870866 - ] + "main_score": 0.4307597888228038, + "v_measure": 0.4307597888228038, + "v_measures": { + "Level 0": [ + 0.4349157677245067, + 0.43791737498480904, + 0.42985262251362394, + 0.41283466606958064, + 0.4493598523433172, + 0.4162786883541447, + 0.4300393799747593, + 0.4235167220806759, + 0.43780119666677136, + 0.4350816175158491 + ] + } } ] }, diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClusteringP2P.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClusteringP2P.json index c890de988..312793fb4 100644 --- a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClusteringP2P.json +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/GeoreviewClusteringP2P.json @@ -1,6 +1,6 @@ { - "dataset_revision": "e82bdbb7d767270d37c9b4ea88cb6475facfd656", - "evaluation_time": 147.46828031539917, + "dataset_revision": "97a313c8fc85b47f13f33e7e9a95c1ad888c7fec", + "evaluation_time": 5.206845998764038, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { @@ -10,21 +10,22 @@ "languages": [ "rus-Cyrl" ], - "main_score": 0.4547073107289486, - "v_measure": 0.4547073107289486, - "v_measure_std": 0.062205229475387265, - "v_measures": [ - 0.48664109686626755, - 0.5408877316411709, - 0.4331384047503243, - 0.4636243738216654, - 0.3738562677328548, - 0.35238197464166343, - 0.4378906693865557, - 0.40749948572778594, - 0.5172189167028366, - 0.5339341860183607 - ] + "main_score": 0.5486119276656327, + "v_measure": 0.5486119276656327, + "v_measures": { + "Level 0": [ + 0.5130266102200091, + 0.5757537468998207, + 0.5438885942855619, + 0.5535288643537233, + 0.5564142741438273, + 0.5396622263390491, + 0.5550496396375509, + 0.5583978408096387, + 0.5455086873703175, + 0.5448887925968284 + ] + } } ] }, diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClusteringP2P.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClusteringP2P.json index 1e0f4080e..d3ede1d0b 100644 --- a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClusteringP2P.json +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchGRNTIClusteringP2P.json @@ -1,6 +1,6 @@ { - "dataset_revision": "5add37c2d5028dda82cf115a659b56153580c203", - "evaluation_time": 27.22791576385498, + "dataset_revision": "673a610d6d3dd91a547a0d57ae1b56f37ebbf6a1", + "evaluation_time": 5.758132696151733, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { @@ -10,21 +10,22 @@ "languages": [ "rus-Cyrl" ], - "main_score": 0.4707243048007701, - "v_measure": 0.4707243048007701, - "v_measure_std": 0.008993513565805002, - "v_measures": [ - 0.4568940516605606, - 0.4647838252635624, - 0.4747646295876639, - 0.46754763831760426, - 0.47791105336236367, - 0.47689738622690375, - 0.464306841618253, - 0.48971364051921895, - 0.4716642112374646, - 0.4627597702141062 - ] + "main_score": 0.489340827859812, + "v_measure": 0.489340827859812, + "v_measures": { + "Level 0": [ + 0.4825986464892499, + 0.4922392771563489, + 0.48999037336945284, + 0.4854264166733914, + 0.49003974620425955, + 0.5114125638769909, + 0.48750007697574893, + 0.4904222396564343, + 0.4873198362197574, + 0.47645910197648483 + ] + } } ] }, diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClusteringP2P.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClusteringP2P.json index b7b854d0d..99a63ea8f 100644 --- a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClusteringP2P.json +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSciBenchOECDClusteringP2P.json @@ -1,6 +1,6 @@ { - "dataset_revision": "08475cf0f71cd474bdc3525ee49d8495a12a9a6a", - "evaluation_time": 26.567129611968994, + "dataset_revision": "26c88e99dcaba32bb45d0e1bfc21902337f6d471", + "evaluation_time": 3.9011802673339844, "kg_co2_emissions": null, "mteb_version": "1.11.6", "scores": { @@ -10,21 +10,22 @@ "languages": [ "rus-Cyrl" ], - "main_score": 0.4051600422450699, - "v_measure": 0.4051600422450699, - "v_measure_std": 0.011396762847751415, - "v_measures": [ - 0.40430177897104075, - 0.4013933520419551, - 0.4109438030061019, - 0.4083699017502154, - 0.4094881739139133, - 0.41200961686882875, - 0.3780387081159579, - 0.4070882848354379, - 0.3960421227057608, - 0.4239246802414878 - ] + "main_score": 0.4370636105917381, + "v_measure": 0.4370636105917381, + "v_measures": { + "Level 0": [ + 0.4404273826538474, + 0.4497330758156431, + 0.42263609746133335, + 0.4297491340963033, + 0.43323572364578233, + 0.44273130834121455, + 0.43823526802268087, + 0.43929108694021235, + 0.4492543317545968, + 0.4253426971857663 + ] + } } ] }, From b9ab5eb8750fd7d154627ea2a186f005659334ab Mon Sep 17 00:00:00 2001 From: Artem Snegirev Date: Tue, 28 May 2024 10:15:09 +0300 Subject: [PATCH 5/8] remove mmarco dataset --- mteb/tasks/Retrieval/__init__.py | 1 - mteb/tasks/Retrieval/rus/MMarcoRetrieval.py | 47 ---------------- .../RuMMarcoRetrieval.json | 53 ------------------- .../RuMMarcoRetrieval.json | 53 ------------------- 4 files changed, 154 deletions(-) delete mode 100644 mteb/tasks/Retrieval/rus/MMarcoRetrieval.py delete mode 100644 results/intfloat__multilingual-e5-small/RuMMarcoRetrieval.json delete mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuMMarcoRetrieval.json diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 69316c04f..fd20d9ab1 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -92,7 +92,6 @@ from .pol.SCIDOCSPLRetrieval import * from .pol.SciFactPLRetrieval import * from .pol.TRECCOVIDPLRetrieval import * -from .rus.MMarcoRetrieval import * from .rus.RiaNewsRetrieval import * from .rus.RuBQRetrieval import * from .slk.SlovakSumRetrieval import * diff --git a/mteb/tasks/Retrieval/rus/MMarcoRetrieval.py b/mteb/tasks/Retrieval/rus/MMarcoRetrieval.py deleted file mode 100644 index 317185960..000000000 --- a/mteb/tasks/Retrieval/rus/MMarcoRetrieval.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import annotations - -from mteb.abstasks.TaskMetadata import TaskMetadata - -from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval - - -class RuMMarcoRetrieval(AbsTaskRetrieval): - metadata = TaskMetadata( - name="RuMMarcoRetrieval", - dataset={ - "path": "ai-forever/ru-mmarco-retrieval", - "revision": "18d1c2b1ab2a7e8920614329e19ab4c513113d7e", - }, - description="mMARCO: A Multilingual Version of the MS MARCO Passage Ranking Dataset", - reference="https://arxiv.org/abs/2108.13897", - type="Retrieval", - category="s2p", - eval_splits=["dev"], - eval_langs=["rus-Cyrl"], - main_score="ndcg_at_10", - date=("2000-01-01", "2019-01-01"), - form=["written"], - domains=["Web"], - task_subtypes=["Question answering"], - license="apache-2.0", - socioeconomic_status="mixed", - annotations_creators="derived", - dialect=[], - text_creation="machine-translated", - bibtex_citation="""@misc{bonifacio2022mmarco, - title={mMARCO: A Multilingual Version of the MS MARCO Passage Ranking Dataset}, - author={Luiz Bonifacio - and Vitor Jeronymo - and Hugo Queiroz Abonizio - and Israel Campiotti - and Marzieh Fadaee - and Roberto Lotufo - and Rodrigo Nogueira}, - year={2022}, - eprint={2108.13897}, - archivePrefix={arXiv}, - primaryClass={cs.CL} - }""", - n_samples={"dev": 7437}, - avg_character_length={"dev": 385.9}, - ) diff --git a/results/intfloat__multilingual-e5-small/RuMMarcoRetrieval.json b/results/intfloat__multilingual-e5-small/RuMMarcoRetrieval.json deleted file mode 100644 index 742986080..000000000 --- a/results/intfloat__multilingual-e5-small/RuMMarcoRetrieval.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "dataset_revision": "18d1c2b1ab2a7e8920614329e19ab4c513113d7e", - "evaluation_time": 5528.388280630112, - "kg_co2_emissions": null, - "mteb_version": "1.11.6", - "scores": { - "dev": [ - { - "hf_subset": "default", - "languages": [ - "rus-Cyrl" - ], - "main_score": 0.28082, - "map_at_1": 0.14021, - "map_at_10": 0.22886, - "map_at_100": 0.24022, - "map_at_1000": 0.24096, - "map_at_20": 0.23568, - "map_at_3": 0.19892, - "map_at_5": 0.2155, - "mrr_at_1": 0.14435, - "mrr_at_10": 0.23293, - "mrr_at_100": 0.2441, - "mrr_at_1000": 0.24479, - "mrr_at_20": 0.23978, - "mrr_at_3": 0.20303, - "mrr_at_5": 0.21966, - "ndcg_at_1": 0.14435, - "ndcg_at_10": 0.28082, - "ndcg_at_100": 0.33788, - "ndcg_at_1000": 0.35838, - "ndcg_at_20": 0.30552, - "ndcg_at_3": 0.21925, - "ndcg_at_5": 0.24894, - "precision_at_1": 0.14435, - "precision_at_10": 0.04616, - "precision_at_100": 0.0075, - "precision_at_1000": 0.00093, - "precision_at_20": 0.02818, - "precision_at_3": 0.09466, - "precision_at_5": 0.07182, - "recall_at_1": 0.14021, - "recall_at_10": 0.44217, - "recall_at_100": 0.71052, - "recall_at_1000": 0.87136, - "recall_at_20": 0.53845, - "recall_at_3": 0.27413, - "recall_at_5": 0.34533 - } - ] - }, - "task_name": "RuMMarcoRetrieval" -} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuMMarcoRetrieval.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuMMarcoRetrieval.json deleted file mode 100644 index c34f4fc94..000000000 --- a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuMMarcoRetrieval.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "dataset_revision": "18d1c2b1ab2a7e8920614329e19ab4c513113d7e", - "evaluation_time": 4171.972459077835, - "kg_co2_emissions": null, - "mteb_version": "1.11.6", - "scores": { - "dev": [ - { - "hf_subset": "default", - "languages": [ - "rus-Cyrl" - ], - "main_score": 0.10022, - "map_at_1": 0.04642, - "map_at_10": 0.07956, - "map_at_100": 0.08654, - "map_at_1000": 0.08742, - "map_at_20": 0.08308, - "map_at_3": 0.06717, - "map_at_5": 0.07386, - "mrr_at_1": 0.04817, - "mrr_at_10": 0.08199, - "mrr_at_100": 0.08905, - "mrr_at_1000": 0.0899, - "mrr_at_20": 0.08558, - "mrr_at_3": 0.06943, - "mrr_at_5": 0.07616, - "ndcg_at_1": 0.04817, - "ndcg_at_10": 0.10022, - "ndcg_at_100": 0.13959, - "ndcg_at_1000": 0.1665, - "ndcg_at_20": 0.11301, - "ndcg_at_3": 0.0744, - "ndcg_at_5": 0.08635, - "precision_at_1": 0.04817, - "precision_at_10": 0.01716, - "precision_at_100": 0.00378, - "precision_at_1000": 0.00061, - "precision_at_20": 0.01122, - "precision_at_3": 0.0324, - "precision_at_5": 0.0254, - "recall_at_1": 0.04642, - "recall_at_10": 0.16416, - "recall_at_100": 0.35889, - "recall_at_1000": 0.57588, - "recall_at_20": 0.21396, - "recall_at_3": 0.09307, - "recall_at_5": 0.12164 - } - ] - }, - "task_name": "RuMMarcoRetrieval" -} \ No newline at end of file From 411bf3293469a0048d8fa3f0211e5949e986774c Mon Sep 17 00:00:00 2001 From: Artem Snegirev Date: Tue, 28 May 2024 14:59:53 +0300 Subject: [PATCH 6/8] minor changes --- .../rus/GeoreviewClassification.py | 6 +-- .../rus/HeadlineClassification.py | 4 +- .../rus/InappropriatenessClassification.py | 4 +- .../rus/RuReviewsClassification.py | 4 +- .../rus/RuSciBenchGRNTIClassification.py | 6 +-- .../rus/RuSciBenchOECDClassification.py | 6 +-- .../Clustering/rus/GeoreviewClusteringP2P.py | 2 +- .../rus/RuSciBenchGRNTIClusteringP2P.py | 11 +++-- .../rus/RuSciBenchOECDClusteringP2P.py | 11 +++-- mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py | 8 ++-- .../RuSTSBenchmarkSTS.json | 42 ------------------- .../RuSTSBenchmarkSTS.json | 42 ------------------- 12 files changed, 36 insertions(+), 110 deletions(-) diff --git a/mteb/tasks/Classification/rus/GeoreviewClassification.py b/mteb/tasks/Classification/rus/GeoreviewClassification.py index f91da2a29..79e08cc88 100644 --- a/mteb/tasks/Classification/rus/GeoreviewClassification.py +++ b/mteb/tasks/Classification/rus/GeoreviewClassification.py @@ -28,12 +28,12 @@ class GeoreviewClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], text_creation="found", - bibtex_citation="""""", - n_samples={"test": 5000}, + bibtex_citation="", + n_samples={"test": 2048}, avg_character_length={"test": 409.0}, ) def dataset_transform(self): self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["test"] + self.dataset, seed=self.seed, n_samples=2048, splits=["test"] ) diff --git a/mteb/tasks/Classification/rus/HeadlineClassification.py b/mteb/tasks/Classification/rus/HeadlineClassification.py index 420098c7b..89ae6a62c 100644 --- a/mteb/tasks/Classification/rus/HeadlineClassification.py +++ b/mteb/tasks/Classification/rus/HeadlineClassification.py @@ -52,11 +52,11 @@ class HeadlineClassification(AbsTaskClassification): pages = "54--59", abstract = "The article is focused on automatic development and ranking of a large corpus for Russian paraphrase generation which proves to be the first corpus of such type in Russian computational linguistics. Existing manually annotated paraphrase datasets for Russian are limited to small-sized ParaPhraser corpus and ParaPlag which are suitable for a set of NLP tasks, such as paraphrase and plagiarism detection, sentence similarity and relatedness estimation, etc. Due to size restrictions, these datasets can hardly be applied in end-to-end text generation solutions. Meanwhile, paraphrase generation requires a large amount of training data. In our study we propose a solution to the problem: we collect, rank and evaluate a new publicly available headline paraphrase corpus (ParaPhraser Plus), and then perform text generation experiments with manual evaluation on automatically ranked corpora using the Universal Transformer architecture.", }""", - n_samples={"test": 12000}, + n_samples={"test": 2048}, avg_character_length={"test": 61.6}, ) def dataset_transform(self): self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["test"] + self.dataset, seed=self.seed, n_samples=2048, splits=["test"] ) diff --git a/mteb/tasks/Classification/rus/InappropriatenessClassification.py b/mteb/tasks/Classification/rus/InappropriatenessClassification.py index c69846a97..8d035e8d4 100644 --- a/mteb/tasks/Classification/rus/InappropriatenessClassification.py +++ b/mteb/tasks/Classification/rus/InappropriatenessClassification.py @@ -56,11 +56,11 @@ class InappropriatenessClassification(AbsTaskClassification): pages = "26--36", abstract = "Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.", }""", - n_samples={"test": 10000}, + n_samples={"test": 2048}, avg_character_length={"test": 97.7}, ) def dataset_transform(self): self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["test"] + self.dataset, seed=self.seed, n_samples=2048, splits=["test"] ) diff --git a/mteb/tasks/Classification/rus/RuReviewsClassification.py b/mteb/tasks/Classification/rus/RuReviewsClassification.py index 32cc4de2b..4711a6608 100644 --- a/mteb/tasks/Classification/rus/RuReviewsClassification.py +++ b/mteb/tasks/Classification/rus/RuReviewsClassification.py @@ -40,11 +40,11 @@ class RuReviewsClassification(AbsTaskClassification): ISSN={2378-1963}, month={July} }""", - n_samples={"test": 15000}, + n_samples={"test": 2048}, avg_character_length={"test": 133.2}, ) def dataset_transform(self): self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["test"] + self.dataset, seed=self.seed, n_samples=2048, splits=["test"] ) diff --git a/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py b/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py index 96c4eb22f..e05d4569b 100644 --- a/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py +++ b/mteb/tasks/Classification/rus/RuSciBenchGRNTIClassification.py @@ -28,12 +28,12 @@ class RuSciBenchGRNTIClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], text_creation="found", - bibtex_citation="""""", - n_samples={"test": 2772}, + bibtex_citation="", + n_samples={"test": 2048}, avg_character_length={"test": 890.1}, ) def dataset_transform(self): self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["test"] + self.dataset, seed=self.seed, n_samples=2048, splits=["test"] ) diff --git a/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py b/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py index e21e9f496..5bee76bab 100644 --- a/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py +++ b/mteb/tasks/Classification/rus/RuSciBenchOECDClassification.py @@ -28,12 +28,12 @@ class RuSciBenchOECDClassification(AbsTaskClassification): annotations_creators="derived", dialect=[], text_creation="found", - bibtex_citation="""""", - n_samples={"test": 3219}, + bibtex_citation="", + n_samples={"test": 2048}, avg_character_length={"test": 838.9}, ) def dataset_transform(self): self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["test"] + self.dataset, seed=self.seed, n_samples=2048, splits=["test"] ) diff --git a/mteb/tasks/Clustering/rus/GeoreviewClusteringP2P.py b/mteb/tasks/Clustering/rus/GeoreviewClusteringP2P.py index 390d291c4..468bd5acf 100644 --- a/mteb/tasks/Clustering/rus/GeoreviewClusteringP2P.py +++ b/mteb/tasks/Clustering/rus/GeoreviewClusteringP2P.py @@ -28,7 +28,7 @@ class GeoreviewClusteringP2P(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], text_creation="found", - bibtex_citation="""""", + bibtex_citation="", n_samples={"test": 2000}, avg_character_length={"test": 384.5}, ) diff --git a/mteb/tasks/Clustering/rus/RuSciBenchGRNTIClusteringP2P.py b/mteb/tasks/Clustering/rus/RuSciBenchGRNTIClusteringP2P.py index e8a89bd0b..9b655347a 100644 --- a/mteb/tasks/Clustering/rus/RuSciBenchGRNTIClusteringP2P.py +++ b/mteb/tasks/Clustering/rus/RuSciBenchGRNTIClusteringP2P.py @@ -29,8 +29,8 @@ class RuSciBenchGRNTIClusteringP2P(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], text_creation="found", - bibtex_citation="""""", - n_samples={"test": 2772}, + bibtex_citation="", + n_samples={"test": 2048}, avg_character_length={"test": 890.1}, ) @@ -38,6 +38,11 @@ def dataset_transform(self): self.dataset = self.dataset.rename_columns( {"label": "labels", "text": "sentences"} ) + self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["test"], label="labels" + self.dataset, + seed=self.seed, + splits=["test"], + n_samples=2048, + label="labels", ) diff --git a/mteb/tasks/Clustering/rus/RuSciBenchOECDClusteringP2P.py b/mteb/tasks/Clustering/rus/RuSciBenchOECDClusteringP2P.py index c7d8d293d..dabdfa19b 100644 --- a/mteb/tasks/Clustering/rus/RuSciBenchOECDClusteringP2P.py +++ b/mteb/tasks/Clustering/rus/RuSciBenchOECDClusteringP2P.py @@ -29,8 +29,8 @@ class RuSciBenchOECDClusteringP2P(AbsTaskClusteringFast): annotations_creators="derived", dialect=[], text_creation="found", - bibtex_citation="""""", - n_samples={"test": 3219}, + bibtex_citation="", + n_samples={"test": 2048}, avg_character_length={"test": 838.9}, ) @@ -38,6 +38,11 @@ def dataset_transform(self): self.dataset = self.dataset.rename_columns( {"label": "labels", "text": "sentences"} ) + self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["test"], label="labels" + self.dataset, + seed=self.seed, + splits=["test"], + n_samples=2048, + label="labels", ) diff --git a/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py b/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py index 6da4acaed..1fd639961 100644 --- a/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py +++ b/mteb/tasks/STS/rus/RuSTSBenchmarkSTS.py @@ -17,7 +17,7 @@ class RuSTSBenchmarkSTS(AbsTaskSTS): reference="https://github.com/PhilipMay/stsb-multi-mt/", type="STS", category="s2s", - eval_splits=["validation", "test"], + eval_splits=["test"], eval_langs=["rus-Cyrl"], main_score="cosine_spearman", date=("2012-01-01", "2018-01-01"), @@ -29,9 +29,9 @@ class RuSTSBenchmarkSTS(AbsTaskSTS): annotations_creators="human-annotated", dialect=[], text_creation="machine-translated and verified", - bibtex_citation="""""", - n_samples={"validation": 1336, "test": 1264}, - avg_character_length={"validation": 65.4, "test": 54.2}, + bibtex_citation="", + n_samples={"test": 1264}, + avg_character_length={"test": 54.2}, ) @property diff --git a/results/intfloat__multilingual-e5-small/RuSTSBenchmarkSTS.json b/results/intfloat__multilingual-e5-small/RuSTSBenchmarkSTS.json index 7e7970225..07b3a390a 100644 --- a/results/intfloat__multilingual-e5-small/RuSTSBenchmarkSTS.json +++ b/results/intfloat__multilingual-e5-small/RuSTSBenchmarkSTS.json @@ -24,48 +24,6 @@ "spearman": 0.7758844882898225 } } - ], - "train": [ - { - "cos_sim": { - "pearson": 0.7978802401575843, - "spearman": 0.7781758504177039 - }, - "euclidean": { - "pearson": 0.7786276295006407, - "spearman": 0.7781731374641703 - }, - "hf_subset": "default", - "languages": [ - "rus-Cyrl" - ], - "main_score": 0.7781758504177039, - "manhattan": { - "pearson": 0.7785232719634329, - "spearman": 0.7775080206948711 - } - } - ], - "validation": [ - { - "cos_sim": { - "pearson": 0.8230354096147491, - "spearman": 0.8244747741010346 - }, - "euclidean": { - "pearson": 0.8118130979247168, - "spearman": 0.8244747741010346 - }, - "hf_subset": "default", - "languages": [ - "rus-Cyrl" - ], - "main_score": 0.8244747741010346, - "manhattan": { - "pearson": 0.8114795850180648, - "spearman": 0.8241630998227286 - } - } ] }, "task_name": "RuSTSBenchmarkSTS" diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSTSBenchmarkSTS.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSTSBenchmarkSTS.json index 239fc37b5..b0fc537c0 100644 --- a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSTSBenchmarkSTS.json +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/RuSTSBenchmarkSTS.json @@ -24,48 +24,6 @@ "spearman": 0.7725759998467114 } } - ], - "train": [ - { - "cos_sim": { - "pearson": 0.8167225483626231, - "spearman": 0.8004316378621577 - }, - "euclidean": { - "pearson": 0.814022124536146, - "spearman": 0.7983481288726039 - }, - "hf_subset": "default", - "languages": [ - "rus-Cyrl" - ], - "main_score": 0.8004316378621577, - "manhattan": { - "pearson": 0.81379986366986, - "spearman": 0.7983336550748988 - } - } - ], - "validation": [ - { - "cos_sim": { - "pearson": 0.8405088639041084, - "spearman": 0.8429697776337159 - }, - "euclidean": { - "pearson": 0.8170485377169829, - "spearman": 0.8225506394065449 - }, - "hf_subset": "default", - "languages": [ - "rus-Cyrl" - ], - "main_score": 0.8429697776337159, - "manhattan": { - "pearson": 0.815989675052089, - "spearman": 0.820892726176619 - } - } ] }, "task_name": "RuSTSBenchmarkSTS" From 28adb41649232beb1681631ab9454714e98fb79a Mon Sep 17 00:00:00 2001 From: Artem Snegirev Date: Thu, 30 May 2024 11:30:22 +0300 Subject: [PATCH 7/8] add points --- docs/mmteb/points/815.jsonl | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/mmteb/points/815.jsonl diff --git a/docs/mmteb/points/815.jsonl b/docs/mmteb/points/815.jsonl new file mode 100644 index 000000000..4c33b5b22 --- /dev/null +++ b/docs/mmteb/points/815.jsonl @@ -0,0 +1,5 @@ +{"GitHub": "artemsnegirev", "New dataset": 8} +{"GitHub": "MariyaTikhonova", "New dataset": 6} +{"GitHub": "anpalmak2003", "New dataset": 6} +{"GitHub": "Alenush", "New dataset": 4} +{"GitHub": "ab1992ao", "New dataset": 4} \ No newline at end of file From 4791d80dd26bbcd4b7ce9eadb54919606eb92a23 Mon Sep 17 00:00:00 2001 From: Artem Snegirev Date: Thu, 30 May 2024 22:57:32 +0300 Subject: [PATCH 8/8] add list of tasks to benchmarks --- mteb/__init__.py | 2 ++ mteb/benchmarks.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/mteb/__init__.py b/mteb/__init__.py index 2f708068f..2dc7838b2 100644 --- a/mteb/__init__.py +++ b/mteb/__init__.py @@ -4,6 +4,7 @@ from mteb.benchmarks import ( MTEB_MAIN_EN, + MTEB_MAIN_RU, MTEB_RETRIEVAL_LAW, MTEB_RETRIEVAL_WITH_INSTRUCTIONS, ) @@ -15,6 +16,7 @@ __all__ = [ "MTEB_MAIN_EN", + "MTEB_MAIN_RU", "MTEB_RETRIEVAL_LAW", "MTEB_RETRIEVAL_WITH_INSTRUCTIONS", "TASKS_REGISTRY", diff --git a/mteb/benchmarks.py b/mteb/benchmarks.py index 4c5a9f2c9..022ccee98 100644 --- a/mteb/benchmarks.py +++ b/mteb/benchmarks.py @@ -68,6 +68,26 @@ "TwitterURLCorpus", ] +MTEB_MAIN_RU = [ + "GeoreviewClassification", + "GeoreviewClusteringP2P", + "HeadlineClassification", + "InappropriatenessClassification", + "KinopoiskClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + "RiaNewsRetrieval", + "RuBQRetrieval", + "RuReviewsClassification", + "RuSciBenchGRNTIClassification", + "RuSciBenchGRNTIClusteringP2P", + "RuSciBenchOECDClassification", + "RuSciBenchOECDClusteringP2P", + "RuSTSBenchmarkSTS", + "STS22", + "TERRa", +] + MTEB_RETRIEVAL_WITH_INSTRUCTIONS = [ "Robust04InstructionRetrieval", "News21InstructionRetrieval",