From 4ac934fb9004a7e0628a2c3142b2ad539030901a Mon Sep 17 00:00:00 2001 From: Matti Wiegmann Date: Wed, 10 Jul 2024 16:05:35 +0200 Subject: [PATCH 01/13] add msmarco v2.1 documents --- ir_datasets/datasets/msmarco_document_v2_1.py | 90 +++++++++++++++++++ ir_datasets/etc/downloads.json | 17 ++++ 2 files changed, 107 insertions(+) create mode 100644 ir_datasets/datasets/msmarco_document_v2_1.py diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py new file mode 100644 index 0000000..cb582ff --- /dev/null +++ b/ir_datasets/datasets/msmarco_document_v2_1.py @@ -0,0 +1,90 @@ +import contextlib +import gzip +import io +from pathlib import Path +import json +from typing import NamedTuple, Tuple, List +import tarfile +import ir_datasets +from ir_datasets.indices import PickleLz4FullStore +from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator, TarExtractAll, TarExtract +from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels +from ir_datasets.formats import TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs +from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS +from ir_datasets.datasets.msmarco_document import TREC_DL_QRELS_DEFS +from ir_datasets.datasets.msmarco_document_v2 import MsMarcoV2Docs, MsMarcoV2Document + +_logger = ir_datasets.log.easy() + +NAME = 'msmarco-document-v2.1' + + +class MsMarcoV21DocStore(ir_datasets.indices.Docstore): + def __init__(self, doc_cls, dlc, base_path): + super().__init__(doc_cls) + self.dlc = dlc + self.cache = None + self.base_path = base_path + + def built(self): + return False + + def build(self): + if self.cache: + return + self.cache = Cache(TarExtractAll(self.dlc, "msmarco_v2.1_doc"), self.base_path) + + def get(self, doc_id, field=None): + (string1, string2, string3, bundlenum, position) = doc_id.split("_") + assert string1 == "msmarco" and string2 == "v2.1" and string3 == "doc" + + with open( + f"{self.cache.path()}/msmarco_v2.1_doc_{bundlenum}.json", "rt", encoding="utf8" + ) as in_fh: + in_fh.seek(int(position)) + json_string = in_fh.readline() + document = json.loads(json_string) + + assert document["docid"] == doc_id + return MsMarcoV2Document( + document['docid'], + document['url'], + document['title'], + document['headings'], + document['body']) + + # raise KeyError(f'doc_id={doc_id} not found') + + +class MsMarcoV21Docs(MsMarcoV2Docs): + _fields = ["doc_id"] + def __init__(self, dlc): + super().__init__(dlc) + + def __iter__(): + pass + + def docs_store(self, field='doc_id'): + ds = MsMarcoV21DocStore(self, self._dlc, + ir_datasets.util.home_path() / NAME / "docs") + ds.build() + return ds + + +def _init(): + base_path = ir_datasets.util.home_path()/NAME + documentation = YamlDocumentation(f'docs/{NAME}.yaml') + dlc = DownloadConfig.context(NAME, base_path, dua=DUA) + subsets = {} + + collection = MsMarcoV21Docs(dlc['docs']) + for docs in collection.docs_iter(): + print(docs) + break + + ds = collection.docs_store() + document = ds.get("msmarco_v2.1_doc_12_0") + print(document) + +if __name__ == "__main__": + _init() \ No newline at end of file diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json index 397646a..cc0fb44 100644 --- a/ir_datasets/etc/downloads.json +++ b/ir_datasets/etc/downloads.json @@ -4714,6 +4714,23 @@ } }, + "msmarco-document-v2.1": { + "docs": { + "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco_v2.1_doc.tar", + "size_hint": 30844989440, + "expected_md5_foo": "a5950665d6448d3dbaf7135645f1e074", + "cache_path": "msmarco_v2.1_doc.tar", + "download_args": {"headers": {"X-Ms-Version": "2024-07-10"}} + }, + "docs-segmented": { + "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco_v2.1_doc_segmented.tar", + "size_hint": 26918768640, + "expected_md5_foo": "3799e7611efffd8daeb257e9ccca4d60", + "cache_path": "msmarco_v2.1_doc_segmented.tar", + "download_args": {"headers": {"X-Ms-Version": "2024-07-10"}} + } + }, + "msmarco-passage": { "collectionandqueries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz", From 9044fe4f2c159659a4981a78e63e3b43f2a15529 Mon Sep 17 00:00:00 2001 From: Maik Froebe Date: Mon, 5 Aug 2024 13:35:38 +0200 Subject: [PATCH 02/13] Add unit tests for msmarco document 2.1 --- ir_datasets/datasets/__init__.py | 1 + ir_datasets/datasets/msmarco_document_v2_1.py | 15 ++--- ir_datasets/docs/msmarco-document-v2.1.yaml | 14 ++++ test/integration/msmarco_document_v2_1.py | 66 +++++++++++++++++++ 4 files changed, 86 insertions(+), 10 deletions(-) create mode 100644 ir_datasets/docs/msmarco-document-v2.1.yaml create mode 100644 test/integration/msmarco_document_v2_1.py diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py index c5298d1..8bd90db 100644 --- a/ir_datasets/datasets/__init__.py +++ b/ir_datasets/datasets/__init__.py @@ -29,6 +29,7 @@ from . import mr_tydi from . import msmarco_document from . import msmarco_document_v2 +from . import msmarco_document_v2_1 from . import msmarco_passage from . import msmarco_passage_v2 from . import msmarco_qna diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py index cb582ff..d2c1c2c 100644 --- a/ir_datasets/datasets/msmarco_document_v2_1.py +++ b/ir_datasets/datasets/msmarco_document_v2_1.py @@ -75,16 +75,11 @@ def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) - subsets = {} - collection = MsMarcoV21Docs(dlc['docs']) - for docs in collection.docs_iter(): - print(docs) - break + subsets = {} - ds = collection.docs_store() - document = ds.get("msmarco_v2.1_doc_12_0") - print(document) + ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) + + return collection, subsets -if __name__ == "__main__": - _init() \ No newline at end of file +collection, subsets = _init() diff --git a/ir_datasets/docs/msmarco-document-v2.1.yaml b/ir_datasets/docs/msmarco-document-v2.1.yaml new file mode 100644 index 0000000..c6654a8 --- /dev/null +++ b/ir_datasets/docs/msmarco-document-v2.1.yaml @@ -0,0 +1,14 @@ +_: + pretty_name: 'MSMARCO (document, version 2.1)' + desc: ' +

+Version 2.1 of the MS MARCO document ranking dataset used in TREC RAG 2024. +

+' + bibtex_ids: [] + diff --git a/test/integration/msmarco_document_v2_1.py b/test/integration/msmarco_document_v2_1.py new file mode 100644 index 0000000..2c91fe5 --- /dev/null +++ b/test/integration/msmarco_document_v2_1.py @@ -0,0 +1,66 @@ +import re +import unittest +import ir_datasets +from ir_datasets.datasets.msmarco_document_v2_1 import MsMarcoV2Document +from ir_datasets.formats import TrecQrel, GenericQuery +from .base import DatasetIntegrationTest + + +_logger = ir_datasets.log.easy() + + +class TestMSMarcoV21Docs(DatasetIntegrationTest): + def test_ms_marco_docs_iter_full(self): + self._test_docs('msmarco-document-v2.1', count=5371, items={ + 0: MsMarcoV2Document( + doc_id='msmarco_v2.1_doc_12_0', + title='Who Is Ringo Starr\'s Wife Barbara Bach and How Many Children Do They Have?', + url='https://answersafrica.com/ringo-starrs-wife-children.html', + headings=re.compile('.*Wife Barbara Bach.*'), + body=re.compile('^Who Is Ringo Starr\'s Wife Barbara Bach.*') + ), + 9: MsMarcoV2Document( + doc_id='msmarco_v2.1_doc_12_70974', + title='List of Robin Williams Movies and TV Shows From Best To Worst', + url='https://answersafrica.com/robin-williams-movies-tv-shows.html', + headings=re.compile('List of Robin Williams Movies and TV Shows.*'), + body=re.compile('List of Robin Williams Movies and TV Shows From Best To Worst\nList of Robin Williams Movies and TV Shows From Best To Worst*') + ), + 5370: MsMarcoV2Document( + doc_id='msmarco_v2.1_doc_12_48692010', + title='Warriors of Waterdeep 2.11.13 (Mod) latest', + url='https://apkdry.com/warriors-of-waterdeep-2-3-24-mod/', + headings=re.compile('^Warriors of Waterdeep 2.11.13 \(Mod\)\\nWarriors of Waterdeep 2.11.13 \(Mod\)\\nFeatures and Screenshots Warriors of Waterdeep Game for Android.*'), + body=re.compile('Warriors of Waterdeep 2.11.13 \(Mod\) latest\\nWarriors of Waterdeep 2.11.13 \(Mod\)\\nby Apkdry 3 weeks ago Games.*') + ), + }) + + def test_fast_ms_marco_docs_store(self): + docs_store = ir_datasets.load('msmarco-document-v2.1').docs_store() + + doc = docs_store.get('msmarco_v2.1_doc_12_0') + self.assertEqual('msmarco_v2.1_doc_12_0', doc.doc_id) + + doc = docs_store.get('msmarco_v2.1_doc_12_48692010') + self.assertEqual('msmarco_v2.1_doc_12_48692010', doc.doc_id) + + def test_fast_docs_store_on_non_existing_documents(self): + docs_store = ir_datasets.load('msmarco-document-v2.1').docs_store() + + with self.assertRaises(Exception) as context: + doc = docs_store.get('msmarco_v2.1_doc_12_111') + + self.assertTrue('Expecting value: line 1 column 1' in str(context.exception)) + + def test_fast_ms_marco_docs_iter(self): + # faster alternative to above + docs_iter = ir_datasets.load('msmarco-document-v2.1').docs_iter() + first_doc = docs_iter.__next__() + second_doc = docs_iter.__next__() + + self.assertEqual('msmarco_v2.1_doc_12_0', first_doc.doc_id) + self.assertEqual('msmarco_v2.1_doc_12_5689', second_doc.doc_id) + + +if __name__ == '__main__': + unittest.main() From ad48faa954ab92f64252d31d45f5859438685bef Mon Sep 17 00:00:00 2001 From: Maik Froebe Date: Mon, 5 Aug 2024 19:06:26 +0200 Subject: [PATCH 03/13] prepare integration of msmarco v2.1 segmented documents --- ir_datasets/datasets/__init__.py | 1 + ir_datasets/datasets/msmarco_document_v2_1.py | 7 +- .../msmarco_document_v2_1_segmented.py | 156 ++++++++++++++++++ test/integration/msmarco_document_v2_1.py | 8 +- .../msmarco_document_v2_1_segmented.py | 72 ++++++++ 5 files changed, 238 insertions(+), 6 deletions(-) create mode 100644 ir_datasets/datasets/msmarco_document_v2_1_segmented.py create mode 100644 test/integration/msmarco_document_v2_1_segmented.py diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py index 8bd90db..a5d5c59 100644 --- a/ir_datasets/datasets/__init__.py +++ b/ir_datasets/datasets/__init__.py @@ -30,6 +30,7 @@ from . import msmarco_document from . import msmarco_document_v2 from . import msmarco_document_v2_1 +from . import msmarco_document_v2_1_segmented from . import msmarco_passage from . import msmarco_passage_v2 from . import msmarco_qna diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py index d2c1c2c..e73f061 100644 --- a/ir_datasets/datasets/msmarco_document_v2_1.py +++ b/ir_datasets/datasets/msmarco_document_v2_1.py @@ -19,6 +19,10 @@ NAME = 'msmarco-document-v2.1' +class MsMarcoV21Document(MsMarcoV2Document): + # Identical to V2 Document + pass + class MsMarcoV21DocStore(ir_datasets.indices.Docstore): def __init__(self, doc_cls, dlc, base_path): super().__init__(doc_cls) @@ -46,7 +50,7 @@ def get(self, doc_id, field=None): document = json.loads(json_string) assert document["docid"] == doc_id - return MsMarcoV2Document( + return MsMarcoV21Document( document['docid'], document['url'], document['title'], @@ -70,7 +74,6 @@ def docs_store(self, field='doc_id'): ds.build() return ds - def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') diff --git a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py new file mode 100644 index 0000000..6c6b9f5 --- /dev/null +++ b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py @@ -0,0 +1,156 @@ +import contextlib +import gzip +import io +from pathlib import Path +import json +from typing import NamedTuple, Tuple, List +import tarfile +import ir_datasets +from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator, TarExtractAll, TarExtract +from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels +from ir_datasets.formats import TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs +from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS +import os.path +import shutil + +_logger = ir_datasets.log.easy() + +NAME = 'msmarco-document-v2.1' + + +class MsMarcoV21SegmentedDocument(NamedTuple): + doc_id: str + url: str + title: str + headings: str + segment: str + start_char: int + end_char: int + def default_text(self): + """ + title + headings + segment + This is consistent with the MsMarcoV21Document that returns the full text alternative of this: title + headings + body + Please note that Anserini additionaly returns the url. I.e., anserini returns url + title + headings + segment + E.g., https://github.com/castorini/anserini/blob/b8ce19f56bc4e85056ef703322f76646804ec640/src/main/java/io/anserini/collection/MsMarcoV2DocCollection.java#L169 + """ + return f'{self.title} {self.headings} {self.segment}' + + +def ensure_file_is_extracted(file_name): + if os.path.isfile(file_name): + return + import tempfile + tmp_file = Path(tempfile.mkdtemp()) / file_name.split('/')[-1] + + with gzip.open(file_name + '.gz', 'rb') as f_in: + with open(tmp_file, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + shutil.move(tmp_file, file_name) + + +class MsMarcoV21SegmentedDocStore(ir_datasets.indices.Docstore): + def __init__(self, doc_cls, dlc, base_path): + super().__init__(doc_cls) + self.dlc = dlc + self.cache = None + self.base_path = base_path + + def built(self): + return False + + def build(self): + if self.cache: + return + self.cache = Cache(TarExtractAll(self.dlc, "msmarco_v2.1_doc_segmented"), self.base_path) + + for i in range(0, 59): + ensure_file_is_extracted(f"{self.cache.path()}/msmarco_v2.1_doc_segmented_{i:02d}.json") + + + def get(self, doc_id, field=None): + (string1, string2, string3, bundlenum, doc_position, position) = doc_id.split("_") + assert string1 == "msmarco" and string2 == "v2.1" and string3 == "doc" + + with open( + f"{self.cache.path()}/msmarco_v2.1_doc_segmented_{bundlenum}.json", "rt", encoding="utf8" + ) as in_fh: + in_fh.seek(int(position)) + json_string = in_fh.readline() + document = json.loads(json_string) + + assert document["docid"] == doc_id + return MsMarcoV21SegmentedDocument( + document['docid'], + document['url'], + document['title'], + document['headings'], + document['segment'], + document['start_char'], + document['end_char'] + ) + + +class MsMarcoV21Docs(BaseDocs): + _fields = ["doc_id"] + def __init__(self, dlc): + super().__init__() + self._dlc = dlc + + @ir_datasets.util.use_docstore + def docs_iter(self): + with self._dlc.stream() as stream, \ + tarfile.open(fileobj=stream, mode='r|') as tarf: + for record in tarf: + if not record.name.endswith('.gz'): + continue + file = tarf.extractfile(record) + with gzip.open(file) as file: + for line in file: + data = json.loads(line) + yield MsMarcoV21SegmentedDocument( + data['docid'], + data['url'], + data['title'], + data['headings'], + data['segment'], + data['start_char'], + data['end_char'], + ) + + def docs_cls(self): + return MsMarcoV21SegmentedDocument + + def __iter__(): + pass + + def docs_store(self, field='doc_id'): + ds = MsMarcoV21SegmentedDocStore(self, self._dlc, + ir_datasets.util.home_path() / NAME / "docs-segmented") + ds.build() + return ds + + def docs_count(self): + if self.docs_store().built(): + return self.docs_store().count() + + def docs_namespace(self): + return NAME + + def docs_lang(self): + return 'en' + + +def _init(): + base_path = ir_datasets.util.home_path()/NAME + documentation = YamlDocumentation(f'docs/{NAME}.yaml') + dlc = DownloadConfig.context(NAME, base_path, dua=DUA) + collection = MsMarcoV21Docs(dlc['docs-segmented']) + subsets = {} + + ir_datasets.registry.register(NAME + '/segmented', Dataset(collection, documentation('_'))) + collection + print(collection.docs_iter().__next__()) + + return collection, subsets + +collection, subsets = _init() diff --git a/test/integration/msmarco_document_v2_1.py b/test/integration/msmarco_document_v2_1.py index 2c91fe5..14c8493 100644 --- a/test/integration/msmarco_document_v2_1.py +++ b/test/integration/msmarco_document_v2_1.py @@ -1,7 +1,7 @@ import re import unittest import ir_datasets -from ir_datasets.datasets.msmarco_document_v2_1 import MsMarcoV2Document +from ir_datasets.datasets.msmarco_document_v2_1 import MsMarcoV21Document from ir_datasets.formats import TrecQrel, GenericQuery from .base import DatasetIntegrationTest @@ -12,21 +12,21 @@ class TestMSMarcoV21Docs(DatasetIntegrationTest): def test_ms_marco_docs_iter_full(self): self._test_docs('msmarco-document-v2.1', count=5371, items={ - 0: MsMarcoV2Document( + 0: MsMarcoV21Document( doc_id='msmarco_v2.1_doc_12_0', title='Who Is Ringo Starr\'s Wife Barbara Bach and How Many Children Do They Have?', url='https://answersafrica.com/ringo-starrs-wife-children.html', headings=re.compile('.*Wife Barbara Bach.*'), body=re.compile('^Who Is Ringo Starr\'s Wife Barbara Bach.*') ), - 9: MsMarcoV2Document( + 9: MsMarcoV21Document( doc_id='msmarco_v2.1_doc_12_70974', title='List of Robin Williams Movies and TV Shows From Best To Worst', url='https://answersafrica.com/robin-williams-movies-tv-shows.html', headings=re.compile('List of Robin Williams Movies and TV Shows.*'), body=re.compile('List of Robin Williams Movies and TV Shows From Best To Worst\nList of Robin Williams Movies and TV Shows From Best To Worst*') ), - 5370: MsMarcoV2Document( + 5370: MsMarcoV21Document( doc_id='msmarco_v2.1_doc_12_48692010', title='Warriors of Waterdeep 2.11.13 (Mod) latest', url='https://apkdry.com/warriors-of-waterdeep-2-3-24-mod/', diff --git a/test/integration/msmarco_document_v2_1_segmented.py b/test/integration/msmarco_document_v2_1_segmented.py new file mode 100644 index 0000000..151ac90 --- /dev/null +++ b/test/integration/msmarco_document_v2_1_segmented.py @@ -0,0 +1,72 @@ +import re +import unittest +import ir_datasets +from ir_datasets.datasets.msmarco_document_v2_1_segmented import MsMarcoV21SegmentedDocument +from ir_datasets.formats import TrecQrel, GenericQuery +from .base import DatasetIntegrationTest + + +_logger = ir_datasets.log.easy() + + +class TestMSMarcoV21DocsSegmented(DatasetIntegrationTest): + def test_ms_marco_docs_iter_full(self): + self._test_docs('msmarco-document-v2.1/segmented', count=5371, items={ + 0: MsMarcoV21SegmentedDocument( + doc_id='msmarco_v2.1_doc_42_0#0_0', + title='How to Use Flip Tool in GIMP', + url='https://www.guidingtech.com/use-flip-tool-gimp/', + headings=re.compile('^How to Use Flip Tool in\\nGIMP\\n\\nHow to Use Flip Tool in GIMP.*'), + segment=re.compile('^How to Use Flip Tool in GIMP\\nHow to Use Flip Tool in GIMP\\nMehvish\\n06 Sep 2019.*'), + start_char=0, + end_char=800 + ), + 19: MsMarcoV21SegmentedDocument( + doc_id='msmarco_v2.1_doc_42_6080#2_22424', + title='How to Setup and Use FTP Server on Android', + url='https://www.guidingtech.com/use-ftp-server-file-transfer-android/', + headings=re.compile('How to Set\\u00adup and Use\\nFTP\nServ\\u00ader on Android\.*'), + segment=re.compile('^Also Read: Best Alternatives to Google Apps\\nIn this post.*'), + start_char=1032, + end_char=1959 + ), + 5370: MsMarcoV21SegmentedDocument( + doc_id='msmarco_v2.1_doc_42_3400697#6_9024928', + title='Can Guinea Pigs Eat Leaves? - Guinea Pig Tube', + url='https://www.guineapigtube.com/can-guinea-pigs-eat-leaves/', + headings=re.compile('^Can Guinea Pigs Eat Leaves\?\\nCan Guinea Pigs Eat Leaves\?.*'), + segment=re.compile('^They protect the body from free radical damage. The free radicals cause many health problems and also cause premature aging in guinea pigs.*'), + start_char=2954, + end_char=3767, + ), + }) + + def test_fast_ms_marco_docs_store(self): + docs_store = ir_datasets.load('msmarco-document-v2.1/segmented').docs_store() + + doc = docs_store.get('msmarco_v2.1_doc_02_968#0_1561') + self.assertEqual('msmarco_v2.1_doc_02_968#0_1561', doc.doc_id) + + doc = docs_store.get('msmarco_v2.1_doc_03_0#3_5523') + self.assertEqual('msmarco_v2.1_doc_03_0#3_5523', doc.doc_id) + + def test_fast_docs_store_on_non_existing_documents(self): + docs_store = ir_datasets.load('msmarco-document-v2.1/segmented').docs_store() + + with self.assertRaises(Exception) as context: + doc = docs_store.get('msmarco_v2.1_doc_02_968#0_156') + + self.assertTrue('Expecting value: line 1 column 1' in str(context.exception)) + + def test_fast_ms_marco_docs_iter(self): + # faster alternative to above + docs_iter = ir_datasets.load('msmarco-document-v2.1/segmented').docs_iter() + first_doc = docs_iter.__next__() + second_doc = docs_iter.__next__() + + self.assertEqual('msmarco_v2.1_doc_42_0#0_0', first_doc.doc_id) + self.assertEqual('msmarco_v2.1_doc_42_0#1_1311', second_doc.doc_id) + + +if __name__ == '__main__': + unittest.main() From 973751c7b4abcf9334c9989750a43e52b1839e76 Mon Sep 17 00:00:00 2001 From: Maik Froebe Date: Mon, 5 Aug 2024 22:44:53 +0200 Subject: [PATCH 04/13] add doc counts --- ir_datasets/datasets/msmarco_document_v2_1.py | 3 +++ ir_datasets/datasets/msmarco_document_v2_1_segmented.py | 5 +---- test/integration/msmarco_document_v2_1.py | 5 +++++ test/integration/msmarco_document_v2_1_segmented.py | 6 ++++++ 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py index e73f061..6da5a41 100644 --- a/ir_datasets/datasets/msmarco_document_v2_1.py +++ b/ir_datasets/datasets/msmarco_document_v2_1.py @@ -74,6 +74,9 @@ def docs_store(self, field='doc_id'): ds.build() return ds + def docs_count(self): + return 10960555 + def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') diff --git a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py index 6c6b9f5..7c04612 100644 --- a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py +++ b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py @@ -130,8 +130,7 @@ def docs_store(self, field='doc_id'): return ds def docs_count(self): - if self.docs_store().built(): - return self.docs_store().count() + return 113520750 def docs_namespace(self): return NAME @@ -148,8 +147,6 @@ def _init(): subsets = {} ir_datasets.registry.register(NAME + '/segmented', Dataset(collection, documentation('_'))) - collection - print(collection.docs_iter().__next__()) return collection, subsets diff --git a/test/integration/msmarco_document_v2_1.py b/test/integration/msmarco_document_v2_1.py index 14c8493..0dcfe24 100644 --- a/test/integration/msmarco_document_v2_1.py +++ b/test/integration/msmarco_document_v2_1.py @@ -61,6 +61,11 @@ def test_fast_ms_marco_docs_iter(self): self.assertEqual('msmarco_v2.1_doc_12_0', first_doc.doc_id) self.assertEqual('msmarco_v2.1_doc_12_5689', second_doc.doc_id) + def test_fast_docs_count(self): + expected = 10960555 + actual = ir_datasets.load('msmarco-document-v2.1').docs_count() + + self.assertEqual(expected, actual) if __name__ == '__main__': unittest.main() diff --git a/test/integration/msmarco_document_v2_1_segmented.py b/test/integration/msmarco_document_v2_1_segmented.py index 151ac90..f0ca5cb 100644 --- a/test/integration/msmarco_document_v2_1_segmented.py +++ b/test/integration/msmarco_document_v2_1_segmented.py @@ -67,6 +67,12 @@ def test_fast_ms_marco_docs_iter(self): self.assertEqual('msmarco_v2.1_doc_42_0#0_0', first_doc.doc_id) self.assertEqual('msmarco_v2.1_doc_42_0#1_1311', second_doc.doc_id) + def test_fast_docs_count(self): + expected = 113520750 + actual = ir_datasets.load('msmarco-document-v2.1/segmented').docs_count() + + self.assertEqual(expected, actual) + if __name__ == '__main__': unittest.main() From ae3e7780beb91fd050754af312606903af953abe Mon Sep 17 00:00:00 2001 From: Maik Froebe Date: Wed, 7 Aug 2024 07:27:53 +0200 Subject: [PATCH 05/13] add extraction of msmarco v2.1 docs --- ir_datasets/datasets/msmarco_document_v2_1.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py index 6da5a41..ad71a60 100644 --- a/ir_datasets/datasets/msmarco_document_v2_1.py +++ b/ir_datasets/datasets/msmarco_document_v2_1.py @@ -1,6 +1,8 @@ import contextlib import gzip import io +import os +import shutil from pathlib import Path import json from typing import NamedTuple, Tuple, List @@ -23,6 +25,17 @@ class MsMarcoV21Document(MsMarcoV2Document): # Identical to V2 Document pass +def ensure_file_is_extracted(file_name): + if os.path.isfile(file_name): + return + import tempfile + tmp_file = Path(tempfile.mkdtemp()) / file_name.split('/')[-1] + + with gzip.open(file_name + '.gz', 'rb') as f_in: + with open(tmp_file, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + shutil.move(tmp_file, file_name) + class MsMarcoV21DocStore(ir_datasets.indices.Docstore): def __init__(self, doc_cls, dlc, base_path): super().__init__(doc_cls) @@ -37,6 +50,9 @@ def build(self): if self.cache: return self.cache = Cache(TarExtractAll(self.dlc, "msmarco_v2.1_doc"), self.base_path) + for i in range(0, 59): + ensure_file_is_extracted(f"{self.cache.path()}/msmarco_v2.1_doc_{i:02d}.json") + def get(self, doc_id, field=None): (string1, string2, string3, bundlenum, position) = doc_id.split("_") From 8d1fbb8a8500f9113f3435d2726c9af5b79dcc9c Mon Sep 17 00:00:00 2001 From: Maik Froebe Date: Thu, 8 Aug 2024 13:45:16 +0200 Subject: [PATCH 06/13] add trec 2024 rag queries --- ir_datasets/datasets/msmarco_document_v2_1.py | 7 +++++++ ir_datasets/datasets/msmarco_document_v2_1_segmented.py | 6 ++++++ ir_datasets/etc/downloads.json | 7 ++++++- test/integration/msmarco_document_v2_1.py | 8 ++++++++ test/integration/msmarco_document_v2_1_segmented.py | 7 +++++++ 5 files changed, 34 insertions(+), 1 deletion(-) diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py index ad71a60..a8fe444 100644 --- a/ir_datasets/datasets/msmarco_document_v2_1.py +++ b/ir_datasets/datasets/msmarco_document_v2_1.py @@ -100,7 +100,14 @@ def _init(): collection = MsMarcoV21Docs(dlc['docs']) subsets = {} + subsets['trec-rag-2024'] = Dataset( + collection, + TsvQueries(dlc['rag-2024-test-topics'], namespace=NAME, lang='en'), + ) + ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) + for s in sorted(subsets): + ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets diff --git a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py index 7c04612..215c8ac 100644 --- a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py +++ b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py @@ -145,8 +145,14 @@ def _init(): dlc = DownloadConfig.context(NAME, base_path, dua=DUA) collection = MsMarcoV21Docs(dlc['docs-segmented']) subsets = {} + subsets['trec-rag-2024'] = Dataset( + collection, + TsvQueries(dlc['rag-2024-test-topics'], namespace=NAME, lang='en'), + ) ir_datasets.registry.register(NAME + '/segmented', Dataset(collection, documentation('_'))) + for s in sorted(subsets): + ir_datasets.registry.register(f'{NAME}/segmented/{s}', Dataset(subsets[s], documentation(s))) return collection, subsets diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json index cc0fb44..4882b18 100644 --- a/ir_datasets/etc/downloads.json +++ b/ir_datasets/etc/downloads.json @@ -4728,9 +4728,14 @@ "expected_md5_foo": "3799e7611efffd8daeb257e9ccca4d60", "cache_path": "msmarco_v2.1_doc_segmented.tar", "download_args": {"headers": {"X-Ms-Version": "2024-07-10"}} + }, + "rag-2024-test-topics": { + "url": "https://trec-rag.github.io/assets/txt/topics.rag24.test.txt", + "size_hint": 19517, + "expected_md5": "5bd6c8fa0e1300233fe139bae8288d09", + "cache_path": "trec-rag-2024-topics-test.txt" } }, - "msmarco-passage": { "collectionandqueries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz", diff --git a/test/integration/msmarco_document_v2_1.py b/test/integration/msmarco_document_v2_1.py index 0dcfe24..4c6bbb3 100644 --- a/test/integration/msmarco_document_v2_1.py +++ b/test/integration/msmarco_document_v2_1.py @@ -67,5 +67,13 @@ def test_fast_docs_count(self): self.assertEqual(expected, actual) + def test_fast_queries(self): + self._test_queries('msmarco-document-v2.1/trec-rag-2024', count=301, items={ + 0: GenericQuery('2024-145979', 'what is vicarious trauma and how can it be coped with?'), + 9: GenericQuery('2024-158743', 'what was happening in germany and netherlands in the 1840s'), + 300: GenericQuery('2024-21669', 'do abortions kill more black people than other weapons'), + }) + + if __name__ == '__main__': unittest.main() diff --git a/test/integration/msmarco_document_v2_1_segmented.py b/test/integration/msmarco_document_v2_1_segmented.py index f0ca5cb..bd1fc6a 100644 --- a/test/integration/msmarco_document_v2_1_segmented.py +++ b/test/integration/msmarco_document_v2_1_segmented.py @@ -73,6 +73,13 @@ def test_fast_docs_count(self): self.assertEqual(expected, actual) + def test_fast_queries(self): + self._test_queries('msmarco-document-v2.1/trec-rag-2024', count=301, items={ + 0: GenericQuery('2024-145979', 'what is vicarious trauma and how can it be coped with?'), + 9: GenericQuery('2024-158743', 'what was happening in germany and netherlands in the 1840s'), + 300: GenericQuery('2024-21669', 'do abortions kill more black people than other weapons'), + }) + if __name__ == '__main__': unittest.main() From 05eb55b7d43e4ef383f1ae86df8688ccab6c5a66 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Sat, 10 Aug 2024 15:11:55 +0100 Subject: [PATCH 07/13] fix error in configuration --- ir_datasets/datasets/msmarco_document_v2_1.py | 2 +- ir_datasets/datasets/msmarco_document_v2_1_segmented.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py index a8fe444..3b33fed 100644 --- a/ir_datasets/datasets/msmarco_document_v2_1.py +++ b/ir_datasets/datasets/msmarco_document_v2_1.py @@ -49,7 +49,7 @@ def built(self): def build(self): if self.cache: return - self.cache = Cache(TarExtractAll(self.dlc, "msmarco_v2.1_doc"), self.base_path) + self.cache = TarExtractAll(self.dlc, self.base_path/"msmarco_v2.1_doc") for i in range(0, 59): ensure_file_is_extracted(f"{self.cache.path()}/msmarco_v2.1_doc_{i:02d}.json") diff --git a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py index 215c8ac..fa048d1 100644 --- a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py +++ b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py @@ -61,7 +61,7 @@ def built(self): def build(self): if self.cache: return - self.cache = Cache(TarExtractAll(self.dlc, "msmarco_v2.1_doc_segmented"), self.base_path) + self.cache = TarExtractAll(self.dlc, self.base_path/"msmarco_v2.1_doc_segmented") for i in range(0, 59): ensure_file_is_extracted(f"{self.cache.path()}/msmarco_v2.1_doc_segmented_{i:02d}.json") From f548ead514e742531cb2a40b6f8fb0e94f670925 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Sun, 11 Aug 2024 17:40:17 +0100 Subject: [PATCH 08/13] wip by making use of v2 classes --- ir_datasets/datasets/__init__.py | 2 +- ir_datasets/datasets/msmarco_document_v2.py | 25 +-- ir_datasets/datasets/msmarco_document_v2_1.py | 97 +---------- .../msmarco_document_v2_1_segmented.py | 159 ------------------ ir_datasets/datasets/msmarco_passage_v2.py | 63 ++++--- ir_datasets/datasets/msmarco_segment_v2_1.py | 72 ++++++++ ir_datasets/etc/downloads.json | 16 +- test/integration/msmarco_document_v2_1.py | 64 +------ .../msmarco_document_v2_1_segmented.py | 85 ---------- test/integration/msmarco_segment_v2_1.py | 30 ++++ 10 files changed, 180 insertions(+), 433 deletions(-) delete mode 100644 ir_datasets/datasets/msmarco_document_v2_1_segmented.py create mode 100644 ir_datasets/datasets/msmarco_segment_v2_1.py delete mode 100644 test/integration/msmarco_document_v2_1_segmented.py create mode 100644 test/integration/msmarco_segment_v2_1.py diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py index a5d5c59..77b35d0 100644 --- a/ir_datasets/datasets/__init__.py +++ b/ir_datasets/datasets/__init__.py @@ -30,7 +30,7 @@ from . import msmarco_document from . import msmarco_document_v2 from . import msmarco_document_v2_1 -from . import msmarco_document_v2_1_segmented +from . import msmarco_segment_v2_1 from . import msmarco_passage from . import msmarco_passage_v2 from . import msmarco_qna diff --git a/ir_datasets/datasets/msmarco_document_v2.py b/ir_datasets/datasets/msmarco_document_v2.py index 39b9658..d3ae586 100644 --- a/ir_datasets/datasets/msmarco_document_v2.py +++ b/ir_datasets/datasets/msmarco_document_v2.py @@ -36,17 +36,21 @@ def default_text(self): class MsMarcoV2Docs(BaseDocs): - def __init__(self, dlc): + def __init__(self, dlc, docid_prefix='msmarco_doc_', docstore_size_hint=66500029281, name=NAME): super().__init__() self._dlc = dlc + self._docid_prefix = docid_prefix + self._docstore_size_hint = docstore_size_hint + self._name = name @ir_datasets.util.use_docstore def docs_iter(self): - with self._dlc.stream() as stream, \ - tarfile.open(fileobj=stream, mode='r|') as tarf: - for record in tarf: - if not record.name.endswith('.gz'): - continue + with tarfile.open(self._dlc.path(), mode='r:') as tarf: + # since there's no compression, it's fast to scan all records and sort them. + # The sorting has no effect on v2, but in v2.1, the files are out-of-sequence, so this + # addressed that problem. + records = sorted([r for r in tarf if r.name.endswith('.gz')], key=lambda x: x.name) + for record in records: file = tarf.extractfile(record) with gzip.open(file) as file: for line in file: @@ -84,18 +88,17 @@ def docs_store(self, field='doc_id'): data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], - key_field_prefix='msmarco_doc_', # cut down on storage by removing prefix in lookup structure - size_hint=66500029281, - count_hint=ir_datasets.util.count_hint(NAME), + key_field_prefix=self._docid_prefix, # cut down on storage by removing prefix in lookup structure + size_hint=self._docstore_size_hint, + count_hint=ir_datasets.util.count_hint(self._name), ) - # return MsMArcoV2DocStore(self) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): - return NAME + return self._name def docs_lang(self): return 'en' diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py index 3b33fed..c292108 100644 --- a/ir_datasets/datasets/msmarco_document_v2_1.py +++ b/ir_datasets/datasets/msmarco_document_v2_1.py @@ -1,103 +1,20 @@ -import contextlib -import gzip -import io -import os -import shutil -from pathlib import Path -import json -from typing import NamedTuple, Tuple, List -import tarfile import ir_datasets -from ir_datasets.indices import PickleLz4FullStore -from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator, TarExtractAll, TarExtract -from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels -from ir_datasets.formats import TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs -from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS -from ir_datasets.datasets.msmarco_document import TREC_DL_QRELS_DEFS -from ir_datasets.datasets.msmarco_document_v2 import MsMarcoV2Docs, MsMarcoV2Document +from ir_datasets.util import DownloadConfig +from ir_datasets.datasets.base import Dataset, YamlDocumentation +from ir_datasets.formats import TsvQueries +from ir_datasets.datasets.msmarco_passage import DUA +from ir_datasets.datasets.msmarco_document_v2 import MsMarcoV2Docs _logger = ir_datasets.log.easy() NAME = 'msmarco-document-v2.1' - -class MsMarcoV21Document(MsMarcoV2Document): - # Identical to V2 Document - pass - -def ensure_file_is_extracted(file_name): - if os.path.isfile(file_name): - return - import tempfile - tmp_file = Path(tempfile.mkdtemp()) / file_name.split('/')[-1] - - with gzip.open(file_name + '.gz', 'rb') as f_in: - with open(tmp_file, 'wb') as f_out: - shutil.copyfileobj(f_in, f_out) - shutil.move(tmp_file, file_name) - -class MsMarcoV21DocStore(ir_datasets.indices.Docstore): - def __init__(self, doc_cls, dlc, base_path): - super().__init__(doc_cls) - self.dlc = dlc - self.cache = None - self.base_path = base_path - - def built(self): - return False - - def build(self): - if self.cache: - return - self.cache = TarExtractAll(self.dlc, self.base_path/"msmarco_v2.1_doc") - for i in range(0, 59): - ensure_file_is_extracted(f"{self.cache.path()}/msmarco_v2.1_doc_{i:02d}.json") - - - def get(self, doc_id, field=None): - (string1, string2, string3, bundlenum, position) = doc_id.split("_") - assert string1 == "msmarco" and string2 == "v2.1" and string3 == "doc" - - with open( - f"{self.cache.path()}/msmarco_v2.1_doc_{bundlenum}.json", "rt", encoding="utf8" - ) as in_fh: - in_fh.seek(int(position)) - json_string = in_fh.readline() - document = json.loads(json_string) - - assert document["docid"] == doc_id - return MsMarcoV21Document( - document['docid'], - document['url'], - document['title'], - document['headings'], - document['body']) - - # raise KeyError(f'doc_id={doc_id} not found') - - -class MsMarcoV21Docs(MsMarcoV2Docs): - _fields = ["doc_id"] - def __init__(self, dlc): - super().__init__(dlc) - - def __iter__(): - pass - - def docs_store(self, field='doc_id'): - ds = MsMarcoV21DocStore(self, self._dlc, - ir_datasets.util.home_path() / NAME / "docs") - ds.build() - return ds - - def docs_count(self): - return 10960555 - def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) - collection = MsMarcoV21Docs(dlc['docs']) + # we can re-use MsMarcoV2Docs, just with a few modifications directly + collection = MsMarcoV2Docs(dlc['docs'], docid_prefix='msmarco_v2.1_doc_', docstore_size_hint=0, name=NAME) subsets = {} subsets['trec-rag-2024'] = Dataset( diff --git a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py deleted file mode 100644 index fa048d1..0000000 --- a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py +++ /dev/null @@ -1,159 +0,0 @@ -import contextlib -import gzip -import io -from pathlib import Path -import json -from typing import NamedTuple, Tuple, List -import tarfile -import ir_datasets -from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator, TarExtractAll, TarExtract -from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels -from ir_datasets.formats import TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs -from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS -import os.path -import shutil - -_logger = ir_datasets.log.easy() - -NAME = 'msmarco-document-v2.1' - - -class MsMarcoV21SegmentedDocument(NamedTuple): - doc_id: str - url: str - title: str - headings: str - segment: str - start_char: int - end_char: int - def default_text(self): - """ - title + headings + segment - This is consistent with the MsMarcoV21Document that returns the full text alternative of this: title + headings + body - Please note that Anserini additionaly returns the url. I.e., anserini returns url + title + headings + segment - E.g., https://github.com/castorini/anserini/blob/b8ce19f56bc4e85056ef703322f76646804ec640/src/main/java/io/anserini/collection/MsMarcoV2DocCollection.java#L169 - """ - return f'{self.title} {self.headings} {self.segment}' - - -def ensure_file_is_extracted(file_name): - if os.path.isfile(file_name): - return - import tempfile - tmp_file = Path(tempfile.mkdtemp()) / file_name.split('/')[-1] - - with gzip.open(file_name + '.gz', 'rb') as f_in: - with open(tmp_file, 'wb') as f_out: - shutil.copyfileobj(f_in, f_out) - shutil.move(tmp_file, file_name) - - -class MsMarcoV21SegmentedDocStore(ir_datasets.indices.Docstore): - def __init__(self, doc_cls, dlc, base_path): - super().__init__(doc_cls) - self.dlc = dlc - self.cache = None - self.base_path = base_path - - def built(self): - return False - - def build(self): - if self.cache: - return - self.cache = TarExtractAll(self.dlc, self.base_path/"msmarco_v2.1_doc_segmented") - - for i in range(0, 59): - ensure_file_is_extracted(f"{self.cache.path()}/msmarco_v2.1_doc_segmented_{i:02d}.json") - - - def get(self, doc_id, field=None): - (string1, string2, string3, bundlenum, doc_position, position) = doc_id.split("_") - assert string1 == "msmarco" and string2 == "v2.1" and string3 == "doc" - - with open( - f"{self.cache.path()}/msmarco_v2.1_doc_segmented_{bundlenum}.json", "rt", encoding="utf8" - ) as in_fh: - in_fh.seek(int(position)) - json_string = in_fh.readline() - document = json.loads(json_string) - - assert document["docid"] == doc_id - return MsMarcoV21SegmentedDocument( - document['docid'], - document['url'], - document['title'], - document['headings'], - document['segment'], - document['start_char'], - document['end_char'] - ) - - -class MsMarcoV21Docs(BaseDocs): - _fields = ["doc_id"] - def __init__(self, dlc): - super().__init__() - self._dlc = dlc - - @ir_datasets.util.use_docstore - def docs_iter(self): - with self._dlc.stream() as stream, \ - tarfile.open(fileobj=stream, mode='r|') as tarf: - for record in tarf: - if not record.name.endswith('.gz'): - continue - file = tarf.extractfile(record) - with gzip.open(file) as file: - for line in file: - data = json.loads(line) - yield MsMarcoV21SegmentedDocument( - data['docid'], - data['url'], - data['title'], - data['headings'], - data['segment'], - data['start_char'], - data['end_char'], - ) - - def docs_cls(self): - return MsMarcoV21SegmentedDocument - - def __iter__(): - pass - - def docs_store(self, field='doc_id'): - ds = MsMarcoV21SegmentedDocStore(self, self._dlc, - ir_datasets.util.home_path() / NAME / "docs-segmented") - ds.build() - return ds - - def docs_count(self): - return 113520750 - - def docs_namespace(self): - return NAME - - def docs_lang(self): - return 'en' - - -def _init(): - base_path = ir_datasets.util.home_path()/NAME - documentation = YamlDocumentation(f'docs/{NAME}.yaml') - dlc = DownloadConfig.context(NAME, base_path, dua=DUA) - collection = MsMarcoV21Docs(dlc['docs-segmented']) - subsets = {} - subsets['trec-rag-2024'] = Dataset( - collection, - TsvQueries(dlc['rag-2024-test-topics'], namespace=NAME, lang='en'), - ) - - ir_datasets.registry.register(NAME + '/segmented', Dataset(collection, documentation('_'))) - for s in sorted(subsets): - ir_datasets.registry.register(f'{NAME}/segmented/{s}', Dataset(subsets[s], documentation(s))) - - return collection, subsets - -collection, subsets = _init() diff --git a/ir_datasets/datasets/msmarco_passage_v2.py b/ir_datasets/datasets/msmarco_passage_v2.py index ea43d37..7dc572e 100644 --- a/ir_datasets/datasets/msmarco_passage_v2.py +++ b/ir_datasets/datasets/msmarco_passage_v2.py @@ -46,11 +46,22 @@ def parse_msmarco_passage(line): data['docid']) +def passage_bundle_pos_from_key(key): + (string1, string2, bundlenum, position) = key.split('_') + assert string1 == 'msmarco' and string2 == 'passage' + return f'msmarco_passage_{bundlenum}', position + class MsMarcoV2Passages(BaseDocs): - def __init__(self, dlc, pos_dlc=None): + def __init__(self, dlc, pos_dlc=None, cls=MsMarcoV2Passage, parse_passage=parse_msmarco_passage, name=NAME, docstore_size_hint=60880127751, bundle_pos_from_key=passage_bundle_pos_from_key, count=138_364_198): super().__init__() self._dlc = dlc self._pos_dlc = pos_dlc + self._cls = cls + self._parse_passage = parse_passage + self._name = name + self._docstore_size_hint = docstore_size_hint + self._bundle_pos_from_key = bundle_pos_from_key + self._count = count @ir_datasets.util.use_docstore def docs_iter(self): @@ -59,30 +70,31 @@ def docs_iter(self): # files are used (i.e., no filtering is applied) yield from self.docs_store() else: - with self._dlc.stream() as stream, \ - tarfile.open(fileobj=stream, mode='r|') as tarf: - for record in tarf: - if not record.name.endswith('.gz'): - continue + with tarfile.open(self._dlc.path(), mode='r:') as tarf: + # since there's no compression, it's fast to scan all records and sort them. + # The sorting has no effect on v2, but in v2.1, the files are out-of-sequence, so this + # addressed that problem. + records = sorted([r for r in tarf if r.name.endswith('.gz')], key=lambda x: x.name) + for record in records: file = tarf.extractfile(record) with gzip.open(file) as file: for line in file: - yield parse_msmarco_passage(line) + yield self._parse_passage(line) def docs_cls(self): - return MsMarcoV2Passage + return self._cls def docs_store(self, field='doc_id'): assert field == 'doc_id' # Unlike for msmarco-document-v2, using the docstore actually hurts performance. - return MsMarcoV2DocStore(self) + return MsMarcoV2DocStore(self, size_hint=self._docstore_size_hint, count=self._count) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): - return NAME + return self._name def docs_lang(self): return 'en' @@ -92,7 +104,7 @@ def docs_path(self, force=True): class MsMarcoV2DocStore(ir_datasets.indices.Docstore): - def __init__(self, docs_handler): + def __init__(self, docs_handler, size_hint=60880127751, count=138_364_198): super().__init__(docs_handler.docs_cls(), 'doc_id') self.np = ir_datasets.lazy_libs.numpy() self.docs_handler = docs_handler @@ -101,7 +113,8 @@ def __init__(self, docs_handler): self.base_path = docs_handler.docs_path(force=False) + '.extracted' if not os.path.exists(self.base_path): os.makedirs(self.base_path) - self.size_hint = 60880127751 + self.size_hint = size_hint + self._count = count def get_many_iter(self, keys): self.build() @@ -110,20 +123,19 @@ def get_many_iter(self, keys): for key in keys: if not key.count('_') == 3: continue - (string1, string2, bundlenum, position) = key.split('_') - assert string1 == 'msmarco' and string2 == 'passage' + bundlenum, position = self.docs_handler._bundle_pos_from_key(key) if bundlenum not in bundles: bundles[bundlenum] = [] bundles[bundlenum].append(int(position)) for bundlenum, positions in bundles.items(): positions = sorted(positions) - file = f'{self.base_path}/msmarco_passage_{bundlenum}' + file = f'{self.base_path}/{bundlenum}' if not os.path.exists(file): # invalid doc_id -- doesn't point to a real bundle continue if self.docs_handler._pos_dlc is not None: # check the positions are valid for these doc_ids -- only return valid ones - mmp = self.np.memmap(os.path.join(self.pos_dlc.path(), f'msmarco_passage_{bundlenum}.pos'), dtype=' Date: Sun, 11 Aug 2024 19:07:44 +0100 Subject: [PATCH 09/13] more wip --- ir_datasets/datasets/msmarco_document_v2_1.py | 2 +- ir_datasets/datasets/msmarco_passage_v2.py | 7 ++++--- ir_datasets/datasets/msmarco_segment_v2_1.py | 14 ++++++++++---- test/integration/msmarco_segment_v2_1.py | 13 ++++++------- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py index c292108..2b054c0 100644 --- a/ir_datasets/datasets/msmarco_document_v2_1.py +++ b/ir_datasets/datasets/msmarco_document_v2_1.py @@ -14,7 +14,7 @@ def _init(): documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) # we can re-use MsMarcoV2Docs, just with a few modifications directly - collection = MsMarcoV2Docs(dlc['docs'], docid_prefix='msmarco_v2.1_doc_', docstore_size_hint=0, name=NAME) + collection = MsMarcoV2Docs(dlc['docs'], docid_prefix='msmarco_v2.1_doc_', docstore_size_hint=59680176084, name=NAME) subsets = {} subsets['trec-rag-2024'] = Dataset( diff --git a/ir_datasets/datasets/msmarco_passage_v2.py b/ir_datasets/datasets/msmarco_passage_v2.py index 7dc572e..e7c3de4 100644 --- a/ir_datasets/datasets/msmarco_passage_v2.py +++ b/ir_datasets/datasets/msmarco_passage_v2.py @@ -121,9 +121,10 @@ def get_many_iter(self, keys): # adapted from bundles = {} for key in keys: - if not key.count('_') == 3: + try: + bundlenum, position = self.docs_handler._bundle_pos_from_key(key) + except: continue - bundlenum, position = self.docs_handler._bundle_pos_from_key(key) if bundlenum not in bundles: bundles[bundlenum] = [] bundles[bundlenum].append(int(position)) @@ -227,7 +228,7 @@ def __next__(self): pos = self.current_pos_mmap[self.slice.start - self.current_file_start_idx] self.current_file.seek(pos) self.next_index = self.slice.start - result = parse_msmarco_passage(self.current_file.readline()) + result = self.docstore.docs_handler._parse_passage(self.current_file.readline()) self.next_index += 1 self.slice = slice(self.slice.start + (self.slice.step or 1), self.slice.stop, self.slice.step) return result diff --git a/ir_datasets/datasets/msmarco_segment_v2_1.py b/ir_datasets/datasets/msmarco_segment_v2_1.py index 4574aa8..a3a4d91 100644 --- a/ir_datasets/datasets/msmarco_segment_v2_1.py +++ b/ir_datasets/datasets/msmarco_segment_v2_1.py @@ -20,6 +20,8 @@ class MsMarcoV21SegmentedDoc(NamedTuple): segment: str start_char: int end_char: int + msmarco_document_id: str + msmarco_document_segment_idx: int def default_text(self): """ title + headings + segment @@ -32,6 +34,8 @@ def default_text(self): def parse_msmarco_segment(line): data = json.loads(line) + msmarco_document_id, segment_info = data['docid'].split('#') + segment_idx, segment_file_offset = segment_info.split('_') return MsMarcoV21SegmentedDoc( data['docid'], data['url'], @@ -39,15 +43,17 @@ def parse_msmarco_segment(line): data['headings'], data['segment'], data['start_char'], - data['end_char'] + data['end_char'], + msmarco_document_id, + int(segment_idx), ) -def passage_bundle_pos_from_key(key): +def segment_bundle_pos_from_key(key): # key like: msmarco_v2.1_doc_00_0#4_5974 first, second = key.split('#') (string1, string2, string3, bundle, doc_pos) = first.split('_') - (segment_num, segment_pos) = first.split('_') + (segment_num, segment_pos) = second.split('_') assert string1 == 'msmarco' and string2 == 'v2.1' and string3 == 'doc' return f'msmarco_v2.1_doc_segmented_{bundle}.json', segment_pos @@ -56,7 +62,7 @@ def _init(): base_path = ir_datasets.util.home_path()/NAME documentation = YamlDocumentation(f'docs/{NAME}.yaml') dlc = DownloadConfig.context(NAME, base_path, dua=DUA) - collection = MsMarcoV2Passages(dlc['docs'], cls=MsMarcoV21SegmentedDoc, parse_passage=parse_msmarco_segment, name=NAME, bundle_pos_from_key=passage_bundle_pos_from_key, count=113_520_750) + collection = MsMarcoV2Passages(dlc['docs'], cls=MsMarcoV21SegmentedDoc, parse_passage=parse_msmarco_segment, name=NAME, bundle_pos_from_key=segment_bundle_pos_from_key, count=113_520_750, docstore_size_hint=205178702472) subsets = {} subsets['trec-rag-2024'] = Dataset( collection, diff --git a/test/integration/msmarco_segment_v2_1.py b/test/integration/msmarco_segment_v2_1.py index 8b4d07c..2fe48bf 100644 --- a/test/integration/msmarco_segment_v2_1.py +++ b/test/integration/msmarco_segment_v2_1.py @@ -1,7 +1,7 @@ import re import unittest import ir_datasets -from ir_datasets.datasets.msmarco_document_v2_1_segmented import MsMarcoV21SegmentedDocument +from ir_datasets.datasets.msmarco_segment_v2_1 import MsMarcoV21SegmentedDoc from ir_datasets.formats import TrecQrel, GenericQuery from .base import DatasetIntegrationTest @@ -11,12 +11,11 @@ class TestMSMarcoV21DocsSegmented(DatasetIntegrationTest): def test_docs(self): - self._build_test_docs('msmarco-segment-v2.1') - # self._test_docs('msmarco-segment-v2.1', count=113520750, items={ - # 0: MsMarcoV21SegmentedDocument('msmarco_v2.1_doc_42_0#0_0', 'https://www.guidingtech.com/use-flip-tool-gimp/', 'How to Use Flip Tool in GIMP', re.compile('^How to Use Flip Tool in\nGIMP\n\nHow to Use Flip Tool in GIMP\nMehvish\n1\\. Using the Built\\-In Flip Tool\nF.{85}ol\nFlip a Layer\nFlip All Layers in GIMP\nBonus Tip: Create Mirror Effect in GIMP\nMagic of the Mirror\n$', flags=48), re.compile("^How to Use Flip Tool in GIMP\nHow to Use Flip Tool in GIMP\nMehvish\n06 Sep 2019\nAt times, the powerful.{600}re is a guide on how to flip an image in GIMP\\. There are two methods to do it\\. Let's check them out\\.$", flags=48), 0, 800), - # 9: MsMarcoV21SegmentedDocument('msmarco_v2.1_doc_42_0#9_9963', 'https://www.guidingtech.com/use-flip-tool-gimp/', 'How to Use Flip Tool in GIMP', re.compile('^How to Use Flip Tool in\nGIMP\n\nHow to Use Flip Tool in GIMP\nMehvish\n1\\. Using the Built\\-In Flip Tool\nF.{85}ol\nFlip a Layer\nFlip All Layers in GIMP\nBonus Tip: Create Mirror Effect in GIMP\nMagic of the Mirror\n$', flags=48), re.compile('^Flip a Layer\nTo do so, follow these steps: Step 1: Open the image in GIMP\\. Step 2: Click on the Laye.{309} click on the Image option present in the top bar and select Transform followed by your flip choice\\.$', flags=48), 2862, 3372), - # 113520749: MsMarcoV21SegmentedDocument('msmarco_v2.1_doc_04_1869956217#8_3169040836', 'http://www.city-data.com/city/Sedgwick-Kansas.html', 'Sedgwick, Kansas (KS 67135) profile: population, maps, real estate, averages, homes, statistics, relocation, travel, jobs, hospitals, schools, crime, moving, houses, news, sex offenders', re.compile('^Sedgwick, Kansas\nSedgwick, Kansas\nLoading data\\.\\.\\.\nCrime rates in Sedgwick by year\nType\n2007\n2011\n201.{1539}ing System \\(NFIRS\\) incidents\nSedgwick compared to Kansas state average:\nOther pages you might like:\n$', flags=48), re.compile('^79\\.8 \\(low, U\\.S\\. average is 100\\)\nSedgwick, KS residents, houses, and apartments details\nPercentage of.{7764}house built \\- Built 1939 or earlier \\(%\\) Average household size Household density \\(households per squ$', flags=48), 2037, 10000), - # }) + self._test_docs('msmarco-segment-v2.1', count=113520750, items={ + 0: MsMarcoV21SegmentedDoc('msmarco_v2.1_doc_00_0#0_0', 'http://0-60.reviews/0-60-times/', '0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews', '0-60 Times\n0-60 Times', re.compile('^0\\-60 Times \\- 0\\-60 \\| 0 to 60 Times \\& 1/4 Mile Times \\| Zero to 60 Car Reviews\n0\\-60 Times\nThere are man.{1078}used as the standard in the United States, where the rest of the world prefers the 0\\-100 km version\\.$', flags=48), 0, 1278, 'msmarco_v2.1_doc_00_0', 0), + 9: MsMarcoV21SegmentedDoc('msmarco_v2.1_doc_00_4810#2_16701', 'http://0-www.worldcat.org.novacat.nova.edu/identities/lccn-n79036869/', 'Ethel Percy Andrus Gerontology Center [WorldCat Identities]', re.compile('^Ethel Percy Andrus Gerontology Center\nEthel Percy Andrus Gerontology Center\nAndrus \\(Ethel Percy\\) Ger.{409}niversity of Southern California Los Angeles, Calif Ethel Percy Andrus Gerontology Center\nLanguages\n$', flags=48), re.compile('^submitted to U\\.S\\. Department of Health, Education, and Welfare, Public Health Service, Health Resea.{2311}e questionnaires used and the data derived from them, and how the data were collected and analyzed\\.$', flags=48), 2265, 4776, 'msmarco_v2.1_doc_00_4810', 2), + 113520749: MsMarcoV21SegmentedDoc('msmarco_v2.1_doc_59_964287870#4_2159633396', 'https://zzzzbov.com/blag/shortcut-to-zoom', 'Shortcut to Zoom › zzzzBov.com', 'Shortcut to Zoom\nShortcut to Zoom\nBatch File\nShortcut\nTrying it out\n', re.compile('^When it asks "What would you like to name the shortcut\\?", type the name of the meeting \\(i\\.e\\. "Standu.{333}hat adding even a few of these to my start menu will help reduce just a bit more friction in my day\\.$', flags=48), 1963, 2497, 'msmarco_v2.1_doc_59_964287870', 4), + }) def test_queries(self): self._test_queries('msmarco-segment-v2.1/trec-rag-2024', count=301, items={ From 966c1ba9a248feb187ae0d5c69c041cc7b8b51aa Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Sun, 11 Aug 2024 21:28:00 +0100 Subject: [PATCH 10/13] metadata --- ir_datasets/etc/metadata.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json index 60bd7c8..69756b0 100644 --- a/ir_datasets/etc/metadata.json +++ b/ir_datasets/etc/metadata.json @@ -495,6 +495,7 @@ "mr-tydi/th/train": {"docs": {"_ref": "mr-tydi/th"}, "queries": {"count": 3319}, "qrels": {"count": 3360, "fields": {"relevance": {"counts_by_value": {"1": 3360}}}}}, "msmarco-document": {"docs": {"count": 3213835, "fields": {"doc_id": {"max_len": 8, "common_prefix": "D"}}}}, "msmarco-document-v2": {"docs": {"count": 11959635, "fields": {"doc_id": {"max_len": 25, "common_prefix": "msmarco_doc_"}}}}, + "msmarco-document-v2.1/trec-rag-2024": {"docs": {"_ref": "msmarco-document-v2.1"}, "queries": {"count": 301}}, "msmarco-document-v2/anchor-text": {"docs": {"count": 4821244, "fields": {"doc_id": {"max_len": 25, "common_prefix": "msmarco_doc_"}}}}, "msmarco-document-v2/dev1": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 4552}, "qrels": {"count": 4702, "fields": {"relevance": {"counts_by_value": {"1": 4702}}}}, "scoreddocs": {"count": 455200}}, "msmarco-document-v2/dev2": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 5000}, "qrels": {"count": 5178, "fields": {"relevance": {"counts_by_value": {"1": 5178}}}}, "scoreddocs": {"count": 500000}}, @@ -561,6 +562,8 @@ "msmarco-qna/dev": {"docs": {"_ref": "msmarco-qna"}, "queries": {"count": 101093}, "qrels": {"count": 1008985, "fields": {"relevance": {"counts_by_value": {"0": 949712, "1": 59273}}}}, "scoreddocs": {"count": 1008985}}, "msmarco-qna/eval": {"docs": {"_ref": "msmarco-qna"}, "queries": {"count": 101092}, "scoreddocs": {"count": 1008943}}, "msmarco-qna/train": {"docs": {"_ref": "msmarco-qna"}, "queries": {"count": 808731}, "qrels": {"count": 8069749, "fields": {"relevance": {"counts_by_value": {"1": 532761, "0": 7536988}}}}, "scoreddocs": {"count": 8069749}}, + "msmarco-segment-v2.1": {"docs": {"count": 113520750, "fields": {"doc_id": {"max_len": 45, "common_prefix": "msmarco_v2.1_doc_"}}}}, + "msmarco-segment-v2.1/trec-rag-2024": {"docs": {"_ref": "msmarco-segment-v2.1"}, "queries": {"count": 301}}, "natural-questions": {"docs": {"count": 28390850, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}}, "natural-questions/dev": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 7830}, "qrels": {"count": 7695, "fields": {"relevance": {"counts_by_value": {"1": 7695}}}}, "scoreddocs": {"count": 973480}}, "natural-questions/train": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 307373}, "qrels": {"count": 152148, "fields": {"relevance": {"counts_by_value": {"1": 152148}}}}, "scoreddocs": {"count": 40374730}}, From 21790f2e56b0e7f9d886ebe27d5bf4efaddb4f46 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Sun, 11 Aug 2024 21:38:35 +0100 Subject: [PATCH 11/13] msmarco-document-v2.1 metadata --- ir_datasets/etc/metadata.json | 1 + 1 file changed, 1 insertion(+) diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json index 69756b0..2e18294 100644 --- a/ir_datasets/etc/metadata.json +++ b/ir_datasets/etc/metadata.json @@ -495,6 +495,7 @@ "mr-tydi/th/train": {"docs": {"_ref": "mr-tydi/th"}, "queries": {"count": 3319}, "qrels": {"count": 3360, "fields": {"relevance": {"counts_by_value": {"1": 3360}}}}}, "msmarco-document": {"docs": {"count": 3213835, "fields": {"doc_id": {"max_len": 8, "common_prefix": "D"}}}}, "msmarco-document-v2": {"docs": {"count": 11959635, "fields": {"doc_id": {"max_len": 25, "common_prefix": "msmarco_doc_"}}}}, + "msmarco-document-v2.1": {"docs": {"count": 10960555, "fields": {"doc_id": {"max_len": 30, "common_prefix": "msmarco_v2.1_doc_"}}}}, "msmarco-document-v2.1/trec-rag-2024": {"docs": {"_ref": "msmarco-document-v2.1"}, "queries": {"count": 301}}, "msmarco-document-v2/anchor-text": {"docs": {"count": 4821244, "fields": {"doc_id": {"max_len": 25, "common_prefix": "msmarco_doc_"}}}}, "msmarco-document-v2/dev1": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 4552}, "qrels": {"count": 4702, "fields": {"relevance": {"counts_by_value": {"1": 4702}}}}, "scoreddocs": {"count": 455200}}, From 96d3c9fc4cdd715612d3b5719a47a3d047532b3c Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Mon, 12 Aug 2024 08:37:40 +0100 Subject: [PATCH 12/13] missing metadata from trec-cast --- ir_datasets/etc/metadata.json | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json index 2e18294..87ee0f5 100644 --- a/ir_datasets/etc/metadata.json +++ b/ir_datasets/etc/metadata.json @@ -640,6 +640,20 @@ "trec-cast/v1/2019/judged": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 173}, "qrels": {"_ref": "trec-cast/v1/2019"}, "scoreddocs": {"count": 173000}}, "trec-cast/v1/2020": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 216}, "qrels": {"count": 40451, "fields": {"relevance": {"counts_by_value": {"1": 2697, "0": 33781, "2": 1834, "3": 1408, "4": 731}}}}}, "trec-cast/v1/2020/judged": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 208}, "qrels": {"_ref": "trec-cast/v1/2020"}}, + "trec-cast/v2/2021": {"docs": {"_ref": "trec-cast/v2"}, "queries": {"count": 239}, "qrels": {"count": 19334, "fields": {"relevance": {"counts_by_value": {"0": 13829, "4": 716, "3": 1007, "2": 1710, "1": 2072}}}}}, + "trec-cast/v2/kilt": {"docs": {"count": 5903530, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}}, + "trec-cast/v2/kilt/passages": {"docs": {"count": 17124025, "fields": {"doc_id": {"max_len": 17, "common_prefix": "KILT_"}}}}, + "trec-cast/v2/kilt/segmented": {"docs": {"count": 5903530, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}}, + "trec-cast/v2/msmarco": {"docs": {"count": 3051991, "fields": {"doc_id": {"max_len": 14, "common_prefix": "MARCO_D"}}}}, + "trec-cast/v2/msmarco/passages": {"docs": {"count": 19092817, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_D"}}}}, + "trec-cast/v2/msmarco/segmented": {"docs": {"count": 3051991, "fields": {"doc_id": {"max_len": 14, "common_prefix": "MARCO_D"}}}}, + "trec-cast/v3/2022": {"docs": {"_ref": "trec-cast/v3"}, "queries": {"count": 408}, "qrels": {"count": 42196, "fields": {"relevance": {"counts_by_value": {"0": 29868, "1": 5063, "3": 2129, "2": 3297, "4": 1839}}}}}, + "trec-cast/v3/kilt": {"docs": {"count": 5903219, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}}, + "trec-cast/v3/kilt/passages": {"docs": {"count": 17111488, "fields": {"doc_id": {"max_len": 17, "common_prefix": "KILT_"}}}}, + "trec-cast/v3/kilt/segmented": {"docs": {"count": 5903219, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}}, + "trec-cast/v3/msmarco": {"docs": {"count": 10965836, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_"}}}}, + "trec-cast/v3/msmarco/passages": {"docs": {"count": 86326322, "fields": {"doc_id": {"max_len": 24, "common_prefix": "MARCO_"}}}}, + "trec-cast/v3/msmarco/segmented": {"docs": {"count": 10965836, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_"}}}}, "trec-fair": {}, "trec-fair-2021": {"docs": {"count": 6280328, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-fair-2021/eval": {"docs": {"_ref": "trec-fair-2021"}, "queries": {"count": 49}, "qrels": {"count": 13757, "fields": {"relevance": {"counts_by_value": {"1": 13757}}}}}, From 24a983d51b04f11a11c2f654dab3c275905c67a0 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Mon, 12 Aug 2024 13:05:06 +0100 Subject: [PATCH 13/13] more metadata --- ir_datasets/etc/metadata.json | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json index 87ee0f5..bf89499 100644 --- a/ir_datasets/etc/metadata.json +++ b/ir_datasets/etc/metadata.json @@ -640,6 +640,7 @@ "trec-cast/v1/2019/judged": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 173}, "qrels": {"_ref": "trec-cast/v1/2019"}, "scoreddocs": {"count": 173000}}, "trec-cast/v1/2020": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 216}, "qrels": {"count": 40451, "fields": {"relevance": {"counts_by_value": {"1": 2697, "0": 33781, "2": 1834, "3": 1408, "4": 731}}}}}, "trec-cast/v1/2020/judged": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 208}, "qrels": {"_ref": "trec-cast/v1/2020"}}, + "trec-cast/v2": {"docs": {"count": 9680029, "fields": {"doc_id": {"max_len": 41, "common_prefix": ""}}}}, "trec-cast/v2/2021": {"docs": {"_ref": "trec-cast/v2"}, "queries": {"count": 239}, "qrels": {"count": 19334, "fields": {"relevance": {"counts_by_value": {"0": 13829, "4": 716, "3": 1007, "2": 1710, "1": 2072}}}}}, "trec-cast/v2/kilt": {"docs": {"count": 5903530, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}}, "trec-cast/v2/kilt/passages": {"docs": {"count": 17124025, "fields": {"doc_id": {"max_len": 17, "common_prefix": "KILT_"}}}}, @@ -647,6 +648,11 @@ "trec-cast/v2/msmarco": {"docs": {"count": 3051991, "fields": {"doc_id": {"max_len": 14, "common_prefix": "MARCO_D"}}}}, "trec-cast/v2/msmarco/passages": {"docs": {"count": 19092817, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_D"}}}}, "trec-cast/v2/msmarco/segmented": {"docs": {"count": 3051991, "fields": {"doc_id": {"max_len": 14, "common_prefix": "MARCO_D"}}}}, + "trec-cast/v2/passages": {"docs": {"count": 39254641, "fields": {"doc_id": {"max_len": 45, "common_prefix": ""}}}}, + "trec-cast/v2/wapo": {"docs": {"count": 724508, "fields": {"doc_id": {"max_len": 41, "common_prefix": "WAPO_"}}}}, + "trec-cast/v2/wapo/passages": {"docs": {"count": 3037799, "fields": {"doc_id": {"max_len": 45, "common_prefix": "WAPO_"}}}}, + "trec-cast/v2/wapo/segmented": {"docs": {"count": 724508, "fields": {"doc_id": {"max_len": 41, "common_prefix": "WAPO_"}}}}, + "trec-cast/v3": {"docs": {"count": 106400940, "fields": {"doc_id": {"max_len": 45, "common_prefix": ""}}}}, "trec-cast/v3/2022": {"docs": {"_ref": "trec-cast/v3"}, "queries": {"count": 408}, "qrels": {"count": 42196, "fields": {"relevance": {"counts_by_value": {"0": 29868, "1": 5063, "3": 2129, "2": 3297, "4": 1839}}}}}, "trec-cast/v3/kilt": {"docs": {"count": 5903219, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}}, "trec-cast/v3/kilt/passages": {"docs": {"count": 17111488, "fields": {"doc_id": {"max_len": 17, "common_prefix": "KILT_"}}}}, @@ -654,6 +660,9 @@ "trec-cast/v3/msmarco": {"docs": {"count": 10965836, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_"}}}}, "trec-cast/v3/msmarco/passages": {"docs": {"count": 86326322, "fields": {"doc_id": {"max_len": 24, "common_prefix": "MARCO_"}}}}, "trec-cast/v3/msmarco/segmented": {"docs": {"count": 10965836, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_"}}}}, + "trec-cast/v3/wapo": {"docs": {"count": 713594, "fields": {"doc_id": {"max_len": 41, "common_prefix": "WAPO_"}}}}, + "trec-cast/v3/wapo/passages": {"docs": {"count": 2963130, "fields": {"doc_id": {"max_len": 45, "common_prefix": "WAPO_"}}}}, + "trec-cast/v3/wapo/segmented": {"docs": {"count": 713594, "fields": {"doc_id": {"max_len": 41, "common_prefix": "WAPO_"}}}}, "trec-fair": {}, "trec-fair-2021": {"docs": {"count": 6280328, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}}, "trec-fair-2021/eval": {"docs": {"_ref": "trec-fair-2021"}, "queries": {"count": 49}, "qrels": {"count": 13757, "fields": {"relevance": {"counts_by_value": {"1": 13757}}}}},