From 4ac934fb9004a7e0628a2c3142b2ad539030901a Mon Sep 17 00:00:00 2001
From: Matti Wiegmann <matti.wiegmann@uni-weimar.de>
Date: Wed, 10 Jul 2024 16:05:35 +0200
Subject: [PATCH 01/13] add msmarco v2.1 documents

---
 ir_datasets/datasets/msmarco_document_v2_1.py | 90 +++++++++++++++++++
 ir_datasets/etc/downloads.json                | 17 ++++
 2 files changed, 107 insertions(+)
 create mode 100644 ir_datasets/datasets/msmarco_document_v2_1.py

diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py
new file mode 100644
index 0000000..cb582ff
--- /dev/null
+++ b/ir_datasets/datasets/msmarco_document_v2_1.py
@@ -0,0 +1,90 @@
+import contextlib
+import gzip
+import io
+from pathlib import Path
+import json
+from typing import NamedTuple, Tuple, List
+import tarfile
+import ir_datasets
+from ir_datasets.indices import PickleLz4FullStore
+from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator, TarExtractAll, TarExtract
+from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels
+from ir_datasets.formats import TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs
+from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS
+from ir_datasets.datasets.msmarco_document import TREC_DL_QRELS_DEFS
+from ir_datasets.datasets.msmarco_document_v2 import MsMarcoV2Docs, MsMarcoV2Document
+
+_logger = ir_datasets.log.easy()
+
+NAME = 'msmarco-document-v2.1'
+
+
+class MsMarcoV21DocStore(ir_datasets.indices.Docstore):    
+    def __init__(self, doc_cls, dlc, base_path):
+        super().__init__(doc_cls)
+        self.dlc = dlc
+        self.cache = None
+        self.base_path = base_path
+
+    def built(self):
+        return False
+
+    def build(self):
+        if self.cache:
+            return
+        self.cache = Cache(TarExtractAll(self.dlc, "msmarco_v2.1_doc"), self.base_path)
+
+    def get(self, doc_id, field=None):
+        (string1, string2, string3, bundlenum, position) = doc_id.split("_")
+        assert string1 == "msmarco" and string2 == "v2.1" and string3 == "doc"
+
+        with open(
+            f"{self.cache.path()}/msmarco_v2.1_doc_{bundlenum}.json", "rt", encoding="utf8"
+        ) as in_fh:
+            in_fh.seek(int(position))
+            json_string = in_fh.readline()
+            document = json.loads(json_string)
+
+            assert document["docid"] == doc_id
+            return MsMarcoV2Document(
+                document['docid'],
+                document['url'],
+                document['title'],
+                document['headings'],
+                document['body'])
+        
+        # raise KeyError(f'doc_id={doc_id} not found')
+
+
+class MsMarcoV21Docs(MsMarcoV2Docs):
+    _fields = ["doc_id"]
+    def __init__(self, dlc):
+        super().__init__(dlc)
+
+    def __iter__():
+        pass
+
+    def docs_store(self, field='doc_id'):
+        ds = MsMarcoV21DocStore(self, self._dlc, 
+        ir_datasets.util.home_path() / NAME / "docs")
+        ds.build()
+        return ds
+
+
+def _init():
+    base_path = ir_datasets.util.home_path()/NAME
+    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
+    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
+    subsets = {}
+    
+    collection = MsMarcoV21Docs(dlc['docs'])
+    for docs in collection.docs_iter():
+        print(docs)
+        break
+
+    ds = collection.docs_store()
+    document = ds.get("msmarco_v2.1_doc_12_0")
+    print(document)
+
+if __name__ == "__main__":
+    _init()
\ No newline at end of file
diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json
index 397646a..cc0fb44 100644
--- a/ir_datasets/etc/downloads.json
+++ b/ir_datasets/etc/downloads.json
@@ -4714,6 +4714,23 @@
     }
   },
   
+  "msmarco-document-v2.1": {
+    "docs": {
+      "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco_v2.1_doc.tar",
+      "size_hint": 30844989440,
+      "expected_md5_foo": "a5950665d6448d3dbaf7135645f1e074",
+      "cache_path": "msmarco_v2.1_doc.tar",
+      "download_args": {"headers": {"X-Ms-Version": "2024-07-10"}}
+    },
+    "docs-segmented": {
+      "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco_v2.1_doc_segmented.tar",
+      "size_hint": 26918768640,
+      "expected_md5_foo": "3799e7611efffd8daeb257e9ccca4d60",
+      "cache_path": "msmarco_v2.1_doc_segmented.tar",
+      "download_args": {"headers": {"X-Ms-Version": "2024-07-10"}}
+    }
+  },
+
   "msmarco-passage": {
     "collectionandqueries": {
       "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",

From 9044fe4f2c159659a4981a78e63e3b43f2a15529 Mon Sep 17 00:00:00 2001
From: Maik Froebe <maik.froebe@uni-jena.de>
Date: Mon, 5 Aug 2024 13:35:38 +0200
Subject: [PATCH 02/13] Add unit tests for msmarco document 2.1

---
 ir_datasets/datasets/__init__.py              |  1 +
 ir_datasets/datasets/msmarco_document_v2_1.py | 15 ++---
 ir_datasets/docs/msmarco-document-v2.1.yaml   | 14 ++++
 test/integration/msmarco_document_v2_1.py     | 66 +++++++++++++++++++
 4 files changed, 86 insertions(+), 10 deletions(-)
 create mode 100644 ir_datasets/docs/msmarco-document-v2.1.yaml
 create mode 100644 test/integration/msmarco_document_v2_1.py

diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py
index c5298d1..8bd90db 100644
--- a/ir_datasets/datasets/__init__.py
+++ b/ir_datasets/datasets/__init__.py
@@ -29,6 +29,7 @@
 from . import mr_tydi
 from . import msmarco_document
 from . import msmarco_document_v2
+from . import msmarco_document_v2_1
 from . import msmarco_passage
 from . import msmarco_passage_v2
 from . import msmarco_qna
diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py
index cb582ff..d2c1c2c 100644
--- a/ir_datasets/datasets/msmarco_document_v2_1.py
+++ b/ir_datasets/datasets/msmarco_document_v2_1.py
@@ -75,16 +75,11 @@ def _init():
     base_path = ir_datasets.util.home_path()/NAME
     documentation = YamlDocumentation(f'docs/{NAME}.yaml')
     dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
-    subsets = {}
-    
     collection = MsMarcoV21Docs(dlc['docs'])
-    for docs in collection.docs_iter():
-        print(docs)
-        break
+    subsets = {}
 
-    ds = collection.docs_store()
-    document = ds.get("msmarco_v2.1_doc_12_0")
-    print(document)
+    ir_datasets.registry.register(NAME, Dataset(collection, documentation('_')))
+    
+    return collection, subsets
 
-if __name__ == "__main__":
-    _init()
\ No newline at end of file
+collection, subsets = _init()
diff --git a/ir_datasets/docs/msmarco-document-v2.1.yaml b/ir_datasets/docs/msmarco-document-v2.1.yaml
new file mode 100644
index 0000000..c6654a8
--- /dev/null
+++ b/ir_datasets/docs/msmarco-document-v2.1.yaml
@@ -0,0 +1,14 @@
+_:
+  pretty_name: 'MSMARCO (document, version 2.1)'
+  desc: '
+  <p>
+Version 2.1 of the MS MARCO document ranking dataset used in TREC RAG 2024.
+</p>
+<ul>
+  <li>Version 1 of dataset: <a class="ds-ref">msmarco-document</a></li>
+  <li>Documents: Text extracted from web pages</li>
+  <li>Queries: Natural language questions (from query log)</li>
+  <li> TODO: add paper describing the dataset.</li>
+</ul>'
+  bibtex_ids: []
+
diff --git a/test/integration/msmarco_document_v2_1.py b/test/integration/msmarco_document_v2_1.py
new file mode 100644
index 0000000..2c91fe5
--- /dev/null
+++ b/test/integration/msmarco_document_v2_1.py
@@ -0,0 +1,66 @@
+import re
+import unittest
+import ir_datasets
+from ir_datasets.datasets.msmarco_document_v2_1 import MsMarcoV2Document
+from ir_datasets.formats import TrecQrel, GenericQuery
+from .base import DatasetIntegrationTest
+
+
+_logger = ir_datasets.log.easy()
+
+
+class TestMSMarcoV21Docs(DatasetIntegrationTest):
+    def test_ms_marco_docs_iter_full(self):
+        self._test_docs('msmarco-document-v2.1', count=5371, items={
+            0: MsMarcoV2Document(
+                doc_id='msmarco_v2.1_doc_12_0',
+                title='Who Is Ringo Starr\'s Wife Barbara Bach and How Many Children Do They Have?',
+                url='https://answersafrica.com/ringo-starrs-wife-children.html',
+                headings=re.compile('.*Wife Barbara Bach.*'),
+                body=re.compile('^Who Is Ringo Starr\'s Wife Barbara Bach.*')
+            ),
+            9: MsMarcoV2Document(
+                doc_id='msmarco_v2.1_doc_12_70974',
+                title='List of Robin Williams Movies and TV Shows From Best To Worst',
+                url='https://answersafrica.com/robin-williams-movies-tv-shows.html',
+                headings=re.compile('List of Robin Williams Movies and TV Shows.*'),
+                body=re.compile('List of Robin Williams Movies and TV Shows From Best To Worst\nList of Robin Williams Movies and TV Shows From Best To Worst*')
+            ),
+            5370: MsMarcoV2Document(
+                doc_id='msmarco_v2.1_doc_12_48692010',
+                title='Warriors of Waterdeep 2.11.13 (Mod) latest',
+                url='https://apkdry.com/warriors-of-waterdeep-2-3-24-mod/',
+                headings=re.compile('^Warriors of Waterdeep 2.11.13 \(Mod\)\\nWarriors of Waterdeep 2.11.13 \(Mod\)\\nFeatures and Screenshots Warriors of Waterdeep Game for Android.*'),
+                body=re.compile('Warriors of Waterdeep 2.11.13 \(Mod\) latest\\nWarriors of Waterdeep 2.11.13 \(Mod\)\\nby Apkdry 3 weeks ago Games.*')
+            ),
+        })
+
+    def test_fast_ms_marco_docs_store(self):
+        docs_store = ir_datasets.load('msmarco-document-v2.1').docs_store()
+
+        doc = docs_store.get('msmarco_v2.1_doc_12_0')
+        self.assertEqual('msmarco_v2.1_doc_12_0', doc.doc_id)
+
+        doc = docs_store.get('msmarco_v2.1_doc_12_48692010')
+        self.assertEqual('msmarco_v2.1_doc_12_48692010', doc.doc_id)
+
+    def test_fast_docs_store_on_non_existing_documents(self):
+        docs_store = ir_datasets.load('msmarco-document-v2.1').docs_store()
+
+        with self.assertRaises(Exception) as context:
+            doc = docs_store.get('msmarco_v2.1_doc_12_111')
+
+        self.assertTrue('Expecting value: line 1 column 1' in str(context.exception))
+
+    def test_fast_ms_marco_docs_iter(self):
+        # faster alternative to above
+        docs_iter = ir_datasets.load('msmarco-document-v2.1').docs_iter()
+        first_doc = docs_iter.__next__()
+        second_doc = docs_iter.__next__()
+
+        self.assertEqual('msmarco_v2.1_doc_12_0', first_doc.doc_id)
+        self.assertEqual('msmarco_v2.1_doc_12_5689', second_doc.doc_id)
+
+
+if __name__ == '__main__':
+    unittest.main()

From ad48faa954ab92f64252d31d45f5859438685bef Mon Sep 17 00:00:00 2001
From: Maik Froebe <maik.froebe@uni-jena.de>
Date: Mon, 5 Aug 2024 19:06:26 +0200
Subject: [PATCH 03/13] prepare integration of msmarco v2.1 segmented documents

---
 ir_datasets/datasets/__init__.py              |   1 +
 ir_datasets/datasets/msmarco_document_v2_1.py |   7 +-
 .../msmarco_document_v2_1_segmented.py        | 156 ++++++++++++++++++
 test/integration/msmarco_document_v2_1.py     |   8 +-
 .../msmarco_document_v2_1_segmented.py        |  72 ++++++++
 5 files changed, 238 insertions(+), 6 deletions(-)
 create mode 100644 ir_datasets/datasets/msmarco_document_v2_1_segmented.py
 create mode 100644 test/integration/msmarco_document_v2_1_segmented.py

diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py
index 8bd90db..a5d5c59 100644
--- a/ir_datasets/datasets/__init__.py
+++ b/ir_datasets/datasets/__init__.py
@@ -30,6 +30,7 @@
 from . import msmarco_document
 from . import msmarco_document_v2
 from . import msmarco_document_v2_1
+from . import msmarco_document_v2_1_segmented
 from . import msmarco_passage
 from . import msmarco_passage_v2
 from . import msmarco_qna
diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py
index d2c1c2c..e73f061 100644
--- a/ir_datasets/datasets/msmarco_document_v2_1.py
+++ b/ir_datasets/datasets/msmarco_document_v2_1.py
@@ -19,6 +19,10 @@
 NAME = 'msmarco-document-v2.1'
 
 
+class MsMarcoV21Document(MsMarcoV2Document):
+    # Identical to V2 Document
+    pass
+
 class MsMarcoV21DocStore(ir_datasets.indices.Docstore):    
     def __init__(self, doc_cls, dlc, base_path):
         super().__init__(doc_cls)
@@ -46,7 +50,7 @@ def get(self, doc_id, field=None):
             document = json.loads(json_string)
 
             assert document["docid"] == doc_id
-            return MsMarcoV2Document(
+            return MsMarcoV21Document(
                 document['docid'],
                 document['url'],
                 document['title'],
@@ -70,7 +74,6 @@ def docs_store(self, field='doc_id'):
         ds.build()
         return ds
 
-
 def _init():
     base_path = ir_datasets.util.home_path()/NAME
     documentation = YamlDocumentation(f'docs/{NAME}.yaml')
diff --git a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py
new file mode 100644
index 0000000..6c6b9f5
--- /dev/null
+++ b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py
@@ -0,0 +1,156 @@
+import contextlib
+import gzip
+import io
+from pathlib import Path
+import json
+from typing import NamedTuple, Tuple, List
+import tarfile
+import ir_datasets
+from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator, TarExtractAll, TarExtract
+from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels
+from ir_datasets.formats import TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs
+from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS
+import os.path
+import shutil
+
+_logger = ir_datasets.log.easy()
+
+NAME = 'msmarco-document-v2.1'
+
+
+class MsMarcoV21SegmentedDocument(NamedTuple):
+    doc_id: str
+    url: str
+    title: str
+    headings: str
+    segment: str
+    start_char: int
+    end_char: int
+    def default_text(self):
+        """
+        title + headings + segment
+        This is consistent with the MsMarcoV21Document that returns the full text alternative of this: title + headings + body
+        Please note that Anserini additionaly returns the url. I.e., anserini returns url + title + headings + segment
+        E.g., https://github.com/castorini/anserini/blob/b8ce19f56bc4e85056ef703322f76646804ec640/src/main/java/io/anserini/collection/MsMarcoV2DocCollection.java#L169
+        """
+        return f'{self.title} {self.headings} {self.segment}'
+
+
+def ensure_file_is_extracted(file_name):
+    if os.path.isfile(file_name):
+        return
+    import tempfile
+    tmp_file = Path(tempfile.mkdtemp()) / file_name.split('/')[-1]
+    
+    with gzip.open(file_name + '.gz', 'rb') as f_in:
+        with open(tmp_file, 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    shutil.move(tmp_file, file_name)
+
+
+class MsMarcoV21SegmentedDocStore(ir_datasets.indices.Docstore):    
+    def __init__(self, doc_cls, dlc, base_path):
+        super().__init__(doc_cls)
+        self.dlc = dlc
+        self.cache = None
+        self.base_path = base_path
+
+    def built(self):
+        return False
+
+    def build(self):
+        if self.cache:
+            return
+        self.cache = Cache(TarExtractAll(self.dlc, "msmarco_v2.1_doc_segmented"), self.base_path)
+
+        for i in range(0, 59):
+            ensure_file_is_extracted(f"{self.cache.path()}/msmarco_v2.1_doc_segmented_{i:02d}.json")
+
+
+    def get(self, doc_id, field=None):
+        (string1, string2, string3, bundlenum, doc_position, position) = doc_id.split("_")
+        assert string1 == "msmarco" and string2 == "v2.1" and string3 == "doc"
+
+        with open(
+            f"{self.cache.path()}/msmarco_v2.1_doc_segmented_{bundlenum}.json", "rt", encoding="utf8"
+        ) as in_fh:
+            in_fh.seek(int(position))
+            json_string = in_fh.readline()
+            document = json.loads(json_string)
+
+            assert document["docid"] == doc_id
+            return MsMarcoV21SegmentedDocument(
+                document['docid'],
+                document['url'],
+                document['title'],
+                document['headings'],
+                document['segment'],
+                document['start_char'],
+                document['end_char']
+            )
+        
+
+class MsMarcoV21Docs(BaseDocs):
+    _fields = ["doc_id"]
+    def __init__(self, dlc):
+        super().__init__()
+        self._dlc = dlc
+
+    @ir_datasets.util.use_docstore
+    def docs_iter(self):
+        with self._dlc.stream() as stream, \
+             tarfile.open(fileobj=stream, mode='r|') as tarf:
+            for record in tarf:
+                if not record.name.endswith('.gz'):
+                    continue
+                file = tarf.extractfile(record)
+                with gzip.open(file) as file:
+                    for line in file:
+                        data = json.loads(line)
+                        yield MsMarcoV21SegmentedDocument(
+                            data['docid'],
+                            data['url'],
+                            data['title'],
+                            data['headings'],
+                            data['segment'],
+                            data['start_char'],
+                            data['end_char'],
+                        )
+
+    def docs_cls(self):
+        return MsMarcoV21SegmentedDocument
+
+    def __iter__():
+        pass
+
+    def docs_store(self, field='doc_id'):
+        ds = MsMarcoV21SegmentedDocStore(self, self._dlc, 
+        ir_datasets.util.home_path() / NAME / "docs-segmented")
+        ds.build()
+        return ds
+
+    def docs_count(self):
+        if self.docs_store().built():
+            return self.docs_store().count()
+
+    def docs_namespace(self):
+        return NAME
+
+    def docs_lang(self):
+        return 'en'
+
+
+def _init():
+    base_path = ir_datasets.util.home_path()/NAME
+    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
+    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
+    collection = MsMarcoV21Docs(dlc['docs-segmented'])
+    subsets = {}
+
+    ir_datasets.registry.register(NAME + '/segmented', Dataset(collection, documentation('_')))
+    collection
+    print(collection.docs_iter().__next__())
+    
+    return collection, subsets
+
+collection, subsets = _init()
diff --git a/test/integration/msmarco_document_v2_1.py b/test/integration/msmarco_document_v2_1.py
index 2c91fe5..14c8493 100644
--- a/test/integration/msmarco_document_v2_1.py
+++ b/test/integration/msmarco_document_v2_1.py
@@ -1,7 +1,7 @@
 import re
 import unittest
 import ir_datasets
-from ir_datasets.datasets.msmarco_document_v2_1 import MsMarcoV2Document
+from ir_datasets.datasets.msmarco_document_v2_1 import MsMarcoV21Document
 from ir_datasets.formats import TrecQrel, GenericQuery
 from .base import DatasetIntegrationTest
 
@@ -12,21 +12,21 @@
 class TestMSMarcoV21Docs(DatasetIntegrationTest):
     def test_ms_marco_docs_iter_full(self):
         self._test_docs('msmarco-document-v2.1', count=5371, items={
-            0: MsMarcoV2Document(
+            0: MsMarcoV21Document(
                 doc_id='msmarco_v2.1_doc_12_0',
                 title='Who Is Ringo Starr\'s Wife Barbara Bach and How Many Children Do They Have?',
                 url='https://answersafrica.com/ringo-starrs-wife-children.html',
                 headings=re.compile('.*Wife Barbara Bach.*'),
                 body=re.compile('^Who Is Ringo Starr\'s Wife Barbara Bach.*')
             ),
-            9: MsMarcoV2Document(
+            9: MsMarcoV21Document(
                 doc_id='msmarco_v2.1_doc_12_70974',
                 title='List of Robin Williams Movies and TV Shows From Best To Worst',
                 url='https://answersafrica.com/robin-williams-movies-tv-shows.html',
                 headings=re.compile('List of Robin Williams Movies and TV Shows.*'),
                 body=re.compile('List of Robin Williams Movies and TV Shows From Best To Worst\nList of Robin Williams Movies and TV Shows From Best To Worst*')
             ),
-            5370: MsMarcoV2Document(
+            5370: MsMarcoV21Document(
                 doc_id='msmarco_v2.1_doc_12_48692010',
                 title='Warriors of Waterdeep 2.11.13 (Mod) latest',
                 url='https://apkdry.com/warriors-of-waterdeep-2-3-24-mod/',
diff --git a/test/integration/msmarco_document_v2_1_segmented.py b/test/integration/msmarco_document_v2_1_segmented.py
new file mode 100644
index 0000000..151ac90
--- /dev/null
+++ b/test/integration/msmarco_document_v2_1_segmented.py
@@ -0,0 +1,72 @@
+import re
+import unittest
+import ir_datasets
+from ir_datasets.datasets.msmarco_document_v2_1_segmented import MsMarcoV21SegmentedDocument
+from ir_datasets.formats import TrecQrel, GenericQuery
+from .base import DatasetIntegrationTest
+
+
+_logger = ir_datasets.log.easy()
+
+
+class TestMSMarcoV21DocsSegmented(DatasetIntegrationTest):
+    def test_ms_marco_docs_iter_full(self):
+        self._test_docs('msmarco-document-v2.1/segmented', count=5371, items={
+            0: MsMarcoV21SegmentedDocument(
+                doc_id='msmarco_v2.1_doc_42_0#0_0',
+                title='How to Use Flip Tool in GIMP',
+                url='https://www.guidingtech.com/use-flip-tool-gimp/',
+                headings=re.compile('^How to Use Flip Tool in\\nGIMP\\n\\nHow to Use Flip Tool in GIMP.*'),
+                segment=re.compile('^How to Use Flip Tool in GIMP\\nHow to Use Flip Tool in GIMP\\nMehvish\\n06 Sep 2019.*'),
+                start_char=0,
+                end_char=800
+            ),
+           19: MsMarcoV21SegmentedDocument(
+                doc_id='msmarco_v2.1_doc_42_6080#2_22424',
+                title='How to Setup and Use FTP Server on Android',
+                url='https://www.guidingtech.com/use-ftp-server-file-transfer-android/',
+                headings=re.compile('How to Set\\u00adup and Use\\nFTP\nServ\\u00ader on Android\.*'),
+                segment=re.compile('^Also Read: Best Alternatives to Google Apps\\nIn this post.*'),
+                start_char=1032,
+                end_char=1959
+            ),
+            5370: MsMarcoV21SegmentedDocument(
+                doc_id='msmarco_v2.1_doc_42_3400697#6_9024928',
+                title='Can Guinea Pigs Eat Leaves? - Guinea Pig Tube',
+                url='https://www.guineapigtube.com/can-guinea-pigs-eat-leaves/',
+                headings=re.compile('^Can Guinea Pigs Eat Leaves\?\\nCan Guinea Pigs Eat Leaves\?.*'),
+                segment=re.compile('^They protect the body from free radical damage. The free radicals cause many health problems and also cause premature aging in guinea pigs.*'),
+                start_char=2954,
+                end_char=3767,
+            ),
+        })
+
+    def test_fast_ms_marco_docs_store(self):
+        docs_store = ir_datasets.load('msmarco-document-v2.1/segmented').docs_store()
+
+        doc = docs_store.get('msmarco_v2.1_doc_02_968#0_1561')
+        self.assertEqual('msmarco_v2.1_doc_02_968#0_1561', doc.doc_id)
+
+        doc = docs_store.get('msmarco_v2.1_doc_03_0#3_5523')
+        self.assertEqual('msmarco_v2.1_doc_03_0#3_5523', doc.doc_id)
+
+    def test_fast_docs_store_on_non_existing_documents(self):
+        docs_store = ir_datasets.load('msmarco-document-v2.1/segmented').docs_store()
+
+        with self.assertRaises(Exception) as context:
+            doc = docs_store.get('msmarco_v2.1_doc_02_968#0_156')
+
+        self.assertTrue('Expecting value: line 1 column 1' in str(context.exception))
+
+    def test_fast_ms_marco_docs_iter(self):
+        # faster alternative to above
+        docs_iter = ir_datasets.load('msmarco-document-v2.1/segmented').docs_iter()
+        first_doc = docs_iter.__next__()
+        second_doc = docs_iter.__next__()
+
+        self.assertEqual('msmarco_v2.1_doc_42_0#0_0', first_doc.doc_id)
+        self.assertEqual('msmarco_v2.1_doc_42_0#1_1311', second_doc.doc_id)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 973751c7b4abcf9334c9989750a43e52b1839e76 Mon Sep 17 00:00:00 2001
From: Maik Froebe <maik.froebe@uni-jena.de>
Date: Mon, 5 Aug 2024 22:44:53 +0200
Subject: [PATCH 04/13] add doc counts

---
 ir_datasets/datasets/msmarco_document_v2_1.py           | 3 +++
 ir_datasets/datasets/msmarco_document_v2_1_segmented.py | 5 +----
 test/integration/msmarco_document_v2_1.py               | 5 +++++
 test/integration/msmarco_document_v2_1_segmented.py     | 6 ++++++
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py
index e73f061..6da5a41 100644
--- a/ir_datasets/datasets/msmarco_document_v2_1.py
+++ b/ir_datasets/datasets/msmarco_document_v2_1.py
@@ -74,6 +74,9 @@ def docs_store(self, field='doc_id'):
         ds.build()
         return ds
 
+    def docs_count(self):
+        return 10960555
+
 def _init():
     base_path = ir_datasets.util.home_path()/NAME
     documentation = YamlDocumentation(f'docs/{NAME}.yaml')
diff --git a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py
index 6c6b9f5..7c04612 100644
--- a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py
+++ b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py
@@ -130,8 +130,7 @@ def docs_store(self, field='doc_id'):
         return ds
 
     def docs_count(self):
-        if self.docs_store().built():
-            return self.docs_store().count()
+        return 113520750
 
     def docs_namespace(self):
         return NAME
@@ -148,8 +147,6 @@ def _init():
     subsets = {}
 
     ir_datasets.registry.register(NAME + '/segmented', Dataset(collection, documentation('_')))
-    collection
-    print(collection.docs_iter().__next__())
     
     return collection, subsets
 
diff --git a/test/integration/msmarco_document_v2_1.py b/test/integration/msmarco_document_v2_1.py
index 14c8493..0dcfe24 100644
--- a/test/integration/msmarco_document_v2_1.py
+++ b/test/integration/msmarco_document_v2_1.py
@@ -61,6 +61,11 @@ def test_fast_ms_marco_docs_iter(self):
         self.assertEqual('msmarco_v2.1_doc_12_0', first_doc.doc_id)
         self.assertEqual('msmarco_v2.1_doc_12_5689', second_doc.doc_id)
 
+    def test_fast_docs_count(self):
+        expected = 10960555
+        actual = ir_datasets.load('msmarco-document-v2.1').docs_count()
+
+        self.assertEqual(expected, actual)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/integration/msmarco_document_v2_1_segmented.py b/test/integration/msmarco_document_v2_1_segmented.py
index 151ac90..f0ca5cb 100644
--- a/test/integration/msmarco_document_v2_1_segmented.py
+++ b/test/integration/msmarco_document_v2_1_segmented.py
@@ -67,6 +67,12 @@ def test_fast_ms_marco_docs_iter(self):
         self.assertEqual('msmarco_v2.1_doc_42_0#0_0', first_doc.doc_id)
         self.assertEqual('msmarco_v2.1_doc_42_0#1_1311', second_doc.doc_id)
 
+    def test_fast_docs_count(self):
+        expected = 113520750
+        actual = ir_datasets.load('msmarco-document-v2.1/segmented').docs_count()
+
+        self.assertEqual(expected, actual)
+
 
 if __name__ == '__main__':
     unittest.main()

From ae3e7780beb91fd050754af312606903af953abe Mon Sep 17 00:00:00 2001
From: Maik Froebe <maik.froebe@uni-jena.de>
Date: Wed, 7 Aug 2024 07:27:53 +0200
Subject: [PATCH 05/13] add extraction of msmarco v2.1 docs

---
 ir_datasets/datasets/msmarco_document_v2_1.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py
index 6da5a41..ad71a60 100644
--- a/ir_datasets/datasets/msmarco_document_v2_1.py
+++ b/ir_datasets/datasets/msmarco_document_v2_1.py
@@ -1,6 +1,8 @@
 import contextlib
 import gzip
 import io
+import os
+import shutil
 from pathlib import Path
 import json
 from typing import NamedTuple, Tuple, List
@@ -23,6 +25,17 @@ class MsMarcoV21Document(MsMarcoV2Document):
     # Identical to V2 Document
     pass
 
+def ensure_file_is_extracted(file_name):
+    if os.path.isfile(file_name):
+        return
+    import tempfile
+    tmp_file = Path(tempfile.mkdtemp()) / file_name.split('/')[-1]
+
+    with gzip.open(file_name + '.gz', 'rb') as f_in:
+        with open(tmp_file, 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    shutil.move(tmp_file, file_name)
+
 class MsMarcoV21DocStore(ir_datasets.indices.Docstore):    
     def __init__(self, doc_cls, dlc, base_path):
         super().__init__(doc_cls)
@@ -37,6 +50,9 @@ def build(self):
         if self.cache:
             return
         self.cache = Cache(TarExtractAll(self.dlc, "msmarco_v2.1_doc"), self.base_path)
+        for i in range(0, 59):
+            ensure_file_is_extracted(f"{self.cache.path()}/msmarco_v2.1_doc_{i:02d}.json")
+
 
     def get(self, doc_id, field=None):
         (string1, string2, string3, bundlenum, position) = doc_id.split("_")

From 8d1fbb8a8500f9113f3435d2726c9af5b79dcc9c Mon Sep 17 00:00:00 2001
From: Maik Froebe <maik.froebe@uni-jena.de>
Date: Thu, 8 Aug 2024 13:45:16 +0200
Subject: [PATCH 06/13] add trec 2024 rag queries

---
 ir_datasets/datasets/msmarco_document_v2_1.py           | 7 +++++++
 ir_datasets/datasets/msmarco_document_v2_1_segmented.py | 6 ++++++
 ir_datasets/etc/downloads.json                          | 7 ++++++-
 test/integration/msmarco_document_v2_1.py               | 8 ++++++++
 test/integration/msmarco_document_v2_1_segmented.py     | 7 +++++++
 5 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py
index ad71a60..a8fe444 100644
--- a/ir_datasets/datasets/msmarco_document_v2_1.py
+++ b/ir_datasets/datasets/msmarco_document_v2_1.py
@@ -100,7 +100,14 @@ def _init():
     collection = MsMarcoV21Docs(dlc['docs'])
     subsets = {}
 
+    subsets['trec-rag-2024'] = Dataset(
+        collection,
+        TsvQueries(dlc['rag-2024-test-topics'], namespace=NAME, lang='en'),
+    )
+
     ir_datasets.registry.register(NAME, Dataset(collection, documentation('_')))
+    for s in sorted(subsets):
+        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))
     
     return collection, subsets
 
diff --git a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py
index 7c04612..215c8ac 100644
--- a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py
+++ b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py
@@ -145,8 +145,14 @@ def _init():
     dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
     collection = MsMarcoV21Docs(dlc['docs-segmented'])
     subsets = {}
+    subsets['trec-rag-2024'] = Dataset(
+        collection,
+        TsvQueries(dlc['rag-2024-test-topics'], namespace=NAME, lang='en'),
+    )
 
     ir_datasets.registry.register(NAME + '/segmented', Dataset(collection, documentation('_')))
+    for s in sorted(subsets):
+        ir_datasets.registry.register(f'{NAME}/segmented/{s}', Dataset(subsets[s], documentation(s)))
     
     return collection, subsets
 
diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json
index cc0fb44..4882b18 100644
--- a/ir_datasets/etc/downloads.json
+++ b/ir_datasets/etc/downloads.json
@@ -4728,9 +4728,14 @@
       "expected_md5_foo": "3799e7611efffd8daeb257e9ccca4d60",
       "cache_path": "msmarco_v2.1_doc_segmented.tar",
       "download_args": {"headers": {"X-Ms-Version": "2024-07-10"}}
+    },
+    "rag-2024-test-topics": {
+      "url": "https://trec-rag.github.io/assets/txt/topics.rag24.test.txt",
+      "size_hint": 19517,
+      "expected_md5": "5bd6c8fa0e1300233fe139bae8288d09",
+      "cache_path": "trec-rag-2024-topics-test.txt"
     }
   },
-
   "msmarco-passage": {
     "collectionandqueries": {
       "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
diff --git a/test/integration/msmarco_document_v2_1.py b/test/integration/msmarco_document_v2_1.py
index 0dcfe24..4c6bbb3 100644
--- a/test/integration/msmarco_document_v2_1.py
+++ b/test/integration/msmarco_document_v2_1.py
@@ -67,5 +67,13 @@ def test_fast_docs_count(self):
 
         self.assertEqual(expected, actual)
 
+    def test_fast_queries(self):
+        self._test_queries('msmarco-document-v2.1/trec-rag-2024', count=301, items={
+            0: GenericQuery('2024-145979', 'what is vicarious trauma and how can it be coped with?'),
+            9: GenericQuery('2024-158743', 'what was happening in germany and netherlands in the 1840s'),
+            300: GenericQuery('2024-21669', 'do abortions kill more black people than other weapons'),
+        })
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/integration/msmarco_document_v2_1_segmented.py b/test/integration/msmarco_document_v2_1_segmented.py
index f0ca5cb..bd1fc6a 100644
--- a/test/integration/msmarco_document_v2_1_segmented.py
+++ b/test/integration/msmarco_document_v2_1_segmented.py
@@ -73,6 +73,13 @@ def test_fast_docs_count(self):
 
         self.assertEqual(expected, actual)
 
+    def test_fast_queries(self):
+        self._test_queries('msmarco-document-v2.1/trec-rag-2024', count=301, items={
+            0: GenericQuery('2024-145979', 'what is vicarious trauma and how can it be coped with?'),
+            9: GenericQuery('2024-158743', 'what was happening in germany and netherlands in the 1840s'),
+            300: GenericQuery('2024-21669', 'do abortions kill more black people than other weapons'),
+        })
+
 
 if __name__ == '__main__':
     unittest.main()

From 05eb55b7d43e4ef383f1ae86df8688ccab6c5a66 Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Sat, 10 Aug 2024 15:11:55 +0100
Subject: [PATCH 07/13] fix error in configuration

---
 ir_datasets/datasets/msmarco_document_v2_1.py           | 2 +-
 ir_datasets/datasets/msmarco_document_v2_1_segmented.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py
index a8fe444..3b33fed 100644
--- a/ir_datasets/datasets/msmarco_document_v2_1.py
+++ b/ir_datasets/datasets/msmarco_document_v2_1.py
@@ -49,7 +49,7 @@ def built(self):
     def build(self):
         if self.cache:
             return
-        self.cache = Cache(TarExtractAll(self.dlc, "msmarco_v2.1_doc"), self.base_path)
+        self.cache = TarExtractAll(self.dlc, self.base_path/"msmarco_v2.1_doc")
         for i in range(0, 59):
             ensure_file_is_extracted(f"{self.cache.path()}/msmarco_v2.1_doc_{i:02d}.json")
 
diff --git a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py
index 215c8ac..fa048d1 100644
--- a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py
+++ b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py
@@ -61,7 +61,7 @@ def built(self):
     def build(self):
         if self.cache:
             return
-        self.cache = Cache(TarExtractAll(self.dlc, "msmarco_v2.1_doc_segmented"), self.base_path)
+        self.cache = TarExtractAll(self.dlc, self.base_path/"msmarco_v2.1_doc_segmented")
 
         for i in range(0, 59):
             ensure_file_is_extracted(f"{self.cache.path()}/msmarco_v2.1_doc_segmented_{i:02d}.json")

From f548ead514e742531cb2a40b6f8fb0e94f670925 Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Sun, 11 Aug 2024 17:40:17 +0100
Subject: [PATCH 08/13] wip by making use of v2 classes

---
 ir_datasets/datasets/__init__.py              |   2 +-
 ir_datasets/datasets/msmarco_document_v2.py   |  25 +--
 ir_datasets/datasets/msmarco_document_v2_1.py |  97 +----------
 .../msmarco_document_v2_1_segmented.py        | 159 ------------------
 ir_datasets/datasets/msmarco_passage_v2.py    |  63 ++++---
 ir_datasets/datasets/msmarco_segment_v2_1.py  |  72 ++++++++
 ir_datasets/etc/downloads.json                |  16 +-
 test/integration/msmarco_document_v2_1.py     |  64 +------
 .../msmarco_document_v2_1_segmented.py        |  85 ----------
 test/integration/msmarco_segment_v2_1.py      |  30 ++++
 10 files changed, 180 insertions(+), 433 deletions(-)
 delete mode 100644 ir_datasets/datasets/msmarco_document_v2_1_segmented.py
 create mode 100644 ir_datasets/datasets/msmarco_segment_v2_1.py
 delete mode 100644 test/integration/msmarco_document_v2_1_segmented.py
 create mode 100644 test/integration/msmarco_segment_v2_1.py

diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py
index a5d5c59..77b35d0 100644
--- a/ir_datasets/datasets/__init__.py
+++ b/ir_datasets/datasets/__init__.py
@@ -30,7 +30,7 @@
 from . import msmarco_document
 from . import msmarco_document_v2
 from . import msmarco_document_v2_1
-from . import msmarco_document_v2_1_segmented
+from . import msmarco_segment_v2_1
 from . import msmarco_passage
 from . import msmarco_passage_v2
 from . import msmarco_qna
diff --git a/ir_datasets/datasets/msmarco_document_v2.py b/ir_datasets/datasets/msmarco_document_v2.py
index 39b9658..d3ae586 100644
--- a/ir_datasets/datasets/msmarco_document_v2.py
+++ b/ir_datasets/datasets/msmarco_document_v2.py
@@ -36,17 +36,21 @@ def default_text(self):
 
 
 class MsMarcoV2Docs(BaseDocs):
-    def __init__(self, dlc):
+    def __init__(self, dlc, docid_prefix='msmarco_doc_', docstore_size_hint=66500029281, name=NAME):
         super().__init__()
         self._dlc = dlc
+        self._docid_prefix = docid_prefix
+        self._docstore_size_hint = docstore_size_hint
+        self._name = name
 
     @ir_datasets.util.use_docstore
     def docs_iter(self):
-        with self._dlc.stream() as stream, \
-             tarfile.open(fileobj=stream, mode='r|') as tarf:
-            for record in tarf:
-                if not record.name.endswith('.gz'):
-                    continue
+        with tarfile.open(self._dlc.path(), mode='r:') as tarf:
+            # since there's no compression, it's fast to scan all records and sort them.
+            # The sorting has no effect on v2, but in v2.1, the files are out-of-sequence, so this
+            # addressed that problem.
+            records = sorted([r for r in tarf if r.name.endswith('.gz')], key=lambda x: x.name)
+            for record in records:
                 file = tarf.extractfile(record)
                 with gzip.open(file) as file:
                     for line in file:
@@ -84,18 +88,17 @@ def docs_store(self, field='doc_id'):
             data_cls=self.docs_cls(),
             lookup_field=field,
             index_fields=['doc_id'],
-            key_field_prefix='msmarco_doc_', # cut down on storage by removing prefix in lookup structure
-            size_hint=66500029281,
-            count_hint=ir_datasets.util.count_hint(NAME),
+            key_field_prefix=self._docid_prefix, # cut down on storage by removing prefix in lookup structure
+            size_hint=self._docstore_size_hint,
+            count_hint=ir_datasets.util.count_hint(self._name),
         )
-        # return MsMArcoV2DocStore(self)
 
     def docs_count(self):
         if self.docs_store().built():
             return self.docs_store().count()
 
     def docs_namespace(self):
-        return NAME
+        return self._name
 
     def docs_lang(self):
         return 'en'
diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py
index 3b33fed..c292108 100644
--- a/ir_datasets/datasets/msmarco_document_v2_1.py
+++ b/ir_datasets/datasets/msmarco_document_v2_1.py
@@ -1,103 +1,20 @@
-import contextlib
-import gzip
-import io
-import os
-import shutil
-from pathlib import Path
-import json
-from typing import NamedTuple, Tuple, List
-import tarfile
 import ir_datasets
-from ir_datasets.indices import PickleLz4FullStore
-from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator, TarExtractAll, TarExtract
-from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels
-from ir_datasets.formats import TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs
-from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS
-from ir_datasets.datasets.msmarco_document import TREC_DL_QRELS_DEFS
-from ir_datasets.datasets.msmarco_document_v2 import MsMarcoV2Docs, MsMarcoV2Document
+from ir_datasets.util import DownloadConfig
+from ir_datasets.datasets.base import Dataset, YamlDocumentation
+from ir_datasets.formats import TsvQueries
+from ir_datasets.datasets.msmarco_passage import DUA
+from ir_datasets.datasets.msmarco_document_v2 import MsMarcoV2Docs
 
 _logger = ir_datasets.log.easy()
 
 NAME = 'msmarco-document-v2.1'
 
-
-class MsMarcoV21Document(MsMarcoV2Document):
-    # Identical to V2 Document
-    pass
-
-def ensure_file_is_extracted(file_name):
-    if os.path.isfile(file_name):
-        return
-    import tempfile
-    tmp_file = Path(tempfile.mkdtemp()) / file_name.split('/')[-1]
-
-    with gzip.open(file_name + '.gz', 'rb') as f_in:
-        with open(tmp_file, 'wb') as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    shutil.move(tmp_file, file_name)
-
-class MsMarcoV21DocStore(ir_datasets.indices.Docstore):    
-    def __init__(self, doc_cls, dlc, base_path):
-        super().__init__(doc_cls)
-        self.dlc = dlc
-        self.cache = None
-        self.base_path = base_path
-
-    def built(self):
-        return False
-
-    def build(self):
-        if self.cache:
-            return
-        self.cache = TarExtractAll(self.dlc, self.base_path/"msmarco_v2.1_doc")
-        for i in range(0, 59):
-            ensure_file_is_extracted(f"{self.cache.path()}/msmarco_v2.1_doc_{i:02d}.json")
-
-
-    def get(self, doc_id, field=None):
-        (string1, string2, string3, bundlenum, position) = doc_id.split("_")
-        assert string1 == "msmarco" and string2 == "v2.1" and string3 == "doc"
-
-        with open(
-            f"{self.cache.path()}/msmarco_v2.1_doc_{bundlenum}.json", "rt", encoding="utf8"
-        ) as in_fh:
-            in_fh.seek(int(position))
-            json_string = in_fh.readline()
-            document = json.loads(json_string)
-
-            assert document["docid"] == doc_id
-            return MsMarcoV21Document(
-                document['docid'],
-                document['url'],
-                document['title'],
-                document['headings'],
-                document['body'])
-        
-        # raise KeyError(f'doc_id={doc_id} not found')
-
-
-class MsMarcoV21Docs(MsMarcoV2Docs):
-    _fields = ["doc_id"]
-    def __init__(self, dlc):
-        super().__init__(dlc)
-
-    def __iter__():
-        pass
-
-    def docs_store(self, field='doc_id'):
-        ds = MsMarcoV21DocStore(self, self._dlc, 
-        ir_datasets.util.home_path() / NAME / "docs")
-        ds.build()
-        return ds
-
-    def docs_count(self):
-        return 10960555
-
 def _init():
     base_path = ir_datasets.util.home_path()/NAME
     documentation = YamlDocumentation(f'docs/{NAME}.yaml')
     dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
-    collection = MsMarcoV21Docs(dlc['docs'])
+    # we can re-use MsMarcoV2Docs, just with a few modifications directly
+    collection = MsMarcoV2Docs(dlc['docs'], docid_prefix='msmarco_v2.1_doc_', docstore_size_hint=0, name=NAME)
     subsets = {}
 
     subsets['trec-rag-2024'] = Dataset(
diff --git a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py b/ir_datasets/datasets/msmarco_document_v2_1_segmented.py
deleted file mode 100644
index fa048d1..0000000
--- a/ir_datasets/datasets/msmarco_document_v2_1_segmented.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import contextlib
-import gzip
-import io
-from pathlib import Path
-import json
-from typing import NamedTuple, Tuple, List
-import tarfile
-import ir_datasets
-from ir_datasets.util import Cache, DownloadConfig, GzipExtract, Lazy, Migrator, TarExtractAll, TarExtract
-from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries, FilteredScoredDocs, FilteredQrels
-from ir_datasets.formats import TsvQueries, TrecQrels, TrecScoredDocs, BaseDocs
-from ir_datasets.datasets.msmarco_passage import DUA, DL_HARD_QIDS_BYFOLD, DL_HARD_QIDS
-import os.path
-import shutil
-
-_logger = ir_datasets.log.easy()
-
-NAME = 'msmarco-document-v2.1'
-
-
-class MsMarcoV21SegmentedDocument(NamedTuple):
-    doc_id: str
-    url: str
-    title: str
-    headings: str
-    segment: str
-    start_char: int
-    end_char: int
-    def default_text(self):
-        """
-        title + headings + segment
-        This is consistent with the MsMarcoV21Document that returns the full text alternative of this: title + headings + body
-        Please note that Anserini additionaly returns the url. I.e., anserini returns url + title + headings + segment
-        E.g., https://github.com/castorini/anserini/blob/b8ce19f56bc4e85056ef703322f76646804ec640/src/main/java/io/anserini/collection/MsMarcoV2DocCollection.java#L169
-        """
-        return f'{self.title} {self.headings} {self.segment}'
-
-
-def ensure_file_is_extracted(file_name):
-    if os.path.isfile(file_name):
-        return
-    import tempfile
-    tmp_file = Path(tempfile.mkdtemp()) / file_name.split('/')[-1]
-    
-    with gzip.open(file_name + '.gz', 'rb') as f_in:
-        with open(tmp_file, 'wb') as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    shutil.move(tmp_file, file_name)
-
-
-class MsMarcoV21SegmentedDocStore(ir_datasets.indices.Docstore):    
-    def __init__(self, doc_cls, dlc, base_path):
-        super().__init__(doc_cls)
-        self.dlc = dlc
-        self.cache = None
-        self.base_path = base_path
-
-    def built(self):
-        return False
-
-    def build(self):
-        if self.cache:
-            return
-        self.cache = TarExtractAll(self.dlc, self.base_path/"msmarco_v2.1_doc_segmented")
-
-        for i in range(0, 59):
-            ensure_file_is_extracted(f"{self.cache.path()}/msmarco_v2.1_doc_segmented_{i:02d}.json")
-
-
-    def get(self, doc_id, field=None):
-        (string1, string2, string3, bundlenum, doc_position, position) = doc_id.split("_")
-        assert string1 == "msmarco" and string2 == "v2.1" and string3 == "doc"
-
-        with open(
-            f"{self.cache.path()}/msmarco_v2.1_doc_segmented_{bundlenum}.json", "rt", encoding="utf8"
-        ) as in_fh:
-            in_fh.seek(int(position))
-            json_string = in_fh.readline()
-            document = json.loads(json_string)
-
-            assert document["docid"] == doc_id
-            return MsMarcoV21SegmentedDocument(
-                document['docid'],
-                document['url'],
-                document['title'],
-                document['headings'],
-                document['segment'],
-                document['start_char'],
-                document['end_char']
-            )
-        
-
-class MsMarcoV21Docs(BaseDocs):
-    _fields = ["doc_id"]
-    def __init__(self, dlc):
-        super().__init__()
-        self._dlc = dlc
-
-    @ir_datasets.util.use_docstore
-    def docs_iter(self):
-        with self._dlc.stream() as stream, \
-             tarfile.open(fileobj=stream, mode='r|') as tarf:
-            for record in tarf:
-                if not record.name.endswith('.gz'):
-                    continue
-                file = tarf.extractfile(record)
-                with gzip.open(file) as file:
-                    for line in file:
-                        data = json.loads(line)
-                        yield MsMarcoV21SegmentedDocument(
-                            data['docid'],
-                            data['url'],
-                            data['title'],
-                            data['headings'],
-                            data['segment'],
-                            data['start_char'],
-                            data['end_char'],
-                        )
-
-    def docs_cls(self):
-        return MsMarcoV21SegmentedDocument
-
-    def __iter__():
-        pass
-
-    def docs_store(self, field='doc_id'):
-        ds = MsMarcoV21SegmentedDocStore(self, self._dlc, 
-        ir_datasets.util.home_path() / NAME / "docs-segmented")
-        ds.build()
-        return ds
-
-    def docs_count(self):
-        return 113520750
-
-    def docs_namespace(self):
-        return NAME
-
-    def docs_lang(self):
-        return 'en'
-
-
-def _init():
-    base_path = ir_datasets.util.home_path()/NAME
-    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
-    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
-    collection = MsMarcoV21Docs(dlc['docs-segmented'])
-    subsets = {}
-    subsets['trec-rag-2024'] = Dataset(
-        collection,
-        TsvQueries(dlc['rag-2024-test-topics'], namespace=NAME, lang='en'),
-    )
-
-    ir_datasets.registry.register(NAME + '/segmented', Dataset(collection, documentation('_')))
-    for s in sorted(subsets):
-        ir_datasets.registry.register(f'{NAME}/segmented/{s}', Dataset(subsets[s], documentation(s)))
-    
-    return collection, subsets
-
-collection, subsets = _init()
diff --git a/ir_datasets/datasets/msmarco_passage_v2.py b/ir_datasets/datasets/msmarco_passage_v2.py
index ea43d37..7dc572e 100644
--- a/ir_datasets/datasets/msmarco_passage_v2.py
+++ b/ir_datasets/datasets/msmarco_passage_v2.py
@@ -46,11 +46,22 @@ def parse_msmarco_passage(line):
         data['docid'])
 
 
+def passage_bundle_pos_from_key(key):
+    (string1, string2, bundlenum, position) = key.split('_')
+    assert string1 == 'msmarco' and string2 == 'passage'
+    return f'msmarco_passage_{bundlenum}', position
+
 class MsMarcoV2Passages(BaseDocs):
-    def __init__(self, dlc, pos_dlc=None):
+    def __init__(self, dlc, pos_dlc=None, cls=MsMarcoV2Passage, parse_passage=parse_msmarco_passage, name=NAME, docstore_size_hint=60880127751, bundle_pos_from_key=passage_bundle_pos_from_key, count=138_364_198):
         super().__init__()
         self._dlc = dlc
         self._pos_dlc = pos_dlc
+        self._cls = cls
+        self._parse_passage = parse_passage
+        self._name = name
+        self._docstore_size_hint = docstore_size_hint
+        self._bundle_pos_from_key = bundle_pos_from_key
+        self._count = count
 
     @ir_datasets.util.use_docstore
     def docs_iter(self):
@@ -59,30 +70,31 @@ def docs_iter(self):
             # files are used (i.e., no filtering is applied)
             yield from self.docs_store()
         else:
-            with self._dlc.stream() as stream, \
-                 tarfile.open(fileobj=stream, mode='r|') as tarf:
-                for record in tarf:
-                    if not record.name.endswith('.gz'):
-                        continue
+            with tarfile.open(self._dlc.path(), mode='r:') as tarf:
+                # since there's no compression, it's fast to scan all records and sort them.
+                # The sorting has no effect on v2, but in v2.1, the files are out-of-sequence, so this
+                # addressed that problem.
+                records = sorted([r for r in tarf if r.name.endswith('.gz')], key=lambda x: x.name)
+                for record in records:
                     file = tarf.extractfile(record)
                     with gzip.open(file) as file:
                         for line in file:
-                            yield parse_msmarco_passage(line)
+                            yield self._parse_passage(line)
 
     def docs_cls(self):
-        return MsMarcoV2Passage
+        return self._cls
 
     def docs_store(self, field='doc_id'):
         assert field == 'doc_id'
         # Unlike for msmarco-document-v2, using the docstore actually hurts performance.
-        return MsMarcoV2DocStore(self)
+        return MsMarcoV2DocStore(self, size_hint=self._docstore_size_hint, count=self._count)
 
     def docs_count(self):
         if self.docs_store().built():
             return self.docs_store().count()
 
     def docs_namespace(self):
-        return NAME
+        return self._name
 
     def docs_lang(self):
         return 'en'
@@ -92,7 +104,7 @@ def docs_path(self, force=True):
 
 
 class MsMarcoV2DocStore(ir_datasets.indices.Docstore):
-    def __init__(self, docs_handler):
+    def __init__(self, docs_handler, size_hint=60880127751, count=138_364_198):
         super().__init__(docs_handler.docs_cls(), 'doc_id')
         self.np = ir_datasets.lazy_libs.numpy()
         self.docs_handler = docs_handler
@@ -101,7 +113,8 @@ def __init__(self, docs_handler):
         self.base_path = docs_handler.docs_path(force=False) + '.extracted'
         if not os.path.exists(self.base_path):
             os.makedirs(self.base_path)
-        self.size_hint = 60880127751
+        self.size_hint = size_hint
+        self._count = count
 
     def get_many_iter(self, keys):
         self.build()
@@ -110,20 +123,19 @@ def get_many_iter(self, keys):
         for key in keys:
             if not key.count('_') == 3:
                 continue
-            (string1, string2, bundlenum, position) = key.split('_')
-            assert string1 == 'msmarco' and string2 == 'passage'
+            bundlenum, position = self.docs_handler._bundle_pos_from_key(key)
             if bundlenum not in bundles:
                 bundles[bundlenum] = []
             bundles[bundlenum].append(int(position))
         for bundlenum, positions in bundles.items():
             positions = sorted(positions)
-            file = f'{self.base_path}/msmarco_passage_{bundlenum}'
+            file = f'{self.base_path}/{bundlenum}'
             if not os.path.exists(file):
                 # invalid doc_id -- doesn't point to a real bundle
                 continue
             if self.docs_handler._pos_dlc is not None:
                 # check the positions are valid for these doc_ids -- only return valid ones
-                mmp = self.np.memmap(os.path.join(self.pos_dlc.path(), f'msmarco_passage_{bundlenum}.pos'), dtype='<u4')
+                mmp = self.np.memmap(os.path.join(self.pos_dlc.path(), f'{bundlenum}.pos'), dtype='<u4')
                 positions = self.np.array(positions, dtype='<u4')
                 positions = positions[self.np.isin(positions, mmp)].tolist()
                 del mmp
@@ -131,7 +143,7 @@ def get_many_iter(self, keys):
                 for position in positions:
                     in_fh.seek(position)
                     try:
-                        yield parse_msmarco_passage(in_fh.readline())
+                        yield self.docs_handler._parse_passage(in_fh.readline())
                     except json.JSONDecodeError:
                         # invalid doc_id -- pointed to a wrong position
                         pass
@@ -141,12 +153,9 @@ def build(self):
             return
         np = ir_datasets.lazy_libs.numpy()
         ir_datasets.util.check_disk_free(self.base_path, self.size_hint)
-        with _logger.pbar_raw('extracting source documents', total=70, unit='file') as pbar, \
-             self.dlc.stream() as stream, \
-             tarfile.open(fileobj=stream, mode='r|') as tarf:
-            for record in tarf:
-                if not record.name.endswith('.gz'):
-                    continue
+        with tarfile.open(self.dlc.path(), mode='r:') as tarf:
+            records = sorted([r for r in tarf if r.name.endswith('.gz')], key=lambda x: x.name)
+            for record in _logger.pbar(records, desc='extracting source documents'):
                 file = tarf.extractfile(record)
                 fname = record.name.split('/')[-1][:-len('.gz')]
                 positions = []
@@ -158,7 +167,6 @@ def build(self):
                 # keep track of the positions for efficient slicing
                 with open(os.path.join(self.base_path, f'{fname}.pos'), 'wb') as posout:
                     posout.write(np.array(positions, dtype='<u4').tobytes())
-                pbar.update(1)
         (Path(self.base_path) / '_built').touch()
 
     def built(self):
@@ -169,14 +177,15 @@ def __iter__(self):
         return MsMarcoV2PassageIter(self, slice(0, self.count()))
 
     def _iter_source_files(self):
-        for i in range(70):
-            yield os.path.join(self.base_path, f'msmarco_passage_{i:02d}')
+        for path in sorted(os.listdir(self.base_path)):
+            if path.startswith('msmarco_') and not path.endswith('.pos'):
+                yield os.path.join(self.base_path, path)
 
     def count(self):
         if self.docs_handler._pos_dlc is not None:
             base_path = self.pos_dlc.path()
             return sum(os.path.getsize(os.path.join(base_path, f)) for f in os.listdir(base_path)) // 4
-        return 138_364_198
+        return self._count
 
 
 class MsMarcoV2PassageIter:
diff --git a/ir_datasets/datasets/msmarco_segment_v2_1.py b/ir_datasets/datasets/msmarco_segment_v2_1.py
new file mode 100644
index 0000000..4574aa8
--- /dev/null
+++ b/ir_datasets/datasets/msmarco_segment_v2_1.py
@@ -0,0 +1,72 @@
+import json
+from typing import NamedTuple
+import ir_datasets
+from ir_datasets.util import DownloadConfig
+from ir_datasets.datasets.base import Dataset, YamlDocumentation
+from ir_datasets.formats import TsvQueries
+from ir_datasets.datasets.msmarco_passage import DUA
+from ir_datasets.datasets.msmarco_passage_v2 import MsMarcoV2Passages
+
+_logger = ir_datasets.log.easy()
+
+NAME = 'msmarco-segment-v2.1'
+
+
+class MsMarcoV21SegmentedDoc(NamedTuple):
+    doc_id: str
+    url: str
+    title: str
+    headings: str
+    segment: str
+    start_char: int
+    end_char: int
+    def default_text(self):
+        """
+        title + headings + segment
+        This is consistent with the MsMarcoV21Document that returns the full text alternative of this: title + headings + body
+        Please note that Anserini additionaly returns the url. I.e., anserini returns url + title + headings + segment
+        E.g., https://github.com/castorini/anserini/blob/b8ce19f56bc4e85056ef703322f76646804ec640/src/main/java/io/anserini/collection/MsMarcoV2DocCollection.java#L169
+        """
+        return f'{self.title} {self.headings} {self.segment}'
+
+
+def parse_msmarco_segment(line):
+    data = json.loads(line)
+    return MsMarcoV21SegmentedDoc(
+        data['docid'],
+        data['url'],
+        data['title'],
+        data['headings'],
+        data['segment'],
+        data['start_char'],
+        data['end_char']
+    )
+
+
+def passage_bundle_pos_from_key(key):
+    # key like: msmarco_v2.1_doc_00_0#4_5974
+    first, second = key.split('#')
+    (string1, string2, string3, bundle, doc_pos) = first.split('_')
+    (segment_num, segment_pos) = first.split('_')
+    assert string1 == 'msmarco' and string2 == 'v2.1' and string3 == 'doc'
+    return f'msmarco_v2.1_doc_segmented_{bundle}.json', segment_pos
+
+
+def _init():
+    base_path = ir_datasets.util.home_path()/NAME
+    documentation = YamlDocumentation(f'docs/{NAME}.yaml')
+    dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
+    collection = MsMarcoV2Passages(dlc['docs'], cls=MsMarcoV21SegmentedDoc, parse_passage=parse_msmarco_segment, name=NAME, bundle_pos_from_key=passage_bundle_pos_from_key, count=113_520_750)
+    subsets = {}
+    subsets['trec-rag-2024'] = Dataset(
+        collection,
+        TsvQueries(dlc['rag-2024-test-topics'], namespace=NAME, lang='en'),
+    )
+
+    ir_datasets.registry.register(NAME, Dataset(collection, documentation('_')))
+    for s in sorted(subsets):
+        ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))
+    
+    return collection, subsets
+
+collection, subsets = _init()
diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json
index 4882b18..968ad11 100644
--- a/ir_datasets/etc/downloads.json
+++ b/ir_datasets/etc/downloads.json
@@ -4718,14 +4718,23 @@
     "docs": {
       "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco_v2.1_doc.tar",
       "size_hint": 30844989440,
-      "expected_md5_foo": "a5950665d6448d3dbaf7135645f1e074",
+      "expected_md5": "a5950665d6448d3dbaf7135645f1e074",
       "cache_path": "msmarco_v2.1_doc.tar",
       "download_args": {"headers": {"X-Ms-Version": "2024-07-10"}}
     },
-    "docs-segmented": {
+    "rag-2024-test-topics": {
+      "url": "https://trec-rag.github.io/assets/txt/topics.rag24.test.txt",
+      "size_hint": 19517,
+      "expected_md5": "5bd6c8fa0e1300233fe139bae8288d09",
+      "cache_path": "trec-rag-2024-topics-test.txt"
+    }
+  },
+
+  "msmarco-segment-v2.1": {
+    "docs": {
       "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco_v2.1_doc_segmented.tar",
       "size_hint": 26918768640,
-      "expected_md5_foo": "3799e7611efffd8daeb257e9ccca4d60",
+      "expected_md5": "3799e7611efffd8daeb257e9ccca4d60",
       "cache_path": "msmarco_v2.1_doc_segmented.tar",
       "download_args": {"headers": {"X-Ms-Version": "2024-07-10"}}
     },
@@ -4736,6 +4745,7 @@
       "cache_path": "trec-rag-2024-topics-test.txt"
     }
   },
+
   "msmarco-passage": {
     "collectionandqueries": {
       "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
diff --git a/test/integration/msmarco_document_v2_1.py b/test/integration/msmarco_document_v2_1.py
index 4c6bbb3..b2680c8 100644
--- a/test/integration/msmarco_document_v2_1.py
+++ b/test/integration/msmarco_document_v2_1.py
@@ -1,7 +1,7 @@
 import re
 import unittest
 import ir_datasets
-from ir_datasets.datasets.msmarco_document_v2_1 import MsMarcoV21Document
+from ir_datasets.datasets.msmarco_document_v2 import MsMarcoV2Document
 from ir_datasets.formats import TrecQrel, GenericQuery
 from .base import DatasetIntegrationTest
 
@@ -10,64 +10,14 @@
 
 
 class TestMSMarcoV21Docs(DatasetIntegrationTest):
-    def test_ms_marco_docs_iter_full(self):
-        self._test_docs('msmarco-document-v2.1', count=5371, items={
-            0: MsMarcoV21Document(
-                doc_id='msmarco_v2.1_doc_12_0',
-                title='Who Is Ringo Starr\'s Wife Barbara Bach and How Many Children Do They Have?',
-                url='https://answersafrica.com/ringo-starrs-wife-children.html',
-                headings=re.compile('.*Wife Barbara Bach.*'),
-                body=re.compile('^Who Is Ringo Starr\'s Wife Barbara Bach.*')
-            ),
-            9: MsMarcoV21Document(
-                doc_id='msmarco_v2.1_doc_12_70974',
-                title='List of Robin Williams Movies and TV Shows From Best To Worst',
-                url='https://answersafrica.com/robin-williams-movies-tv-shows.html',
-                headings=re.compile('List of Robin Williams Movies and TV Shows.*'),
-                body=re.compile('List of Robin Williams Movies and TV Shows From Best To Worst\nList of Robin Williams Movies and TV Shows From Best To Worst*')
-            ),
-            5370: MsMarcoV21Document(
-                doc_id='msmarco_v2.1_doc_12_48692010',
-                title='Warriors of Waterdeep 2.11.13 (Mod) latest',
-                url='https://apkdry.com/warriors-of-waterdeep-2-3-24-mod/',
-                headings=re.compile('^Warriors of Waterdeep 2.11.13 \(Mod\)\\nWarriors of Waterdeep 2.11.13 \(Mod\)\\nFeatures and Screenshots Warriors of Waterdeep Game for Android.*'),
-                body=re.compile('Warriors of Waterdeep 2.11.13 \(Mod\) latest\\nWarriors of Waterdeep 2.11.13 \(Mod\)\\nby Apkdry 3 weeks ago Games.*')
-            ),
+    def test_docs(self):
+        self._test_docs('msmarco-document-v2.1', count=10960555, items={
+            0: MsMarcoV2Document('msmarco_v2.1_doc_00_0', 'http://0-60.reviews/0-60-times/', '0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews', '0-60 Times\n0-60 Times', re.compile('^0\\-60 Times \\- 0\\-60 \\| 0 to 60 Times \\& 1/4 Mile Times \\| Zero to 60 Car Reviews\n0\\-60 Times\nThere are man.{4332} biggest touted numbers for vehicles, and easier for people to relate to than horsepower and torque\\.$', flags=48)),
+            9: MsMarcoV2Document('msmarco_v2.1_doc_00_110582', 'http://003.clayton.k12.ga.us/', 'Home - Morrow High School', 'Morrow High\nMorrow High', re.compile("^Home \\- Morrow High School\nMore Options\nSelect a School\nDISTRICT\nCCPS\nElementary\nAnderson Elementary\n.{4959}oks Site\nMs\\. Cavazos' Site\nMr\\. Holbrook's Site\nMs\\. Hunt's Site\nMs\\. Lamarre's Site\nMr\\. McClain's Site$", flags=48)),
+            10960554: MsMarcoV2Document('msmarco_v2.1_doc_59_964287870', 'https://zzzzbov.com/blag/shortcut-to-zoom', 'Shortcut to Zoom › zzzzBov.com', 'Shortcut to Zoom\nShortcut to Zoom\nBatch File\nShortcut\nTrying it out\n', re.compile('^Shortcut to Zoom › zzzzBov\\.com\n07 \\- Apr \\- 2020\nShortcut to Zoom\nI use Chrome on Windows as my primar.{2297}hat adding even a few of these to my start menu will help reduce just a bit more friction in my day\\.$', flags=48)),
         })
 
-    def test_fast_ms_marco_docs_store(self):
-        docs_store = ir_datasets.load('msmarco-document-v2.1').docs_store()
-
-        doc = docs_store.get('msmarco_v2.1_doc_12_0')
-        self.assertEqual('msmarco_v2.1_doc_12_0', doc.doc_id)
-
-        doc = docs_store.get('msmarco_v2.1_doc_12_48692010')
-        self.assertEqual('msmarco_v2.1_doc_12_48692010', doc.doc_id)
-
-    def test_fast_docs_store_on_non_existing_documents(self):
-        docs_store = ir_datasets.load('msmarco-document-v2.1').docs_store()
-
-        with self.assertRaises(Exception) as context:
-            doc = docs_store.get('msmarco_v2.1_doc_12_111')
-
-        self.assertTrue('Expecting value: line 1 column 1' in str(context.exception))
-
-    def test_fast_ms_marco_docs_iter(self):
-        # faster alternative to above
-        docs_iter = ir_datasets.load('msmarco-document-v2.1').docs_iter()
-        first_doc = docs_iter.__next__()
-        second_doc = docs_iter.__next__()
-
-        self.assertEqual('msmarco_v2.1_doc_12_0', first_doc.doc_id)
-        self.assertEqual('msmarco_v2.1_doc_12_5689', second_doc.doc_id)
-
-    def test_fast_docs_count(self):
-        expected = 10960555
-        actual = ir_datasets.load('msmarco-document-v2.1').docs_count()
-
-        self.assertEqual(expected, actual)
-
-    def test_fast_queries(self):
+    def test_queries(self):
         self._test_queries('msmarco-document-v2.1/trec-rag-2024', count=301, items={
             0: GenericQuery('2024-145979', 'what is vicarious trauma and how can it be coped with?'),
             9: GenericQuery('2024-158743', 'what was happening in germany and netherlands in the 1840s'),
diff --git a/test/integration/msmarco_document_v2_1_segmented.py b/test/integration/msmarco_document_v2_1_segmented.py
deleted file mode 100644
index bd1fc6a..0000000
--- a/test/integration/msmarco_document_v2_1_segmented.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import re
-import unittest
-import ir_datasets
-from ir_datasets.datasets.msmarco_document_v2_1_segmented import MsMarcoV21SegmentedDocument
-from ir_datasets.formats import TrecQrel, GenericQuery
-from .base import DatasetIntegrationTest
-
-
-_logger = ir_datasets.log.easy()
-
-
-class TestMSMarcoV21DocsSegmented(DatasetIntegrationTest):
-    def test_ms_marco_docs_iter_full(self):
-        self._test_docs('msmarco-document-v2.1/segmented', count=5371, items={
-            0: MsMarcoV21SegmentedDocument(
-                doc_id='msmarco_v2.1_doc_42_0#0_0',
-                title='How to Use Flip Tool in GIMP',
-                url='https://www.guidingtech.com/use-flip-tool-gimp/',
-                headings=re.compile('^How to Use Flip Tool in\\nGIMP\\n\\nHow to Use Flip Tool in GIMP.*'),
-                segment=re.compile('^How to Use Flip Tool in GIMP\\nHow to Use Flip Tool in GIMP\\nMehvish\\n06 Sep 2019.*'),
-                start_char=0,
-                end_char=800
-            ),
-           19: MsMarcoV21SegmentedDocument(
-                doc_id='msmarco_v2.1_doc_42_6080#2_22424',
-                title='How to Setup and Use FTP Server on Android',
-                url='https://www.guidingtech.com/use-ftp-server-file-transfer-android/',
-                headings=re.compile('How to Set\\u00adup and Use\\nFTP\nServ\\u00ader on Android\.*'),
-                segment=re.compile('^Also Read: Best Alternatives to Google Apps\\nIn this post.*'),
-                start_char=1032,
-                end_char=1959
-            ),
-            5370: MsMarcoV21SegmentedDocument(
-                doc_id='msmarco_v2.1_doc_42_3400697#6_9024928',
-                title='Can Guinea Pigs Eat Leaves? - Guinea Pig Tube',
-                url='https://www.guineapigtube.com/can-guinea-pigs-eat-leaves/',
-                headings=re.compile('^Can Guinea Pigs Eat Leaves\?\\nCan Guinea Pigs Eat Leaves\?.*'),
-                segment=re.compile('^They protect the body from free radical damage. The free radicals cause many health problems and also cause premature aging in guinea pigs.*'),
-                start_char=2954,
-                end_char=3767,
-            ),
-        })
-
-    def test_fast_ms_marco_docs_store(self):
-        docs_store = ir_datasets.load('msmarco-document-v2.1/segmented').docs_store()
-
-        doc = docs_store.get('msmarco_v2.1_doc_02_968#0_1561')
-        self.assertEqual('msmarco_v2.1_doc_02_968#0_1561', doc.doc_id)
-
-        doc = docs_store.get('msmarco_v2.1_doc_03_0#3_5523')
-        self.assertEqual('msmarco_v2.1_doc_03_0#3_5523', doc.doc_id)
-
-    def test_fast_docs_store_on_non_existing_documents(self):
-        docs_store = ir_datasets.load('msmarco-document-v2.1/segmented').docs_store()
-
-        with self.assertRaises(Exception) as context:
-            doc = docs_store.get('msmarco_v2.1_doc_02_968#0_156')
-
-        self.assertTrue('Expecting value: line 1 column 1' in str(context.exception))
-
-    def test_fast_ms_marco_docs_iter(self):
-        # faster alternative to above
-        docs_iter = ir_datasets.load('msmarco-document-v2.1/segmented').docs_iter()
-        first_doc = docs_iter.__next__()
-        second_doc = docs_iter.__next__()
-
-        self.assertEqual('msmarco_v2.1_doc_42_0#0_0', first_doc.doc_id)
-        self.assertEqual('msmarco_v2.1_doc_42_0#1_1311', second_doc.doc_id)
-
-    def test_fast_docs_count(self):
-        expected = 113520750
-        actual = ir_datasets.load('msmarco-document-v2.1/segmented').docs_count()
-
-        self.assertEqual(expected, actual)
-
-    def test_fast_queries(self):
-        self._test_queries('msmarco-document-v2.1/trec-rag-2024', count=301, items={
-            0: GenericQuery('2024-145979', 'what is vicarious trauma and how can it be coped with?'),
-            9: GenericQuery('2024-158743', 'what was happening in germany and netherlands in the 1840s'),
-            300: GenericQuery('2024-21669', 'do abortions kill more black people than other weapons'),
-        })
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/integration/msmarco_segment_v2_1.py b/test/integration/msmarco_segment_v2_1.py
new file mode 100644
index 0000000..8b4d07c
--- /dev/null
+++ b/test/integration/msmarco_segment_v2_1.py
@@ -0,0 +1,30 @@
+import re
+import unittest
+import ir_datasets
+from ir_datasets.datasets.msmarco_document_v2_1_segmented import MsMarcoV21SegmentedDocument
+from ir_datasets.formats import TrecQrel, GenericQuery
+from .base import DatasetIntegrationTest
+
+
+_logger = ir_datasets.log.easy()
+
+
+class TestMSMarcoV21DocsSegmented(DatasetIntegrationTest):
+    def test_docs(self):
+        self._build_test_docs('msmarco-segment-v2.1')
+        # self._test_docs('msmarco-segment-v2.1', count=113520750, items={
+        #     0: MsMarcoV21SegmentedDocument('msmarco_v2.1_doc_42_0#0_0', 'https://www.guidingtech.com/use-flip-tool-gimp/', 'How to Use Flip Tool in GIMP', re.compile('^How to Use Flip Tool in\nGIMP\n\nHow to Use Flip Tool in GIMP\nMehvish\n1\\. Using the Built\\-In Flip Tool\nF.{85}ol\nFlip a Layer\nFlip All Layers in GIMP\nBonus Tip: Create Mirror Effect in GIMP\nMagic of the Mirror\n$', flags=48), re.compile("^How to Use Flip Tool in GIMP\nHow to Use Flip Tool in GIMP\nMehvish\n06 Sep 2019\nAt times, the powerful.{600}re is a guide on how to flip an image in GIMP\\. There are two methods to do it\\. Let's check them out\\.$", flags=48), 0, 800),
+        #     9: MsMarcoV21SegmentedDocument('msmarco_v2.1_doc_42_0#9_9963', 'https://www.guidingtech.com/use-flip-tool-gimp/', 'How to Use Flip Tool in GIMP', re.compile('^How to Use Flip Tool in\nGIMP\n\nHow to Use Flip Tool in GIMP\nMehvish\n1\\. Using the Built\\-In Flip Tool\nF.{85}ol\nFlip a Layer\nFlip All Layers in GIMP\nBonus Tip: Create Mirror Effect in GIMP\nMagic of the Mirror\n$', flags=48), re.compile('^Flip a Layer\nTo do so, follow these steps: Step 1: Open the image in GIMP\\. Step 2: Click on the Laye.{309} click on the Image option present in the top bar and select Transform followed by your flip choice\\.$', flags=48), 2862, 3372),
+        #     113520749: MsMarcoV21SegmentedDocument('msmarco_v2.1_doc_04_1869956217#8_3169040836', 'http://www.city-data.com/city/Sedgwick-Kansas.html', 'Sedgwick, Kansas (KS 67135) profile: population, maps, real estate, averages, homes, statistics, relocation, travel, jobs, hospitals, schools, crime, moving, houses, news, sex offenders', re.compile('^Sedgwick, Kansas\nSedgwick, Kansas\nLoading data\\.\\.\\.\nCrime rates in Sedgwick by year\nType\n2007\n2011\n201.{1539}ing System \\(NFIRS\\) incidents\nSedgwick compared to Kansas state average:\nOther pages you might like:\n$', flags=48), re.compile('^79\\.8 \\(low, U\\.S\\. average is 100\\)\nSedgwick, KS residents, houses, and apartments details\nPercentage of.{7764}house built \\- Built 1939 or earlier \\(%\\) Average household size Household density \\(households per squ$', flags=48), 2037, 10000),
+        # })
+
+    def test_queries(self):
+        self._test_queries('msmarco-segment-v2.1/trec-rag-2024', count=301, items={
+            0: GenericQuery('2024-145979', 'what is vicarious trauma and how can it be coped with?'),
+            9: GenericQuery('2024-158743', 'what was happening in germany and netherlands in the 1840s'),
+            300: GenericQuery('2024-21669', 'do abortions kill more black people than other weapons'),
+        })
+
+
+if __name__ == '__main__':
+    unittest.main()

From 1dbf781628122ce9d1be06dbaf82129d6da6d031 Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Sun, 11 Aug 2024 19:07:44 +0100
Subject: [PATCH 09/13] more wip

---
 ir_datasets/datasets/msmarco_document_v2_1.py |  2 +-
 ir_datasets/datasets/msmarco_passage_v2.py    |  7 ++++---
 ir_datasets/datasets/msmarco_segment_v2_1.py  | 14 ++++++++++----
 test/integration/msmarco_segment_v2_1.py      | 13 ++++++-------
 4 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py
index c292108..2b054c0 100644
--- a/ir_datasets/datasets/msmarco_document_v2_1.py
+++ b/ir_datasets/datasets/msmarco_document_v2_1.py
@@ -14,7 +14,7 @@ def _init():
     documentation = YamlDocumentation(f'docs/{NAME}.yaml')
     dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
     # we can re-use MsMarcoV2Docs, just with a few modifications directly
-    collection = MsMarcoV2Docs(dlc['docs'], docid_prefix='msmarco_v2.1_doc_', docstore_size_hint=0, name=NAME)
+    collection = MsMarcoV2Docs(dlc['docs'], docid_prefix='msmarco_v2.1_doc_', docstore_size_hint=59680176084, name=NAME)
     subsets = {}
 
     subsets['trec-rag-2024'] = Dataset(
diff --git a/ir_datasets/datasets/msmarco_passage_v2.py b/ir_datasets/datasets/msmarco_passage_v2.py
index 7dc572e..e7c3de4 100644
--- a/ir_datasets/datasets/msmarco_passage_v2.py
+++ b/ir_datasets/datasets/msmarco_passage_v2.py
@@ -121,9 +121,10 @@ def get_many_iter(self, keys):
         # adapted from <https://microsoft.github.io/msmarco/TREC-Deep-Learning.html>
         bundles = {}
         for key in keys:
-            if not key.count('_') == 3:
+            try:
+                bundlenum, position = self.docs_handler._bundle_pos_from_key(key)
+            except:
                 continue
-            bundlenum, position = self.docs_handler._bundle_pos_from_key(key)
             if bundlenum not in bundles:
                 bundles[bundlenum] = []
             bundles[bundlenum].append(int(position))
@@ -227,7 +228,7 @@ def __next__(self):
                 pos = self.current_pos_mmap[self.slice.start - self.current_file_start_idx]
                 self.current_file.seek(pos)
                 self.next_index = self.slice.start
-        result = parse_msmarco_passage(self.current_file.readline())
+        result = self.docstore.docs_handler._parse_passage(self.current_file.readline())
         self.next_index += 1
         self.slice = slice(self.slice.start + (self.slice.step or 1), self.slice.stop, self.slice.step)
         return result
diff --git a/ir_datasets/datasets/msmarco_segment_v2_1.py b/ir_datasets/datasets/msmarco_segment_v2_1.py
index 4574aa8..a3a4d91 100644
--- a/ir_datasets/datasets/msmarco_segment_v2_1.py
+++ b/ir_datasets/datasets/msmarco_segment_v2_1.py
@@ -20,6 +20,8 @@ class MsMarcoV21SegmentedDoc(NamedTuple):
     segment: str
     start_char: int
     end_char: int
+    msmarco_document_id: str
+    msmarco_document_segment_idx: int
     def default_text(self):
         """
         title + headings + segment
@@ -32,6 +34,8 @@ def default_text(self):
 
 def parse_msmarco_segment(line):
     data = json.loads(line)
+    msmarco_document_id, segment_info = data['docid'].split('#')
+    segment_idx, segment_file_offset = segment_info.split('_')
     return MsMarcoV21SegmentedDoc(
         data['docid'],
         data['url'],
@@ -39,15 +43,17 @@ def parse_msmarco_segment(line):
         data['headings'],
         data['segment'],
         data['start_char'],
-        data['end_char']
+        data['end_char'],
+        msmarco_document_id,
+        int(segment_idx),
     )
 
 
-def passage_bundle_pos_from_key(key):
+def segment_bundle_pos_from_key(key):
     # key like: msmarco_v2.1_doc_00_0#4_5974
     first, second = key.split('#')
     (string1, string2, string3, bundle, doc_pos) = first.split('_')
-    (segment_num, segment_pos) = first.split('_')
+    (segment_num, segment_pos) = second.split('_')
     assert string1 == 'msmarco' and string2 == 'v2.1' and string3 == 'doc'
     return f'msmarco_v2.1_doc_segmented_{bundle}.json', segment_pos
 
@@ -56,7 +62,7 @@ def _init():
     base_path = ir_datasets.util.home_path()/NAME
     documentation = YamlDocumentation(f'docs/{NAME}.yaml')
     dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
-    collection = MsMarcoV2Passages(dlc['docs'], cls=MsMarcoV21SegmentedDoc, parse_passage=parse_msmarco_segment, name=NAME, bundle_pos_from_key=passage_bundle_pos_from_key, count=113_520_750)
+    collection = MsMarcoV2Passages(dlc['docs'], cls=MsMarcoV21SegmentedDoc, parse_passage=parse_msmarco_segment, name=NAME, bundle_pos_from_key=segment_bundle_pos_from_key, count=113_520_750, docstore_size_hint=205178702472)
     subsets = {}
     subsets['trec-rag-2024'] = Dataset(
         collection,
diff --git a/test/integration/msmarco_segment_v2_1.py b/test/integration/msmarco_segment_v2_1.py
index 8b4d07c..2fe48bf 100644
--- a/test/integration/msmarco_segment_v2_1.py
+++ b/test/integration/msmarco_segment_v2_1.py
@@ -1,7 +1,7 @@
 import re
 import unittest
 import ir_datasets
-from ir_datasets.datasets.msmarco_document_v2_1_segmented import MsMarcoV21SegmentedDocument
+from ir_datasets.datasets.msmarco_segment_v2_1 import MsMarcoV21SegmentedDoc
 from ir_datasets.formats import TrecQrel, GenericQuery
 from .base import DatasetIntegrationTest
 
@@ -11,12 +11,11 @@
 
 class TestMSMarcoV21DocsSegmented(DatasetIntegrationTest):
     def test_docs(self):
-        self._build_test_docs('msmarco-segment-v2.1')
-        # self._test_docs('msmarco-segment-v2.1', count=113520750, items={
-        #     0: MsMarcoV21SegmentedDocument('msmarco_v2.1_doc_42_0#0_0', 'https://www.guidingtech.com/use-flip-tool-gimp/', 'How to Use Flip Tool in GIMP', re.compile('^How to Use Flip Tool in\nGIMP\n\nHow to Use Flip Tool in GIMP\nMehvish\n1\\. Using the Built\\-In Flip Tool\nF.{85}ol\nFlip a Layer\nFlip All Layers in GIMP\nBonus Tip: Create Mirror Effect in GIMP\nMagic of the Mirror\n$', flags=48), re.compile("^How to Use Flip Tool in GIMP\nHow to Use Flip Tool in GIMP\nMehvish\n06 Sep 2019\nAt times, the powerful.{600}re is a guide on how to flip an image in GIMP\\. There are two methods to do it\\. Let's check them out\\.$", flags=48), 0, 800),
-        #     9: MsMarcoV21SegmentedDocument('msmarco_v2.1_doc_42_0#9_9963', 'https://www.guidingtech.com/use-flip-tool-gimp/', 'How to Use Flip Tool in GIMP', re.compile('^How to Use Flip Tool in\nGIMP\n\nHow to Use Flip Tool in GIMP\nMehvish\n1\\. Using the Built\\-In Flip Tool\nF.{85}ol\nFlip a Layer\nFlip All Layers in GIMP\nBonus Tip: Create Mirror Effect in GIMP\nMagic of the Mirror\n$', flags=48), re.compile('^Flip a Layer\nTo do so, follow these steps: Step 1: Open the image in GIMP\\. Step 2: Click on the Laye.{309} click on the Image option present in the top bar and select Transform followed by your flip choice\\.$', flags=48), 2862, 3372),
-        #     113520749: MsMarcoV21SegmentedDocument('msmarco_v2.1_doc_04_1869956217#8_3169040836', 'http://www.city-data.com/city/Sedgwick-Kansas.html', 'Sedgwick, Kansas (KS 67135) profile: population, maps, real estate, averages, homes, statistics, relocation, travel, jobs, hospitals, schools, crime, moving, houses, news, sex offenders', re.compile('^Sedgwick, Kansas\nSedgwick, Kansas\nLoading data\\.\\.\\.\nCrime rates in Sedgwick by year\nType\n2007\n2011\n201.{1539}ing System \\(NFIRS\\) incidents\nSedgwick compared to Kansas state average:\nOther pages you might like:\n$', flags=48), re.compile('^79\\.8 \\(low, U\\.S\\. average is 100\\)\nSedgwick, KS residents, houses, and apartments details\nPercentage of.{7764}house built \\- Built 1939 or earlier \\(%\\) Average household size Household density \\(households per squ$', flags=48), 2037, 10000),
-        # })
+        self._test_docs('msmarco-segment-v2.1', count=113520750, items={
+            0: MsMarcoV21SegmentedDoc('msmarco_v2.1_doc_00_0#0_0', 'http://0-60.reviews/0-60-times/', '0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews', '0-60 Times\n0-60 Times', re.compile('^0\\-60 Times \\- 0\\-60 \\| 0 to 60 Times \\& 1/4 Mile Times \\| Zero to 60 Car Reviews\n0\\-60 Times\nThere are man.{1078}used as the standard in the United States, where the rest of the world prefers the 0\\-100 km version\\.$', flags=48), 0, 1278, 'msmarco_v2.1_doc_00_0', 0),
+            9: MsMarcoV21SegmentedDoc('msmarco_v2.1_doc_00_4810#2_16701', 'http://0-www.worldcat.org.novacat.nova.edu/identities/lccn-n79036869/', 'Ethel Percy Andrus Gerontology Center [WorldCat Identities]', re.compile('^Ethel Percy Andrus Gerontology Center\nEthel Percy Andrus Gerontology Center\nAndrus \\(Ethel Percy\\) Ger.{409}niversity of Southern California Los Angeles, Calif Ethel Percy Andrus Gerontology Center\nLanguages\n$', flags=48), re.compile('^submitted to U\\.S\\. Department  of Health, Education, and Welfare, Public Health Service, Health Resea.{2311}e questionnaires used and the data derived from them, and how the data were collected and  analyzed\\.$', flags=48), 2265, 4776, 'msmarco_v2.1_doc_00_4810', 2),
+            113520749: MsMarcoV21SegmentedDoc('msmarco_v2.1_doc_59_964287870#4_2159633396', 'https://zzzzbov.com/blag/shortcut-to-zoom', 'Shortcut to Zoom › zzzzBov.com', 'Shortcut to Zoom\nShortcut to Zoom\nBatch File\nShortcut\nTrying it out\n', re.compile('^When it asks "What would you like to name the shortcut\\?", type the name of the meeting \\(i\\.e\\. "Standu.{333}hat adding even a few of these to my start menu will help reduce just a bit more friction in my day\\.$', flags=48), 1963, 2497, 'msmarco_v2.1_doc_59_964287870', 4),
+        })
 
     def test_queries(self):
         self._test_queries('msmarco-segment-v2.1/trec-rag-2024', count=301, items={

From 966c1ba9a248feb187ae0d5c69c041cc7b8b51aa Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Sun, 11 Aug 2024 21:28:00 +0100
Subject: [PATCH 10/13] metadata

---
 ir_datasets/etc/metadata.json | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json
index 60bd7c8..69756b0 100644
--- a/ir_datasets/etc/metadata.json
+++ b/ir_datasets/etc/metadata.json
@@ -495,6 +495,7 @@
   "mr-tydi/th/train": {"docs": {"_ref": "mr-tydi/th"}, "queries": {"count": 3319}, "qrels": {"count": 3360, "fields": {"relevance": {"counts_by_value": {"1": 3360}}}}},
   "msmarco-document": {"docs": {"count": 3213835, "fields": {"doc_id": {"max_len": 8, "common_prefix": "D"}}}},
   "msmarco-document-v2": {"docs": {"count": 11959635, "fields": {"doc_id": {"max_len": 25, "common_prefix": "msmarco_doc_"}}}},
+  "msmarco-document-v2.1/trec-rag-2024": {"docs": {"_ref": "msmarco-document-v2.1"}, "queries": {"count": 301}},
   "msmarco-document-v2/anchor-text": {"docs": {"count": 4821244, "fields": {"doc_id": {"max_len": 25, "common_prefix": "msmarco_doc_"}}}},
   "msmarco-document-v2/dev1": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 4552}, "qrels": {"count": 4702, "fields": {"relevance": {"counts_by_value": {"1": 4702}}}}, "scoreddocs": {"count": 455200}},
   "msmarco-document-v2/dev2": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 5000}, "qrels": {"count": 5178, "fields": {"relevance": {"counts_by_value": {"1": 5178}}}}, "scoreddocs": {"count": 500000}},
@@ -561,6 +562,8 @@
   "msmarco-qna/dev": {"docs": {"_ref": "msmarco-qna"}, "queries": {"count": 101093}, "qrels": {"count": 1008985, "fields": {"relevance": {"counts_by_value": {"0": 949712, "1": 59273}}}}, "scoreddocs": {"count": 1008985}},
   "msmarco-qna/eval": {"docs": {"_ref": "msmarco-qna"}, "queries": {"count": 101092}, "scoreddocs": {"count": 1008943}},
   "msmarco-qna/train": {"docs": {"_ref": "msmarco-qna"}, "queries": {"count": 808731}, "qrels": {"count": 8069749, "fields": {"relevance": {"counts_by_value": {"1": 532761, "0": 7536988}}}}, "scoreddocs": {"count": 8069749}},
+  "msmarco-segment-v2.1": {"docs": {"count": 113520750, "fields": {"doc_id": {"max_len": 45, "common_prefix": "msmarco_v2.1_doc_"}}}},
+  "msmarco-segment-v2.1/trec-rag-2024": {"docs": {"_ref": "msmarco-segment-v2.1"}, "queries": {"count": 301}},
   "natural-questions": {"docs": {"count": 28390850, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}},
   "natural-questions/dev": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 7830}, "qrels": {"count": 7695, "fields": {"relevance": {"counts_by_value": {"1": 7695}}}}, "scoreddocs": {"count": 973480}},
   "natural-questions/train": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 307373}, "qrels": {"count": 152148, "fields": {"relevance": {"counts_by_value": {"1": 152148}}}}, "scoreddocs": {"count": 40374730}},

From 21790f2e56b0e7f9d886ebe27d5bf4efaddb4f46 Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Sun, 11 Aug 2024 21:38:35 +0100
Subject: [PATCH 11/13] msmarco-document-v2.1 metadata

---
 ir_datasets/etc/metadata.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json
index 69756b0..2e18294 100644
--- a/ir_datasets/etc/metadata.json
+++ b/ir_datasets/etc/metadata.json
@@ -495,6 +495,7 @@
   "mr-tydi/th/train": {"docs": {"_ref": "mr-tydi/th"}, "queries": {"count": 3319}, "qrels": {"count": 3360, "fields": {"relevance": {"counts_by_value": {"1": 3360}}}}},
   "msmarco-document": {"docs": {"count": 3213835, "fields": {"doc_id": {"max_len": 8, "common_prefix": "D"}}}},
   "msmarco-document-v2": {"docs": {"count": 11959635, "fields": {"doc_id": {"max_len": 25, "common_prefix": "msmarco_doc_"}}}},
+  "msmarco-document-v2.1": {"docs": {"count": 10960555, "fields": {"doc_id": {"max_len": 30, "common_prefix": "msmarco_v2.1_doc_"}}}},
   "msmarco-document-v2.1/trec-rag-2024": {"docs": {"_ref": "msmarco-document-v2.1"}, "queries": {"count": 301}},
   "msmarco-document-v2/anchor-text": {"docs": {"count": 4821244, "fields": {"doc_id": {"max_len": 25, "common_prefix": "msmarco_doc_"}}}},
   "msmarco-document-v2/dev1": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 4552}, "qrels": {"count": 4702, "fields": {"relevance": {"counts_by_value": {"1": 4702}}}}, "scoreddocs": {"count": 455200}},

From 96d3c9fc4cdd715612d3b5719a47a3d047532b3c Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Mon, 12 Aug 2024 08:37:40 +0100
Subject: [PATCH 12/13] missing metadata from trec-cast

---
 ir_datasets/etc/metadata.json | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json
index 2e18294..87ee0f5 100644
--- a/ir_datasets/etc/metadata.json
+++ b/ir_datasets/etc/metadata.json
@@ -640,6 +640,20 @@
   "trec-cast/v1/2019/judged": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 173}, "qrels": {"_ref": "trec-cast/v1/2019"}, "scoreddocs": {"count": 173000}},
   "trec-cast/v1/2020": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 216}, "qrels": {"count": 40451, "fields": {"relevance": {"counts_by_value": {"1": 2697, "0": 33781, "2": 1834, "3": 1408, "4": 731}}}}},
   "trec-cast/v1/2020/judged": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 208}, "qrels": {"_ref": "trec-cast/v1/2020"}},
+  "trec-cast/v2/2021": {"docs": {"_ref": "trec-cast/v2"}, "queries": {"count": 239}, "qrels": {"count": 19334, "fields": {"relevance": {"counts_by_value": {"0": 13829, "4": 716, "3": 1007, "2": 1710, "1": 2072}}}}},
+  "trec-cast/v2/kilt": {"docs": {"count": 5903530, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}},
+  "trec-cast/v2/kilt/passages": {"docs": {"count": 17124025, "fields": {"doc_id": {"max_len": 17, "common_prefix": "KILT_"}}}},
+  "trec-cast/v2/kilt/segmented": {"docs": {"count": 5903530, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}},
+  "trec-cast/v2/msmarco": {"docs": {"count": 3051991, "fields": {"doc_id": {"max_len": 14, "common_prefix": "MARCO_D"}}}},
+  "trec-cast/v2/msmarco/passages": {"docs": {"count": 19092817, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_D"}}}},
+  "trec-cast/v2/msmarco/segmented": {"docs": {"count": 3051991, "fields": {"doc_id": {"max_len": 14, "common_prefix": "MARCO_D"}}}},
+  "trec-cast/v3/2022": {"docs": {"_ref": "trec-cast/v3"}, "queries": {"count": 408}, "qrels": {"count": 42196, "fields": {"relevance": {"counts_by_value": {"0": 29868, "1": 5063, "3": 2129, "2": 3297, "4": 1839}}}}},
+  "trec-cast/v3/kilt": {"docs": {"count": 5903219, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}},
+  "trec-cast/v3/kilt/passages": {"docs": {"count": 17111488, "fields": {"doc_id": {"max_len": 17, "common_prefix": "KILT_"}}}},
+  "trec-cast/v3/kilt/segmented": {"docs": {"count": 5903219, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}},
+  "trec-cast/v3/msmarco": {"docs": {"count": 10965836, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_"}}}},
+  "trec-cast/v3/msmarco/passages": {"docs": {"count": 86326322, "fields": {"doc_id": {"max_len": 24, "common_prefix": "MARCO_"}}}},
+  "trec-cast/v3/msmarco/segmented": {"docs": {"count": 10965836, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_"}}}},
   "trec-fair": {},
   "trec-fair-2021": {"docs": {"count": 6280328, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
   "trec-fair-2021/eval": {"docs": {"_ref": "trec-fair-2021"}, "queries": {"count": 49}, "qrels": {"count": 13757, "fields": {"relevance": {"counts_by_value": {"1": 13757}}}}},

From 24a983d51b04f11a11c2f654dab3c275905c67a0 Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Mon, 12 Aug 2024 13:05:06 +0100
Subject: [PATCH 13/13] more metadata

---
 ir_datasets/etc/metadata.json | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json
index 87ee0f5..bf89499 100644
--- a/ir_datasets/etc/metadata.json
+++ b/ir_datasets/etc/metadata.json
@@ -640,6 +640,7 @@
   "trec-cast/v1/2019/judged": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 173}, "qrels": {"_ref": "trec-cast/v1/2019"}, "scoreddocs": {"count": 173000}},
   "trec-cast/v1/2020": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 216}, "qrels": {"count": 40451, "fields": {"relevance": {"counts_by_value": {"1": 2697, "0": 33781, "2": 1834, "3": 1408, "4": 731}}}}},
   "trec-cast/v1/2020/judged": {"docs": {"_ref": "trec-cast/v1"}, "queries": {"count": 208}, "qrels": {"_ref": "trec-cast/v1/2020"}},
+  "trec-cast/v2": {"docs": {"count": 9680029, "fields": {"doc_id": {"max_len": 41, "common_prefix": ""}}}},
   "trec-cast/v2/2021": {"docs": {"_ref": "trec-cast/v2"}, "queries": {"count": 239}, "qrels": {"count": 19334, "fields": {"relevance": {"counts_by_value": {"0": 13829, "4": 716, "3": 1007, "2": 1710, "1": 2072}}}}},
   "trec-cast/v2/kilt": {"docs": {"count": 5903530, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}},
   "trec-cast/v2/kilt/passages": {"docs": {"count": 17124025, "fields": {"doc_id": {"max_len": 17, "common_prefix": "KILT_"}}}},
@@ -647,6 +648,11 @@
   "trec-cast/v2/msmarco": {"docs": {"count": 3051991, "fields": {"doc_id": {"max_len": 14, "common_prefix": "MARCO_D"}}}},
   "trec-cast/v2/msmarco/passages": {"docs": {"count": 19092817, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_D"}}}},
   "trec-cast/v2/msmarco/segmented": {"docs": {"count": 3051991, "fields": {"doc_id": {"max_len": 14, "common_prefix": "MARCO_D"}}}},
+  "trec-cast/v2/passages": {"docs": {"count": 39254641, "fields": {"doc_id": {"max_len": 45, "common_prefix": ""}}}},
+  "trec-cast/v2/wapo": {"docs": {"count": 724508, "fields": {"doc_id": {"max_len": 41, "common_prefix": "WAPO_"}}}},
+  "trec-cast/v2/wapo/passages": {"docs": {"count": 3037799, "fields": {"doc_id": {"max_len": 45, "common_prefix": "WAPO_"}}}},
+  "trec-cast/v2/wapo/segmented": {"docs": {"count": 724508, "fields": {"doc_id": {"max_len": 41, "common_prefix": "WAPO_"}}}},
+  "trec-cast/v3": {"docs": {"count": 106400940, "fields": {"doc_id": {"max_len": 45, "common_prefix": ""}}}},
   "trec-cast/v3/2022": {"docs": {"_ref": "trec-cast/v3"}, "queries": {"count": 408}, "qrels": {"count": 42196, "fields": {"relevance": {"counts_by_value": {"0": 29868, "1": 5063, "3": 2129, "2": 3297, "4": 1839}}}}},
   "trec-cast/v3/kilt": {"docs": {"count": 5903219, "fields": {"doc_id": {"max_len": 13, "common_prefix": "KILT_"}}}},
   "trec-cast/v3/kilt/passages": {"docs": {"count": 17111488, "fields": {"doc_id": {"max_len": 17, "common_prefix": "KILT_"}}}},
@@ -654,6 +660,9 @@
   "trec-cast/v3/msmarco": {"docs": {"count": 10965836, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_"}}}},
   "trec-cast/v3/msmarco/passages": {"docs": {"count": 86326322, "fields": {"doc_id": {"max_len": 24, "common_prefix": "MARCO_"}}}},
   "trec-cast/v3/msmarco/segmented": {"docs": {"count": 10965836, "fields": {"doc_id": {"max_len": 19, "common_prefix": "MARCO_"}}}},
+  "trec-cast/v3/wapo": {"docs": {"count": 713594, "fields": {"doc_id": {"max_len": 41, "common_prefix": "WAPO_"}}}},
+  "trec-cast/v3/wapo/passages": {"docs": {"count": 2963130, "fields": {"doc_id": {"max_len": 45, "common_prefix": "WAPO_"}}}},
+  "trec-cast/v3/wapo/segmented": {"docs": {"count": 713594, "fields": {"doc_id": {"max_len": 41, "common_prefix": "WAPO_"}}}},
   "trec-fair": {},
   "trec-fair-2021": {"docs": {"count": 6280328, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
   "trec-fair-2021/eval": {"docs": {"_ref": "trec-fair-2021"}, "queries": {"count": 49}, "qrels": {"count": 13757, "fields": {"relevance": {"counts_by_value": {"1": 13757}}}}},