add more details for trec-tot 2024

allenai · mam10eks · Sep 21, 2024 · Sep 22, 2024 · Sep 22, 2024 · Sep 22, 2024
commit 8eb267bdf1e3538c8f2bf4e9e50575d6a72f43f2
diff --git a/ir_datasets/datasets/trec_tot.py b/ir_datasets/datasets/trec_tot.py
@@ -22,6 +22,20 @@ def default_text(self):
         """
         return self.page_title + ' ' + self.text
 
+class TipOfTheTongueDoc2024(NamedTuple):
+    doc_id: str
+    title: str
+    wikidata_id: str
+    text: str
+    sections: Dict[str, str]
+
+    def default_text(self):
+        """
+        We use the title and text of the TipOfTheTongueQuery as default_text because that is everything available for users who want to respond to such an information need.
+        """
+        return self.title + ' ' + self.text
+
+
 
 class TipOfTheTongueQuery(NamedTuple):
     query_id: str
@@ -68,12 +82,20 @@ def _init():
 
     main_dlc = dlc['2024']
 
-    docs_2024_handler = JsonlDocs(Cache(ZipExtract(main_dlc, 'TREC-TOT-2024/corpus.jsonl'), base_path/'2024/corpus.jsonl'), doc_cls=TipOfTheTongueDoc, lang='en')
+    docs_2024_handler = JsonlDocs(Cache(ZipExtract(main_dlc, 'corpus.jsonl'), base_path/'2024/corpus.jsonl'), doc_cls=TipOfTheTongueDoc2024, lang='en')
     subsets['2024'] = Dataset(
         docs_2024_handler,
         documentation('2024'),
     )
     ir_datasets.registry.register(f'{NAME}/2024', subsets['2024'])
+    for s in ['test']:
+        subsets[f'2024/{s}'] = Dataset(
+            docs_2024_handler,
+            JsonlQueries(Cache(ZipExtract(dlc[f'2024-{s}'], f'TREC-TOT/{s}-2024/queries.jsonl'), base_path/f'2024/{s}-2024/queries.jsonl'), query_cls=TipOfTheTongueQuery, mapping=QUERY_MAP, lang='en'),
+            documentation(f'2024/{s}'),
+        )
+        ir_datasets.registry.register(f'{NAME}/2024/{s}', subsets[f'2024/{s}'])
+
 
     return base, subsets
 

diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json
@@ -5949,9 +5949,14 @@
       "cache_path": "trec-tot.zip"
     },
     "2024": {
-       "url": "https://zenodo.org/api/records/11185090/files-archive",
-      "expected_md5": "ea48b27fd3a1b90b1b9b8bf98351f7f9",
-      "cache_path": "trec-tot-2024.zip"
+      "url": "https://zenodo.org/records/13370657/files/corpus.jsonl.zip?download=1",
+      "expected_md5": "4ea86770817e46a06fea5c94f596409c",
+      "cache_path": "trec-tot-2024-corpus.zip"
+    },
+    "2024-test": {
+      "url": "https://zenodo.org/records/13370657/files/test-2024.zip?download=1",
+      "expected_md5": "3d0a4d83957ee6a1398afefbc96162fa",
+      "cache_path": "trec-tot-2024-queries.zip"
     }
   },
 

diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json
@@ -659,6 +659,7 @@
   "trec-spanish/trec4": {"docs": {"_ref": "trec-spanish"}, "queries": {"count": 25}, "qrels": {"count": 13109, "fields": {"relevance": {"counts_by_value": {"1": 2202, "0": 10907}}}}},
   "trec-tot": {},
   "trec-tot/2023": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
+  "trec-tot/2024": {"docs": {"count": 3185450, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
   "trec-tot/2023/dev": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
   "trec-tot/2023/train": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
   "tripclick": {"docs": {"count": 1523878, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},

diff --git a/test/integration/trec_tot_2024.py b/test/integration/trec_tot_2024.py
@@ -5,11 +5,12 @@
 from test.integration.base import DatasetIntegrationTest
 import ir_datasets
 
+print(ir_datasets.load('trec-tot/2024').docs_count())
 print(len([i for i in ir_datasets.load('trec-tot/2024').docs_iter()]))
 
 class TestTipOfTheTongue(DatasetIntegrationTest):
     def test_tip_of_the_tongue_docs(self):
-         self._test_docs('trec-tot/2024', count=231852, items={})
+         self._test_docs('trec-tot/2024', count=3185450, items={})
 
     def test_test_tip_of_the_tongue_qrels_train(self):
         #self._test_qrels('trec-tot/2024/test', count=150, items={