From a0b6f60987bf39584b83f413140d1d673be402d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maik=20Fr=C3=B6be?= <maik.froebe@uni-jena.de>
Date: Sat, 21 Sep 2024 07:47:59 +0200
Subject: [PATCH 1/3] prepare download for TREC tot 2024

---
 ir_datasets/datasets/trec_tot.py  | 12 +++++++++++-
 ir_datasets/docs/trec-tot.yaml    | 14 ++++++++++++++
 ir_datasets/etc/downloads.json    |  7 ++++++-
 test/integration/trec_tot_2024.py | 25 +++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 test/integration/trec_tot_2024.py
diff --git a/ir_datasets/datasets/trec_tot.py b/ir_datasets/datasets/trec_tot.py
index f7c8af67..0b66751f 100644
--- a/ir_datasets/datasets/trec_tot.py
+++ b/ir_datasets/datasets/trec_tot.py
@@ -44,7 +44,7 @@ def _init():
     dlc = DownloadConfig.context(NAME, base_path)
     subsets = {}
 
-    main_dlc = dlc['main']
+    main_dlc = dlc['2023']
     base = Dataset(
         documentation('_'),
     )
@@ -55,6 +55,7 @@ def _init():
         docs_2023_handler,
         documentation('2023'),
     )
+
     ir_datasets.registry.register(f'{NAME}/2023', subsets['2023'])
     for s in ['train', 'dev']:
         subsets[f'2023/{s}'] = Dataset(
@@ -65,6 +66,15 @@ def _init():
         )
         ir_datasets.registry.register(f'{NAME}/2023/{s}', subsets[f'2023/{s}'])
 
+    main_dlc = dlc['2024']
+
+    docs_2024_handler = JsonlDocs(Cache(ZipExtract(main_dlc, 'TREC-TOT-2024/corpus.jsonl'), base_path/'2024/corpus.jsonl'), doc_cls=TipOfTheTongueDoc, lang='en')
+    subsets['2024'] = Dataset(
+        docs_2024_handler,
+        documentation('2024'),
+    )
+    ir_datasets.registry.register(f'{NAME}/2024', subsets['2024'])
+
     return base, subsets
 
 
diff --git a/ir_datasets/docs/trec-tot.yaml b/ir_datasets/docs/trec-tot.yaml
index 1633c546..196fed5a 100644
--- a/ir_datasets/docs/trec-tot.yaml
+++ b/ir_datasets/docs/trec-tot.yaml
@@ -26,3 +26,17 @@ Train query set for TREC 2023 tip-of-the-tongue search track.
 Dev query set for TREC 2023 tip-of-the-tongue search track.
 </p>
 '
+
+2024:
+  desc: '
+<p>
+Corpus for the TREC 2024 tip-of-the-tongue search track.
+</p>
+'
+
+2024/test:
+  desc: '
+<p>
+Test query set for TREC 2024 tip-of-the-tongue search track.
+</p>
+'
diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json
index 397646a6..9038a867 100644
--- a/ir_datasets/etc/downloads.json
+++ b/ir_datasets/etc/downloads.json
@@ -5943,10 +5943,15 @@
   },
 
   "trec-tot": {
-    "main": {
+    "2023": {
       "url": "https://surfdrive.surf.nl/files/index.php/s/FaEK4xc6Xp2JcAJ/download",
       "expected_md5": "f84fe82cb80e3ee1072576c8d6c4a417",
       "cache_path": "trec-tot.zip"
+    },
+    "2024": {
+       "url": "https://zenodo.org/api/records/11185090/files-archive",
+      "expected_md5": "ea48b27fd3a1b90b1b9b8bf98351f7f9",
+      "cache_path": "trec-tot-2024.zip"
     }
   },
   
diff --git a/test/integration/trec_tot_2024.py b/test/integration/trec_tot_2024.py
new file mode 100644
index 00000000..d296d264
--- /dev/null
+++ b/test/integration/trec_tot_2024.py
@@ -0,0 +1,25 @@
+import re
+import unittest
+from ir_datasets.formats import TrecQrel
+from ir_datasets.datasets.trec_tot import TipOfTheTongueDoc, TipOfTheTongueQuery
+from test.integration.base import DatasetIntegrationTest
+import ir_datasets
+
+print(len([i for i in ir_datasets.load('trec-tot/2024').docs_iter()]))
+
+class TestTipOfTheTongue(DatasetIntegrationTest):
+    def test_tip_of_the_tongue_docs(self):
+         self._test_docs('trec-tot/2024', count=231852, items={})
+
+    def test_test_tip_of_the_tongue_qrels_train(self):
+        #self._test_qrels('trec-tot/2024/test', count=150, items={
+        #    0: TrecQrel('763', '16742289', 1, '0'),
+        #    9: TrecQrel('293', '142456', 1, '0'),
+        #    149: TrecQrel('828', '30672517', 1, '0'),
+        #})
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()
+

From 8eb267bdf1e3538c8f2bf4e9e50575d6a72f43f2 Mon Sep 17 00:00:00 2001
From: Maik Froebe <maik.froebe@uni-jena.de>
Date: Sun, 22 Sep 2024 03:13:07 +0200
Subject: [PATCH 2/3] add more details for trec-tot 2024

---
 ir_datasets/datasets/trec_tot.py  | 24 +++++++++++++++++++++++-
 ir_datasets/etc/downloads.json    | 11 ++++++++---
 ir_datasets/etc/metadata.json     |  1 +
 test/integration/trec_tot_2024.py |  3 ++-
 4 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/ir_datasets/datasets/trec_tot.py b/ir_datasets/datasets/trec_tot.py
index 0b66751f..a04f9009 100644
--- a/ir_datasets/datasets/trec_tot.py
+++ b/ir_datasets/datasets/trec_tot.py
@@ -22,6 +22,20 @@ def default_text(self):
         """
         return self.page_title + ' ' + self.text
 
+class TipOfTheTongueDoc2024(NamedTuple):
+    doc_id: str
+    title: str
+    wikidata_id: str
+    text: str
+    sections: Dict[str, str]
+
+    def default_text(self):
+        """
+        We use the title and text of the TipOfTheTongueQuery as default_text because that is everything available for users who want to respond to such an information need.
+        """
+        return self.title + ' ' + self.text
+
+
 
 class TipOfTheTongueQuery(NamedTuple):
     query_id: str
@@ -68,12 +82,20 @@ def _init():
 
     main_dlc = dlc['2024']
 
-    docs_2024_handler = JsonlDocs(Cache(ZipExtract(main_dlc, 'TREC-TOT-2024/corpus.jsonl'), base_path/'2024/corpus.jsonl'), doc_cls=TipOfTheTongueDoc, lang='en')
+    docs_2024_handler = JsonlDocs(Cache(ZipExtract(main_dlc, 'corpus.jsonl'), base_path/'2024/corpus.jsonl'), doc_cls=TipOfTheTongueDoc2024, lang='en')
     subsets['2024'] = Dataset(
         docs_2024_handler,
         documentation('2024'),
     )
     ir_datasets.registry.register(f'{NAME}/2024', subsets['2024'])
+    for s in ['test']:
+        subsets[f'2024/{s}'] = Dataset(
+            docs_2024_handler,
+            JsonlQueries(Cache(ZipExtract(dlc[f'2024-{s}'], f'TREC-TOT/{s}-2024/queries.jsonl'), base_path/f'2024/{s}-2024/queries.jsonl'), query_cls=TipOfTheTongueQuery, mapping=QUERY_MAP, lang='en'),
+            documentation(f'2024/{s}'),
+        )
+        ir_datasets.registry.register(f'{NAME}/2024/{s}', subsets[f'2024/{s}'])
+
 
     return base, subsets
 
diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json
index 9038a867..3e59d1d8 100644
--- a/ir_datasets/etc/downloads.json
+++ b/ir_datasets/etc/downloads.json
@@ -5949,9 +5949,14 @@
       "cache_path": "trec-tot.zip"
     },
     "2024": {
-       "url": "https://zenodo.org/api/records/11185090/files-archive",
-      "expected_md5": "ea48b27fd3a1b90b1b9b8bf98351f7f9",
-      "cache_path": "trec-tot-2024.zip"
+      "url": "https://zenodo.org/records/13370657/files/corpus.jsonl.zip?download=1",
+      "expected_md5": "4ea86770817e46a06fea5c94f596409c",
+      "cache_path": "trec-tot-2024-corpus.zip"
+    },
+    "2024-test": {
+      "url": "https://zenodo.org/records/13370657/files/test-2024.zip?download=1",
+      "expected_md5": "3d0a4d83957ee6a1398afefbc96162fa",
+      "cache_path": "trec-tot-2024-queries.zip"
     }
   },
   
diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json
index 60bd7c82..c291aed1 100644
--- a/ir_datasets/etc/metadata.json
+++ b/ir_datasets/etc/metadata.json
@@ -659,6 +659,7 @@
   "trec-spanish/trec4": {"docs": {"_ref": "trec-spanish"}, "queries": {"count": 25}, "qrels": {"count": 13109, "fields": {"relevance": {"counts_by_value": {"1": 2202, "0": 10907}}}}},
   "trec-tot": {},
   "trec-tot/2023": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
+  "trec-tot/2024": {"docs": {"count": 3185450, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
   "trec-tot/2023/dev": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
   "trec-tot/2023/train": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
   "tripclick": {"docs": {"count": 1523878, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
diff --git a/test/integration/trec_tot_2024.py b/test/integration/trec_tot_2024.py
index d296d264..73fc6e26 100644
--- a/test/integration/trec_tot_2024.py
+++ b/test/integration/trec_tot_2024.py
@@ -5,11 +5,12 @@
 from test.integration.base import DatasetIntegrationTest
 import ir_datasets
 
+print(ir_datasets.load('trec-tot/2024').docs_count())
 print(len([i for i in ir_datasets.load('trec-tot/2024').docs_iter()]))
 
 class TestTipOfTheTongue(DatasetIntegrationTest):
     def test_tip_of_the_tongue_docs(self):
-         self._test_docs('trec-tot/2024', count=231852, items={})
+         self._test_docs('trec-tot/2024', count=3185450, items={})
 
     def test_test_tip_of_the_tongue_qrels_train(self):
         #self._test_qrels('trec-tot/2024/test', count=150, items={

From a6dd8dbbd196f8fd74e9811de8ef48660cffac3a Mon Sep 17 00:00:00 2001
From: Maik Froebe <maik.froebe@uni-jena.de>
Date: Sun, 22 Sep 2024 07:16:47 +0200
Subject: [PATCH 3/3] Add code for trec tot 2024

---
 ir_datasets/datasets/trec_tot.py  |  8 +++++++-
 test/integration/trec_tot_2024.py | 22 ++++++++++------------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/ir_datasets/datasets/trec_tot.py b/ir_datasets/datasets/trec_tot.py
index a04f9009..8dfb5458 100644
--- a/ir_datasets/datasets/trec_tot.py
+++ b/ir_datasets/datasets/trec_tot.py
@@ -35,6 +35,12 @@ def default_text(self):
         """
         return self.title + ' ' + self.text
 
+class TipOfTheTongueQuery2024(NamedTuple):
+    query_id: str
+    query: str
+
+    def default_text(self):
+        return self.query
 
 
 class TipOfTheTongueQuery(NamedTuple):
@@ -91,7 +97,7 @@ def _init():
     for s in ['test']:
         subsets[f'2024/{s}'] = Dataset(
             docs_2024_handler,
-            JsonlQueries(Cache(ZipExtract(dlc[f'2024-{s}'], f'TREC-TOT/{s}-2024/queries.jsonl'), base_path/f'2024/{s}-2024/queries.jsonl'), query_cls=TipOfTheTongueQuery, mapping=QUERY_MAP, lang='en'),
+            JsonlQueries(Cache(ZipExtract(dlc[f'2024-{s}'], f'{s}-2024/queries.jsonl'), base_path/f'2024/{s}-2024/queries.jsonl'), query_cls=TipOfTheTongueQuery2024, lang='en'),
             documentation(f'2024/{s}'),
         )
         ir_datasets.registry.register(f'{NAME}/2024/{s}', subsets[f'2024/{s}'])
diff --git a/test/integration/trec_tot_2024.py b/test/integration/trec_tot_2024.py
index 73fc6e26..5a4a43c2 100644
--- a/test/integration/trec_tot_2024.py
+++ b/test/integration/trec_tot_2024.py
@@ -1,25 +1,23 @@
 import re
 import unittest
 from ir_datasets.formats import TrecQrel
-from ir_datasets.datasets.trec_tot import TipOfTheTongueDoc, TipOfTheTongueQuery
+from ir_datasets.datasets.trec_tot import TipOfTheTongueDoc2024, TipOfTheTongueQuery2024
 from test.integration.base import DatasetIntegrationTest
 import ir_datasets
 
-print(ir_datasets.load('trec-tot/2024').docs_count())
-print(len([i for i in ir_datasets.load('trec-tot/2024').docs_iter()]))
 
 class TestTipOfTheTongue(DatasetIntegrationTest):
     def test_tip_of_the_tongue_docs(self):
-         self._test_docs('trec-tot/2024', count=3185450, items={})
-
-    def test_test_tip_of_the_tongue_qrels_train(self):
-        #self._test_qrels('trec-tot/2024/test', count=150, items={
-        #    0: TrecQrel('763', '16742289', 1, '0'),
-        #    9: TrecQrel('293', '142456', 1, '0'),
-        #    149: TrecQrel('828', '30672517', 1, '0'),
-        #})
-        pass
+        self._test_docs('trec-tot/2024', count=3185450, items={
+            0: TipOfTheTongueDoc2024("846", "Museum of Work", "Q6941060", re.compile("^The Museum of Work .*"), [{"start": 0, "end": 798, "section": "Abstract"}, {"start": 798, "end": 1620, "section": "Overview"}, {"start": 1620, "end": 3095, "section": "Exhibitions"}, {"start": 3095, "end": 3371, "section": "The history of Alva"}, {"start": 3371, "end": 3824, "section": "Industriland"}, {"start": 3824, "end": 4371, "section": "Framtidsland (Future country)"}, {"start": 4371, "end": 4761, "section": "EWK \u2014 The Center for Political Illustration Art"}]),
+            1091: TipOfTheTongueDoc2024("9764", "Emma Goldman", "Q79969", re.compile("Emma Goldman \\(June 27, 1869 .*"),[{"start": 0, "end": 2752, "section": "Abstract"}, {"start": 2752, "end": 45613, "section": "Biography"}, {"start": 45613, "end": 47371, "section": "Family"}, {"start": 47371, "end": 50317, "section": "Adolescence"}, {"start": 50317, "end": 52433, "section": "Rochester, New York"}, {"start": 52433, "end": 54427, "section": "Most and Berkman"}, {"start": 54427, "end": 57448, "section": "Homestead plot"}, {"start": 57448, "end": 60672, "section": "\"Inciting to riot\""}, {"start": 60672, "end": 63288, "section": "McKinley assassination"}, {"start": 63288, "end": 66975, "section": "''Mother Earth'' and Berkman's release"}, {"start": 66975, "end": 69914, "section": "Reitman, essays, and birth control"}, {"start": 69914, "end": 73788, "section": "World War I"}, {"start": 73788, "end": 76344, "section": "Deportation"}, {"start": 76344, "end": 79375, "section": "Russia"}, {"start": 79375, "end": 83782, "section": "England, Canada, and France"}, {"start": 83782, "end": 86917, "section": "Spanish Civil War"}, {"start": 86917, "end": 87430, "section": "Final years"}, {"start": 87430, "end": 88493, "section": "Death"}, {"start": 88493, "end": 101764, "section": "Philosophy"}, {"start": 101764, "end": 106976, "section": "Anarchism"}, {"start": 106976, "end": 109922, "section": "Tactical uses of violence"}, {"start": 109922, "end": 111036, "section": "Capitalism and labor"}, {"start": 111036, "end": 114245, "section": "State"}, {"start": 114245, "end": 116281, "section": "Feminism and sexuality"}, {"start": 116281, "end": 117248, "section": "Atheism"}, {"start": 117248, "end": 120736, "section": "Legacy"}, {"start": 120736, "end": 120977, "section": "Works"}])
+        })
 
+    def test_tip_of_the_tongue_queries(self):
+        self._test_queries('trec-tot/2024/test', count=600, items={
+            0: TipOfTheTongueQuery2024("2001", re.compile("^I remember this old building I used to pass by in the heart of a bustling financial district, a place where the air always seemed thick.*")),
+            599: TipOfTheTongueQuery2024("2600", re.compile("^Okay, this is a vague one .\n So I know this is going to be.*"))
+        })
 
 if __name__ == '__main__':
     unittest.main()