Move all perf tests to memmap

softwaredoug · Nov 28, 2024 · 29270ea · 29270ea
1 parent 9548dbb
commit 29270ea
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 64 deletions.
diff --git a/test/msmarco_utils.py b/test/msmarco_utils.py
@@ -28,6 +28,8 @@ def msmarco_exists():
 def csv_col_iter(col_no, msmarco_unzipped_path=None, num_docs=None):
     if msmarco_unzipped_path is None:
         msmarco_unzipped_path = msmarco_gz_path()[0:-3]
+    if not pathlib.Path(msmarco_unzipped_path).exists():
+        raise FileNotFoundError(f"File {msmarco_unzipped_path} not found")
     with open(msmarco_unzipped_path, "rt") as f:
         csv_reader = csv.reader(f, delimiter="\t")
         for idx, row in enumerate(csv_reader):

diff --git a/test/test_msmarco.py b/test/test_msmarco.py
@@ -68,7 +68,32 @@ def msmarco100k_memmap():
 
 @pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
 @pytest.fixture(scope="session")
-def msmarco_memmap():
+def msmarco1m():
+    # msmarco1m_raw = pd.read_pickle(msmarco1m_raw_path())
+    msmarco_path = 'data/msmarco1m_memmap.pkl'
+    msmarco1m_path = pathlib.Path(msmarco_path)
+
+    if not msmarco1m_path.exists():
+        body_iter = csv_col_iter(3, num_docs=1000000)
+        title_iter = csv_col_iter(2, num_docs=1000000)
+        df = pd.DataFrame()
+        print("Indexing body")
+        df['body_ws'] = SearchArray.index(body_iter, truncate=True, tokenizer=ws_tokenizer,
+                                          workers=2, data_dir='data/')
+        print("Indexing title")
+        df['title_ws'] = SearchArray.index(title_iter, truncate=True, tokenizer=ws_tokenizer,
+                                           data_dir='data/')
+        print(" Done!... Saving")
+        # Save to pickle
+        df.to_pickle(msmarco1m_path)
+        return df
+    else:
+        return pd.read_pickle(msmarco1m_path)
+
+
+@pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
+@pytest.fixture(scope="session")
+def msmarco():
     msmarco100k_raw = pd.read_pickle(msmarco100k_raw_path())
     msmarco_path = 'data/msmarco100k_memmap.pkl'
     msmarco100k_path = pathlib.Path(msmarco_path)
@@ -114,62 +139,9 @@ def msmarco100k():
         return pd.read_pickle(msmarco_path)
 
 
-@pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
-@pytest.fixture(scope="session")
-def msmarco1m():
-    msmarco1m_raw = pd.read_pickle(msmarco1m_raw_path())
-    msmarco_path = 'data/msmarco1m.pkl'
-    msmarco1m_path = pathlib.Path(msmarco_path)
-
-    if not msmarco1m_path.exists():
-        print("Indexing 1m docs...")
-        msmarco = msmarco1m_raw
-        msmarco['title'].fillna('', inplace=True)
-        msmarco['body'].fillna('', inplace=True)
-        print(" Index Title")
-        msmarco["title_ws"] = SearchArray.index(msmarco["title"], tokenizer=ws_punc_tokenizer)
-        print(" Index Body")
-        msmarco["body_ws"] = SearchArray.index(msmarco["body"], tokenizer=ws_punc_tokenizer)
-
-        print(" DONE!... Saving")
-        msmarco.to_pickle(msmarco_path)
-        return msmarco
-    else:
-        print("Loading idxed pkl docs...")
-        msmarco = pd.read_pickle(msmarco_path)
-        return msmarco
-
-
 @pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
 @pytest.fixture(scope="session")
 def msmarco_all():
-    msmarco_path_str = 'data/msmarco_all.pkl'
-    msmarco_path = pathlib.Path(msmarco_path_str)
-
-    # 43 / 13
-
-    if not msmarco_path.exists():
-        body_iter = csv_col_iter(3)
-        title_iter = csv_col_iter(2)
-        df = pd.DataFrame()
-        print("Indexing body")
-        df['body_idx'] = SearchArray.index(body_iter, truncate=True, tokenizer=ws_tokenizer,
-                                           workers=2)
-        print("Indexing title")
-        df['title_idx'] = SearchArray.index(title_iter, truncate=True, tokenizer=ws_tokenizer)
-        # Save to pickle
-        df.to_pickle(msmarco_path_str)
-        return df
-    else:
-        print("Loading idxed pkl docs...")
-        msmarco = pd.read_pickle(msmarco_path_str)
-        print(f"Loaded msmarco -- {len(msmarco)} -- {msmarco['body_idx'].array.memory_usage() / 1024 ** 2:.2f} MB | {msmarco['title_idx'].array.memory_usage() / 1024 ** 2:.2f} MB")
-        return msmarco
-
-
-@pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
-@pytest.fixture(scope="session")
-def msmarco_all_memmap():
     msmarco_path_str = 'data/msmarco_all_memmap.pkl'
     msmarco_path = pathlib.Path(msmarco_path_str)
 
@@ -305,16 +277,7 @@ def test_msmarco100k_phrase_stress(phrase_search, msmarco100k):
 
 @pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
 @pytest.mark.parametrize("phrase_search", ["what is", "what is the", "what is the purpose", "what is the purpose of", "what is the purpose of cats", "star trek", "star trek the next generation", "what what what", "the purpose"])
-def test_msmarco_all_phrase_memmap(phrase_search, msmarco_all_memmap, benchmark):
-    profiler = Profiler(benchmark)
-    phrase_search = phrase_search.split()
-    print(f"STARTING {phrase_search}")
-    profiler.run(msmarco_all_memmap['body_idx'].array.score, phrase_search)
-
-
-@pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
-@pytest.mark.parametrize("phrase_search", ["what is", "what is the", "what is the purpose", "what is the purpose of", "what is the purpose of cats", "star trek", "star trek the next generation", "what what what", "the purpose"])
-def test_msmarco_all_phrase(phrase_search, msmarco_all, benchmark):
+def test_msmarco_all_phrase_memmap(phrase_search, msmarco_all, benchmark):
     profiler = Profiler(benchmark)
     phrase_search = phrase_search.split()
     print(f"STARTING {phrase_search}")