Skip to content

Commit

Permalink
Move all perf tests to memmap
Browse files Browse the repository at this point in the history
  • Loading branch information
softwaredoug committed Nov 28, 2024
1 parent 9548dbb commit 29270ea
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 64 deletions.
2 changes: 2 additions & 0 deletions test/msmarco_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def msmarco_exists():
def csv_col_iter(col_no, msmarco_unzipped_path=None, num_docs=None):
if msmarco_unzipped_path is None:
msmarco_unzipped_path = msmarco_gz_path()[0:-3]
if not pathlib.Path(msmarco_unzipped_path).exists():
raise FileNotFoundError(f"File {msmarco_unzipped_path} not found")
with open(msmarco_unzipped_path, "rt") as f:
csv_reader = csv.reader(f, delimiter="\t")
for idx, row in enumerate(csv_reader):
Expand Down
91 changes: 27 additions & 64 deletions test/test_msmarco.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,32 @@ def msmarco100k_memmap():

@pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
@pytest.fixture(scope="session")
def msmarco_memmap():
def msmarco1m():
# msmarco1m_raw = pd.read_pickle(msmarco1m_raw_path())
msmarco_path = 'data/msmarco1m_memmap.pkl'
msmarco1m_path = pathlib.Path(msmarco_path)

if not msmarco1m_path.exists():
body_iter = csv_col_iter(3, num_docs=1000000)
title_iter = csv_col_iter(2, num_docs=1000000)
df = pd.DataFrame()
print("Indexing body")
df['body_ws'] = SearchArray.index(body_iter, truncate=True, tokenizer=ws_tokenizer,
workers=2, data_dir='data/')
print("Indexing title")
df['title_ws'] = SearchArray.index(title_iter, truncate=True, tokenizer=ws_tokenizer,
data_dir='data/')
print(" Done!... Saving")
# Save to pickle
df.to_pickle(msmarco1m_path)
return df
else:
return pd.read_pickle(msmarco1m_path)


@pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
@pytest.fixture(scope="session")
def msmarco():
msmarco100k_raw = pd.read_pickle(msmarco100k_raw_path())
msmarco_path = 'data/msmarco100k_memmap.pkl'
msmarco100k_path = pathlib.Path(msmarco_path)
Expand Down Expand Up @@ -114,62 +139,9 @@ def msmarco100k():
return pd.read_pickle(msmarco_path)


@pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
@pytest.fixture(scope="session")
def msmarco1m():
msmarco1m_raw = pd.read_pickle(msmarco1m_raw_path())
msmarco_path = 'data/msmarco1m.pkl'
msmarco1m_path = pathlib.Path(msmarco_path)

if not msmarco1m_path.exists():
print("Indexing 1m docs...")
msmarco = msmarco1m_raw
msmarco['title'].fillna('', inplace=True)
msmarco['body'].fillna('', inplace=True)
print(" Index Title")
msmarco["title_ws"] = SearchArray.index(msmarco["title"], tokenizer=ws_punc_tokenizer)
print(" Index Body")
msmarco["body_ws"] = SearchArray.index(msmarco["body"], tokenizer=ws_punc_tokenizer)

print(" DONE!... Saving")
msmarco.to_pickle(msmarco_path)
return msmarco
else:
print("Loading idxed pkl docs...")
msmarco = pd.read_pickle(msmarco_path)
return msmarco


@pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
@pytest.fixture(scope="session")
def msmarco_all():
msmarco_path_str = 'data/msmarco_all.pkl'
msmarco_path = pathlib.Path(msmarco_path_str)

# 43 / 13

if not msmarco_path.exists():
body_iter = csv_col_iter(3)
title_iter = csv_col_iter(2)
df = pd.DataFrame()
print("Indexing body")
df['body_idx'] = SearchArray.index(body_iter, truncate=True, tokenizer=ws_tokenizer,
workers=2)
print("Indexing title")
df['title_idx'] = SearchArray.index(title_iter, truncate=True, tokenizer=ws_tokenizer)
# Save to pickle
df.to_pickle(msmarco_path_str)
return df
else:
print("Loading idxed pkl docs...")
msmarco = pd.read_pickle(msmarco_path_str)
print(f"Loaded msmarco -- {len(msmarco)} -- {msmarco['body_idx'].array.memory_usage() / 1024 ** 2:.2f} MB | {msmarco['title_idx'].array.memory_usage() / 1024 ** 2:.2f} MB")
return msmarco


@pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
@pytest.fixture(scope="session")
def msmarco_all_memmap():
msmarco_path_str = 'data/msmarco_all_memmap.pkl'
msmarco_path = pathlib.Path(msmarco_path_str)

Expand Down Expand Up @@ -305,16 +277,7 @@ def test_msmarco100k_phrase_stress(phrase_search, msmarco100k):

@pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
@pytest.mark.parametrize("phrase_search", ["what is", "what is the", "what is the purpose", "what is the purpose of", "what is the purpose of cats", "star trek", "star trek the next generation", "what what what", "the purpose"])
def test_msmarco_all_phrase_memmap(phrase_search, msmarco_all_memmap, benchmark):
profiler = Profiler(benchmark)
phrase_search = phrase_search.split()
print(f"STARTING {phrase_search}")
profiler.run(msmarco_all_memmap['body_idx'].array.score, phrase_search)


@pytest.mark.skipif(not profile_enabled, reason="Profiling disabled")
@pytest.mark.parametrize("phrase_search", ["what is", "what is the", "what is the purpose", "what is the purpose of", "what is the purpose of cats", "star trek", "star trek the next generation", "what what what", "the purpose"])
def test_msmarco_all_phrase(phrase_search, msmarco_all, benchmark):
def test_msmarco_all_phrase_memmap(phrase_search, msmarco_all, benchmark):
profiler = Profiler(benchmark)
phrase_search = phrase_search.split()
print(f"STARTING {phrase_search}")
Expand Down

0 comments on commit 29270ea

Please sign in to comment.