sourmash-bio · luizirber · Jun 29, 2024
diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
@@ -32,3 +32,22 @@ jobs:
         with:
           run: "cd src/core && cargo codspeed run"
           token: ${{ secrets.CODSPEED_TOKEN }}
+
+  benchmarks-python:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install tox
+
+      - name: Run benchmarks
+        uses: CodSpeedHQ/action@v2
+        with:
+          token: ${{ secrets.CODSPEED_TOKEN }}
+          run: tox -e codspeed
diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
@@ -33,74 +33,6 @@ def load_sequences():
     return sequences
 
 
-class TimeMinHashSuite:
-    def setup(self):
-        self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False)
-        self.protein_mh = MinHash(
-            MINHASH_NUM, MINHASH_K, is_protein=True, track_abundance=False
-        )
-        self.sequences = load_sequences()
-
-        self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False)
-        for seq in self.sequences:
-            self.populated_mh.add_sequence(seq)
-
-    def time_add_sequence(self):
-        mh = self.mh
-        sequences = self.sequences
-        for seq in sequences:
-            mh.add_sequence(seq)
-
-    def time_add_protein(self):
-        mh = self.protein_mh
-        sequences = self.sequences
-        for seq in sequences:
-            mh.add_protein(seq)
-
-    def time_get_mins(self):
-        mh = self.populated_mh
-        for i in range(GET_MINS_RANGE):
-            mh.get_mins()
-
-    def time_add_hash(self):
-        mh = self.mh
-        for i in range(ADD_HASH_RANGE):
-            mh.add_hash(i)
-
-    def time_add_many(self):
-        mh = self.mh
-        mh.add_many(list(range(ADD_MANY_RANGE)))
-
-    def time_similarity(self):
-        mh = self.mh
-        other_mh = self.populated_mh
-        for i in range(SIMILARITY_TIMES):
-            mh.similarity(other_mh)
-
-    def time_count_common(self):
-        mh = self.mh
-        other_mh = self.populated_mh
-        for i in range(COUNT_COMMON_TIMES):
-            mh.count_common(other_mh)
-
-    def time_merge(self):
-        mh = self.mh
-        other_mh = self.populated_mh
-        for i in range(MERGE_TIMES):
-            mh.merge(other_mh)
-
-    def time_copy(self):
-        mh = self.populated_mh
-        for i in range(COPY_TIMES):
-            mh.__copy__()
-
-    def time_concat(self):
-        mh = self.mh
-        other_mh = self.populated_mh
-        for i in range(CONCAT_TIMES):
-            mh += other_mh
-
-
 class PeakmemMinHashSuite:
     def setup(self):
         self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
@@ -134,33 +66,6 @@ def peakmem_add_many(self):
 ####################
 
 
-class TimeMinAbundanceSuite(TimeMinHashSuite):
-    def setup(self):
-        TimeMinHashSuite.setup(self)
-        self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
-
-        self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
-        for seq in self.sequences:
-            self.populated_mh.add_sequence(seq)
-
-    def time_get_mins_abundance(self):
-        mh = self.populated_mh
-        for i in range(GET_MINS_RANGE):
-            mh.get_mins(with_abundance=True)
-
-    def time_set_abundances(self):
-        mh = self.mh
-        mins = self.populated_mh.get_mins(with_abundance=True)
-        for i in range(SET_ABUNDANCES_RANGE):
-            mh.set_abundances(mins)
-
-    def time_set_abundances_noclear(self):
-        mh = self.mh
-        mins = self.populated_mh.get_mins(with_abundance=True)
-        for i in range(SET_ABUNDANCES_RANGE):
-            mh.set_abundances(mins, clear=False)
-
-
 class PeakmemMinAbundanceSuite(PeakmemMinHashSuite):
     def setup(self):
         PeakmemMinHashSuite.setup(self)
@@ -170,35 +75,6 @@ def setup(self):
 ####################
 
 
-class TimeZipStorageSuite:
-    def setup(self):
-        import zipfile
-
-        self.zipfile = NamedTemporaryFile()
-
-        with zipfile.ZipFile(
-            self.zipfile, mode="w", compression=zipfile.ZIP_STORED
-        ) as storage:
-            for i in range(ZIP_STORAGE_WRITE):
-                # just so we have lots of entries
-                storage.writestr(str(i), b"0")
-            # one big-ish entry
-            storage.writestr("sig1", b"9" * 1_000_000)
-
-    def time_load_from_zipstorage(self):
-        with ZipStorage(self.zipfile.name) as storage:
-            for i in range(ZIP_STORAGE_LOAD):
-                storage.load("sig1")
-
-    def time_load_small_from_zipstorage(self):
-        with ZipStorage(self.zipfile.name) as storage:
-            for i in range(ZIP_STORAGE_LOAD):
-                storage.load("99999")
-
-    def teardown(self):
-        self.zipfile.close()
-
-
 class PeakmemZipStorageSuite:
     def setup(self):
         import zipfile

diff --git a/pyproject.toml b/pyproject.toml
@@ -103,6 +103,7 @@ test = [
   "pytest>=6.2.4,<8.4.0",
   "pytest-cov>=4,<6.0",
   "pytest-xdist>=3.1",
+  "pytest-benchmark>=4.0",
   "pyyaml>=6,<7",
   "diff-cover>=7.3",
   "covdefaults>=2.2.2",

diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py
@@ -0,0 +1,176 @@
+import random
+from tempfile import NamedTemporaryFile
+
+import pytest
+
+from sourmash.sbt_storage import ZipStorage
+from sourmash.minhash import MinHash
+
+RANDOM_SEQ_SIZE = 3000
+RANDOM_SEQ_NUMBER = 300
+
+MINHASH_NUM = 500
+MINHASH_K = 21
+
+GET_MINS_RANGE = 500
+ADD_HASH_RANGE = 10_000
+ADD_MANY_RANGE = 1000
+SIMILARITY_TIMES = 500
+COUNT_COMMON_TIMES = 500
+MERGE_TIMES = 500
+COPY_TIMES = 500
+CONCAT_TIMES = 500
+SET_ABUNDANCES_RANGE = 500
+ZIP_STORAGE_WRITE = 100_000
+ZIP_STORAGE_LOAD = 20
+
+
+def load_sequences():
+    sequences = []
+    for _ in range(10):
+        random_seq = random.sample(
+            "A,C,G,T".split(",") * RANDOM_SEQ_SIZE, RANDOM_SEQ_NUMBER
+        )
+        sequences.append("".join(random_seq))
+    return sequences
+
+
+@pytest.fixture
+def mh():
+    return MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False)
+
+
+@pytest.fixture
+def mh_protein():
+    return MinHash(MINHASH_NUM, MINHASH_K, is_protein=True, track_abundance=False)
+
+
+@pytest.fixture
+def sequences():
+    return load_sequences()
+
+
+@pytest.fixture
+def populated_mh(sequences):
+    populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False)
+    for seq in sequences:
+        populated_mh.add_sequence(seq)
+    return populated_mh
+
+
+def test_add_sequence(benchmark, mh, sequences):
+    @benchmark
+    def bench():
+        for seq in sequences:
+            mh.add_sequence(seq)
+
+
+def test_add_protein(benchmark, mh_protein, sequences):
+    @benchmark
+    def bench():
+        for seq in sequences:
+            mh_protein.add_protein(seq)
+
+
+def test_get_mins(benchmark, populated_mh):
+    benchmark(populated_mh.get_mins)
+
+
+def test_add_hash(benchmark, mh):
+    @benchmark
+    def bench():
+        for i in range(ADD_HASH_RANGE):
+            mh.add_hash(i)
+
+
+def test_add_many(benchmark, mh):
+    benchmark(mh.add_many, list(range(ADD_MANY_RANGE)))
+
+
+def test_similarity(benchmark, mh, populated_mh):
+    benchmark(mh.similarity, populated_mh)
+
+
+def test_count_common(benchmark, mh, populated_mh):
+    benchmark(mh.count_common, populated_mh)
+
+
+def test_merge(benchmark, mh, populated_mh):
+    benchmark(mh.merge, populated_mh)
+
+
+def test_copy(benchmark, populated_mh):
+    benchmark(populated_mh.__copy__)
+
+
+def test_concat(benchmark, mh, populated_mh):
+    benchmark(mh.__iadd__, populated_mh)
+
+
+####################
+
+
+def setup(self):
+    TimeMinHashSuite.setup(self)
+    self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
+
+    self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True)
+    for seq in self.sequences:
+        self.populated_mh.add_sequence(seq)
+
+
+def time_get_mins_abundance(self):
+    mh = self.populated_mh
+    for i in range(GET_MINS_RANGE):
+        mh.get_mins(with_abundance=True)
+
+
+def time_set_abundances(self):
+    mh = self.mh
+    mins = self.populated_mh.get_mins(with_abundance=True)
+    for i in range(SET_ABUNDANCES_RANGE):
+        mh.set_abundances(mins)
+
+
+def time_set_abundances_noclear(self):
+    mh = self.mh
+    mins = self.populated_mh.get_mins(with_abundance=True)
+    for i in range(SET_ABUNDANCES_RANGE):
+        mh.set_abundances(mins, clear=False)
+
+
+####################
+
+
+@pytest.fixture
+def zipstore():
+    import zipfile
+
+    zf = NamedTemporaryFile()
+
+    with zipfile.ZipFile(zf, mode="w", compression=zipfile.ZIP_STORED) as storage:
+        for i in range(ZIP_STORAGE_WRITE):
+            # just so we have lots of entries
+            storage.writestr(str(i), b"0")
+        # one big-ish entry
+        storage.writestr("sig1", b"9" * 1_000_000)
+
+    yield zf
+
+    zf.close()
+
+
+def test_load_from_zipstorage(benchmark, zipstore):
+    @benchmark
+    def bench():
+        with ZipStorage(zipstore.name) as storage:
+            for _ in range(ZIP_STORAGE_LOAD):
+                storage.load("sig1")
+
+
+def test_load_small_from_zipstorage(benchmark, zipstore):
+    @benchmark
+    def bench():
+        with ZipStorage(zipstore.name) as storage:
+            for _ in range(ZIP_STORAGE_LOAD):
+                storage.load("99999")