From 0c7806d080a02142083daa51d092b9663c7c90cf Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Thu, 28 Nov 2024 13:11:31 +0100
Subject: [PATCH 01/10] fix tf, download mirdata idx by default

---
 compiam/__init__.py | 4 +++-
 pyproject.toml      | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/compiam/__init__.py b/compiam/__init__.py
index 35d13052..5a795d57 100644
--- a/compiam/__init__.py
+++ b/compiam/__init__.py
@@ -71,9 +71,11 @@ def load_dataset(dataset_name, data_home=None, version="default"):
     """
     if dataset_name not in datasets_list:
         raise ValueError("Invalid dataset {}".format(dataset_name))
-    return mirdata.initialize(
+    dataloader =  mirdata.initialize(
         dataset_name=dataset_name, data_home=data_home, version=version
     )
+    dataloader.download(["index"])  # Download index file
+    return dataloader
 
 
 def load_corpora(tradition, token=None):
diff --git a/pyproject.toml b/pyproject.toml
index 80d9218b..05e61490 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,8 +65,8 @@ docs = [
 ]
 tensorflow = [
     "keras<3.0.0",
-    "tensorflow>=2.12.0,<2.16",
-]
+    "tensorflow==2.15.0",
+]  # Fixing tf versions to avoid issues
 torch = [
     "torch==1.13.0",
     "torchaudio",

From d2e65efd26d7396ca34108c970802af516a4b8c9 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Thu, 28 Nov 2024 16:35:28 +0100
Subject: [PATCH 02/10] fix resampling and model loading minor errors

---
 .../melody/pattern/sancara_search/__init__.py | 13 ++++++++---
 .../ftaresnet_carnatic/__init__.py            |  5 ++++-
 compiam/melody/pitch_extraction/melodia.py    |  6 +++--
 .../raga_recognition/deepsrgm/__init__.py     | 22 +++++++++++--------
 .../tonic_identification/tonic_multipitch.py  |  8 +++++--
 .../mixer_model/__init__.py                   |  6 ++++-
 .../dhrupad_bandish_segmentation/__init__.py  | 12 ++++++----
 compiam/utils/__init__.py                     | 19 +++++++++++++++-
 tests/melody/test_deepsrgm.py                 |  4 ++++
 tests/melody/test_essentia_extractors.py      |  8 +++++++
 10 files changed, 80 insertions(+), 23 deletions(-)

diff --git a/compiam/melody/pattern/sancara_search/__init__.py b/compiam/melody/pattern/sancara_search/__init__.py
index f3ee2f61..39f88206 100644
--- a/compiam/melody/pattern/sancara_search/__init__.py
+++ b/compiam/melody/pattern/sancara_search/__init__.py
@@ -180,9 +180,16 @@ def load_model(self, model_path, conf_path, spec_path):
             setattr(self, tp, v)
 
         self.model = self._build_model()
-        self.model.load_state_dict(
-            torch.load(model_path, weights_only=True), strict=False
-        )
+        try:
+            self.model.load_state_dict(
+                torch.load(model_path, weights_only=True, map_location=self.device),
+                strict=False
+            )
+        except:
+            self.model.load_state_dict(
+                torch.load(model_path, map_location=self.device),
+                strict=False,
+            )
         self.trained = True
 
     def download_model(self, model_path=None, force_overwrite=False):
diff --git a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py
index eb34be79..68c094aa 100644
--- a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py
+++ b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py
@@ -80,7 +80,10 @@ def load_model(self, model_path):
         """Load pre-trained model weights."""
         if not os.path.exists(model_path):
             self.download_model(model_path)  # Downloading model weights
-        self.model.load_state_dict(torch.load(model_path, weights_only=True))
+        try: # Loading model weights
+            self.model.load_state_dict(torch.load(model_path, weights_only=True, map_location=self.device))
+        except:
+            self.model.load_state_dict(torch.load(model_path, map_location=self.device))
         self.model_path = model_path
         self.trained = True
 
diff --git a/compiam/melody/pitch_extraction/melodia.py b/compiam/melody/pitch_extraction/melodia.py
index 247a85f9..9bebb128 100644
--- a/compiam/melody/pitch_extraction/melodia.py
+++ b/compiam/melody/pitch_extraction/melodia.py
@@ -4,7 +4,7 @@
 
 from compiam.utils.pitch import normalisation, resampling
 from compiam.io import write_csv
-from compiam.utils import get_logger
+from compiam.utils import get_logger, stereo_to_mono
 
 logger = get_logger(__name__)
 
@@ -87,12 +87,14 @@ def extract(self, input_data, input_sr=44100, out_step=None):
                 filename=input_data, sampleRate=self.sample_rate
             )()
         elif isinstance(input_data, np.ndarray):
+            input_data = stereo_to_mono(input_data)
+            # Apply Eqloudness filter
             logger.warning(
                 f"Resampling... (input sampling rate is {input_sr}Hz, make sure this is correct)"
             )
             resample_audio = estd.Resample(
                 inputSampleRate=input_sr, outputSampleRate=self.sample_rate
-            )()
+            )
             input_data = resample_audio(input_data)
             audio = estd.EqualLoudness(signal=input_data)()
         else:
diff --git a/compiam/melody/raga_recognition/deepsrgm/__init__.py b/compiam/melody/raga_recognition/deepsrgm/__init__.py
index 2323fde3..1d79ea7c 100644
--- a/compiam/melody/raga_recognition/deepsrgm/__init__.py
+++ b/compiam/melody/raga_recognition/deepsrgm/__init__.py
@@ -1,4 +1,5 @@
 import os
+import librosa
 
 import numpy as np
 
@@ -8,7 +9,7 @@
     ModelNotTrainedError,
     DatasetNotLoadedError,
 )
-from compiam.utils import get_logger, WORKDIR
+from compiam.utils import get_logger, stereo_to_mono, WORKDIR
 from compiam.utils.download import download_remote_model
 
 logger = get_logger(__name__)
@@ -122,7 +123,10 @@ def load_model(self, model_path, rnn="lstm"):
             self.model = self._build_model(rnn="gru")
 
         self.model_path = model_path
-        weights = torch.load(model_path, weights_only=True, map_location=self.device)
+        try:
+            weights = torch.load(model_path, weights_only=True, map_location=self.device)
+        except:
+            weights = torch.load(model_path, map_location=self.device)
         new_weights = weights.copy()
         keys_to_fix = [
             ".weight_ih_l0",
@@ -232,20 +236,20 @@ def get_features(
                     "Install compIAM with essentia support: pip install 'compiam[essentia]'"
                 )
 
+            # Loading and resampling audio
             if isinstance(input_data, str):
                 if not os.path.exists(input_data):
                     raise FileNotFoundError("Target audio not found.")
-                audio = estd.MonoLoader(
-                    filename=input_data, sampleRate=self.sample_rate
-                )()
+                audio, _ = librosa.load(input_data, sr=self.sample_rate)
             elif isinstance(input_data, np.ndarray):
+                input_data = stereo_to_mono(input_data)
                 logger.warning(
-                    "Resampling... (input sampling rate is {input_sr}Hz, make sure this is correct)"
+                    f"Resampling... (input sampling rate is assumed {input_sr}Hz, \
+                        make sure this is correct and change input_sr otherwise)"
                 )
-                resampling = estd.Resample(
-                    inputSampleRate=input_sr, outputSampleRate=self.sample_rate
+                audio = librosa.resample(
+                    input_data, orig_sr=input_sr, target_sr=self.sample_rate
                 )
-                audio = resampling(input_data)
             else:
                 raise ValueError("Input must be path to audio signal or an audio array")
 
diff --git a/compiam/melody/tonic_identification/tonic_multipitch.py b/compiam/melody/tonic_identification/tonic_multipitch.py
index 49a6d429..2c3ed099 100644
--- a/compiam/melody/tonic_identification/tonic_multipitch.py
+++ b/compiam/melody/tonic_identification/tonic_multipitch.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from compiam.utils import get_logger
+from compiam.utils import get_logger, stereo_to_mono
 
 logger = get_logger(__name__)
 
@@ -65,12 +65,16 @@ def extract(self, input_data, input_sr=44100):
                 raise FileNotFoundError("Target audio not found.")
             audio = estd.MonoLoader(filename=input_data, sampleRate=self.sample_rate)()
         elif isinstance(input_data, np.ndarray):
+            if len(input_data.shape) == 2:
+                input_data = stereo_to_mono(input_data)
+            if len(input_data.shape) > 2:
+                raise ValueError("Input must be an unbatched audio signal")
             logger.warning(
                 f"Resampling... (input sampling rate is {input_sr}Hz, make sure this is correct)"
             )
             resampling = estd.Resample(
                 inputSampleRate=input_sr, outputSampleRate=self.sample_rate
-            )()
+            )
             audio = resampling(input_data)
         else:
             raise ValueError("Input must be path to audio signal or an audio array")
diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py
index b44f748e..b6d74ef1 100644
--- a/compiam/separation/music_source_separation/mixer_model/__init__.py
+++ b/compiam/separation/music_source_separation/mixer_model/__init__.py
@@ -83,7 +83,11 @@ def _build_model(self):
     def load_model(self, model_path):
         if not os.path.exists(model_path):
             self.download_model(model_path)  # Downloading model weights
-        self.model.load_state_dict(torch.load(model_path, weights_only=True))
+        try:
+            weights = torch.load(model_path, weights_only=True, map_location=self.device)
+        except:
+            weights = torch.load(model_path, map_location=self.device)
+        self.model.load_state_dict(weights)
         self.model_path = model_path
         self.trained = True
 
diff --git a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py
index f67c7347..9debf911 100644
--- a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py
+++ b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py
@@ -176,16 +176,20 @@ def load_model(self, model_path):
             self.download_model(model_path)
 
         self.model = self._build_model()
-        self.model.load_state_dict(
-            torch.load(model_path, weights_only=True, map_location=self.device)
-        )
+        try:
+            self.model.load_state_dict(
+                torch.load(model_path, weights_only=True, map_location=self.device)
+            )
+        except:
+            self.model.load_state_dict(
+                torch.load(model_path, map_location=self.device)
+            )
         self.model.eval()
         self.loaded_model_path = model_path
         self.trained = True
 
     def download_model(self, model_path=None, force_overwrite=False):
         """Download pre-trained model."""
-        print("modelpathhh", model_path)
         download_path = (
             os.sep + os.path.join(*model_path.split(os.sep)[:-4])
             if model_path is not None
diff --git a/compiam/utils/__init__.py b/compiam/utils/__init__.py
index 8a06ac3b..fa49733c 100644
--- a/compiam/utils/__init__.py
+++ b/compiam/utils/__init__.py
@@ -4,7 +4,6 @@
 import pathlib
 import pickle
 import difflib
-import librosa
 
 import IPython.display as ipd
 import numpy as np
@@ -172,3 +171,21 @@ def add_center_to_mask(mask):
                 num_one = 0
                 indices = []
     return mask
+
+
+def stereo_to_mono(audio):
+    """Assuming numpy array as input"""
+    if len(audio.shape) == 2:
+        # Put channels first
+        if audio.shape[0] > audio.shape[1]:
+            audio = audio.T
+        # If stereo, average the channels
+        if audio.shape[0] == 2:
+            audio = np.mean(audio, axis=0)
+        if audio.shape[0] == 1:
+            audio = np.squeeze(audio, axis=0)
+        if audio.shape[0] > 2:
+            raise ValueError("Expected mono or stereo audio, got multi-channel audio")
+    if len(audio.shape) > 2:
+        raise ValueError("Input must be an unbatched audio signal")
+    return audio
diff --git a/tests/melody/test_deepsrgm.py b/tests/melody/test_deepsrgm.py
index 69c2769e..e01dc73f 100644
--- a/tests/melody/test_deepsrgm.py
+++ b/tests/melody/test_deepsrgm.py
@@ -43,6 +43,10 @@ def _get_features():
         feat = deepsrgm.get_features(
             os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
         )
+    feat_1 = deepsrgm.get_features(np.zeros(44100))
+    feat_2 = deepsrgm.get_features(
+        os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
+    )
 
 
 @pytest.mark.torch
diff --git a/tests/melody/test_essentia_extractors.py b/tests/melody/test_essentia_extractors.py
index ef75a61f..b2db2904 100644
--- a/tests/melody/test_essentia_extractors.py
+++ b/tests/melody/test_essentia_extractors.py
@@ -15,6 +15,9 @@ def _predict_normalized_pitch():
     pitch = melodia.extract(
         os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
     )
+    pitch_2 = melodia.extract(np.zeros(44100))
+    pitch_3 = melodia.extract(np.zeros(2, 44100))  # Testing input array
+    pitch_4 = melodia.extract(np.zeros(44100, 2))  # Testing input array
 
     assert isinstance(pitch, np.ndarray)
     assert np.shape(pitch) == (699, 2)
@@ -67,6 +70,11 @@ def _predict_normalized_pitch():
     tonic = tonic_multipitch.extract(
         os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
     )
+    tonic_2 = tonic_multipitch.extract(np.zeros(44100))  # Testing input array
+    tonic_3 = tonic_multipitch.extract(np.zeros(2, 44100))  # Testing input array
+    tonic_4 = tonic_multipitch.extract(np.zeros(44100, 2))  # Testing input array
+
+
 
     assert isinstance(tonic, float)
     assert tonic == 157.64892578125

From 16fb6c31c096357abdd26ef0601f6d5f492f6d07 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Thu, 28 Nov 2024 16:52:46 +0100
Subject: [PATCH 03/10] fix in eqloudness input

---
 compiam/melody/pitch_extraction/melodia.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiam/melody/pitch_extraction/melodia.py b/compiam/melody/pitch_extraction/melodia.py
index 9bebb128..449c694a 100644
--- a/compiam/melody/pitch_extraction/melodia.py
+++ b/compiam/melody/pitch_extraction/melodia.py
@@ -96,7 +96,7 @@ def extract(self, input_data, input_sr=44100, out_step=None):
                 inputSampleRate=input_sr, outputSampleRate=self.sample_rate
             )
             input_data = resample_audio(input_data)
-            audio = estd.EqualLoudness(signal=input_data)()
+            audio = estd.EqualLoudness(sampleRate=self.sample_rate)(audio)
         else:
             raise ValueError("Input must be path to audio signal or an audio array")
 

From 4d4e5b8f276fcd2b0a3f2ba6ad5e0e54c51056c9 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Thu, 28 Nov 2024 17:09:53 +0100
Subject: [PATCH 04/10] fix wrongly defined variable

---
 compiam/melody/pitch_extraction/melodia.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiam/melody/pitch_extraction/melodia.py b/compiam/melody/pitch_extraction/melodia.py
index 449c694a..6d27cb62 100644
--- a/compiam/melody/pitch_extraction/melodia.py
+++ b/compiam/melody/pitch_extraction/melodia.py
@@ -96,7 +96,7 @@ def extract(self, input_data, input_sr=44100, out_step=None):
                 inputSampleRate=input_sr, outputSampleRate=self.sample_rate
             )
             input_data = resample_audio(input_data)
-            audio = estd.EqualLoudness(sampleRate=self.sample_rate)(audio)
+            audio = estd.EqualLoudness(sampleRate=self.sample_rate)(input_data)
         else:
             raise ValueError("Input must be path to audio signal or an audio array")
 

From ac31c0cf7fa83443c85d640829e0d23aa1eb0cc3 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Thu, 28 Nov 2024 17:34:43 +0100
Subject: [PATCH 05/10] fixing tests

---
 .github/environment-ci.yml                       |  4 ++--
 .../cold_diff_sep/__init__.py                    |  9 +++++++++
 tests/melody/test_deepsrgm.py                    |  7 +++++--
 tests/melody/test_essentia_extractors.py         | 16 ++++++++--------
 4 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/.github/environment-ci.yml b/.github/environment-ci.yml
index d3c24541..5c0caeda 100644
--- a/.github/environment-ci.yml
+++ b/.github/environment-ci.yml
@@ -27,8 +27,8 @@ dependencies:
   - pip:
     - "keras<3.0.0"
     - "tensorflow>=2.12.0,<2.16"
-    - "torch==2.0.0"
-    - "torchaudio==2.0.1"
+    - "torch==1.13"
+    - "torchaudio"
     - "essentia"
     - "soundfile>=0.12.1"
     - "opencv-python~=4.6.0"
diff --git a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py
index 6c57b0c1..8cc92c06 100644
--- a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py
+++ b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py
@@ -94,6 +94,7 @@ def separate(
         clusters=5,
         scheduler=4,
         chunk_size=3,
+        normalize_input=True,
         gpu="-1",
     ):
         """Separate singing voice from mixture.
@@ -103,6 +104,8 @@ def separate(
             relevant if the input is an array of data instead of a filepath.
         :param clusters: Number of clusters to use to build the separation masks.
         :param scheduler: Scheduler factor to weight the clusters to be more or less restirctive with the interferences.
+        :param chunk_size: Size of the chunks to process the audio signal.
+        :param normalize_input: Normalize the input audio signal.
         :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc.
         :return: Singing voice signal.
         """
@@ -153,6 +156,12 @@ def separate(
                 f"Downsampling to mono... your audio is stereo, \
                     and the model is trained on mono audio."
             )
+
+        if normalize_input:
+            # Normalizing audio for better performance overall
+            mean = tf.reduce_mean(mixture, keepdims=True)
+            std = tf.math.reduce_std(mixture, keepdims=True)
+            mixture = (mixture - mean) / (1e-6 + std)
             
         output_voc = np.zeros(mixture.shape)
         hopsized_chunk = int((chunk_size * self.sample_rate) / 2)
diff --git a/tests/melody/test_deepsrgm.py b/tests/melody/test_deepsrgm.py
index e01dc73f..0d47655d 100644
--- a/tests/melody/test_deepsrgm.py
+++ b/tests/melody/test_deepsrgm.py
@@ -43,8 +43,11 @@ def _get_features():
         feat = deepsrgm.get_features(
             os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
         )
-    feat_1 = deepsrgm.get_features(np.zeros(44100))
-    feat_2 = deepsrgm.get_features(
+    feat_1 = deepsrgm.get_features(np.zeros([44100]))
+    feat_2 = deepsrgm.get_features(np.zeros([1, 44100]))
+    feat_3 = deepsrgm.get_features(np.zeros([2, 44100]))
+    feat_3 = deepsrgm.get_features(np.zeros([44100, 2]))
+    feat_4 = deepsrgm.get_features(
         os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
     )
 
diff --git a/tests/melody/test_essentia_extractors.py b/tests/melody/test_essentia_extractors.py
index b2db2904..622c969a 100644
--- a/tests/melody/test_essentia_extractors.py
+++ b/tests/melody/test_essentia_extractors.py
@@ -1,5 +1,6 @@
 import os
 import pytest
+import librosa
 
 import numpy as np
 
@@ -15,9 +16,9 @@ def _predict_normalized_pitch():
     pitch = melodia.extract(
         os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
     )
-    pitch_2 = melodia.extract(np.zeros(44100))
-    pitch_3 = melodia.extract(np.zeros(2, 44100))  # Testing input array
-    pitch_4 = melodia.extract(np.zeros(44100, 2))  # Testing input array
+    pitch_2 = melodia.extract(np.zeros([44100]))
+    pitch_3 = melodia.extract(np.zeros([2, 44100]))  # Testing input array
+    pitch_4 = melodia.extract(np.zeros([44100, 2]))  # Testing input array
 
     assert isinstance(pitch, np.ndarray)
     assert np.shape(pitch) == (699, 2)
@@ -70,11 +71,10 @@ def _predict_normalized_pitch():
     tonic = tonic_multipitch.extract(
         os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
     )
-    tonic_2 = tonic_multipitch.extract(np.zeros(44100))  # Testing input array
-    tonic_3 = tonic_multipitch.extract(np.zeros(2, 44100))  # Testing input array
-    tonic_4 = tonic_multipitch.extract(np.zeros(44100, 2))  # Testing input array
-
-
+    audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0]
+    tonic_2 = tonic_multipitch.extract(audio)  # Testing input array
+    tonic_3 = tonic_multipitch.extract(np.stack([audio, audio]))  # Testing input array
+    tonic_4 = tonic_multipitch.extract(np.stack([audio, audio]).T)  # Testing input array
 
     assert isinstance(tonic, float)
     assert tonic == 157.64892578125

From 07afc5c829a320b9a5890a936bbd9d32826456e4 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Thu, 28 Nov 2024 17:39:27 +0100
Subject: [PATCH 06/10] bump to new torch version

---
 .github/environment-ci.yml | 4 ++--
 pyproject.toml             | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/environment-ci.yml b/.github/environment-ci.yml
index 5c0caeda..d3c24541 100644
--- a/.github/environment-ci.yml
+++ b/.github/environment-ci.yml
@@ -27,8 +27,8 @@ dependencies:
   - pip:
     - "keras<3.0.0"
     - "tensorflow>=2.12.0,<2.16"
-    - "torch==1.13"
-    - "torchaudio"
+    - "torch==2.0.0"
+    - "torchaudio==2.0.1"
     - "essentia"
     - "soundfile>=0.12.1"
     - "opencv-python~=4.6.0"
diff --git a/pyproject.toml b/pyproject.toml
index 05e61490..f5961a9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,8 +68,8 @@ tensorflow = [
     "tensorflow==2.15.0",
 ]  # Fixing tf versions to avoid issues
 torch = [
-    "torch==1.13.0",
-    "torchaudio",
+    "torch==2.0.0",
+    "torchaudio==2.0.1",
 ]
 essentia = [
     "essentia",

From 47996159ca87a299c75233ec956388f114fa7c39 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Thu, 28 Nov 2024 17:58:14 +0100
Subject: [PATCH 07/10] fix stereo to mono function

---
 compiam/utils/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiam/utils/__init__.py b/compiam/utils/__init__.py
index fa49733c..ef971b90 100644
--- a/compiam/utils/__init__.py
+++ b/compiam/utils/__init__.py
@@ -179,13 +179,13 @@ def stereo_to_mono(audio):
         # Put channels first
         if audio.shape[0] > audio.shape[1]:
             audio = audio.T
+            if audio.shape[0] > 2:
+                raise ValueError("Expected mono or stereo audio, got multi-channel audio")
         # If stereo, average the channels
         if audio.shape[0] == 2:
             audio = np.mean(audio, axis=0)
         if audio.shape[0] == 1:
             audio = np.squeeze(audio, axis=0)
-        if audio.shape[0] > 2:
-            raise ValueError("Expected mono or stereo audio, got multi-channel audio")
     if len(audio.shape) > 2:
         raise ValueError("Input must be an unbatched audio signal")
     return audio

From 9ba8da0a8abee3530d81b86bf10d68ce88bd12f2 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Thu, 28 Nov 2024 18:21:25 +0100
Subject: [PATCH 08/10] tonic does not find peaks

---
 compiam/melody/pattern/sancara_search/__init__.py             | 2 ++
 .../melody/pitch_extraction/ftaresnet_carnatic/__init__.py    | 4 +++-
 .../music_source_separation/mixer_model/__init__.py           | 2 ++
 .../segmentation/dhrupad_bandish_segmentation/__init__.py     | 2 ++
 tests/melody/test_essentia_extractors.py                      | 4 ----
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/compiam/melody/pattern/sancara_search/__init__.py b/compiam/melody/pattern/sancara_search/__init__.py
index 39f88206..b1537920 100644
--- a/compiam/melody/pattern/sancara_search/__init__.py
+++ b/compiam/melody/pattern/sancara_search/__init__.py
@@ -180,6 +180,8 @@ def load_model(self, model_path, conf_path, spec_path):
             setattr(self, tp, v)
 
         self.model = self._build_model()
+        ## Ensuring we can load the model for different torch versions
+        ## -- (weights only might be deprecated)
         try:
             self.model.load_state_dict(
                 torch.load(model_path, weights_only=True, map_location=self.device),
diff --git a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py
index 68c094aa..9a5ccac7 100644
--- a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py
+++ b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py
@@ -80,7 +80,9 @@ def load_model(self, model_path):
         """Load pre-trained model weights."""
         if not os.path.exists(model_path):
             self.download_model(model_path)  # Downloading model weights
-        try: # Loading model weights
+        ## Ensuring we can load the model for different torch versions
+        ## -- (weights only might be deprecated)
+        try:
             self.model.load_state_dict(torch.load(model_path, weights_only=True, map_location=self.device))
         except:
             self.model.load_state_dict(torch.load(model_path, map_location=self.device))
diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py
index b6d74ef1..c3fc2c59 100644
--- a/compiam/separation/music_source_separation/mixer_model/__init__.py
+++ b/compiam/separation/music_source_separation/mixer_model/__init__.py
@@ -83,6 +83,8 @@ def _build_model(self):
     def load_model(self, model_path):
         if not os.path.exists(model_path):
             self.download_model(model_path)  # Downloading model weights
+        ## Ensuring we can load the model for different torch versions
+        ## -- (weights only might be deprecated)
         try:
             weights = torch.load(model_path, weights_only=True, map_location=self.device)
         except:
diff --git a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py
index 9debf911..81cd7d9c 100644
--- a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py
+++ b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py
@@ -176,6 +176,8 @@ def load_model(self, model_path):
             self.download_model(model_path)
 
         self.model = self._build_model()
+        ## Ensuring we can load the model for different torch versions
+        ## -- (weights only might be deprecated)
         try:
             self.model.load_state_dict(
                 torch.load(model_path, weights_only=True, map_location=self.device)
diff --git a/tests/melody/test_essentia_extractors.py b/tests/melody/test_essentia_extractors.py
index 622c969a..36c602ea 100644
--- a/tests/melody/test_essentia_extractors.py
+++ b/tests/melody/test_essentia_extractors.py
@@ -71,10 +71,6 @@ def _predict_normalized_pitch():
     tonic = tonic_multipitch.extract(
         os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
     )
-    audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0]
-    tonic_2 = tonic_multipitch.extract(audio)  # Testing input array
-    tonic_3 = tonic_multipitch.extract(np.stack([audio, audio]))  # Testing input array
-    tonic_4 = tonic_multipitch.extract(np.stack([audio, audio]).T)  # Testing input array
 
     assert isinstance(tonic, float)
     assert tonic == 157.64892578125

From 2ddab849d611b30206a651075d0ee3ca72b56d59 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Thu, 28 Nov 2024 18:23:01 +0100
Subject: [PATCH 09/10] removed wrong tests

---
 tests/melody/test_deepsrgm.py            | 9 +++++----
 tests/melody/test_essentia_extractors.py | 4 ++++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/melody/test_deepsrgm.py b/tests/melody/test_deepsrgm.py
index 0d47655d..7eb00dbe 100644
--- a/tests/melody/test_deepsrgm.py
+++ b/tests/melody/test_deepsrgm.py
@@ -1,5 +1,6 @@
 import os
 import pytest
+import librosa
 
 import numpy as np
 
@@ -43,10 +44,10 @@ def _get_features():
         feat = deepsrgm.get_features(
             os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
         )
-    feat_1 = deepsrgm.get_features(np.zeros([44100]))
-    feat_2 = deepsrgm.get_features(np.zeros([1, 44100]))
-    feat_3 = deepsrgm.get_features(np.zeros([2, 44100]))
-    feat_3 = deepsrgm.get_features(np.zeros([44100, 2]))
+    audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0]
+    feat_1 = deepsrgm.get_features(audio)
+    feat_2 = deepsrgm.get_features(np.stack([audio, audio]))
+    feat_3 = deepsrgm.get_features(np.stack([audio, audio]).T)
     feat_4 = deepsrgm.get_features(
         os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
     )
diff --git a/tests/melody/test_essentia_extractors.py b/tests/melody/test_essentia_extractors.py
index 36c602ea..622c969a 100644
--- a/tests/melody/test_essentia_extractors.py
+++ b/tests/melody/test_essentia_extractors.py
@@ -71,6 +71,10 @@ def _predict_normalized_pitch():
     tonic = tonic_multipitch.extract(
         os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
     )
+    audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0]
+    tonic_2 = tonic_multipitch.extract(audio)  # Testing input array
+    tonic_3 = tonic_multipitch.extract(np.stack([audio, audio]))  # Testing input array
+    tonic_4 = tonic_multipitch.extract(np.stack([audio, audio]).T)  # Testing input array
 
     assert isinstance(tonic, float)
     assert tonic == 157.64892578125

From 30a3e3af51b602c55f673a2523cdc2773206e75b Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Fri, 29 Nov 2024 13:22:54 +0100
Subject: [PATCH 10/10] make audio testing longer for deepsrgm

---
 tests/melody/test_deepsrgm.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/melody/test_deepsrgm.py b/tests/melody/test_deepsrgm.py
index 7eb00dbe..51f9a523 100644
--- a/tests/melody/test_deepsrgm.py
+++ b/tests/melody/test_deepsrgm.py
@@ -45,12 +45,10 @@ def _get_features():
             os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
         )
     audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0]
+    audio = np.tile(audio, 9)
     feat_1 = deepsrgm.get_features(audio)
     feat_2 = deepsrgm.get_features(np.stack([audio, audio]))
     feat_3 = deepsrgm.get_features(np.stack([audio, audio]).T)
-    feat_4 = deepsrgm.get_features(
-        os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
-    )
 
 
 @pytest.mark.torch