MTG · thomasgnuttall · Nov 29, 2024 · Nov 28, 2024 · Nov 28, 2024 · Nov 28, 2024
diff --git a/compiam/__init__.py b/compiam/__init__.py
@@ -71,9 +71,11 @@ def load_dataset(dataset_name, data_home=None, version="default"):
     """
     if dataset_name not in datasets_list:
         raise ValueError("Invalid dataset {}".format(dataset_name))
-    return mirdata.initialize(
+    dataloader =  mirdata.initialize(
         dataset_name=dataset_name, data_home=data_home, version=version
     )
+    dataloader.download(["index"])  # Download index file
+    return dataloader
 
 
 def load_corpora(tradition, token=None):

diff --git a/compiam/melody/pattern/sancara_search/__init__.py b/compiam/melody/pattern/sancara_search/__init__.py
@@ -180,9 +180,18 @@ def load_model(self, model_path, conf_path, spec_path):
             setattr(self, tp, v)
 
         self.model = self._build_model()
-        self.model.load_state_dict(
-            torch.load(model_path, weights_only=True), strict=False
-        )
+        ## Ensuring we can load the model for different torch versions
+        ## -- (weights only might be deprecated)
+        try:
+            self.model.load_state_dict(
+                torch.load(model_path, weights_only=True, map_location=self.device),
+                strict=False
+            )
+        except:
+            self.model.load_state_dict(
+                torch.load(model_path, map_location=self.device),
+                strict=False,
+            )
         self.trained = True
 
     def download_model(self, model_path=None, force_overwrite=False):

diff --git a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py
@@ -80,7 +80,12 @@ def load_model(self, model_path):
         """Load pre-trained model weights."""
         if not os.path.exists(model_path):
             self.download_model(model_path)  # Downloading model weights
-        self.model.load_state_dict(torch.load(model_path, weights_only=True))
+        ## Ensuring we can load the model for different torch versions
+        ## -- (weights only might be deprecated)
+        try:
+            self.model.load_state_dict(torch.load(model_path, weights_only=True, map_location=self.device))
+        except:
+            self.model.load_state_dict(torch.load(model_path, map_location=self.device))
         self.model_path = model_path
         self.trained = True
 

diff --git a/compiam/melody/pitch_extraction/melodia.py b/compiam/melody/pitch_extraction/melodia.py
@@ -4,7 +4,7 @@
 
 from compiam.utils.pitch import normalisation, resampling
 from compiam.io import write_csv
-from compiam.utils import get_logger
+from compiam.utils import get_logger, stereo_to_mono
 
 logger = get_logger(__name__)
 
@@ -87,14 +87,16 @@ def extract(self, input_data, input_sr=44100, out_step=None):
                 filename=input_data, sampleRate=self.sample_rate
             )()
         elif isinstance(input_data, np.ndarray):
+            input_data = stereo_to_mono(input_data)
+            # Apply Eqloudness filter
             logger.warning(
                 f"Resampling... (input sampling rate is {input_sr}Hz, make sure this is correct)"
             )
             resample_audio = estd.Resample(
                 inputSampleRate=input_sr, outputSampleRate=self.sample_rate
-            )()
+            )
             input_data = resample_audio(input_data)
-            audio = estd.EqualLoudness(signal=input_data)()
+            audio = estd.EqualLoudness(sampleRate=self.sample_rate)(input_data)
         else:
             raise ValueError("Input must be path to audio signal or an audio array")
 

diff --git a/compiam/melody/raga_recognition/deepsrgm/__init__.py b/compiam/melody/raga_recognition/deepsrgm/__init__.py
@@ -1,4 +1,5 @@
 import os
+import librosa
 
 import numpy as np
 
@@ -8,7 +9,7 @@
     ModelNotTrainedError,
     DatasetNotLoadedError,
 )
-from compiam.utils import get_logger, WORKDIR
+from compiam.utils import get_logger, stereo_to_mono, WORKDIR
 from compiam.utils.download import download_remote_model
 
 logger = get_logger(__name__)
@@ -122,7 +123,10 @@ def load_model(self, model_path, rnn="lstm"):
             self.model = self._build_model(rnn="gru")
 
         self.model_path = model_path
-        weights = torch.load(model_path, weights_only=True, map_location=self.device)
+        try:
+            weights = torch.load(model_path, weights_only=True, map_location=self.device)
+        except:
+            weights = torch.load(model_path, map_location=self.device)
         new_weights = weights.copy()
         keys_to_fix = [
             ".weight_ih_l0",
@@ -232,20 +236,20 @@ def get_features(
                     "Install compIAM with essentia support: pip install 'compiam[essentia]'"
                 )
 
+            # Loading and resampling audio
             if isinstance(input_data, str):
                 if not os.path.exists(input_data):
                     raise FileNotFoundError("Target audio not found.")
-                audio = estd.MonoLoader(
-                    filename=input_data, sampleRate=self.sample_rate
-                )()
+                audio, _ = librosa.load(input_data, sr=self.sample_rate)
             elif isinstance(input_data, np.ndarray):
+                input_data = stereo_to_mono(input_data)
                 logger.warning(
-                    "Resampling... (input sampling rate is {input_sr}Hz, make sure this is correct)"
+                    f"Resampling... (input sampling rate is assumed {input_sr}Hz, \
+                        make sure this is correct and change input_sr otherwise)"
                 )
-                resampling = estd.Resample(
-                    inputSampleRate=input_sr, outputSampleRate=self.sample_rate
+                audio = librosa.resample(
+                    input_data, orig_sr=input_sr, target_sr=self.sample_rate
                 )
-                audio = resampling(input_data)
             else:
                 raise ValueError("Input must be path to audio signal or an audio array")
 

diff --git a/compiam/melody/tonic_identification/tonic_multipitch.py b/compiam/melody/tonic_identification/tonic_multipitch.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from compiam.utils import get_logger
+from compiam.utils import get_logger, stereo_to_mono
 
 logger = get_logger(__name__)
 
@@ -65,12 +65,16 @@ def extract(self, input_data, input_sr=44100):
                 raise FileNotFoundError("Target audio not found.")
             audio = estd.MonoLoader(filename=input_data, sampleRate=self.sample_rate)()
         elif isinstance(input_data, np.ndarray):
+            if len(input_data.shape) == 2:
+                input_data = stereo_to_mono(input_data)
+            if len(input_data.shape) > 2:
+                raise ValueError("Input must be an unbatched audio signal")
             logger.warning(
                 f"Resampling... (input sampling rate is {input_sr}Hz, make sure this is correct)"
             )
             resampling = estd.Resample(
                 inputSampleRate=input_sr, outputSampleRate=self.sample_rate
-            )()
+            )
             audio = resampling(input_data)
         else:
             raise ValueError("Input must be path to audio signal or an audio array")

diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py
@@ -83,7 +83,13 @@ def _build_model(self):
     def load_model(self, model_path):
         if not os.path.exists(model_path):
             self.download_model(model_path)  # Downloading model weights
-        self.model.load_state_dict(torch.load(model_path, weights_only=True))
+        ## Ensuring we can load the model for different torch versions
+        ## -- (weights only might be deprecated)
+        try:
+            weights = torch.load(model_path, weights_only=True, map_location=self.device)
+        except:
+            weights = torch.load(model_path, map_location=self.device)
+        self.model.load_state_dict(weights)
         self.model_path = model_path
         self.trained = True
 

diff --git a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py
@@ -94,6 +94,7 @@ def separate(
         clusters=5,
         scheduler=4,
         chunk_size=3,
+        normalize_input=True,
         gpu="-1",
     ):
         """Separate singing voice from mixture.
@@ -103,6 +104,8 @@ def separate(
             relevant if the input is an array of data instead of a filepath.
         :param clusters: Number of clusters to use to build the separation masks.
         :param scheduler: Scheduler factor to weight the clusters to be more or less restirctive with the interferences.
+        :param chunk_size: Size of the chunks to process the audio signal.
+        :param normalize_input: Normalize the input audio signal.
         :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc.
         :return: Singing voice signal.
         """
@@ -153,6 +156,12 @@ def separate(
                 f"Downsampling to mono... your audio is stereo, \
                     and the model is trained on mono audio."
             )
+
+        if normalize_input:
+            # Normalizing audio for better performance overall
+            mean = tf.reduce_mean(mixture, keepdims=True)
+            std = tf.math.reduce_std(mixture, keepdims=True)
+            mixture = (mixture - mean) / (1e-6 + std)
 
         output_voc = np.zeros(mixture.shape)
         hopsized_chunk = int((chunk_size * self.sample_rate) / 2)

diff --git a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py
@@ -176,16 +176,22 @@ def load_model(self, model_path):
             self.download_model(model_path)
 
         self.model = self._build_model()
-        self.model.load_state_dict(
-            torch.load(model_path, weights_only=True, map_location=self.device)
-        )
+        ## Ensuring we can load the model for different torch versions
+        ## -- (weights only might be deprecated)
+        try:
+            self.model.load_state_dict(
+                torch.load(model_path, weights_only=True, map_location=self.device)
+            )
+        except:
+            self.model.load_state_dict(
+                torch.load(model_path, map_location=self.device)
+            )
         self.model.eval()
         self.loaded_model_path = model_path
         self.trained = True
 
     def download_model(self, model_path=None, force_overwrite=False):
         """Download pre-trained model."""
-        print("modelpathhh", model_path)
         download_path = (
             os.sep + os.path.join(*model_path.split(os.sep)[:-4])
             if model_path is not None

diff --git a/compiam/utils/__init__.py b/compiam/utils/__init__.py
@@ -4,7 +4,6 @@
 import pathlib
 import pickle
 import difflib
-import librosa
 
 import IPython.display as ipd
 import numpy as np
@@ -172,3 +171,21 @@ def add_center_to_mask(mask):
                 num_one = 0
                 indices = []
     return mask
+
+
+def stereo_to_mono(audio):
+    """Assuming numpy array as input"""
+    if len(audio.shape) == 2:
+        # Put channels first
+        if audio.shape[0] > audio.shape[1]:
+            audio = audio.T
+            if audio.shape[0] > 2:
+                raise ValueError("Expected mono or stereo audio, got multi-channel audio")
+        # If stereo, average the channels
+        if audio.shape[0] == 2:
+            audio = np.mean(audio, axis=0)
+        if audio.shape[0] == 1:
+            audio = np.squeeze(audio, axis=0)
+    if len(audio.shape) > 2:
+        raise ValueError("Input must be an unbatched audio signal")
+    return audio
diff --git a/pyproject.toml b/pyproject.toml
@@ -65,11 +65,11 @@ docs = [
 ]
 tensorflow = [
     "keras<3.0.0",
-    "tensorflow>=2.12.0,<2.16",
-]
+    "tensorflow==2.15.0",
+]  # Fixing tf versions to avoid issues
 torch = [
-    "torch==1.13.0",
-    "torchaudio",
+    "torch==2.0.0",
+    "torchaudio==2.0.1",
 ]
 essentia = [
     "essentia",

diff --git a/tests/melody/test_deepsrgm.py b/tests/melody/test_deepsrgm.py
@@ -1,5 +1,6 @@
 import os
 import pytest
+import librosa
 
 import numpy as np
 
@@ -43,6 +44,11 @@ def _get_features():
         feat = deepsrgm.get_features(
             os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
         )
+    audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0]
+    audio = np.tile(audio, 9)
+    feat_1 = deepsrgm.get_features(audio)
+    feat_2 = deepsrgm.get_features(np.stack([audio, audio]))
+    feat_3 = deepsrgm.get_features(np.stack([audio, audio]).T)
 
 
 @pytest.mark.torch

diff --git a/tests/melody/test_essentia_extractors.py b/tests/melody/test_essentia_extractors.py
@@ -1,5 +1,6 @@
 import os
 import pytest
+import librosa
 
 import numpy as np
 
@@ -15,6 +16,9 @@ def _predict_normalized_pitch():
     pitch = melodia.extract(
         os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
     )
+    pitch_2 = melodia.extract(np.zeros([44100]))
+    pitch_3 = melodia.extract(np.zeros([2, 44100]))  # Testing input array
+    pitch_4 = melodia.extract(np.zeros([44100, 2]))  # Testing input array
 
     assert isinstance(pitch, np.ndarray)
     assert np.shape(pitch) == (699, 2)
@@ -67,6 +71,10 @@ def _predict_normalized_pitch():
     tonic = tonic_multipitch.extract(
         os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
     )
+    audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0]
+    tonic_2 = tonic_multipitch.extract(audio)  # Testing input array
+    tonic_3 = tonic_multipitch.extract(np.stack([audio, audio]))  # Testing input array
+    tonic_4 = tonic_multipitch.extract(np.stack([audio, audio]).T)  # Testing input array
 
     assert isinstance(tonic, float)
     assert tonic == 157.64892578125