From 0c7806d080a02142083daa51d092b9663c7c90cf Mon Sep 17 00:00:00 2001 From: genisplaja Date: Thu, 28 Nov 2024 13:11:31 +0100 Subject: [PATCH 01/10] fix tf, download mirdata idx by default --- compiam/__init__.py | 4 +++- pyproject.toml | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/compiam/__init__.py b/compiam/__init__.py index 35d13052..5a795d57 100644 --- a/compiam/__init__.py +++ b/compiam/__init__.py @@ -71,9 +71,11 @@ def load_dataset(dataset_name, data_home=None, version="default"): """ if dataset_name not in datasets_list: raise ValueError("Invalid dataset {}".format(dataset_name)) - return mirdata.initialize( + dataloader = mirdata.initialize( dataset_name=dataset_name, data_home=data_home, version=version ) + dataloader.download(["index"]) # Download index file + return dataloader def load_corpora(tradition, token=None): diff --git a/pyproject.toml b/pyproject.toml index 80d9218b..05e61490 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,8 +65,8 @@ docs = [ ] tensorflow = [ "keras<3.0.0", - "tensorflow>=2.12.0,<2.16", -] + "tensorflow==2.15.0", +] # Fixing tf versions to avoid issues torch = [ "torch==1.13.0", "torchaudio", From d2e65efd26d7396ca34108c970802af516a4b8c9 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Thu, 28 Nov 2024 16:35:28 +0100 Subject: [PATCH 02/10] fix resampling and model loading minor errors --- .../melody/pattern/sancara_search/__init__.py | 13 ++++++++--- .../ftaresnet_carnatic/__init__.py | 5 ++++- compiam/melody/pitch_extraction/melodia.py | 6 +++-- .../raga_recognition/deepsrgm/__init__.py | 22 +++++++++++-------- .../tonic_identification/tonic_multipitch.py | 8 +++++-- .../mixer_model/__init__.py | 6 ++++- .../dhrupad_bandish_segmentation/__init__.py | 12 ++++++---- compiam/utils/__init__.py | 19 +++++++++++++++- tests/melody/test_deepsrgm.py | 4 ++++ tests/melody/test_essentia_extractors.py | 8 +++++++ 10 files changed, 80 insertions(+), 23 deletions(-) diff --git a/compiam/melody/pattern/sancara_search/__init__.py b/compiam/melody/pattern/sancara_search/__init__.py index f3ee2f61..39f88206 100644 --- a/compiam/melody/pattern/sancara_search/__init__.py +++ b/compiam/melody/pattern/sancara_search/__init__.py @@ -180,9 +180,16 @@ def load_model(self, model_path, conf_path, spec_path): setattr(self, tp, v) self.model = self._build_model() - self.model.load_state_dict( - torch.load(model_path, weights_only=True), strict=False - ) + try: + self.model.load_state_dict( + torch.load(model_path, weights_only=True, map_location=self.device), + strict=False + ) + except: + self.model.load_state_dict( + torch.load(model_path, map_location=self.device), + strict=False, + ) self.trained = True def download_model(self, model_path=None, force_overwrite=False): diff --git a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py index eb34be79..68c094aa 100644 --- a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py +++ b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py @@ -80,7 +80,10 @@ def load_model(self, model_path): """Load pre-trained model weights.""" if not os.path.exists(model_path): self.download_model(model_path) # Downloading model weights - self.model.load_state_dict(torch.load(model_path, weights_only=True)) + try: # Loading model weights + self.model.load_state_dict(torch.load(model_path, weights_only=True, map_location=self.device)) + except: + self.model.load_state_dict(torch.load(model_path, map_location=self.device)) self.model_path = model_path self.trained = True diff --git a/compiam/melody/pitch_extraction/melodia.py b/compiam/melody/pitch_extraction/melodia.py index 247a85f9..9bebb128 100644 --- a/compiam/melody/pitch_extraction/melodia.py +++ b/compiam/melody/pitch_extraction/melodia.py @@ -4,7 +4,7 @@ from compiam.utils.pitch import normalisation, resampling from compiam.io import write_csv -from compiam.utils import get_logger +from compiam.utils import get_logger, stereo_to_mono logger = get_logger(__name__) @@ -87,12 +87,14 @@ def extract(self, input_data, input_sr=44100, out_step=None): filename=input_data, sampleRate=self.sample_rate )() elif isinstance(input_data, np.ndarray): + input_data = stereo_to_mono(input_data) + # Apply Eqloudness filter logger.warning( f"Resampling... (input sampling rate is {input_sr}Hz, make sure this is correct)" ) resample_audio = estd.Resample( inputSampleRate=input_sr, outputSampleRate=self.sample_rate - )() + ) input_data = resample_audio(input_data) audio = estd.EqualLoudness(signal=input_data)() else: diff --git a/compiam/melody/raga_recognition/deepsrgm/__init__.py b/compiam/melody/raga_recognition/deepsrgm/__init__.py index 2323fde3..1d79ea7c 100644 --- a/compiam/melody/raga_recognition/deepsrgm/__init__.py +++ b/compiam/melody/raga_recognition/deepsrgm/__init__.py @@ -1,4 +1,5 @@ import os +import librosa import numpy as np @@ -8,7 +9,7 @@ ModelNotTrainedError, DatasetNotLoadedError, ) -from compiam.utils import get_logger, WORKDIR +from compiam.utils import get_logger, stereo_to_mono, WORKDIR from compiam.utils.download import download_remote_model logger = get_logger(__name__) @@ -122,7 +123,10 @@ def load_model(self, model_path, rnn="lstm"): self.model = self._build_model(rnn="gru") self.model_path = model_path - weights = torch.load(model_path, weights_only=True, map_location=self.device) + try: + weights = torch.load(model_path, weights_only=True, map_location=self.device) + except: + weights = torch.load(model_path, map_location=self.device) new_weights = weights.copy() keys_to_fix = [ ".weight_ih_l0", @@ -232,20 +236,20 @@ def get_features( "Install compIAM with essentia support: pip install 'compiam[essentia]'" ) + # Loading and resampling audio if isinstance(input_data, str): if not os.path.exists(input_data): raise FileNotFoundError("Target audio not found.") - audio = estd.MonoLoader( - filename=input_data, sampleRate=self.sample_rate - )() + audio, _ = librosa.load(input_data, sr=self.sample_rate) elif isinstance(input_data, np.ndarray): + input_data = stereo_to_mono(input_data) logger.warning( - "Resampling... (input sampling rate is {input_sr}Hz, make sure this is correct)" + f"Resampling... (input sampling rate is assumed {input_sr}Hz, \ + make sure this is correct and change input_sr otherwise)" ) - resampling = estd.Resample( - inputSampleRate=input_sr, outputSampleRate=self.sample_rate + audio = librosa.resample( + input_data, orig_sr=input_sr, target_sr=self.sample_rate ) - audio = resampling(input_data) else: raise ValueError("Input must be path to audio signal or an audio array") diff --git a/compiam/melody/tonic_identification/tonic_multipitch.py b/compiam/melody/tonic_identification/tonic_multipitch.py index 49a6d429..2c3ed099 100644 --- a/compiam/melody/tonic_identification/tonic_multipitch.py +++ b/compiam/melody/tonic_identification/tonic_multipitch.py @@ -2,7 +2,7 @@ import numpy as np -from compiam.utils import get_logger +from compiam.utils import get_logger, stereo_to_mono logger = get_logger(__name__) @@ -65,12 +65,16 @@ def extract(self, input_data, input_sr=44100): raise FileNotFoundError("Target audio not found.") audio = estd.MonoLoader(filename=input_data, sampleRate=self.sample_rate)() elif isinstance(input_data, np.ndarray): + if len(input_data.shape) == 2: + input_data = stereo_to_mono(input_data) + if len(input_data.shape) > 2: + raise ValueError("Input must be an unbatched audio signal") logger.warning( f"Resampling... (input sampling rate is {input_sr}Hz, make sure this is correct)" ) resampling = estd.Resample( inputSampleRate=input_sr, outputSampleRate=self.sample_rate - )() + ) audio = resampling(input_data) else: raise ValueError("Input must be path to audio signal or an audio array") diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py index b44f748e..b6d74ef1 100644 --- a/compiam/separation/music_source_separation/mixer_model/__init__.py +++ b/compiam/separation/music_source_separation/mixer_model/__init__.py @@ -83,7 +83,11 @@ def _build_model(self): def load_model(self, model_path): if not os.path.exists(model_path): self.download_model(model_path) # Downloading model weights - self.model.load_state_dict(torch.load(model_path, weights_only=True)) + try: + weights = torch.load(model_path, weights_only=True, map_location=self.device) + except: + weights = torch.load(model_path, map_location=self.device) + self.model.load_state_dict(weights) self.model_path = model_path self.trained = True diff --git a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py index f67c7347..9debf911 100644 --- a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py +++ b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py @@ -176,16 +176,20 @@ def load_model(self, model_path): self.download_model(model_path) self.model = self._build_model() - self.model.load_state_dict( - torch.load(model_path, weights_only=True, map_location=self.device) - ) + try: + self.model.load_state_dict( + torch.load(model_path, weights_only=True, map_location=self.device) + ) + except: + self.model.load_state_dict( + torch.load(model_path, map_location=self.device) + ) self.model.eval() self.loaded_model_path = model_path self.trained = True def download_model(self, model_path=None, force_overwrite=False): """Download pre-trained model.""" - print("modelpathhh", model_path) download_path = ( os.sep + os.path.join(*model_path.split(os.sep)[:-4]) if model_path is not None diff --git a/compiam/utils/__init__.py b/compiam/utils/__init__.py index 8a06ac3b..fa49733c 100644 --- a/compiam/utils/__init__.py +++ b/compiam/utils/__init__.py @@ -4,7 +4,6 @@ import pathlib import pickle import difflib -import librosa import IPython.display as ipd import numpy as np @@ -172,3 +171,21 @@ def add_center_to_mask(mask): num_one = 0 indices = [] return mask + + +def stereo_to_mono(audio): + """Assuming numpy array as input""" + if len(audio.shape) == 2: + # Put channels first + if audio.shape[0] > audio.shape[1]: + audio = audio.T + # If stereo, average the channels + if audio.shape[0] == 2: + audio = np.mean(audio, axis=0) + if audio.shape[0] == 1: + audio = np.squeeze(audio, axis=0) + if audio.shape[0] > 2: + raise ValueError("Expected mono or stereo audio, got multi-channel audio") + if len(audio.shape) > 2: + raise ValueError("Input must be an unbatched audio signal") + return audio diff --git a/tests/melody/test_deepsrgm.py b/tests/melody/test_deepsrgm.py index 69c2769e..e01dc73f 100644 --- a/tests/melody/test_deepsrgm.py +++ b/tests/melody/test_deepsrgm.py @@ -43,6 +43,10 @@ def _get_features(): feat = deepsrgm.get_features( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) + feat_1 = deepsrgm.get_features(np.zeros(44100)) + feat_2 = deepsrgm.get_features( + os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") + ) @pytest.mark.torch diff --git a/tests/melody/test_essentia_extractors.py b/tests/melody/test_essentia_extractors.py index ef75a61f..b2db2904 100644 --- a/tests/melody/test_essentia_extractors.py +++ b/tests/melody/test_essentia_extractors.py @@ -15,6 +15,9 @@ def _predict_normalized_pitch(): pitch = melodia.extract( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) + pitch_2 = melodia.extract(np.zeros(44100)) + pitch_3 = melodia.extract(np.zeros(2, 44100)) # Testing input array + pitch_4 = melodia.extract(np.zeros(44100, 2)) # Testing input array assert isinstance(pitch, np.ndarray) assert np.shape(pitch) == (699, 2) @@ -67,6 +70,11 @@ def _predict_normalized_pitch(): tonic = tonic_multipitch.extract( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) + tonic_2 = tonic_multipitch.extract(np.zeros(44100)) # Testing input array + tonic_3 = tonic_multipitch.extract(np.zeros(2, 44100)) # Testing input array + tonic_4 = tonic_multipitch.extract(np.zeros(44100, 2)) # Testing input array + + assert isinstance(tonic, float) assert tonic == 157.64892578125 From 16fb6c31c096357abdd26ef0601f6d5f492f6d07 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Thu, 28 Nov 2024 16:52:46 +0100 Subject: [PATCH 03/10] fix in eqloudness input --- compiam/melody/pitch_extraction/melodia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiam/melody/pitch_extraction/melodia.py b/compiam/melody/pitch_extraction/melodia.py index 9bebb128..449c694a 100644 --- a/compiam/melody/pitch_extraction/melodia.py +++ b/compiam/melody/pitch_extraction/melodia.py @@ -96,7 +96,7 @@ def extract(self, input_data, input_sr=44100, out_step=None): inputSampleRate=input_sr, outputSampleRate=self.sample_rate ) input_data = resample_audio(input_data) - audio = estd.EqualLoudness(signal=input_data)() + audio = estd.EqualLoudness(sampleRate=self.sample_rate)(audio) else: raise ValueError("Input must be path to audio signal or an audio array") From 4d4e5b8f276fcd2b0a3f2ba6ad5e0e54c51056c9 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Thu, 28 Nov 2024 17:09:53 +0100 Subject: [PATCH 04/10] fix wrongly defined variable --- compiam/melody/pitch_extraction/melodia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiam/melody/pitch_extraction/melodia.py b/compiam/melody/pitch_extraction/melodia.py index 449c694a..6d27cb62 100644 --- a/compiam/melody/pitch_extraction/melodia.py +++ b/compiam/melody/pitch_extraction/melodia.py @@ -96,7 +96,7 @@ def extract(self, input_data, input_sr=44100, out_step=None): inputSampleRate=input_sr, outputSampleRate=self.sample_rate ) input_data = resample_audio(input_data) - audio = estd.EqualLoudness(sampleRate=self.sample_rate)(audio) + audio = estd.EqualLoudness(sampleRate=self.sample_rate)(input_data) else: raise ValueError("Input must be path to audio signal or an audio array") From ac31c0cf7fa83443c85d640829e0d23aa1eb0cc3 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Thu, 28 Nov 2024 17:34:43 +0100 Subject: [PATCH 05/10] fixing tests --- .github/environment-ci.yml | 4 ++-- .../cold_diff_sep/__init__.py | 9 +++++++++ tests/melody/test_deepsrgm.py | 7 +++++-- tests/melody/test_essentia_extractors.py | 16 ++++++++-------- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/.github/environment-ci.yml b/.github/environment-ci.yml index d3c24541..5c0caeda 100644 --- a/.github/environment-ci.yml +++ b/.github/environment-ci.yml @@ -27,8 +27,8 @@ dependencies: - pip: - "keras<3.0.0" - "tensorflow>=2.12.0,<2.16" - - "torch==2.0.0" - - "torchaudio==2.0.1" + - "torch==1.13" + - "torchaudio" - "essentia" - "soundfile>=0.12.1" - "opencv-python~=4.6.0" diff --git a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py index 6c57b0c1..8cc92c06 100644 --- a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py +++ b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py @@ -94,6 +94,7 @@ def separate( clusters=5, scheduler=4, chunk_size=3, + normalize_input=True, gpu="-1", ): """Separate singing voice from mixture. @@ -103,6 +104,8 @@ def separate( relevant if the input is an array of data instead of a filepath. :param clusters: Number of clusters to use to build the separation masks. :param scheduler: Scheduler factor to weight the clusters to be more or less restirctive with the interferences. + :param chunk_size: Size of the chunks to process the audio signal. + :param normalize_input: Normalize the input audio signal. :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. :return: Singing voice signal. """ @@ -153,6 +156,12 @@ def separate( f"Downsampling to mono... your audio is stereo, \ and the model is trained on mono audio." ) + + if normalize_input: + # Normalizing audio for better performance overall + mean = tf.reduce_mean(mixture, keepdims=True) + std = tf.math.reduce_std(mixture, keepdims=True) + mixture = (mixture - mean) / (1e-6 + std) output_voc = np.zeros(mixture.shape) hopsized_chunk = int((chunk_size * self.sample_rate) / 2) diff --git a/tests/melody/test_deepsrgm.py b/tests/melody/test_deepsrgm.py index e01dc73f..0d47655d 100644 --- a/tests/melody/test_deepsrgm.py +++ b/tests/melody/test_deepsrgm.py @@ -43,8 +43,11 @@ def _get_features(): feat = deepsrgm.get_features( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) - feat_1 = deepsrgm.get_features(np.zeros(44100)) - feat_2 = deepsrgm.get_features( + feat_1 = deepsrgm.get_features(np.zeros([44100])) + feat_2 = deepsrgm.get_features(np.zeros([1, 44100])) + feat_3 = deepsrgm.get_features(np.zeros([2, 44100])) + feat_3 = deepsrgm.get_features(np.zeros([44100, 2])) + feat_4 = deepsrgm.get_features( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) diff --git a/tests/melody/test_essentia_extractors.py b/tests/melody/test_essentia_extractors.py index b2db2904..622c969a 100644 --- a/tests/melody/test_essentia_extractors.py +++ b/tests/melody/test_essentia_extractors.py @@ -1,5 +1,6 @@ import os import pytest +import librosa import numpy as np @@ -15,9 +16,9 @@ def _predict_normalized_pitch(): pitch = melodia.extract( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) - pitch_2 = melodia.extract(np.zeros(44100)) - pitch_3 = melodia.extract(np.zeros(2, 44100)) # Testing input array - pitch_4 = melodia.extract(np.zeros(44100, 2)) # Testing input array + pitch_2 = melodia.extract(np.zeros([44100])) + pitch_3 = melodia.extract(np.zeros([2, 44100])) # Testing input array + pitch_4 = melodia.extract(np.zeros([44100, 2])) # Testing input array assert isinstance(pitch, np.ndarray) assert np.shape(pitch) == (699, 2) @@ -70,11 +71,10 @@ def _predict_normalized_pitch(): tonic = tonic_multipitch.extract( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) - tonic_2 = tonic_multipitch.extract(np.zeros(44100)) # Testing input array - tonic_3 = tonic_multipitch.extract(np.zeros(2, 44100)) # Testing input array - tonic_4 = tonic_multipitch.extract(np.zeros(44100, 2)) # Testing input array - - + audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0] + tonic_2 = tonic_multipitch.extract(audio) # Testing input array + tonic_3 = tonic_multipitch.extract(np.stack([audio, audio])) # Testing input array + tonic_4 = tonic_multipitch.extract(np.stack([audio, audio]).T) # Testing input array assert isinstance(tonic, float) assert tonic == 157.64892578125 From 07afc5c829a320b9a5890a936bbd9d32826456e4 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Thu, 28 Nov 2024 17:39:27 +0100 Subject: [PATCH 06/10] bump to new torch version --- .github/environment-ci.yml | 4 ++-- pyproject.toml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/environment-ci.yml b/.github/environment-ci.yml index 5c0caeda..d3c24541 100644 --- a/.github/environment-ci.yml +++ b/.github/environment-ci.yml @@ -27,8 +27,8 @@ dependencies: - pip: - "keras<3.0.0" - "tensorflow>=2.12.0,<2.16" - - "torch==1.13" - - "torchaudio" + - "torch==2.0.0" + - "torchaudio==2.0.1" - "essentia" - "soundfile>=0.12.1" - "opencv-python~=4.6.0" diff --git a/pyproject.toml b/pyproject.toml index 05e61490..f5961a9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,8 +68,8 @@ tensorflow = [ "tensorflow==2.15.0", ] # Fixing tf versions to avoid issues torch = [ - "torch==1.13.0", - "torchaudio", + "torch==2.0.0", + "torchaudio==2.0.1", ] essentia = [ "essentia", From 47996159ca87a299c75233ec956388f114fa7c39 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Thu, 28 Nov 2024 17:58:14 +0100 Subject: [PATCH 07/10] fix stereo to mono function --- compiam/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiam/utils/__init__.py b/compiam/utils/__init__.py index fa49733c..ef971b90 100644 --- a/compiam/utils/__init__.py +++ b/compiam/utils/__init__.py @@ -179,13 +179,13 @@ def stereo_to_mono(audio): # Put channels first if audio.shape[0] > audio.shape[1]: audio = audio.T + if audio.shape[0] > 2: + raise ValueError("Expected mono or stereo audio, got multi-channel audio") # If stereo, average the channels if audio.shape[0] == 2: audio = np.mean(audio, axis=0) if audio.shape[0] == 1: audio = np.squeeze(audio, axis=0) - if audio.shape[0] > 2: - raise ValueError("Expected mono or stereo audio, got multi-channel audio") if len(audio.shape) > 2: raise ValueError("Input must be an unbatched audio signal") return audio From 9ba8da0a8abee3530d81b86bf10d68ce88bd12f2 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Thu, 28 Nov 2024 18:21:25 +0100 Subject: [PATCH 08/10] tonic does not find peaks --- compiam/melody/pattern/sancara_search/__init__.py | 2 ++ .../melody/pitch_extraction/ftaresnet_carnatic/__init__.py | 4 +++- .../music_source_separation/mixer_model/__init__.py | 2 ++ .../segmentation/dhrupad_bandish_segmentation/__init__.py | 2 ++ tests/melody/test_essentia_extractors.py | 4 ---- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/compiam/melody/pattern/sancara_search/__init__.py b/compiam/melody/pattern/sancara_search/__init__.py index 39f88206..b1537920 100644 --- a/compiam/melody/pattern/sancara_search/__init__.py +++ b/compiam/melody/pattern/sancara_search/__init__.py @@ -180,6 +180,8 @@ def load_model(self, model_path, conf_path, spec_path): setattr(self, tp, v) self.model = self._build_model() + ## Ensuring we can load the model for different torch versions + ## -- (weights only might be deprecated) try: self.model.load_state_dict( torch.load(model_path, weights_only=True, map_location=self.device), diff --git a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py index 68c094aa..9a5ccac7 100644 --- a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py +++ b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py @@ -80,7 +80,9 @@ def load_model(self, model_path): """Load pre-trained model weights.""" if not os.path.exists(model_path): self.download_model(model_path) # Downloading model weights - try: # Loading model weights + ## Ensuring we can load the model for different torch versions + ## -- (weights only might be deprecated) + try: self.model.load_state_dict(torch.load(model_path, weights_only=True, map_location=self.device)) except: self.model.load_state_dict(torch.load(model_path, map_location=self.device)) diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py index b6d74ef1..c3fc2c59 100644 --- a/compiam/separation/music_source_separation/mixer_model/__init__.py +++ b/compiam/separation/music_source_separation/mixer_model/__init__.py @@ -83,6 +83,8 @@ def _build_model(self): def load_model(self, model_path): if not os.path.exists(model_path): self.download_model(model_path) # Downloading model weights + ## Ensuring we can load the model for different torch versions + ## -- (weights only might be deprecated) try: weights = torch.load(model_path, weights_only=True, map_location=self.device) except: diff --git a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py index 9debf911..81cd7d9c 100644 --- a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py +++ b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py @@ -176,6 +176,8 @@ def load_model(self, model_path): self.download_model(model_path) self.model = self._build_model() + ## Ensuring we can load the model for different torch versions + ## -- (weights only might be deprecated) try: self.model.load_state_dict( torch.load(model_path, weights_only=True, map_location=self.device) diff --git a/tests/melody/test_essentia_extractors.py b/tests/melody/test_essentia_extractors.py index 622c969a..36c602ea 100644 --- a/tests/melody/test_essentia_extractors.py +++ b/tests/melody/test_essentia_extractors.py @@ -71,10 +71,6 @@ def _predict_normalized_pitch(): tonic = tonic_multipitch.extract( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) - audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0] - tonic_2 = tonic_multipitch.extract(audio) # Testing input array - tonic_3 = tonic_multipitch.extract(np.stack([audio, audio])) # Testing input array - tonic_4 = tonic_multipitch.extract(np.stack([audio, audio]).T) # Testing input array assert isinstance(tonic, float) assert tonic == 157.64892578125 From 2ddab849d611b30206a651075d0ee3ca72b56d59 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Thu, 28 Nov 2024 18:23:01 +0100 Subject: [PATCH 09/10] removed wrong tests --- tests/melody/test_deepsrgm.py | 9 +++++---- tests/melody/test_essentia_extractors.py | 4 ++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/melody/test_deepsrgm.py b/tests/melody/test_deepsrgm.py index 0d47655d..7eb00dbe 100644 --- a/tests/melody/test_deepsrgm.py +++ b/tests/melody/test_deepsrgm.py @@ -1,5 +1,6 @@ import os import pytest +import librosa import numpy as np @@ -43,10 +44,10 @@ def _get_features(): feat = deepsrgm.get_features( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) - feat_1 = deepsrgm.get_features(np.zeros([44100])) - feat_2 = deepsrgm.get_features(np.zeros([1, 44100])) - feat_3 = deepsrgm.get_features(np.zeros([2, 44100])) - feat_3 = deepsrgm.get_features(np.zeros([44100, 2])) + audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0] + feat_1 = deepsrgm.get_features(audio) + feat_2 = deepsrgm.get_features(np.stack([audio, audio])) + feat_3 = deepsrgm.get_features(np.stack([audio, audio]).T) feat_4 = deepsrgm.get_features( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) diff --git a/tests/melody/test_essentia_extractors.py b/tests/melody/test_essentia_extractors.py index 36c602ea..622c969a 100644 --- a/tests/melody/test_essentia_extractors.py +++ b/tests/melody/test_essentia_extractors.py @@ -71,6 +71,10 @@ def _predict_normalized_pitch(): tonic = tonic_multipitch.extract( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) + audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0] + tonic_2 = tonic_multipitch.extract(audio) # Testing input array + tonic_3 = tonic_multipitch.extract(np.stack([audio, audio])) # Testing input array + tonic_4 = tonic_multipitch.extract(np.stack([audio, audio]).T) # Testing input array assert isinstance(tonic, float) assert tonic == 157.64892578125 From 30a3e3af51b602c55f673a2523cdc2773206e75b Mon Sep 17 00:00:00 2001 From: genisplaja Date: Fri, 29 Nov 2024 13:22:54 +0100 Subject: [PATCH 10/10] make audio testing longer for deepsrgm --- tests/melody/test_deepsrgm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/melody/test_deepsrgm.py b/tests/melody/test_deepsrgm.py index 7eb00dbe..51f9a523 100644 --- a/tests/melody/test_deepsrgm.py +++ b/tests/melody/test_deepsrgm.py @@ -45,12 +45,10 @@ def _get_features(): os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0] + audio = np.tile(audio, 9) feat_1 = deepsrgm.get_features(audio) feat_2 = deepsrgm.get_features(np.stack([audio, audio])) feat_3 = deepsrgm.get_features(np.stack([audio, audio]).T) - feat_4 = deepsrgm.get_features( - os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") - ) @pytest.mark.torch