Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements and dependency fixes #107

Merged
merged 10 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion compiam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,11 @@ def load_dataset(dataset_name, data_home=None, version="default"):
"""
if dataset_name not in datasets_list:
raise ValueError("Invalid dataset {}".format(dataset_name))
return mirdata.initialize(
dataloader = mirdata.initialize(
dataset_name=dataset_name, data_home=data_home, version=version
)
dataloader.download(["index"]) # Download index file
return dataloader


def load_corpora(tradition, token=None):
Expand Down
15 changes: 12 additions & 3 deletions compiam/melody/pattern/sancara_search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,18 @@ def load_model(self, model_path, conf_path, spec_path):
setattr(self, tp, v)

self.model = self._build_model()
self.model.load_state_dict(
torch.load(model_path, weights_only=True), strict=False
)
## Ensuring we can load the model for different torch versions
## -- (weights only might be deprecated)
try:
self.model.load_state_dict(
torch.load(model_path, weights_only=True, map_location=self.device),
strict=False
)
except:
self.model.load_state_dict(
torch.load(model_path, map_location=self.device),
strict=False,
)
self.trained = True

def download_model(self, model_path=None, force_overwrite=False):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,12 @@ def load_model(self, model_path):
"""Load pre-trained model weights."""
if not os.path.exists(model_path):
self.download_model(model_path) # Downloading model weights
self.model.load_state_dict(torch.load(model_path, weights_only=True))
## Ensuring we can load the model for different torch versions
## -- (weights only might be deprecated)
try:
self.model.load_state_dict(torch.load(model_path, weights_only=True, map_location=self.device))
except:
self.model.load_state_dict(torch.load(model_path, map_location=self.device))
self.model_path = model_path
self.trained = True

Expand Down
8 changes: 5 additions & 3 deletions compiam/melody/pitch_extraction/melodia.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from compiam.utils.pitch import normalisation, resampling
from compiam.io import write_csv
from compiam.utils import get_logger
from compiam.utils import get_logger, stereo_to_mono

logger = get_logger(__name__)

Expand Down Expand Up @@ -87,14 +87,16 @@ def extract(self, input_data, input_sr=44100, out_step=None):
filename=input_data, sampleRate=self.sample_rate
)()
elif isinstance(input_data, np.ndarray):
input_data = stereo_to_mono(input_data)
# Apply Eqloudness filter
logger.warning(
f"Resampling... (input sampling rate is {input_sr}Hz, make sure this is correct)"
)
resample_audio = estd.Resample(
inputSampleRate=input_sr, outputSampleRate=self.sample_rate
)()
)
input_data = resample_audio(input_data)
audio = estd.EqualLoudness(signal=input_data)()
audio = estd.EqualLoudness(sampleRate=self.sample_rate)(input_data)
else:
raise ValueError("Input must be path to audio signal or an audio array")

Expand Down
22 changes: 13 additions & 9 deletions compiam/melody/raga_recognition/deepsrgm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import librosa

import numpy as np

Expand All @@ -8,7 +9,7 @@
ModelNotTrainedError,
DatasetNotLoadedError,
)
from compiam.utils import get_logger, WORKDIR
from compiam.utils import get_logger, stereo_to_mono, WORKDIR
from compiam.utils.download import download_remote_model

logger = get_logger(__name__)
Expand Down Expand Up @@ -122,7 +123,10 @@ def load_model(self, model_path, rnn="lstm"):
self.model = self._build_model(rnn="gru")

self.model_path = model_path
weights = torch.load(model_path, weights_only=True, map_location=self.device)
try:
weights = torch.load(model_path, weights_only=True, map_location=self.device)
except:
weights = torch.load(model_path, map_location=self.device)
new_weights = weights.copy()
keys_to_fix = [
".weight_ih_l0",
Expand Down Expand Up @@ -232,20 +236,20 @@ def get_features(
"Install compIAM with essentia support: pip install 'compiam[essentia]'"
)

# Loading and resampling audio
if isinstance(input_data, str):
if not os.path.exists(input_data):
raise FileNotFoundError("Target audio not found.")
audio = estd.MonoLoader(
filename=input_data, sampleRate=self.sample_rate
)()
audio, _ = librosa.load(input_data, sr=self.sample_rate)
elif isinstance(input_data, np.ndarray):
input_data = stereo_to_mono(input_data)
logger.warning(
"Resampling... (input sampling rate is {input_sr}Hz, make sure this is correct)"
f"Resampling... (input sampling rate is assumed {input_sr}Hz, \
make sure this is correct and change input_sr otherwise)"
)
resampling = estd.Resample(
inputSampleRate=input_sr, outputSampleRate=self.sample_rate
audio = librosa.resample(
input_data, orig_sr=input_sr, target_sr=self.sample_rate
)
audio = resampling(input_data)
else:
raise ValueError("Input must be path to audio signal or an audio array")

Expand Down
8 changes: 6 additions & 2 deletions compiam/melody/tonic_identification/tonic_multipitch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import numpy as np

from compiam.utils import get_logger
from compiam.utils import get_logger, stereo_to_mono

logger = get_logger(__name__)

Expand Down Expand Up @@ -65,12 +65,16 @@ def extract(self, input_data, input_sr=44100):
raise FileNotFoundError("Target audio not found.")
audio = estd.MonoLoader(filename=input_data, sampleRate=self.sample_rate)()
elif isinstance(input_data, np.ndarray):
if len(input_data.shape) == 2:
input_data = stereo_to_mono(input_data)
if len(input_data.shape) > 2:
raise ValueError("Input must be an unbatched audio signal")
logger.warning(
f"Resampling... (input sampling rate is {input_sr}Hz, make sure this is correct)"
)
resampling = estd.Resample(
inputSampleRate=input_sr, outputSampleRate=self.sample_rate
)()
)
audio = resampling(input_data)
else:
raise ValueError("Input must be path to audio signal or an audio array")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,13 @@ def _build_model(self):
def load_model(self, model_path):
if not os.path.exists(model_path):
self.download_model(model_path) # Downloading model weights
self.model.load_state_dict(torch.load(model_path, weights_only=True))
## Ensuring we can load the model for different torch versions
## -- (weights only might be deprecated)
try:
weights = torch.load(model_path, weights_only=True, map_location=self.device)
except:
weights = torch.load(model_path, map_location=self.device)
self.model.load_state_dict(weights)
self.model_path = model_path
self.trained = True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def separate(
clusters=5,
scheduler=4,
chunk_size=3,
normalize_input=True,
gpu="-1",
):
"""Separate singing voice from mixture.
Expand All @@ -103,6 +104,8 @@ def separate(
relevant if the input is an array of data instead of a filepath.
:param clusters: Number of clusters to use to build the separation masks.
:param scheduler: Scheduler factor to weight the clusters to be more or less restirctive with the interferences.
:param chunk_size: Size of the chunks to process the audio signal.
:param normalize_input: Normalize the input audio signal.
:param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc.
:return: Singing voice signal.
"""
Expand Down Expand Up @@ -153,6 +156,12 @@ def separate(
f"Downsampling to mono... your audio is stereo, \
and the model is trained on mono audio."
)

if normalize_input:
# Normalizing audio for better performance overall
mean = tf.reduce_mean(mixture, keepdims=True)
std = tf.math.reduce_std(mixture, keepdims=True)
mixture = (mixture - mean) / (1e-6 + std)

output_voc = np.zeros(mixture.shape)
hopsized_chunk = int((chunk_size * self.sample_rate) / 2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,16 +176,22 @@ def load_model(self, model_path):
self.download_model(model_path)

self.model = self._build_model()
self.model.load_state_dict(
torch.load(model_path, weights_only=True, map_location=self.device)
)
## Ensuring we can load the model for different torch versions
## -- (weights only might be deprecated)
try:
self.model.load_state_dict(
torch.load(model_path, weights_only=True, map_location=self.device)
)
except:
self.model.load_state_dict(
torch.load(model_path, map_location=self.device)
)
self.model.eval()
self.loaded_model_path = model_path
self.trained = True

def download_model(self, model_path=None, force_overwrite=False):
"""Download pre-trained model."""
print("modelpathhh", model_path)
download_path = (
os.sep + os.path.join(*model_path.split(os.sep)[:-4])
if model_path is not None
Expand Down
19 changes: 18 additions & 1 deletion compiam/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import pathlib
import pickle
import difflib
import librosa

import IPython.display as ipd
import numpy as np
Expand Down Expand Up @@ -172,3 +171,21 @@ def add_center_to_mask(mask):
num_one = 0
indices = []
return mask


def stereo_to_mono(audio):
"""Assuming numpy array as input"""
if len(audio.shape) == 2:
# Put channels first
if audio.shape[0] > audio.shape[1]:
audio = audio.T
if audio.shape[0] > 2:
raise ValueError("Expected mono or stereo audio, got multi-channel audio")
# If stereo, average the channels
if audio.shape[0] == 2:
audio = np.mean(audio, axis=0)
if audio.shape[0] == 1:
audio = np.squeeze(audio, axis=0)
if len(audio.shape) > 2:
raise ValueError("Input must be an unbatched audio signal")
return audio
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,11 @@ docs = [
]
tensorflow = [
"keras<3.0.0",
"tensorflow>=2.12.0,<2.16",
]
"tensorflow==2.15.0",
] # Fixing tf versions to avoid issues
torch = [
"torch==1.13.0",
"torchaudio",
"torch==2.0.0",
"torchaudio==2.0.1",
]
essentia = [
"essentia",
Expand Down
6 changes: 6 additions & 0 deletions tests/melody/test_deepsrgm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import pytest
import librosa

import numpy as np

Expand Down Expand Up @@ -43,6 +44,11 @@ def _get_features():
feat = deepsrgm.get_features(
os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
)
audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0]
audio = np.tile(audio, 9)
feat_1 = deepsrgm.get_features(audio)
feat_2 = deepsrgm.get_features(np.stack([audio, audio]))
feat_3 = deepsrgm.get_features(np.stack([audio, audio]).T)


@pytest.mark.torch
Expand Down
8 changes: 8 additions & 0 deletions tests/melody/test_essentia_extractors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import pytest
import librosa

import numpy as np

Expand All @@ -15,6 +16,9 @@ def _predict_normalized_pitch():
pitch = melodia.extract(
os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
)
pitch_2 = melodia.extract(np.zeros([44100]))
pitch_3 = melodia.extract(np.zeros([2, 44100])) # Testing input array
pitch_4 = melodia.extract(np.zeros([44100, 2])) # Testing input array

assert isinstance(pitch, np.ndarray)
assert np.shape(pitch) == (699, 2)
Expand Down Expand Up @@ -67,6 +71,10 @@ def _predict_normalized_pitch():
tonic = tonic_multipitch.extract(
os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
)
audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0]
tonic_2 = tonic_multipitch.extract(audio) # Testing input array
tonic_3 = tonic_multipitch.extract(np.stack([audio, audio])) # Testing input array
tonic_4 = tonic_multipitch.extract(np.stack([audio, audio]).T) # Testing input array

assert isinstance(tonic, float)
assert tonic == 157.64892578125
Expand Down
Loading