-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
158 lines (127 loc) · 6.48 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import glob
import os
import math
import re
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import wavio
# Ignore frequency components below this value (in Hz)
MIN_RELEVANT_FREQUENCY = 0
# Ignore frequency components above this value (in Hz)
MAX_RELEVANT_FREQUENCY = 12500
def get_spectrograms(dir_path, prefix, num_freq_bins, num_time_samples, num_time_bins):
'''Computes and processes spectrograms of all matching .wav files in the target directory
Args:
dir_path (string) : The path of the directory of .wav files
prefix (string) : A filename prefix of all the desired .wav files in the directory
num_freq_bins (int) : The number of desired frequency bins in the processed binned spectrograms
num_time_samples (int) : The desired length (in time samples) to either clip or right-pad each .wav file recording
num_time_bins (int) : The number of desired time bins in the processed binned spectrograms
Returns:
The indices (from alphabetical order by file name) of validly computed spectrograms
The indices (from alphabetical order by file name) of files from which spectrograms were not successfully computed
The validly computed spectrograms (in alphabetical order by file name)
'''
error_indices = []
valid_indices = []
spectrograms = []
audio_files = glob.glob(os.path.join(dir_path, '%s*.wav'%prefix))
for i, audio_file_name in enumerate(sorted(audio_files)):
try:
wav_obj = wavio.read(audio_file_name)
except:
error_indices.append(i)
continue
sample_rate = wav_obj.rate
# This is a mono stream of audio, so we get rid of a dimension by taking the first component.
# If this were stereo, the [0,:] below would just take the first channel.
audio_data = np.transpose(wav_obj.data)[0,:]
spectrogram = process_audio(audioData, sample_rate, num_freq_bins, num_time_samples, num_time_bins)
valid_indices.append(i)
spectrograms.append(spectrogram)
return (valid_indices, error_indices, spectrograms)
def process_audio(audio_data, sample_rate, num_freq_bins, num_time_samples, num_time_bins, check_valid=True):
'''Computes and processes a binned spectrogram from a raw audio (unclipped and unpadded) signal array.
Args:
audio_data (numpy.array): Array for a raw audio signal (one channel only)
sample_rate (int) : The number of samples per second of the audio signal.
num_freq_bins (int) : The number of desired frequency bins in the processed binned spectrogram
num_time_samples (int) : The desired length (in time samples) to either clip or right-pad the audio signal array
num_time_bins (int) : The number of desired time bins in the processed binned spectrogram
check_valid (boolean) : Whether to interrupt the function on a processing error and debug with plots
Returns:
A numpy.array representing the processed and binned spectrogram
'''
padded_data = np.zeros(num_time_samples, dtype=audio_data.dtype)
fin_length = min(audio_data.shape[0], num_time_samples)
padded_data[:fin_length] = audio_data[:fin_length]
fully_binned_spectrogram, binned_freq_spectrogram = compute_spectrogram(padded_data, sample_rate, num_freq_bins, num_time_bins)
# This is for debugging any invalid spectrograms that slip through the cracks.
if check_valid and np.mean(fully_binned_spectrogram) < 1:
print(np.mean(fully_binned_spectrogram))
plt.imshow(binned_freq_spectrogram)
plt.colorbar()
plt.show()
plt.imshow(fully_binned_spectrogram)
plt.colorbar()
plt.show()
return fully_binned_spectrogram
def compute_spectrogram(audio_data, sample_rate, num_freq_bins, num_time_bins):
'''Computes and processes a spectrogram directly from an audio signal.
Args:
audio_data (numpy.array): Array for a raw audio signal (one channel only)
sample_rate (int) : The number of samples per second of the audio signal.
num_freq_bins (int) : The number of desired frequency bins in the processed binned spectrogram
num_time_bins (int) : The number of desired time bins in the processed binned spectrogram
Returns:
A numpy.array representing the fully processed and binned spectrogram
A numpy.array for the processed spectrogram only binned on the frequency dimension (for debugging purposes)
'''
# Sxx has first dim Freq, second dim time
f, t, Sxx = signal.spectrogram(audio_data, sample_rate, scaling='spectrum', return_onesided=True)
Sxx = np.array(Sxx)
# Find the indices of the bounds of the relevant frequencies
min_relevant_freq_idx = np.searchsorted(f, MIN_RELEVANT_FREQUENCY)
max_relevant_freq_idx = np.searchsorted(f, MAX_RELEVANT_FREQUENCY)
trimmed_spectrogram = Sxx[min_relevant_freq_idx:max_relevant_freq_idx,:]
trimmed_freqs = f[min_relevant_freq_idx:max_relevant_freq_idx]
binned_freq_spectrogram = bin_spectrogram_freq(trimmed_spectrogram, num_freq_bins)
fully_binned_spectrogram = bin_spectrogram_time(binned_freq_spectrogram, num_time_bins)
return fully_binned_spectrogram, binned_freq_spectrogram
def bin_spectrogram_freq(spectrogram, num_freq_bins):
'''Bins a spectrogram on its frequency dimension.
Args:
spectrogram (numpy.array) : The unbinned spectrogram
num_freq_bins (int) : The number of desired frequency bins in the processed binned spectrogram
Returns:
The spectrogram binned on its frequency dimension.
'''
return __bin_matrix_dimension(spectrogram, 0, num_freq_bins)
def bin_spectrogram_time(spectrogram, num_time_bins):
'''Bins a spectrogram on its time dimension.
Args:
spectrogram (numpy.array) : The unbinned spectrogram
num_time_bins (int) : The number of desired time bins in the processed binned spectrogram
Returns:
The spectrogram binned on its time dimension.
'''
return __bin_matrix_dimension(spectrogram, 1, num_time_bins)
def __bin_matrix_dimension(m, dimension, num_bins):
'''Bins a matrix on a specified dimension.
Args:
m (numpy.array) : The original matrix
dimension (int) : The dimension to bin
num_bins (int) : The desired number of bins for the specified dimension
Returns:
A numpy.array of the matrix binned on the specified dimension.
'''
bin_size = int(np.floor(m.shape[dimension]/(num_bins+0.0)))
binned_matrix = np.zeros((m.shape[1-dimension], num_bins))
for b in range(num_bins):
min_bin_idx = b * bin_size
max_bin_idx = min((b+1) * bin_size, m.shape[dimension])
binned_matrix[:,b] = np.sum(m[:,min_bin_idx:max_bin_idx], axis=1)
return binned_matrix