-
Notifications
You must be signed in to change notification settings - Fork 3
/
tools.py
executable file
·326 lines (239 loc) · 8.69 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
"""
This script contains supporting function for the data processing.
It is used in several other scripts:
for generating bvh files, aligning sequences and calculation of speech features
"""
import ctypes
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Acoustic signal processing
import scipy.io.wavfile as wav
from pydub import AudioSegment
from python_speech_features import mfcc
import scipy
# from alt_prosody import compute_prosody
MFCC_INPUTS=26 # How many features we will store for each MFCC vector
WINDOW_LENGTH = 0.1
def create_bvh(filename, prediction, frame_time):
"""
Create BVH File
Args:
filename: file, in which motion in bvh format should be written
prediction: motion sequences, to be written into file
frame_time: frame rate of the motion
Returns:
nothing, writes motion to the file
"""
with open('hformat.txt', 'r') as ftemp:
hformat = ftemp.readlines()
with open(filename, 'w') as fo:
prediction = np.squeeze(prediction)
print("output vector shape: " + str(prediction.shape))
offset = [0, 60, 0]
offset_line = "\tOFFSET " + " ".join("{:.6f}".format(x) for x in offset) + '\n'
fo.write("HIERARCHY\n")
fo.write("ROOT Hips\n")
fo.write("{\n")
fo.write(offset_line)
fo.writelines(hformat)
fo.write("MOTION\n")
fo.write("Frames: " + str(len(prediction)) + '\n')
fo.write("Frame Time: " + frame_time + "\n")
for row in prediction:
row[0:3] = 0
legs = np.zeros(24)
row = np.concatenate((row, legs))
label_line = " ".join("{:.6f}".format(x) for x in row) + " "
fo.write(label_line + '\n')
print("bvh generated")
def shorten(arr1, arr2):
min_len = min(len(arr1), len(arr2))
arr1 = arr1[:min_len]
arr2 = arr2[:min_len]
return arr1, arr2
def shorten3(arr1, arr2, arr3):
min_len = min(len(arr1), len(arr2), len(arr3))
arr1 = arr1[:min_len]
arr2 = arr2[:min_len]
arr3 = arr3[:min_len]
return arr1, arr2, arr3
def average(arr, n):
""" Replace every "n" values by their average
Args:
arr: input array
n: number of elements to average on
Returns:
resulting array
"""
end = n * int(len(arr)/n)
return np.mean(arr[:end].reshape(-1, n), 1)
def calculate_mfcc(audio_filename):
"""
Calculate MFCC features for the audio in a given file
Args:
audio_filename: file name of the audio
Returns:
feature_vectors: MFCC feature vector for the given audio file
"""
fs, audio = wav.read(audio_filename)
# Make stereo audio being mono
if len(audio.shape) == 2:
audio = (audio[:, 0] + audio[:, 1]) / 2
# Calculate MFCC feature with the window frame it was designed for
input_vectors = mfcc(audio, winlen=0.02, winstep=0.01, samplerate=fs, numcep=MFCC_INPUTS)
input_vectors = [average(input_vectors[:, i], 5) for i in range(MFCC_INPUTS)]
feature_vectors = np.transpose(input_vectors)
return feature_vectors
def get_energy_level(sound, win_len):
""" Calculate energy signal of an audio object
Args:
sound: AudioSegment object with the audio signal
win_len: length of the window for the energy calculations
Returns:
energy: the energy of the signal
"""
loudness = list([])
length = len(sound) - win_len
# Split signal into short chunks and get energy of each of them
for i in range(0, length, win_len):
current_segment = sound[i:i + win_len]
loudness.append(current_segment.rms)
# Append the last segment, which was not considered
loudness.append(0)
energy = np.array(loudness)
return energy
def derivative(x, f):
""" Calculate numerical derivative (by FDM) of a 1d array
Args:
x: input space x
f: Function of x
Returns:
der: numerical derivative of f wrt x
"""
x = 1000 * x # from seconds to milliseconds
# Normalization:
dx = (x[1] - x[0])
cf = np.convolve(f, [1, -1]) / dx
# Remove unstable values
der = cf[:-1].copy()
der[0] = 0
return der
def calculate_pitch(audio_filename):
""" Calculate F0 contour of a given speech file
Args:
audio_filename: address of a speech file
Returns:
F0 contour in a log scale and flag indicating weather F0 existed
"""
fs, audio = wav.read(audio_filename)
# Make stereo audio being mono
if len(audio.shape) == 2:
audio =( (audio[:, 0] + audio[:, 1]) / 2 ).astype(ctypes.c_int16)
plot = False
WINDOW_LENGTH = 5
pm_times, pm, f0_times, f0, corr = pyreaper.reaper(audio, fs=fs, minf0=80, maxf0=250)
# Remove unstable values
f0 = f0[1:-1].copy()
# Get an indication if F0 exists
f0[f0 == -1] = np.nan
F0_exists = 1 - np.isnan(f0).astype(int)
# Interpolate pitch values
ts = pd.Series(f0, index=range(f0.shape[0]))
ts = ts.interpolate(method='linear', downcast='infer')\
f0 = ts.values
nans = np.isnan(f0).tolist()
# Extrapolate at the beginning
if False in nans:
first_value = nans.index(False)
first_nans = nans[0:first_value]
for time in range(len(first_nans)):
f0[time] = f0[first_value]
# Extrapolate at the end
if True in nans[first_value:]:
last_value = nans[first_value:].index(True)
last_nans = nans[last_value:]
for time in range(len(last_nans)):
f0[-time] = f0[last_value]
if plot:
plt.plot(f0, linewidth=3, label="F0")
plt.title("F0 results")
plt.show()
# Convert to the log scale
F0_contour = np.log2(f0+1)
return F0_contour, F0_exists
def extract_prosodic_features(audio_filename):
"""
Extract all 5 prosodic features
Args:
audio_filename: file name for the audio to be used
Returns:
pros_feature: energy, energy_der, pitch, pitch_der, pitch_ind
"""
WINDOW_LENGTH = 5
# Read audio from file
sound = AudioSegment.from_file(audio_filename, format="wav")
# Alternative prosodic features
pitch, energy = compute_prosody(audio_filename, WINDOW_LENGTH / 1000)
duration = len(sound) / 1000
t = np.arange(0, duration, WINDOW_LENGTH / 1000)
energy_der = derivative(t, energy)
pitch_der = derivative(t, pitch)
# Average everything in order to match the frequency
energy = average(energy, 10)
energy_der = average(energy_der, 10)
pitch = average(pitch, 10)
pitch_der = average(pitch_der, 10)
# Cut them to the same size
min_size = min(len(energy), len(energy_der), len(pitch_der), len(pitch_der))
energy = energy[:min_size]
energy_der = energy_der[:min_size]
pitch = pitch[:min_size]
pitch_der = pitch_der[:min_size]
# Stack them all together
pros_feature = np.stack((energy, energy_der, pitch, pitch_der))#, pitch_ind))
# And reshape
pros_feature = np.transpose(pros_feature)
return pros_feature
def calculate_spectrogram(audio_filename):
""" Calculate spectrogram for the audio file
Args:
audio_filename: audio file name
Returns:
log spectrogram values
"""
DIM = int(64)
audio, sample_rate = librosa.load(audio_filename)
# Make stereo audio being mono
if len(audio.shape) == 2:
audio = (audio[:, 0] + audio[:, 1]) / 2
spectr = librosa.feature.melspectrogram(audio, sr=sample_rate, #window = scipy.signal.hanning,
hop_length = int(WINDOW_LENGTH* sample_rate / 2),
fmax=7500, fmin=100, n_mels=DIM)
# Shift into the log scale
eps = 1e-10
log_spectr = np.log(abs(spectr)+eps)
return np.transpose(log_spectr)
if __name__ == "__main__":
Debug=1
if Debug:
audio_filename = "/home/taras//Documents/Datasets/SpeechToMotion/" \
"Japanese/speech/audio1099_16k.wav"
feature = extract_prosodic_features(audio_filename)
else:
if False:
DATA_DIR = "/home/taras/Documents/Datasets/SpeechToMotion/Japanese/TheLAtest/"
DATA_FILE = pd.read_csv(DATA_DIR + '/gg-dev.csv')
X = np.array([])
Y = np.array([])
whole_f0 = []
for i in range(len(DATA_FILE)):
f0 = calculate_pitch(DATA_FILE['wav_filename'][i])
whole_f0 = np.append(whole_f0, f0, axis=0)
print(f0.shape)
print(np.shape(whole_f0))
hist, _ = np.histogram(whole_f0, bins=50)
plt.plot(hist, linewidth=3, label="Hist")
plt.show()