-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbeat_tracker.py
204 lines (154 loc) · 8.03 KB
/
beat_tracker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import numpy as np
import librosa
from scipy.signal import lfilter
from plot import (
plot_onset_envelope_strength,
plot_auto_c,
plot_dynamic_programming,
plot_estimated_vs_annotation,
)
def mel_db(sr, stft, n_fft, hop_length, n_mels=40):
abs_stft = abs(stft)
# create a Mel spectrogram with 40 Mel bands
mel = librosa.feature.melspectrogram(S=abs_stft**2, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
# convert power spectrogram (amplitude squared) to dB
mel_db = librosa.power_to_db(mel)
return mel_db
# onset strength envelope
def onset_strength_envelope(y, sr, window_sec=0.032, hop_sec=0.004, show=False):
# calculate the STFT with a 32ms window and 4ms hop size
n_fft = int(window_sec * sr) # window seconds to samples
hop_length = int(hop_sec * sr) # hop seconds to samples
stft = librosa.stft(y, n_fft=n_fft, hop_length=hop_length)
# apply mel scale with 40 bands
mel = mel_db(sr, stft, n_fft, hop_length)
# apply first order difference
mel_dif = np.diff(mel, axis=1)
# half wave rectify
mel_half_rec = np.maximum(0, mel_dif)
# sum across bands
onset_strength = np.sum(mel_half_rec , axis=0)
# remove DC (0 Hz) component
onset_strength = lfilter([1.0, -1.0], [1.0, -0.99], onset_strength, axis=-1)
# normalise to standard deviation (add epsilon to avoid div by 0)
# helps find values with significant difference from mean
onset_strength_norm = onset_strength / np.std(onset_strength) + 1e-10
if(show):
plot_onset_envelope_strength(y, sr, onset_strength_norm, hop_sec)
# plot_mel_spectrogram(mel, sr)
return onset_strength_norm
# Apply perceptual weighting based on tempo tapping data
def perceptual_weighting(tau, tau_0=0.5, sigma_tau=0.9): # tau_0 (bpm) and sigma_tau (octaves) values from ellis 2007
# avoid div by zero error
epsilon = 1e-10
tau = np.maximum(tau, epsilon)
# create gaussian window based on ellis 2007
log_2_tau_over_tau_0 = np.log2(tau / tau_0)
W_t = np.exp(-0.5 * (log_2_tau_over_tau_0 / sigma_tau) ** 2)
return W_t
def tempo_multiples(tps):
# only search 1/3 of the tps this should be plenty with 5 second tps
tps_range = round(len(tps) / 3)
tps_2 = np.zeros(tps_range)
tps_3 = np.zeros(tps_range)
for t in range(tps_range):
# equation (7) and (8) - ellis 2007
tps_2[t] = tps[t] + (0.5 * tps[2 * t]) + (0.25 * tps[(2 * t) - 1]) + (0.25 * tps[(2 * t) + 1])
tps_3[t] = tps[t] + (0.33 * tps[3 * t]) + (0.33 * tps[(3 * t) - 1]) +(0.33 * tps[(3 * t) + 1])
duple = np.max(tps_2)
triple = np.max(tps_3)
# Whichever sequence contains the larger value determines whether the tempo is considered
# duple or triple, respectively, and the location of the largest value is treated as the
# faster target tempo, with one-half or one-third of that tempo, respectively, as the adjacent
# metrical level. - ellis 2007
if duple > triple:
faster_tempo_frame = np.argmax(tps_2)
slower_tempo_frame = np.argmax(tps_2) * 2
return faster_tempo_frame, slower_tempo_frame
else:
faster_tempo_frame = np.argmax(tps_3)
slower_tempo_frame = np.argmax(tps_3) * 3
return faster_tempo_frame, slower_tempo_frame
def estimate_tempo(odf, sr, hop_sec=0.004, max_lag_s=5, show=False):
hop_length = int(hop_sec * sr)
# only correlate reasonable lag range
max_size = max_lag_s * sr / hop_length
# auto correlate onset strength
auto_c = librosa.autocorrelate(odf, max_size=max_size)
# weighting function needs lag in seconds
lags_seconds = np.arange(len(auto_c)) * (hop_length / sr)
# apply perceptual weightinig get tempo period strengths
tps = perceptual_weighting(lags_seconds) * auto_c
# no tps resampling to tempo multiples
# selected_tempo_frame = np.argmax(tps)
# # calculate the tempo multiples
# # two further functions are calculated by resampling T P S to one-half and one-third,
# # respectively, of its original length, adding this to the original T P S,
# # then choosing the largest peak across both these sequences - ellis 2007
faster_tempo_frame, slower_tempo_frame = tempo_multiples(tps)
faster_tempo_peak = tps[faster_tempo_frame]
slower_tempo_peak = tps[slower_tempo_frame]
# Relative weights of the two levels are again taken from the relative peak heights
# at the two period estimates in the original T P S. This approach finds the tempo that maximizes
# the sum of the T P S values at both metrical levels - ellis 2007
faster_tempo_weight = faster_tempo_peak / (faster_tempo_peak + slower_tempo_peak)
slower_tempo_weight = slower_tempo_peak / (faster_tempo_peak + slower_tempo_peak)
# weight difference
if (slower_tempo_weight > faster_tempo_weight): selected_tempo_frame = slower_tempo_frame
else: selected_tempo_frame = faster_tempo_frame
# plot results
if(show):
print("faster_tempo_level: {}, slower_tempo_level {}".format(faster_tempo_peak, slower_tempo_peak))
print("faster_tempo weigth: {}, slower_tempo weight {}".format(faster_tempo_weight, slower_tempo_weight))
print('weight ratio: {}'.format(abs(faster_tempo_weight - slower_tempo_weight)))
plot_auto_c(auto_c, tps, faster_tempo_frame, slower_tempo_frame, selected_tempo_frame, sr, hop_length)
return selected_tempo_frame
# ported from ellis 2007 matlab code
def estimate_beats(ose, tempo_estimate, sr, alpha=680, hop_sec=0.004, show=False):
# initialize backlink and cumulative score arrays
backlink = -np.ones(len(ose), dtype=int)
c_score = ose.copy()
# define search range for previous beat based on the period
prev_range = np.arange(-2*tempo_estimate, -round(tempo_estimate/2), dtype=int)
# calculate transition cost using a log-gaussian window over the search range
cost = -alpha * np.abs(np.log(prev_range / -tempo_estimate) ** 2)
# set up the dynamic programming loop bounds
loop_start = max(-prev_range[0], 0)
loop_end = len(ose)
# use the loop to fill in backlink and cumlative score
for i in range(loop_start, loop_end):
timerange = i + prev_range
# ensure timerange indices are within bounds
valid_timerange = timerange[(timerange >= 0) & (timerange < len(ose))]
# calculate score candidates and find the best predecessor beat
score_candiadates = cost[:len(valid_timerange)] + c_score[valid_timerange]
max_score_index = np.argmax(score_candiadates)
max_score = score_candiadates[max_score_index]
# update cumulative score and backlink
c_score[i] = max_score + ose[i]
backlink[i] = valid_timerange[max_score_index]
# start backtrace from the highest cumulative score
beats = [np.argmax(c_score)]
# backtrace to find all predecessors
while backlink[beats[0]] > 0:
beats.insert(0, backlink[beats[0]])
hop_length = int(hop_sec * sr)
times = librosa.frames_to_time(beats, sr=sr, hop_length=hop_length)
if(show): plot_dynamic_programming(c_score, backlink, ose, beats)
return times
# The whole system should be called via this function
def beat_track(audio_file, name="", genre="", annotations=[], show=False):
hop_sec=0.004
resample_rate = 8000
# load audio
y, sr = librosa.load(audio_file, sr=resample_rate)
ose = onset_strength_envelope(y, sr, hop_sec=hop_sec, show=show)
tempo_estimate = estimate_tempo(ose, sr, hop_sec=hop_sec, show=show)
beats_estimates = estimate_beats(ose, tempo_estimate, sr, hop_sec=hop_sec, show=show) # plot here can take over a miniute
if(show and len(annotations) > 0):
# plot estimated vs reference
plot_estimated_vs_annotation(y, sr, beats_estimates, annotations, name, genre)
return beats_estimates
audio_file = "sample_audio.mp3"
beats = beat_track(audio_file)
print(beats)