forked from dpwe/audfprint
-
Notifications
You must be signed in to change notification settings - Fork 0
/
audfprint_analyze.py
584 lines (521 loc) · 22.1 KB
/
audfprint_analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
"""
audfprint_analyze.py
Class to do the analysis of wave files into hash constellations.
2014-09-20 Dan Ellis [email protected]
"""
from __future__ import print_function
import os
import numpy as np
import scipy.signal
# For reading/writing hashes to file
import struct
# For glob2hashtable, localtester
import glob
import time
# For utility, glob2hashtable
import hash_table
import librosa
import audio_read
################ Globals ################
# Special extension indicating precomputed fingerprint
PRECOMPEXT = '.afpt'
# A different precomputed fingerprint is just the peaks
PRECOMPPKEXT = '.afpk'
def locmax(vec, indices=False):
""" Return a boolean vector of which points in vec are local maxima.
End points are peaks if larger than single neighbors.
if indices=True, return the indices of the True values instead
of the boolean vector.
"""
# vec[-1]-1 means last value can be a peak
#nbr = np.greater_equal(np.r_[vec, vec[-1]-1], np.r_[vec[0], vec])
# the np.r_ was killing us, so try an optimization...
nbr = np.zeros(len(vec)+1, dtype=bool)
nbr[0] = True
nbr[1:-1] = np.greater_equal(vec[1:], vec[:-1])
maxmask = (nbr[:-1] & ~nbr[1:])
if indices:
return np.nonzero(maxmask)[0]
else:
return maxmask
# Constants for Analyzer
# DENSITY controls the density of landmarks found (approx DENSITY per sec)
DENSITY = 20.0
# OVERSAMP > 1 tries to generate extra landmarks by decaying faster
OVERSAMP = 1
## 512 pt FFT @ 11025 Hz, 50% hop
#t_win = 0.0464
#t_hop = 0.0232
# Just specify n_fft
N_FFT = 512
N_HOP = 256
# spectrogram enhancement
HPF_POLE = 0.98
# Globals defining packing of landmarks into hashes
F1_BITS = 8
DF_BITS = 6
DT_BITS = 6
# derived constants
B1_MASK = (1 << F1_BITS) - 1
B1_SHIFT = DF_BITS + DT_BITS
DF_MASK = (1 << DF_BITS) - 1
DF_SHIFT = DT_BITS
DT_MASK = (1 << DT_BITS) - 1
def landmarks2hashes(landmarks):
"""Convert a list of (time, bin1, bin2, dtime) landmarks
into a list of (time, hash) pairs where the hash combines
the three remaining values.
"""
landmarks = np.array(landmarks)
hashes = np.zeros((landmarks.shape[0], 2), dtype=np.int32)
hashes[:, 0] = landmarks[:, 0]
hashes[:, 1] = (((landmarks[:, 1] & B1_MASK) << B1_SHIFT)
| (((landmarks[:, 2] - landmarks[:, 1]) & DF_MASK)
<< DF_SHIFT)
| (landmarks[:, 3] & DT_MASK))
return hashes
def hashes2landmarks(hashes):
"""Convert the mashed-up landmarks in hashes back into a list
of (time, bin1, bin2, dtime) tuples.
"""
landmarks = []
for time_, hash_ in hashes:
dtime = hash_ & DT_MASK
bin1 = (hash_ >> B1_SHIFT) & B1_MASK
dbin = (hash_ >> DF_SHIFT) & DF_MASK
# Sign extend frequency difference
if dbin >= (1 << (DF_BITS-1)):
dbin -= (1 << DF_BITS)
landmarks.append((time_, bin1, bin1+dbin, dtime))
return landmarks
class Analyzer(object):
""" A class to wrap up all the parameters associated with
the analysis of soundfiles into fingerprints """
# Parameters
# optimization: cache pre-calculated Gaussian profile
__sp_width = None
__sp_len = None
__sp_vals = []
def __init__(self, density=DENSITY):
self.density = density
self.target_sr = 11025
self.n_fft = N_FFT
self.n_hop = N_HOP
self.shifts = 1
# how wide to spreak peaks
self.f_sd = 30.0
# Maximum number of local maxima to keep per frame
self.maxpksperframe = 5
# Limit the num of pairs we'll make from each peak (Fanout)
self.maxpairsperpeak = 3
# Values controlling peaks2landmarks
# +/- 31 bins in freq (LIMITED TO -32..31 IN LANDMARK2HASH)
self.targetdf = 31
# min time separation (traditionally 1, upped 2014-08-04)
self.mindt = 2
# max lookahead in time (LIMITED TO <64 IN LANDMARK2HASH)
self.targetdt = 63
# global stores duration of most recently-read soundfile
self.soundfiledur = 0.0
# .. and total amount of sound processed
self.soundfiletotaldur = 0.0
# .. and count of files
self.soundfilecount = 0
# Control behavior on file reading error
self.fail_on_error = True
def spreadpeaksinvector(self, vector, width=4.0):
""" Create a blurred version of vector, where each of the local maxes
is spread by a gaussian with SD <width>.
"""
npts = len(vector)
peaks = locmax(vector, indices=True)
return self.spreadpeaks(zip(peaks, vector[peaks]),
npoints=npts, width=width)
def spreadpeaks(self, peaks, npoints=None, width=4.0, base=None):
""" Generate a vector consisting of the max of a set of Gaussian bumps
:params:
peaks : list
list of (index, value) pairs giving the center point and height
of each gaussian
npoints : int
the length of the output vector (needed if base not provided)
width : float
the half-width of the Gaussians to lay down at each point
base : np.array
optional initial lower bound to place Gaussians above
:returns:
vector : np.array(npoints)
the maximum across all the scaled Gaussians
"""
if base is None:
vec = np.zeros(npoints)
else:
npoints = len(base)
vec = np.copy(base)
#binvals = np.arange(len(vec))
#for pos, val in peaks:
# vec = np.maximum(vec, val*np.exp(-0.5*(((binvals - pos)
# /float(width))**2)))
if width != self.__sp_width or npoints != self.__sp_len:
# Need to calculate new vector
self.__sp_width = width
self.__sp_len = npoints
self.__sp_vals = np.exp(-0.5*((np.arange(-npoints, npoints+1)
/ float(width))**2))
# Now the actual function
for pos, val in peaks:
vec = np.maximum(vec, val*self.__sp_vals[np.arange(npoints)
+ npoints - pos])
return vec
def _decaying_threshold_fwd_prune(self, sgram, a_dec):
""" forward pass of findpeaks
initial threshold envelope based on peaks in first 10 frames
"""
(srows, scols) = np.shape(sgram)
sthresh = self.spreadpeaksinvector(
np.max(sgram[:, :np.minimum(10, scols)], axis=1), self.f_sd
)
## Store sthresh at each column, for debug
#thr = np.zeros((srows, scols))
peaks = np.zeros((srows, scols))
# optimization of mask update
__sp_pts = len(sthresh)
__sp_v = self.__sp_vals
for col in range(scols):
s_col = sgram[:, col]
# Find local magnitude peaks that are above threshold
sdmaxposs = np.nonzero(locmax(s_col) * (s_col > sthresh))[0]
# Work down list of peaks in order of their absolute value
# above threshold
valspeaks = sorted(zip(s_col[sdmaxposs], sdmaxposs), reverse=True)
for val, peakpos in valspeaks[:self.maxpksperframe]:
# What we actually want
#sthresh = spreadpeaks([(peakpos, s_col[peakpos])],
# base=sthresh, width=f_sd)
# Optimization - inline the core function within spreadpeaks
sthresh = np.maximum(sthresh,
val*__sp_v[(__sp_pts - peakpos):
(2*__sp_pts - peakpos)])
peaks[peakpos, col] = 1
sthresh *= a_dec
return peaks
def _decaying_threshold_bwd_prune_peaks(self, sgram, peaks, a_dec):
""" backwards pass of findpeaks """
scols = np.shape(sgram)[1]
# Backwards filter to prune peaks
sthresh = self.spreadpeaksinvector(sgram[:, -1], self.f_sd)
for col in range(scols, 0, -1):
pkposs = np.nonzero(peaks[:, col-1])[0]
peakvals = sgram[pkposs, col-1]
for val, peakpos in sorted(zip(peakvals, pkposs), reverse=True):
if val >= sthresh[peakpos]:
# Setup the threshold
sthresh = self.spreadpeaks([(peakpos, val)], base=sthresh,
width=self.f_sd)
# Delete any following peak (threshold should, but be sure)
if col < scols:
peaks[peakpos, col] = 0
else:
# delete the peak
peaks[peakpos, col-1] = 0
sthresh = a_dec*sthresh
return peaks
def find_peaks(self, d, sr):
""" Find the local peaks in the spectrogram as basis for fingerprints.
Returns a list of (time_frame, freq_bin) pairs.
:params:
d - np.array of float
Input waveform as 1D vector
sr - int
Sampling rate of d (not used)
:returns:
pklist - list of (int, int)
Ordered list of landmark peaks found in STFT. First value of
each pair is the time index (in STFT frames, i.e., units of
n_hop/sr secs), second is the FFT bin (in units of sr/n_fft
Hz).
"""
if len(d) == 0:
return []
# masking envelope decay constant
a_dec = (1.0 - 0.01*(self.density*np.sqrt(self.n_hop/352.8)/35.0)) \
**(1.0/OVERSAMP)
# Take spectrogram
mywin = np.hanning(self.n_fft+2)[1:-1]
sgram = np.abs(librosa.stft(d, n_fft=self.n_fft,
hop_length=self.n_hop,
window=mywin))
sgrammax = np.max(sgram)
if sgrammax > 0.0:
sgram = np.log(np.maximum(sgram, np.max(sgram)/1e6))
sgram = sgram - np.mean(sgram)
else:
# The sgram is identically zero, i.e., the input signal was identically
# zero. Not good, but let's let it through for now.
print("find_peaks: Warning: input signal is identically zero.")
# High-pass filter onset emphasis
# [:-1,] discards top bin (nyquist) of sgram so bins fit in 8 bits
sgram = np.array([scipy.signal.lfilter([1, -1],
[1, -(HPF_POLE)** \
(1/OVERSAMP)], s_row)
for s_row in sgram])[:-1,]
# Prune to keep only local maxima in spectrum that appear above an online,
# decaying threshold
peaks = self._decaying_threshold_fwd_prune(sgram, a_dec)
# Further prune these peaks working backwards in time, to remove small peaks
# that are closely followed by a large peak
peaks = self._decaying_threshold_bwd_prune_peaks(sgram, peaks, a_dec)
# build a list of peaks we ended up with
scols = np.shape(sgram)[1]
pklist = []
for col in xrange(scols):
for bin in np.nonzero(peaks[:, col])[0]:
pklist.append( (col, bin) )
return pklist
def peaks2landmarks(self, pklist):
""" Take a list of local peaks in spectrogram
and form them into pairs as landmarks.
pklist is a column-sorted list of (col, bin) pairs as created
by findpeaks().
Return a list of (col, peak, peak2, col2-col) landmark descriptors.
"""
# Form pairs of peaks into landmarks
landmarks = []
if len(pklist) > 0:
# Find column of the final peak in the list
scols = pklist[-1][0] + 1
# Convert (col, bin) list into peaks_at[col] lists
peaks_at = [[] for col in xrange(scols)]
for (col, bin) in pklist:
peaks_at[col].append(bin)
# Build list of landmarks <starttime F1 endtime F2>
for col in xrange(scols):
for peak in peaks_at[col]:
pairsthispeak = 0
for col2 in xrange(col+self.mindt,
min(scols, col+self.targetdt)):
if pairsthispeak < self.maxpairsperpeak:
for peak2 in peaks_at[col2]:
if abs(peak2-peak) < self.targetdf:
#and abs(peak2-peak) + abs(col2-col) > 2 ):
if pairsthispeak < self.maxpairsperpeak:
# We have a pair!
landmarks.append((col, peak,
peak2, col2-col))
pairsthispeak += 1
return landmarks
def wavfile2peaks(self, filename, shifts=None):
""" Read a soundfile and return its landmark peaks as a
list of (time, bin) pairs. If specified, resample to sr first.
shifts > 1 causes hashes to be extracted from multiple shifts of
waveform, to reduce frame effects. """
ext = os.path.splitext(filename)[1]
if ext == PRECOMPPKEXT:
# short-circuit - precomputed fingerprint file
peaks = peaks_load(filename)
dur = np.max(peaks, axis=0)[0]*self.n_hop/float(self.target_sr)
else:
try:
#[d, sr] = librosa.load(filename, sr=self.target_sr)
d, sr = audio_read.audio_read(filename, sr=self.target_sr, channels=1)
except: # audioread.NoBackendError:
message = "wavfile2peaks: Error reading " + filename
if self.fail_on_error:
raise IOError(message)
print(message, "skipping")
d = []
sr = self.target_sr
# Store duration in a global because it's hard to handle
dur = float(len(d))/sr
if shifts is None or shifts < 2:
peaks = self.find_peaks(d, sr);
else:
# Calculate hashes with optional part-frame shifts
peaklists = []
for shift in range(shifts):
shiftsamps = int(float(shift)/self.shifts*self.n_hop)
peaklists.append(self.find_peaks(d[shiftsamps:], sr))
peaks = peaklists
# instrumentation to track total amount of sound processed
self.soundfiledur = dur
self.soundfiletotaldur += dur
self.soundfilecount += 1
return peaks
def wavfile2hashes(self, filename):
""" Read a soundfile and return its fingerprint hashes as a
list of (time, hash) pairs. If specified, resample to sr first.
shifts > 1 causes hashes to be extracted from multiple shifts of
waveform, to reduce frame effects. """
ext = os.path.splitext(filename)[1]
if ext == PRECOMPEXT:
# short-circuit - precomputed fingerprint file
hashes = hashes_load(filename)
dur = np.max(hashes, axis=0)[0]*self.n_hop/float(self.target_sr)
# instrumentation to track total amount of sound processed
self.soundfiledur = dur
self.soundfiletotaldur += dur
self.soundfilecount += 1
else:
peaks = self.wavfile2peaks(filename, self.shifts)
if len(peaks) == 0:
return []
# Did we get returned a list of lists of peaks due to shift?
if isinstance(peaks[0], list):
peaklists = peaks
query_hashes = []
for peaklist in peaklists:
query_hashes.append(landmarks2hashes(
self.peaks2landmarks(peaklist)))
query_hashes = np.concatenate(query_hashes)
else:
query_hashes = landmarks2hashes(self.peaks2landmarks(peaks))
# Remove duplicates by merging each row into a single value.
hashes_hashes = (((query_hashes[:, 0].astype(np.uint64)) << 32)
+ query_hashes[:, 1].astype(np.uint64))
unique_hash_hash = np.sort(np.unique(hashes_hashes))
unique_hashes = np.hstack([
(unique_hash_hash >> 32)[:, np.newaxis],
(unique_hash_hash & ((1<<32) - 1))[:, np.newaxis]
]).astype(np.int32)
hashes = unique_hashes
# Or simply np.unique(query_hashes, axis=0) for numpy >= 1.13
#print("wavfile2hashes: read", len(hashes), "hashes from", filename)
return hashes
########### functions to link to actual hash table index database #######
def ingest(self, hashtable, filename):
""" Read an audio file and add it to the database
:params:
hashtable : HashTable object
the hash table to add to
filename : str
name of the soundfile to add
:returns:
dur : float
the duration of the track
nhashes : int
the number of hashes it mapped into
"""
#sr = 11025
#print("ingest: sr=",sr)
#d, sr = librosa.load(filename, sr=sr)
# librosa.load on mp3 files prepends 396 samples compared
# to Matlab audioread ??
#hashes = landmarks2hashes(peaks2landmarks(find_peaks(d, sr,
# density=density,
# n_fft=n_fft,
# n_hop=n_hop)))
hashes = self.wavfile2hashes(filename)
hashtable.store(filename, hashes)
#return (len(d)/float(sr), len(hashes))
#return (np.max(hashes, axis=0)[0]*n_hop/float(sr), len(hashes))
# soundfiledur is set up in wavfile2hashes, use result here
return self.soundfiledur, len(hashes)
########### functions to read/write hashes to file for a single track #####
# Format string for writing binary data to file
HASH_FMT = '<2i'
HASH_MAGIC = 'audfprinthashV00' # 16 chars, FWIW
PEAK_FMT = '<2i'
PEAK_MAGIC = 'audfprintpeakV00' # 16 chars, FWIW
def hashes_save(hashfilename, hashes):
""" Write out a list of (time, hash) pairs as 32 bit ints """
with open(hashfilename, 'wb') as f:
f.write(HASH_MAGIC)
for time_, hash_ in hashes:
f.write(struct.pack(HASH_FMT, time_, hash_))
def hashes_load(hashfilename):
""" Read back a set of hashes written by hashes_save """
hashes = []
fmtsize = struct.calcsize(HASH_FMT)
with open(hashfilename, 'rb') as f:
magic = f.read(len(HASH_MAGIC))
if magic != HASH_MAGIC:
raise IOError('%s is not a hash file (magic %s)'
% (hashfilename, magic))
data = f.read(fmtsize)
while data is not None and len(data) == fmtsize:
hashes.append(struct.unpack(HASH_FMT, data))
data = f.read(fmtsize)
return hashes
def peaks_save(peakfilename, peaks):
""" Write out a list of (time, bin) pairs as 32 bit ints """
with open(peakfilename, 'wb') as f:
f.write(PEAK_MAGIC)
for time_, bin_ in peaks:
f.write(struct.pack(PEAK_FMT, time_, bin_))
def peaks_load(peakfilename):
""" Read back a set of (time, bin) pairs written by peaks_save """
peaks = []
fmtsize = struct.calcsize(PEAK_FMT)
with open(peakfilename, 'rb') as f:
magic = f.read(len(PEAK_MAGIC))
if magic != PEAK_MAGIC:
raise IOError('%s is not a peak file (magic %s)'
% (peakfilename, magic))
data = f.read(fmtsize)
while data is not None and len(data) == fmtsize:
peaks.append(struct.unpack(PEAK_FMT, data))
data = f.read(fmtsize)
return peaks
######## function signature for Gordon feature extraction
######## which stores the precalculated hashes for each track separately
extract_features_analyzer = None
def extract_features(track_obj, *args, **kwargs):
""" Extract the audfprint fingerprint hashes for one file.
:params:
track_obj : object
Gordon's internal structure defining a track; we use
track_obj.fn_audio to find the actual audio file.
:returns:
hashes : list of (int, int)
The times (in frames) and hashes analyzed from the audio file.
"""
global extract_features_analyzer
if extract_features_analyzer == None:
extract_features_analyzer = Analyzer()
density = None
n_fft = None
n_hop = None
sr = None
if "density" in kwargs:
density = kwargs["density"]
if "n_fft" in kwargs:
n_fft = kwargs["n_fft"]
if "n_hop" in kwargs:
n_hop = kwargs["n_hop"]
if "sr" in kwargs:
sr = kwargs["sr"]
extract_features_analyzer.density = density
extract_features_analyzer.n_fft = n_fft
extract_features_analyzer.n_hop = n_hop
extract_features_analyzer.target_sr = sr
return extract_features_analyzer.wavfile2hashes(track_obj.fn_audio)
# Handy function to build a new hash table from a file glob pattern
g2h_analyzer = None
def glob2hashtable(pattern, density=20.0):
""" Build a hash table from the files matching a glob pattern """
global g2h_analyzer
if g2h_analyzer == None:
g2h_analyzer = Analyzer(density=density)
ht = hash_table.HashTable()
filelist = glob.glob(pattern)
initticks = time.clock()
totdur = 0.0
tothashes = 0
for ix, file_ in enumerate(filelist):
print(time.ctime(), "ingesting #", ix, ":", file_, "...")
dur, nhash = g2h_analyzer.ingest(ht, file_)
totdur += dur
tothashes += nhash
elapsedtime = time.clock() - initticks
print("Added", tothashes, "(", tothashes/float(totdur), "hashes/sec) at ",
elapsedtime/totdur, "x RT")
return ht
def local_tester():
test_fn = '/Users/dpwe/Downloads/carol11k.wav'
test_ht = hash_table.HashTable()
test_analyzer = Analyzer()
test_analyzer.ingest(test_ht, test_fn)
test_ht.save('httest.pklz')
# Run the test function if called from the command line
if __name__ == "__main__":
local_tester()