-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathidentify.py
110 lines (88 loc) · 3.7 KB
/
identify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import time
from fingerprint import fingerprint_file
from utils import get_id, load, sec_to_min_sec_str, write_output_line
from collections import defaultdict, Counter
from visualise import plot_shift_frequnecy
# function to call each part of the identification process for a single document
def identify(Q, D):
# calulcate the frequnecy of matching offsets
matches = get_matches(Q, D)
# find documents with most frequnent shifts
candidates = get_largest_matches(matches)
# sort most frequnent ids by count
sorted_candidates = sort_dic(candidates)
# return a list of valid matches sorted from most to least frequent
return sorted_candidates
# count the number of valid shift for each hash - returns counter dict {"id": [ m : count ]}
def get_matches(Q, D):
# counter for shift frequency - defaultdict allows arbitrary keys
matches = defaultdict(Counter)
# find valid shifts for each hash in query
for h_q in Q:
# only search hashes in query
if h_q not in D: continue
# get the query hash data
n = Q[h_q]
# for each timstamp at the document hash
for l in D[h_q]:
# calculate the valid shift
m = l["time_stamp"] - n["time_stamp"]
# store the id
D_id = l["id"]
# increment the number of valid shifts at current id and shift
matches[D_id][m] += 1
# return a the frequnecy of valid shifts at ids
return matches
# create dict of most most common matches for each id - returns dict {"document id" : count}
def get_largest_matches(matches):
candidates = {}
# for every id and number of valid shifts in the counter
for D_id, counter in matches.items():
# get most common time shift
_, count = counter.most_common(1)[0]
# visualise shift
# if count > 100: plot_shift_frequnecy(counter)
# add to list of possible matches
candidates[D_id] = count
return candidates
# sort a dict by value - returns array of keys
def sort_dic(dict):
values = [key for key in dict]
keys_by_value = sorted(values, key=lambda x: -dict[x])
return keys_by_value
def print_correct_incorrect(D_ids, Q_id):
most_likley = D_ids[0]
if(len(D_ids) > 0 and most_likley == Q_id):
print("correct match document: {} - query: {}".format(most_likley, Q_id))
else:
print("incorrect match document: {} - query: {}".format(most_likley, Q_id))
def audioIdentification(query_path, fingerprints_path, output_path):
print("running audio identification on query set")
# load fingerprints data structure
D = load(fingerprints_path, "documents")
output_file = open(output_path, "w")
# start performance counter
start_time = time.perf_counter()
# start profile timer
for entry in os.scandir(query_path):
# skip non wave files
if os.path.splitext(entry.name)[1] != ".wav": continue
# process the query
query_name = entry.name
Q_id = get_id(query_name)
# finger print the query
Q = fingerprint_file(entry, id=Q_id)
# find matching documents
D_ids = identify(Q, D)
# show correct and incorrect document matches
print_correct_incorrect(D_ids, Q_id)
# write top 3 documents to file
write_output_line(output_file, D_ids, query_name)
# stop profile timer
end_time = time.perf_counter()
# show runtime
run_time = end_time - start_time
# print results
print("---time to run identification {} mins---".format(sec_to_min_sec_str(run_time)))
output_file.close()