-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscore_indexing.py
145 lines (132 loc) · 8.56 KB
/
score_indexing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import argparse, json, sys
from math import sqrt
from popbot_src.indexing_common import load_document_sections
argparser = argparse.ArgumentParser(description='Score quality of document indexing.')
argparser.add_argument('--print_titles', '-t', action='store_true')
argparser.add_argument('config_path')
argparser.add_argument('csv_path')
args = argparser.parse_args()
# Load the config
config = None
with open(args.config_path) as config_file:
config = json.loads(config_file.read())
# Load document sections from the csv
document_sections = load_document_sections(args.csv_path, print_titles=args.print_titles)
if not 'dev__true_document_pages' in config:
print('No dev__true_document_pages specified, exiting.')
sys.exit(0)
# Collect document lengths from both sources.
true_document_lengths = []
prev_pagenum = config['dev__true_document_pages'][0]
for pagenum in config['dev__true_document_pages'][1:]:
true_document_lengths.append(pagenum-prev_pagenum+1)
prev_pagenum = pagenum
csv_document_lengths = [len(sec.collapsed_text()) for sec in document_sections]
# Scale the indexed lengths as we would be distributing pages from the original index.
scaled_document_lengths = [l / sum(csv_document_lengths) * sum(true_document_lengths)
for l in csv_document_lengths]
# Option 1: We indexed exactly as many documents as the truth.
# Computing the distance is straightforward.
if len(csv_document_lengths) == len(true_document_lengths):
distance = 0
for li, length in enumerate(true_document_lengths):
distance += (length - scaled_document_lengths[li])**2
distance = sqrt(distance)
# Option 2: Our indexed csv has more documents than the truth.
# Greedily alignments (merging documents) minimizing euclidean distance between length sequences.
elif len(scaled_document_lengths) > len(true_document_lengths):
forward_alignment = [ scaled_document_lengths[0] ]
current_orig_index = 0
for li, length in enumerate(scaled_document_lengths[1:]):
# We need to have enough documents left to cover everything; if it is true, merge documents if this produces some distance reduction.
if (current_orig_index == len(true_document_lengths)-1
or ((len(scaled_document_lengths)-li) > len(true_document_lengths)-current_orig_index
and ((forward_alignment[-1]+length-true_document_lengths[current_orig_index])**2
< (forward_alignment[-1]-true_document_lengths[current_orig_index])**2))):
forward_alignment[-1] += length
else:
forward_alignment.append(length)
current_orig_index += 1
forward_alignment_distance = 0
for li, length in enumerate(true_document_lengths):
forward_alignment_distance += (length - forward_alignment[li])**2
forward_alignment_distance = sqrt(forward_alignment_distance)
backward_alignment = [ scaled_document_lengths[-1] ]
current_orig_index = len(true_document_lengths) - 1
for li, length in enumerate(reversed(scaled_document_lengths[:-1])):
# We need to have enough documents left to cover everything; if it is true, merge documents if this produces some distance reduction. Also we must only merge when we are down to the first true page.
if (((len(scaled_document_lengths)-li-1) > current_orig_index
and (backward_alignment[-1]+length-true_document_lengths[current_orig_index])**2
< (backward_alignment[-1]-true_document_lengths[current_orig_index])**2)
or current_orig_index == 0):
backward_alignment[-1] += length
else:
backward_alignment.append(length)
current_orig_index -= 1
backward_alignment_distance = 0
backward_alignment.reverse()
for li, length in enumerate(true_document_lengths):
backward_alignment_distance += (length - backward_alignment[li])**2
backward_alignment_distance = sqrt(backward_alignment_distance)
distance = min([forward_alignment_distance, backward_alignment_distance])
# Option 3: Our indexed csv has fewer documents than the truth.
elif len(scaled_document_lengths)*2 > len(true_document_lengths):
forward_alignment = [ ]
current_orig_index = 0
for li, length in enumerate(scaled_document_lengths):
if current_orig_index+1 < len(true_document_lengths) and li+1 < len(scaled_document_lengths):
current_ngb_distances = ((length-true_document_lengths[current_orig_index])**2
+ (scaled_document_lengths[li+1] + true_document_lengths[current_orig_index+1])**2)
ngb_proportion = true_document_lengths[current_orig_index] / sum(true_document_lengths[current_orig_index:current_orig_index+2])
split_ngb_distances = ((ngb_proportion*length-true_document_lengths[current_orig_index])**2
+ ((1-ngb_proportion)*length-true_document_lengths[current_orig_index+1])**2)
# We are forced to split if there is less remaining indexed sections (x2 if we'd split them all) than there is remaining true ones.
# On the other hand, we must stop splitting if there is no room left for that in accomodating the remaining portion of true pages.
if (split_ngb_distances < current_ngb_distances or (len(scaled_document_lengths) - li)*2 <= len(true_document_lengths) - current_orig_index) and len(true_document_lengths) - current_orig_index > (len(scaled_document_lengths) - li):
forward_alignment += [ngb_proportion*length, (1-ngb_proportion)*length]
current_orig_index += 2
else:
forward_alignment.append(length)
current_orig_index += 1
else:
forward_alignment.append(length)
if current_orig_index+1 != len(true_document_lengths) or li+1 != len(scaled_document_lengths):
raise RuntimeError('bad forward alignment of shorter page index (there is an error in algorithm)')
forward_alignment_distance = 0
for li, length in enumerate(true_document_lengths):
forward_alignment_distance += (length - forward_alignment[li])**2
forward_alignment_distance = sqrt(forward_alignment_distance)
backward_alignment = [ ]
current_orig_index = len(true_document_lengths)-1
for li, length in enumerate(reversed(scaled_document_lengths)):
if current_orig_index != 0 and li+1 < len(scaled_document_lengths):
current_ngb_distances = ((length-true_document_lengths[current_orig_index])**2
+ (scaled_document_lengths[li+1] + true_document_lengths[current_orig_index-1])**2)
ngb_proportion = true_document_lengths[current_orig_index] / sum(true_document_lengths[current_orig_index-1:current_orig_index+1])
split_ngb_distances = ((ngb_proportion*length-true_document_lengths[current_orig_index])**2
+ ((1-ngb_proportion)*length-true_document_lengths[current_orig_index-1])**2)
# We are forced to split if there is less remaining indexed sections (x2 if we'd split them all) than there is remaining true ones.
# On the other hand, we must stop splitting if there is no room left for that in accomodating the remaining portion of true pages.
if (split_ngb_distances < current_ngb_distances or (len(scaled_document_lengths)-li)*2 <= current_orig_index) and current_orig_index > len(scaled_document_lengths) - li - 1:
backward_alignment += [ngb_proportion*length, (1-ngb_proportion)*length]
current_orig_index -= 2
else:
backward_alignment.append(length)
current_orig_index -= 1
else:
backward_alignment.append(length)
if current_orig_index != 0 or li+1 != len(scaled_document_lengths):
raise RuntimeError('bad backward alignment of shorter page index (there is an error in algorithm)')
backward_alignment.reverse()
backward_alignment_distance = 0
for li, length in enumerate(true_document_lengths):
backward_alignment_distance += (length - backward_alignment[li])**2
backward_alignment_distance = sqrt(backward_alignment_distance)
distance = min([forward_alignment_distance, backward_alignment_distance])
else:
raise NotImplementedError('the case with >2x fewer documents than truth is not implemented')
# Print the final score.
difference = abs(len(true_document_lengths)-len(scaled_document_lengths))
proportion = (len(scaled_document_lengths)-len(true_document_lengths))/len(true_document_lengths)
print('Lengths distance / document count difference / total score')
print('{:.3f} {}({:.3f}) {:.3f}'.format(distance, difference, proportion, distance+difference))