Skip to content

Commit

Permalink
xmlquery: use compressed length when available (#633)
Browse files Browse the repository at this point in the history
The field is unfortunately misnamed compressedendoffset in XML but OWB
actually uses this for the compressed length 'S' CDX field.

Without this field when WARC files are accessed over HTTP pywb will make
open byte range requests which results in a lot more data being read
from disk than necessary.
  • Loading branch information
ato authored Apr 27, 2021
1 parent 73d6735 commit c5c4a54
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
5 changes: 5 additions & 0 deletions pywb/warcserver/index/indexsource.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,11 @@ def convert_to_cdx(self, item):
cdx['digest'] = self.gettext(item, 'digest')
cdx['offset'] = self.gettext(item, 'compressedoffset')
cdx['filename'] = self.gettext(item, 'file')

length = self.gettext(item, 'compressedendoffset')
if length:
cdx['length'] = length

return cdx

def gettext(self, item, name):
Expand Down
6 changes: 5 additions & 1 deletion pywb/warcserver/index/test/test_xmlquery_indexsource.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,16 @@ def do_query(self, params):
@patch('pywb.warcserver.index.indexsource.requests.sessions.Session.get', mock_get)
def test_exact_query(self):
res, errs = self.do_query({'url': 'http://example.com/', 'limit': 100})
reslist = list(res)

expected = """\
com,example)/ 20180112200243 example.warc.gz
com,example)/ 20180216200300 example.warc.gz"""
assert(key_ts_res(res) == expected)
assert(key_ts_res(reslist) == expected)
assert(errs == {})
assert query_url == 'http://localhost:8080/path?q=limit%3A+100+type%3Aurlquery+url%3Ahttp%253A%252F%252Fexample.com%252F'
assert reslist[0]['length'] == '123'
assert 'length' not in reslist[1]


@patch('pywb.warcserver.index.indexsource.requests.sessions.Session.get', mock_get)
Expand Down Expand Up @@ -119,6 +122,7 @@ def _get_etree(cls):
<results>
<result>
<compressedoffset>10</compressedoffset>
<compressedendoffset>123</compressedendoffset>
<mimetype>text/html</mimetype>
<file>example.warc.gz</file>
<redirecturl>-</redirecturl>
Expand Down

0 comments on commit c5c4a54

Please sign in to comment.