Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revisit headers load fix #751

Merged
merged 2 commits into from
Aug 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pywb/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '2.6.7'
__version__ = '2.6.8'

if __name__ == '__main__':
print(__version__)
32 changes: 17 additions & 15 deletions pywb/warcserver/resource/responseloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def __init__(self, paths, cdx_source):
self.resolvers = self.make_resolvers(self.paths)

self.resolve_loader = ResolvingLoader(self.resolvers,
no_record_parse=True)
no_record_parse=False)

self.headers_parser = StatusAndHeadersParser([], verify=False)

Expand Down Expand Up @@ -206,18 +206,20 @@ def local_index_query(local_params):
local_index_query))

http_headers_buff = None

if payload.rec_type in ('response', 'revisit'):
status = cdx.get('status')

try:
orig_size = payload.raw_stream.tell()
except:
orig_size = 0

http_headers = headers.http_headers or payload.http_headers

# if status is not set and not, 2xx, 4xx, 5xx
# go through self-redirect check just in case
if not status or not status.startswith(('2', '4', '5')):
http_headers = self.headers_parser.parse(payload.raw_stream)
try:
orig_size = payload.raw_stream.tell()
except:
orig_size = 0

try:
self.raise_on_self_redirect(params, cdx,
http_headers.get_statuscode(),
Expand All @@ -227,15 +229,15 @@ def local_index_query(local_params):
no_except_close(payload.raw_stream)
raise

http_headers_buff = http_headers.to_bytes()
http_headers_buff = http_headers and http_headers.to_bytes()

# if new http_headers_buff is different length,
# attempt to adjust content-length on the WARC record
if orig_size and len(http_headers_buff) != orig_size:
orig_cl = payload.rec_headers.get_header('Content-Length')
if orig_cl:
new_cl = int(orig_cl) + (len(http_headers_buff) - orig_size)
payload.rec_headers.replace_header('Content-Length', str(new_cl))
# if new http_headers_buff is different length,
# attempt to adjust content-length on the WARC record
if http_headers and orig_size and len(http_headers_buff) != orig_size:
orig_cl = payload.rec_headers.get_header('Content-Length')
if orig_cl:
new_cl = int(orig_cl) + (len(http_headers_buff) - orig_size)
payload.rec_headers.replace_header('Content-Length', str(new_cl))

warc_headers = payload.rec_headers

Expand Down
6 changes: 4 additions & 2 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,15 @@ def test_replay_content_head(self, fmod):
assert not resp.headers.get('Content-Length')

def test_replay_content_head_non_zero_content_length_match(self):
resp = self.testapp.get('/pywb/id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
resp = self.testapp.get('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
length = resp.content_length
print('length', length)

# Content-Length included if non-zero
resp = self.testapp.head('/pywb/id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
resp = self.testapp.head('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)

#assert resp.headers['Content-Length'] == length
print('length', resp.content_length)
assert resp.content_length == length

def test_replay_content(self, fmod):
Expand Down
146 changes: 146 additions & 0 deletions tests/test_redirect_revisits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@

from io import BytesIO
import os

from warcio import WARCWriter, StatusAndHeaders
from pywb.manager.manager import main as wb_manager

from .base_config_test import BaseConfigTest, CollsDirMixin, fmod


# ============================================================================
class TestRevisits(CollsDirMixin, BaseConfigTest):
@classmethod
def setup_class(cls):
super(TestRevisits, cls).setup_class('config_test.yaml')


def create_revisit_record(self, url, date, headers, refers_to_uri, refers_to_date):
http_headers = StatusAndHeaders(
"301 Permanent Redirect", headers, protocol="HTTP/1.0"
)

return self.writer.create_revisit_record(
url,
digest="sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O",
refers_to_uri=refers_to_uri,
refers_to_date=refers_to_date,
warc_headers_dict={"WARC-Date": date},
http_headers=http_headers,
)


def create_response_record(self, url, date, headers, payload):
http_headers = StatusAndHeaders(
"301 Permanent Redirect", headers, protocol="HTTP/1.0"
)

return self.writer.create_warc_record(
url,
record_type="response",
http_headers=http_headers,
payload=BytesIO(payload),
warc_headers_dict={"WARC-Date": date},
length=len(payload),
)

def create(self):
payload = b"some\ntext"

# record 1
self.writer.write_record(
self.create_response_record(
"http://example.com/orig-1",
"2020-01-01T00:00:00Z",
[
("Content-Type", 'text/plain; charset="UTF-8"'),
("Location", "https://example.com/redirect-1"),
("Content-Length", str(len(payload))),
("Custom", "1"),
],
payload,
)
)

# record 2
self.writer.write_record(
self.create_response_record(
"http://example.com/orig-2",
"2020-01-01T00:00:00Z",
[
("Content-Type", 'text/plain; charset="UTF-8"'),
("Location", "https://example.com/redirect-2"),
("Content-Length", str(len(payload))),
("Custom", "2"),
],
payload,
)
)

# record 3
self.writer.write_record(
self.create_revisit_record(
"http://example.com/orig-2",
"2022-01-01T00:00:00Z",
[
("Content-Type", 'text/plain; charset="UTF-8"'),
("Location", "https://example.com/redirect-3"),
("Content-Length", str(len(payload))),
("Custom", "3"),
],
refers_to_uri="http://example.com/orig-1",
refers_to_date="2020-01-01T00:00:00Z",
)
)

# record 4
self.writer.write_record(
self.create_revisit_record(
"http://example.com/",
"2022-01-01T00:00:00Z",
[
("Content-Type", 'text/plain; charset="UTF-8"'),
("Location", "https://example.com/redirect-4"),
("Content-Length", str(len(payload))),
("Custom", "4"),
],
refers_to_uri="http://example.com/orig-2",
refers_to_date="2020-01-01T00:00:00Z",
)
)

def test_init(self):
filename = os.path.join(self.root_dir, 'redir.warc.gz')
with open(filename, 'wb') as fh:
self.writer = WARCWriter(fh, gzip=True)
self.create()

wb_manager(['init', 'revisits'])

wb_manager(['add', 'revisits', filename])

assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'revisits', 'indexes', 'index.cdxj'))

def test_different_url_revisit_orig_headers(self, fmod):
res = self.get('/revisits/20220101{0}/http://example.com/', fmod, status=301)
assert res.headers["Custom"] == "4"
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-4".format(fmod))
assert res.text == 'some\ntext'

def test_different_url_revisit_and_response(self, fmod):
res = self.get('/revisits/20200101{0}/http://example.com/orig-2', fmod, status=301)
assert res.headers["Custom"] == "2"
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-2".format(fmod))
assert res.text == 'some\ntext'

res = self.get('/revisits/20220101{0}/http://example.com/orig-2', fmod, status=301)
assert res.headers["Custom"] == "3"
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-3".format(fmod))
assert res.text == 'some\ntext'

def test_orig(self, fmod):
res = self.get('/revisits/20200101{0}/http://example.com/orig-1', fmod, status=301)
assert res.headers["Custom"] == "1"
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-1".format(fmod))
assert res.text == 'some\ntext'