From 90912b9190d2f8902590a32c256654bfbbae0006 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 12 Mar 2021 13:42:19 +0100 Subject: [PATCH 1/2] FrontendApp: forward HTTP status of CDX backend to allow clients to handle errors more easily --- pywb/apps/frontendapp.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index c3ece2940..69382df2f 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -404,10 +404,12 @@ def serve_cdx(self, environ, coll='$root'): try: res = requests.get(cdx_url, stream=True) + status_line = '{} {}'.format(res.status_code, res.reason) content_type = res.headers.get('Content-Type') return WbResponse.bin_stream(StreamIter(res.raw), - content_type=content_type) + content_type=content_type, + status=status_line) except Exception as e: return WbResponse.text_response('Error: ' + str(e), status='400 Bad Request') From ea3c8d40ea7230ec52b4804d08c874d02969a0c2 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 18 Mar 2021 14:19:15 +0100 Subject: [PATCH 2/2] Handle CDXExceptions properly, returning the exception status code - make that CDXException is raised early so that it can be handled in the IndexHandler --- pywb/warcserver/handlers.py | 19 +++++++++++++++++-- tests/test_zipnum_auto_dir.py | 8 ++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/pywb/warcserver/handlers.py b/pywb/warcserver/handlers.py index 58d272d55..58dec4624 100644 --- a/pywb/warcserver/handlers.py +++ b/pywb/warcserver/handlers.py @@ -4,6 +4,7 @@ from warcio.recordloader import ArchiveLoadFailed +from pywb.warcserver.index.cdxobject import CDXException from pywb.warcserver.index.fuzzymatcher import FuzzyMatcher from pywb.warcserver.resource.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader @@ -96,13 +97,27 @@ def __call__(self, params): content_type, res = handler(cdx_iter, fields, params) out_headers = {'Content-Type': content_type} - def check_str(lines): + first_line = None + try: + # raise exceptions early so that they can be handled properly + first_line = next(res) + except StopIteration: + pass + except CDXException as e: + errs = dict(last_exc=e) + return None, None, errs + + def check_str(first_line, lines): + if first_line is not None: + if isinstance(first_line, six.text_type): + first_line = first_line.encode('utf-8') + yield first_line for line in lines: if isinstance(line, six.text_type): line = line.encode('utf-8') yield line - return out_headers, check_str(res), errs + return out_headers, check_str(first_line, res), errs #============================================================================= diff --git a/tests/test_zipnum_auto_dir.py b/tests/test_zipnum_auto_dir.py index 7a3f77b5e..c91c849e7 100644 --- a/tests/test_zipnum_auto_dir.py +++ b/tests/test_zipnum_auto_dir.py @@ -46,5 +46,13 @@ def test_paged_index_query(self): assert lines[2] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7} assert lines[3] == {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8} + def test_paged_index_query_out_of_range(self): + res = self.testapp.get( + '/testzip/cdx?url=http://iana.org/domains/&matchType=domain&output=json&showPagedIndex=true&pageSize=4&page=10', + expect_errors=True) + + assert res.status_code == 400 + assert res.json == {"message": "Page 10 invalid: First Page is 0, Last Page is 9"} +