From 7d7a2d4f9f27816eb56a0be59f567612a6ee173b Mon Sep 17 00:00:00 2001 From: Alex Bocharov Date: Thu, 7 Nov 2024 13:10:30 -0600 Subject: [PATCH] Fix WARC headers parsing when record has `Content-Length: 0` and record after it. Validated against Python implementation: https://github.com/webrecorder/warcio Also fixed flaky test `record::verify_display` by sorting header names in the test. --- src/record.rs | 38 ++++++++++++++++-------- src/warc_reader.rs | 74 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 97 insertions(+), 15 deletions(-) diff --git a/src/record.rs b/src/record.rs index 7da05da..d37122f 100644 --- a/src/record.rs +++ b/src/record.rs @@ -1046,24 +1046,36 @@ mod raw_tests { #[test] fn verify_display() { + let header_entries = vec![ + (WarcHeader::WarcType, b"dunno".to_vec()), + (WarcHeader::Date, b"2024-01-01T00:00:00Z".to_vec()), + ]; + let headers = RawRecordHeader { version: "1.0".to_owned(), - headers: vec![ - (WarcHeader::WarcType, b"dunno".to_vec()), - (WarcHeader::Date, b"2024-01-01T00:00:00Z".to_vec()), - ] - .into_iter() - .collect(), + headers: header_entries.into_iter().collect(), }; - let expected = "\ - WARC/1.0\n\ - warc-type: dunno\n\ - warc-date: 2024-01-01T00:00:00Z\n\ - \n\ - "; + let output = headers.to_string(); + + let expected_lines = vec![ + "WARC/1.0", + "warc-type: dunno", + "warc-date: 2024-01-01T00:00:00Z", + "", + ]; + let actual_lines: Vec<_> = output.lines().collect(); + + let mut expected_headers: Vec<_> = expected_lines[1..expected_lines.len() - 1].to_vec(); + expected_headers.sort(); + + let mut actual_headers: Vec<_> = actual_lines[1..actual_lines.len() - 1].to_vec(); + actual_headers.sort(); - assert_eq!(headers.to_string(), expected); + // verify parts + assert_eq!(actual_lines[0], expected_lines[0]); // WARC version + assert_eq!(actual_headers, expected_headers); // headers (sorted) + assert_eq!(actual_lines.last(), expected_lines.last()); // empty line } } diff --git a/src/warc_reader.rs b/src/warc_reader.rs index 08fd228..5aba65b 100644 --- a/src/warc_reader.rs +++ b/src/warc_reader.rs @@ -120,7 +120,7 @@ impl Iterator for RawRecordIter { let expected_body_len = headers_parsed.2; let mut body_buffer: Vec = Vec::with_capacity(1 * MB); - let mut found_body = expected_body_len == 0; + let mut found_body = false; let mut body_bytes_read = 0; let maximum_read_range = expected_body_len + 4; while !found_body { @@ -203,7 +203,7 @@ impl Iterator for RecordIter { let expected_body_len = headers_parsed.2; let mut body_buffer: Vec = Vec::with_capacity(1 * MB); - let mut found_body = expected_body_len == 0; + let mut found_body = false; let mut body_bytes_read = 0; let maximum_read_range = expected_body_len + 4; while !found_body { @@ -683,4 +683,74 @@ mod next_item_tests { assert_eq!(record.body(), b"12345678"); } } + + #[test] + fn empty_content_length() { + let raw = b"\ + WARC/1.0\r\n\ + Warc-Type: empty-record\r\n\ + Content-Length: 0\r\n\ + WARC-Record-Id: \r\n\ + WARC-Date: 2020-07-08T02:52:57Z\r\n\ + \r\n\ + \r\n\ + "; + + let mut reader = WarcReader::new(create_reader!(raw)); + let mut stream_iter = reader.stream_records(); + + let record = stream_iter + .next_item() + .unwrap() + .unwrap() + .into_buffered() + .unwrap(); + assert_eq!(record.warc_version(), "1.0"); + assert_eq!(record.content_length(), 0); + assert_eq!(record.warc_id(), ""); + assert_eq!(record.body(), b""); + } + + #[test] + fn zero_and_nonzero_content_length() { + let raw = b"\ + WARC/1.0\r\n\ + Warc-Type: empty-record\r\n\ + Content-Length: 0\r\n\ + WARC-Record-Id: \r\n\ + WARC-Date: 2020-07-08T02:52:57Z\r\n\ + \r\n\ + \r\n\ + \r\n\ + WARC/1.0\r\n\ + Warc-Type: non-empty-record\r\n\ + Content-Length: 7\r\n\ + WARC-Record-Id: \r\n\ + WARC-Date: 2020-07-08T02:52:58Z\r\n\ + \r\n\ + 1234567\r\n\ + \r\n\ + "; + + let reader = WarcReader::new(create_reader!(raw)); + let mut iter = reader.iter_records(); + + // Test the first record with Content-Length: 0 + { + let record = iter.next().unwrap().unwrap(); + assert_eq!(record.warc_version(), "1.0"); + assert_eq!(record.content_length(), 0); + assert_eq!(record.warc_id(), ""); + assert_eq!(record.body(), b""); + } + + // Test the second record with non-zero Content-Length + { + let record = iter.next().unwrap().unwrap(); + assert_eq!(record.warc_version(), "1.0"); + assert_eq!(record.content_length(), 7); + assert_eq!(record.warc_id(), ""); + assert_eq!(record.body(), b"1234567"); + } + } }