diff --git a/src/record.rs b/src/record.rs index 7da05da..d9e0a70 100644 --- a/src/record.rs +++ b/src/record.rs @@ -155,12 +155,12 @@ impl std::convert::TryFrom for Record { impl std::fmt::Display for RawRecordHeader { fn fmt(&self, w: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { writeln!(w, "WARC/{}", self.version)?; - for (key, value) in self.as_ref().iter() { + let mut sorted: Vec<_> = self.as_ref().iter().collect(); + sorted.sort_by_key(|(key, _)| key.to_string()); + for (key, value) in sorted { writeln!(w, "{}: {}", key.to_string(), String::from_utf8_lossy(value))?; } - writeln!(w)?; - - Ok(()) + writeln!(w) } } @@ -1058,8 +1058,8 @@ mod raw_tests { let expected = "\ WARC/1.0\n\ - warc-type: dunno\n\ warc-date: 2024-01-01T00:00:00Z\n\ + warc-type: dunno\n\ \n\ "; diff --git a/src/warc_reader.rs b/src/warc_reader.rs index 08fd228..5aba65b 100644 --- a/src/warc_reader.rs +++ b/src/warc_reader.rs @@ -120,7 +120,7 @@ impl Iterator for RawRecordIter { let expected_body_len = headers_parsed.2; let mut body_buffer: Vec = Vec::with_capacity(1 * MB); - let mut found_body = expected_body_len == 0; + let mut found_body = false; let mut body_bytes_read = 0; let maximum_read_range = expected_body_len + 4; while !found_body { @@ -203,7 +203,7 @@ impl Iterator for RecordIter { let expected_body_len = headers_parsed.2; let mut body_buffer: Vec = Vec::with_capacity(1 * MB); - let mut found_body = expected_body_len == 0; + let mut found_body = false; let mut body_bytes_read = 0; let maximum_read_range = expected_body_len + 4; while !found_body { @@ -683,4 +683,74 @@ mod next_item_tests { assert_eq!(record.body(), b"12345678"); } } + + #[test] + fn empty_content_length() { + let raw = b"\ + WARC/1.0\r\n\ + Warc-Type: empty-record\r\n\ + Content-Length: 0\r\n\ + WARC-Record-Id: \r\n\ + WARC-Date: 2020-07-08T02:52:57Z\r\n\ + \r\n\ + \r\n\ + "; + + let mut reader = WarcReader::new(create_reader!(raw)); + let mut stream_iter = reader.stream_records(); + + let record = stream_iter + .next_item() + .unwrap() + .unwrap() + .into_buffered() + .unwrap(); + assert_eq!(record.warc_version(), "1.0"); + assert_eq!(record.content_length(), 0); + assert_eq!(record.warc_id(), ""); + assert_eq!(record.body(), b""); + } + + #[test] + fn zero_and_nonzero_content_length() { + let raw = b"\ + WARC/1.0\r\n\ + Warc-Type: empty-record\r\n\ + Content-Length: 0\r\n\ + WARC-Record-Id: \r\n\ + WARC-Date: 2020-07-08T02:52:57Z\r\n\ + \r\n\ + \r\n\ + \r\n\ + WARC/1.0\r\n\ + Warc-Type: non-empty-record\r\n\ + Content-Length: 7\r\n\ + WARC-Record-Id: \r\n\ + WARC-Date: 2020-07-08T02:52:58Z\r\n\ + \r\n\ + 1234567\r\n\ + \r\n\ + "; + + let reader = WarcReader::new(create_reader!(raw)); + let mut iter = reader.iter_records(); + + // Test the first record with Content-Length: 0 + { + let record = iter.next().unwrap().unwrap(); + assert_eq!(record.warc_version(), "1.0"); + assert_eq!(record.content_length(), 0); + assert_eq!(record.warc_id(), ""); + assert_eq!(record.body(), b""); + } + + // Test the second record with non-zero Content-Length + { + let record = iter.next().unwrap().unwrap(); + assert_eq!(record.warc_version(), "1.0"); + assert_eq!(record.content_length(), 7); + assert_eq!(record.warc_id(), ""); + assert_eq!(record.body(), b"1234567"); + } + } }