Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix WARC headers parsing when record has Content-Length: 0 and record after it. #42

Merged
merged 1 commit into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 25 additions & 13 deletions src/record.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1046,24 +1046,36 @@ mod raw_tests {

#[test]
fn verify_display() {
let header_entries = vec![
(WarcHeader::WarcType, b"dunno".to_vec()),
(WarcHeader::Date, b"2024-01-01T00:00:00Z".to_vec()),
];

let headers = RawRecordHeader {
version: "1.0".to_owned(),
headers: vec![
(WarcHeader::WarcType, b"dunno".to_vec()),
(WarcHeader::Date, b"2024-01-01T00:00:00Z".to_vec()),
]
.into_iter()
.collect(),
headers: header_entries.into_iter().collect(),
};

let expected = "\
WARC/1.0\n\
warc-type: dunno\n\
warc-date: 2024-01-01T00:00:00Z\n\
\n\
";
let output = headers.to_string();

let expected_lines = vec![
"WARC/1.0",
"warc-type: dunno",
"warc-date: 2024-01-01T00:00:00Z",
"",
];
let actual_lines: Vec<_> = output.lines().collect();

let mut expected_headers: Vec<_> = expected_lines[1..expected_lines.len() - 1].to_vec();
expected_headers.sort();

let mut actual_headers: Vec<_> = actual_lines[1..actual_lines.len() - 1].to_vec();
actual_headers.sort();

assert_eq!(headers.to_string(), expected);
// verify parts
assert_eq!(actual_lines[0], expected_lines[0]); // WARC version
assert_eq!(actual_headers, expected_headers); // headers (sorted)
assert_eq!(actual_lines.last(), expected_lines.last()); // empty line
}
}

Expand Down
74 changes: 72 additions & 2 deletions src/warc_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@
let file = fs::OpenOptions::new()
.read(true)
.write(true)
.create(true)

Check warning on line 58 in src/warc_reader.rs

View workflow job for this annotation

GitHub Actions / Clippy linting

file opened with `create`, but `truncate` behavior not defined
.open(&path)?;
let reader = BufReader::with_capacity(1 * MB, file);

Check warning on line 60 in src/warc_reader.rs

View workflow job for this annotation

GitHub Actions / Clippy linting

this operation has no effect

Ok(WarcReader::new(reader))
}
Expand All @@ -71,7 +71,7 @@
pub fn from_path_gzip<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let file = fs::File::open(&path)?;

let gzip_stream = GzipReader::new(BufReader::with_capacity(1 * MB, file))?;

Check warning on line 74 in src/warc_reader.rs

View workflow job for this annotation

GitHub Actions / Clippy linting

this operation has no effect
Ok(WarcReader::new(BufReader::new(gzip_stream)))
}
}
Expand Down Expand Up @@ -119,8 +119,8 @@
let headers_ref = headers_parsed.1;
let expected_body_len = headers_parsed.2;

let mut body_buffer: Vec<u8> = Vec::with_capacity(1 * MB);

Check warning on line 122 in src/warc_reader.rs

View workflow job for this annotation

GitHub Actions / Clippy linting

this operation has no effect
let mut found_body = expected_body_len == 0;
let mut found_body = false;
let mut body_bytes_read = 0;
let maximum_read_range = expected_body_len + 4;
while !found_body {
Expand Down Expand Up @@ -202,8 +202,8 @@
let headers_ref = headers_parsed.1;
let expected_body_len = headers_parsed.2;

let mut body_buffer: Vec<u8> = Vec::with_capacity(1 * MB);

Check warning on line 205 in src/warc_reader.rs

View workflow job for this annotation

GitHub Actions / Clippy linting

this operation has no effect
let mut found_body = expected_body_len == 0;
let mut found_body = false;
let mut body_bytes_read = 0;
let maximum_read_range = expected_body_len + 4;
while !found_body {
Expand Down Expand Up @@ -271,7 +271,7 @@
}

fn skip_body(&mut self) -> Result<(), Error> {
let mut read_buffer = [0u8; 1 * MB];

Check warning on line 274 in src/warc_reader.rs

View workflow job for this annotation

GitHub Actions / Clippy linting

this operation has no effect
let maximum_read_range = self.current_item_size;
let mut body_bytes_left = maximum_read_range;
while body_bytes_left > 0 {
Expand Down Expand Up @@ -683,4 +683,74 @@
assert_eq!(record.body(), b"12345678");
}
}

#[test]
fn empty_content_length() {
let raw = b"\
WARC/1.0\r\n\
Warc-Type: empty-record\r\n\
Content-Length: 0\r\n\
WARC-Record-Id: <urn:test:empty-content-length>\r\n\
WARC-Date: 2020-07-08T02:52:57Z\r\n\
\r\n\
\r\n\
";

let mut reader = WarcReader::new(create_reader!(raw));
let mut stream_iter = reader.stream_records();

let record = stream_iter
.next_item()
.unwrap()
.unwrap()
.into_buffered()
.unwrap();
assert_eq!(record.warc_version(), "1.0");
assert_eq!(record.content_length(), 0);
assert_eq!(record.warc_id(), "<urn:test:empty-content-length>");
assert_eq!(record.body(), b"");
}

#[test]
fn zero_and_nonzero_content_length() {
let raw = b"\
WARC/1.0\r\n\
Warc-Type: empty-record\r\n\
Content-Length: 0\r\n\
WARC-Record-Id: <urn:test:zero-content-length>\r\n\
WARC-Date: 2020-07-08T02:52:57Z\r\n\
\r\n\
\r\n\
\r\n\
WARC/1.0\r\n\
Warc-Type: non-empty-record\r\n\
Content-Length: 7\r\n\
WARC-Record-Id: <urn:test:nonzero-content-length>\r\n\
WARC-Date: 2020-07-08T02:52:58Z\r\n\
\r\n\
1234567\r\n\
\r\n\
";

let reader = WarcReader::new(create_reader!(raw));
let mut iter = reader.iter_records();

// Test the first record with Content-Length: 0
{
let record = iter.next().unwrap().unwrap();
assert_eq!(record.warc_version(), "1.0");
assert_eq!(record.content_length(), 0);
assert_eq!(record.warc_id(), "<urn:test:zero-content-length>");
assert_eq!(record.body(), b"");
}

// Test the second record with non-zero Content-Length
{
let record = iter.next().unwrap().unwrap();
assert_eq!(record.warc_version(), "1.0");
assert_eq!(record.content_length(), 7);
assert_eq!(record.warc_id(), "<urn:test:nonzero-content-length>");
assert_eq!(record.body(), b"1234567");
}
}
}
Loading