Skip to content

Commit

Permalink
ensure dates are parsed appropriately
Browse files Browse the repository at this point in the history
  • Loading branch information
maxcountryman committed Jun 21, 2022
1 parent 4aeddce commit 0719d65
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 12 deletions.
1 change: 1 addition & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ fn main() -> Result<(), Error> {
let props = Some(
WriterProperties::builder()
.set_compression(args.compression.into())
.set_created_by(String::from("warc-parquet"))
.build(),
);
let mut writer = ArrowWriter::try_new(parquet_file, batch.schema(), props)?;
Expand Down
25 changes: 14 additions & 11 deletions src/record_columns.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::sync::Arc;

use arrow::array::{ArrayRef, BinaryArray, StringArray, TimestampSecondArray, UInt32Array};
use arrow::array::{ArrayRef, BinaryArray, StringArray, TimestampMillisecondArray, UInt32Array};
use chrono::NaiveDateTime;
use warc::{BufferedBody, Record, WarcHeader};

Expand Down Expand Up @@ -71,16 +71,19 @@ impl RecordColumns {
.unwrap()]))
}

fn date(&self) -> Arc<TimestampSecondArray> {
Arc::new(TimestampSecondArray::from(vec![self
.record
.header(WarcHeader::Date)
.map(|s| {
NaiveDateTime::parse_from_str(&s, "%Y-%m-%dT%H:%M:%SZ")
.unwrap()
.timestamp()
})
.unwrap()]))
fn date(&self) -> Arc<TimestampMillisecondArray> {
Arc::new(TimestampMillisecondArray::from_vec(
vec![self
.record
.header(WarcHeader::Date)
.map(|s| {
NaiveDateTime::parse_from_str(&s, "%Y-%m-%dT%H:%M:%SZ")
.unwrap()
.timestamp_millis()
})
.unwrap()],
None,
))
}

fn r#type(&self) -> Arc<StringArray> {
Expand Down
6 changes: 5 additions & 1 deletion src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ fn fields() -> Vec<Field> {
// Mandatory fields.
Field::new("id", DataType::Utf8, false),
Field::new("content_length", DataType::UInt32, false),
Field::new("date", DataType::Timestamp(TimeUnit::Second, None), false),
Field::new(
"date",
DataType::Timestamp(TimeUnit::Millisecond, None),
false,
),
Field::new("type", DataType::Utf8, false),
// Optional fields.
Field::new("content_type", DataType::Utf8, true),
Expand Down

0 comments on commit 0719d65

Please sign in to comment.