diff --git a/Cargo.toml b/Cargo.toml index bb9f0735..7f81d03d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,8 +7,8 @@ authors = [ ] description = "Parsing ISO8601 dates using nom" -keywords = [ "iso8601", "date-time", "parser", "nom" ] -categories = [ "parser-implementations", "date-and-time" ] +keywords = ["iso8601", "date-time", "parser", "nom"] +categories = ["parser-implementations", "date-and-time"] repository = "https://github.com/badboy/iso8601" documentation = "https://docs.rs/iso8601/" @@ -18,12 +18,12 @@ readme = "README.md" edition = "2021" [dependencies] -nom = { version = "7", default-features = false } +nom = { version = "8", default-features = false } chrono = { version = "0.4", default-features = false, optional = true } num-traits = { version = "0.2", optional = true } serde = { version = "1.0", optional = true } -[dev-dependencies ] +[dev-dependencies] serde_json = "1.0" [features] diff --git a/docs/parsing-iso8601-dates-using-nom.md b/docs/parsing-iso8601-dates-using-nom.md index 294859a7..bdeed5ef 100644 --- a/docs/parsing-iso8601-dates-using-nom.md +++ b/docs/parsing-iso8601-dates-using-nom.md @@ -44,7 +44,7 @@ It has several parts we need to parse: with the following meaning: | Characters | Meaning | -| ---------- | ------- | +| ---------- | ------------------------------------------------------------------ | | YYYY | The year, can be negative or null and can be extended if necessary | | MM | Month from 1 to 12 (0-prefixed) | | DD | Day from 1 to 31 (0-prefixed) | @@ -63,29 +63,28 @@ We will built a small parser for each of these parts and at the end combine them We will need to make a lib project. -~~~bash +```bash cargo new --lib date_parse -~~~ +``` Edit `Cargo.toml` and `src/lib.rs` so that our project depends on nom. -~~~toml +```toml [dependencies] nom = "^4.0" -~~~ +``` -~~~rust +```rust #[macro_use] extern crate nom; -~~~ - +``` ### Parsing the date: 2015-07-16 Let's start with the sign. As we need it several times, we create its own parser for that. Parsers are created by giving them a name, stating the return value (or defaulting to a byte slice) and the parser combinators to handle the input. -~~~rust +```rust named!(sign <&[u8], i32>, alt!( tag!("-") => { |_| -1 } | tag!("+") => { |_| 1 } @@ -109,7 +108,7 @@ mod tests { assert_eq!(sign(b" "), Err(Error(Code(&b" "[..], Alt)))); } } -~~~ +``` First, we parse either a plus or a minus sign. This combines two already existing parsers: `tag!`, which will match the given byte array (in our case a single character) and `alt!`, which will try a list of parsers, returning on the first successful one. @@ -117,7 +116,7 @@ We can directly map the result of the sub-parsers to either `-1` or `1`, so we d Next we parse the year, which consists of an optional sign and 4 digits (I know, I know, it is possible to extend this to more digits, but let's keep it simple for now). -~~~rust +```rust use std::ops::{AddAssign, MulAssign}; fn buf_to_int(s: &[u8]) -> T @@ -157,66 +156,65 @@ mod tests { } } -~~~ +``` A lot of additional stuff here. So let's separate it. -~~~rust +```rust named!(positive_year <&[u8], i32>, map!(take_while_m_n!(4, 4, nom::is_digit), buf_to_int)); -~~~ +``` This creates a new named parser, that again returns the remaining input and an 32-bit integer. To work, it first calls `take_4_digits` and then maps that result to the corresponding integer. `take_while_m_n` is another small helper parser. We will also use one for 2 digits: -~~~rust +```rust take_while_m_n!(4, 4, nom::is_digit) take_while_m_n!(2, 2, nom::is_digit) -~~~ +``` This takes 4 (or 2) characters from the input and checks that each character is a digit. -~~~rust +```rust named!(pub year <&[u8], i32>, do_parse!( -~~~ +``` The year is also returned as a 32-bit integer (there's a pattern!). Using the `do_parse!` macro, we can chain together multiple parsers and work with the sub-results. -~~~rust +```rust pref: opt!(sign) >> y: positive_year >> -~~~ +``` Our sign is directly followed by 4 digits. It's optional though, that's why we use `opt!`. `>>` is the concatenation operator in the `chain!` macro. We save the sub-results to variables (`pref` and `y`). - -~~~rust +```rust (pref.unwrap_or(1) * y) -~~~ +``` To get the final result, we multiply the prefix (which comes back as either `1` or `-1`) with the year. We can now successfully parse a year: -~~~rust +```rust assert_eq!(year(b"2018"), Ok((&[][..], 2018))); assert_eq!(year(b"-0333"), Ok((&[][..], -0333))); -~~~ +``` Our nom parser will return an `IResult`. -~~~rust +```rust type IResult = Result<(I, O), Err>; pub enum Err { Incomplete(Needed), Error(Context), Failure(Context), } -~~~ +``` If all went well, we get `Ok(I,O)` with `I` and `O` being the appropriate types. For our case `I` is the same as the input, a buffer slice (`&[u8]`), and `O` is the output of the parser itself, an integer (`i32`). @@ -224,7 +222,7 @@ The return value could also be an `Err(Failure)`, if something went completely w Parsing the month and day is a bit easier now: we simply take the digits and map them to an integer: -~~~rust +```rust named!(month <&[u8], u8>, map!(take_while_m_n!(2, 2, nom::is_digit), buf_to_int)); named!(day <&[u8], u8>, map!(take_while_m_n!(2, 2, nom::is_digit), buf_to_int)); @@ -243,12 +241,12 @@ mod tests { assert_eq!(day(b"18"), Ok((&[][..], 18))); } } -~~~ +``` All that's left is combining these 3 parts to parse a full date. Again we can chain the different parsers and map it to some useful value: -~~~rust +```rust #[derive(Eq, PartialEq, Debug)] pub struct Date { year: i32, @@ -297,7 +295,7 @@ mod tests { } } -~~~ +``` And running the tests shows it already works! @@ -305,15 +303,15 @@ And running the tests shows it already works! Next, we parse the time. The individual parts are really simple, just some digits: -~~~rust +```rust named!(pub hour <&[u8], u8>, map!(take_while_m_n!(2, 2, nom::is_digit), buf_to_int)); named!(pub minute <&[u8], u8>, map!(take_while_m_n!(2, 2, nom::is_digit), buf_to_int)); named!(pub second <&[u8], u8>, map!(take_while_m_n!(2, 2, nom::is_digit), buf_to_int)); -~~~ +``` Putting them together becomes a bit more complex, as the `second` part is optional: -~~~rust +```rust #[derive(Eq, PartialEq, Debug)] pub struct Time { hour: u8, @@ -367,7 +365,7 @@ mod tests { ); } } -~~~ +``` As you can see, even `do_parse!` parsers can be nested. The sub-parts then must be mapped once for the inner parser and once into the final value of the outer parser. @@ -378,11 +376,11 @@ But it leaves out one important bit: the timezone. ### Parsing the timezone: +0100 -~~~ +``` 2015-07-02T19:45:00-0500 2015-07-02T19:45:00Z 2015-07-02T19:45:00+01 -~~~ +``` Above are three variants of valid dates with timezones. The timezone in an ISO8601 string is either an appended `Z`, indicating UTC, @@ -390,16 +388,16 @@ or it's separated using a sign (`+` or `-`) and appends the offset from UTC in h Let's cover the UTC special case first: -~~~rust +```rust named!(timezone_utc <&[u8], i32>, map!(tag!("Z"), |_| 0)); -~~~ +``` This should look familiar by now. It's a simple `Z` character, which we map to `0`. The other case is the sign-separated hour and minute offset. -~~~rust +```rust named!(timezone_hour <&[u8], i32>, do_parse!( sign: sign >> hour: hour >> @@ -408,7 +406,7 @@ named!(timezone_hour <&[u8], i32>, do_parse!( ))) >> ((sign * (hour as i32 * 3600 + minute.unwrap_or(0) as i32 * 60))) )); -~~~ +``` We can re-use our already existing parsers and once again chain them to get what we want. The minutes are optional (and might be separated using a colon). @@ -419,13 +417,13 @@ We could also just map it to a tuple like
`(sign, hour, minute.unwrap_or(0)) Combined we get -~~~rust +```rust named!(timezone <&[u8], i32>, alt!(timezone_utc | timezone_hour)); -~~~ +``` Putting this back into time we get: -~~~rust +```rust named!(pub time <&[u8], Time>, do_parse!( hour: hour >> tag!(":") >> @@ -497,7 +495,7 @@ mod tests { } } -~~~ +``` ### Putting it all together @@ -505,7 +503,7 @@ We now got individual parsers for the date, the time and the timezone offset. Putting it all together, our final datetime parser looks quite small and easy to understand: -~~~rust +```rust #[derive(Eq, PartialEq, Debug)] pub struct DateTime { date: Date, @@ -551,21 +549,21 @@ mod tests { ); } } -~~~ +``` Nothing special anymore. We can now parse all kinds of date strings: -~~~rust +```rust datetime("2007-08-31T16:47+00:00"); datetime("2007-12-24T18:21Z"); datetime("2008-02-01T09:00:22+05"); -~~~ +``` But it will also parse invalid dates and times: -~~~rust +```rust datetime("2234-13-42T25:70Z"); -~~~ +``` But this is fine for now. We can handle the actual validation in a later step. For example, we could use [chrono][], a time library, [to handle this for us][chrono-convert]. diff --git a/src/assert.rs b/src/assert.rs index e6351408..7bf1dd4a 100644 --- a/src/assert.rs +++ b/src/assert.rs @@ -8,6 +8,7 @@ pub fn print_result(input: &str, rest: &[u8], result: &T) { } #[macro_export] +#[allow(missing_docs)] macro_rules! assert_parser { ($parser:ident, $line:expr, $expectation:expr) => {{ use std::string::ToString; diff --git a/src/parsers.rs b/src/parsers.rs index 4835d84c..acd439d7 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -14,11 +14,10 @@ use nom::{ branch::alt, bytes::complete::{tag, take_while, take_while_m_n}, character::complete::one_of, - character::is_digit, combinator::{map_res, not, opt}, error::Error, - sequence::{preceded, separated_pair, terminated, tuple}, - Err, IResult, Parser, + sequence::{preceded, separated_pair, terminated}, + AsChar, Err, IResult, Parser, }; use crate::{Date, DateTime, Duration, Time}; @@ -29,7 +28,7 @@ mod tests; // UTILITY fn take_digits(i: &[u8]) -> IResult<&[u8], u32> { - let (i, digits) = take_while(is_digit)(i)?; + let (i, digits) = take_while(AsChar::is_dec_digit).parse(i)?; if digits.is_empty() { return Err(Err::Error(Error::new(i, nom::error::ErrorKind::Eof))); @@ -44,7 +43,7 @@ fn take_digits(i: &[u8]) -> IResult<&[u8], u32> { } fn take_n_digits(i: &[u8], n: usize) -> IResult<&[u8], u32> { - let (i, digits) = take_while_m_n(n, n, is_digit)(i)?; + let (i, digits) = take_while_m_n(n, n, AsChar::is_dec_digit)(i)?; let s = str::from_utf8(digits).expect("Invalid data, expected UTF-8 string"); let res = s @@ -69,7 +68,7 @@ fn n_digit_in_range( } fn sign(i: &[u8]) -> IResult<&[u8], i32> { - alt((tag(b"-"), tag(b"+"))) + alt((tag("-"), tag("+"))) .map(|s: &[u8]| match s { b"-" => -1, _ => 1, @@ -82,12 +81,12 @@ fn sign(i: &[u8]) -> IResult<&[u8], i32> { // [+/-]YYYY fn date_year(i: &[u8]) -> IResult<&[u8], i32> { // The sign is optional, but defaults to `+` - tuple(( + ( opt(sign), // [+/-] |i| take_n_digits(i, 4), // year - )) - .map(|(s, year)| s.unwrap_or(1) * year as i32) - .parse(i) + ) + .map(|(s, year)| s.unwrap_or(1) * year as i32) + .parse(i) } // MM @@ -116,42 +115,42 @@ fn date_ord_day(i: &[u8]) -> IResult<&[u8], u32> { // YYYY-MM-DD fn date_ymd(i: &[u8]) -> IResult<&[u8], Date> { - tuple(( - date_year, // YYYY - opt(tag(b"-")), // - - date_month, // MM - opt(tag(b"-")), // - - date_day, //DD - )) - .map(|(year, _, month, _, day)| Date::YMD { year, month, day }) - .parse(i) + ( + date_year, // YYYY + opt(tag("-")), // - + date_month, // MM + opt(tag("-")), // - + date_day, //DD + ) + .map(|(year, _, month, _, day)| Date::YMD { year, month, day }) + .parse(i) } // YYYY-DDD fn date_ordinal(i: &[u8]) -> IResult<&[u8], Date> { - separated_pair(date_year, opt(tag(b"-")), date_ord_day) + separated_pair(date_year, opt(tag("-")), date_ord_day) .map(|(year, ddd)| Date::Ordinal { year, ddd }) .parse(i) } // YYYY-"W"WW-D fn date_iso_week(i: &[u8]) -> IResult<&[u8], Date> { - tuple(( - date_year, // y - tuple((opt(tag(b"-")), tag(b"W"))), // [-]W - date_week, // w - opt(tag(b"-")), // [-] - date_week_day, // d - )) - .map(|(year, _, ww, _, d)| Date::Week { year, ww, d }) - .parse(i) + ( + date_year, // y + (opt(tag("-")), tag("W")), // [-]W + date_week, // w + opt(tag("-")), // [-] + date_week_day, // d + ) + .map(|(year, _, ww, _, d)| Date::Week { year, ww, d }) + .parse(i) } /// Parses a date string. /// /// See [`date()`][`crate::date()`] for the supported formats. pub fn parse_date(i: &[u8]) -> IResult<&[u8], Date> { - alt((date_ymd, date_iso_week, date_ordinal))(i) + alt((date_ymd, date_iso_week, date_ordinal)).parse(i) } // TIME @@ -175,7 +174,7 @@ fn time_second(i: &[u8]) -> IResult<&[u8], u32> { // truncating towards zero if there are more than three digits. // e.g. "" -> 0, "1" -> 100, "12" -> 120, "123" -> 123, "1234" -> 123 fn fraction_millisecond(i: &[u8]) -> IResult<&[u8], u32> { - let (i, mut digits) = take_while(is_digit)(i)?; + let (i, mut digits) = take_while(AsChar::is_dec_digit).parse(i)?; let mut l = digits.len(); if l > 3 { digits = digits.get(0..3).unwrap(); @@ -197,37 +196,37 @@ fn fraction_millisecond(i: &[u8]) -> IResult<&[u8], u32> { /// See [`time()`][`crate::time()`] for the supported formats. // HH:MM:[SS][.(m*)][(Z|+...|-...)] pub fn parse_time(i: &[u8]) -> IResult<&[u8], Time> { - tuple(( + ( time_hour, // HH - opt(tag(b":")), // : + opt(tag(":")), // : time_minute, // MM - opt(preceded(opt(tag(b":")), time_second)), // [SS] + opt(preceded(opt(tag(":")), time_second)), // [SS] opt(preceded(one_of(",."), fraction_millisecond)), // [.(m*)] opt(alt((timezone_hour, timezone_utc))), // [(Z|+...|-...)] - )) - .map(|(h, _, m, s, ms, z)| { - let (tz_offset_hours, tz_offset_minutes) = z.unwrap_or((0, 0)); - - Time { - hour: h, - minute: m, - second: s.unwrap_or(0), - millisecond: ms.unwrap_or(0), - tz_offset_hours, - tz_offset_minutes, - } - }) - .parse(i) + ) + .map(|(h, _, m, s, ms, z)| { + let (tz_offset_hours, tz_offset_minutes) = z.unwrap_or((0, 0)); + + Time { + hour: h, + minute: m, + second: s.unwrap_or(0), + millisecond: ms.unwrap_or(0), + tz_offset_hours, + tz_offset_minutes, + } + }) + .parse(i) } fn timezone_hour(i: &[u8]) -> IResult<&[u8], (i32, i32)> { - tuple((sign, time_hour, opt(preceded(opt(tag(b":")), time_minute)))) + (sign, time_hour, opt(preceded(opt(tag(":")), time_minute))) .map(|(s, h, m)| (s * (h as i32), s * (m.unwrap_or(0) as i32))) .parse(i) } fn timezone_utc(input: &[u8]) -> IResult<&[u8], (i32, i32)> { - tag(b"Z").map(|_| (0, 0)).parse(input) + tag("Z").map(|_| (0, 0)).parse(input) } /// Parses a datetime string. @@ -235,7 +234,7 @@ fn timezone_utc(input: &[u8]) -> IResult<&[u8], (i32, i32)> { /// See [`datetime()`][`crate::datetime()`] for supported formats. // Full ISO8601 datetime pub fn parse_datetime(i: &[u8]) -> IResult<&[u8], DateTime> { - separated_pair(parse_date, tag(b"T"), parse_time) + separated_pair(parse_date, tag("T"), parse_time) .map(|(d, t)| DateTime { date: d, time: t }) .parse(i) } @@ -244,38 +243,38 @@ pub fn parse_datetime(i: &[u8]) -> IResult<&[u8], DateTime> { /// dur-year = 1*DIGIT "Y" [dur-month] fn duration_year(i: &[u8]) -> IResult<&[u8], u32> { - terminated(take_digits, tag(b"Y"))(i) + terminated(take_digits, tag("Y")).parse(i) } /// dur-month = 1*DIGIT "M" [dur-day] fn duration_month(i: &[u8]) -> IResult<&[u8], u32> { - terminated(take_digits, tag(b"M"))(i) + terminated(take_digits, tag("M")).parse(i) } /// dur-week = 1*DIGIT "W" fn duration_week(i: &[u8]) -> IResult<&[u8], u32> { - terminated(take_digits, tag(b"W"))(i) + terminated(take_digits, tag("W")).parse(i) } // dur-day = 1*DIGIT "D" fn duration_day(i: &[u8]) -> IResult<&[u8], u32> { - terminated(take_digits, tag(b"D"))(i) + terminated(take_digits, tag("D")).parse(i) } /// dur-hour = 1*DIGIT "H" [dur-minute] /// dur-time = "T" (dur-hour / dur-minute / dur-second) fn duration_hour(i: &[u8]) -> IResult<&[u8], u32> { - terminated(take_digits, tag(b"H"))(i) + terminated(take_digits, tag("H")).parse(i) } /// dur-minute = 1*DIGIT "M" [dur-second] fn duration_minute(i: &[u8]) -> IResult<&[u8], u32> { - terminated(take_digits, tag(b"M"))(i) + terminated(take_digits, tag("M")).parse(i) } /// dur-second = 1*DIGIT "S" fn duration_second(i: &[u8]) -> IResult<&[u8], u32> { - terminated(take_digits, tag(b"S"))(i) + terminated(take_digits, tag("S")).parse(i) } /// dur-second-ext = 1*DIGIT (,|.) 1*DIGIT "S" @@ -286,40 +285,41 @@ fn duration_second_and_millisecond(i: &[u8]) -> IResult<&[u8], (u32, u32)> { terminated( // with milliseconds separated_pair(take_digits, one_of(",."), fraction_millisecond), - tag(b"S"), + tag("S"), ), - ))(i) + )) + .parse(i) } fn duration_time(i: &[u8]) -> IResult<&[u8], (u32, u32, u32, u32)> { - tuple(( + ( opt(duration_hour), opt(duration_minute), opt(duration_second_and_millisecond), - )) - .map(|(h, m, s)| { - let (s, ms) = s.unwrap_or((0, 0)); + ) + .map(|(h, m, s)| { + let (s, ms) = s.unwrap_or((0, 0)); - (h.unwrap_or(0), m.unwrap_or(0), s, ms) - }) - .parse(i) + (h.unwrap_or(0), m.unwrap_or(0), s, ms) + }) + .parse(i) } fn duration_ymdhms(i: &[u8]) -> IResult<&[u8], Duration> { map_res( preceded( - tag(b"P"), - tuple(( + tag("P"), + ( opt(duration_year), opt(duration_month), opt(duration_day), - opt(preceded(tag(b"T"), duration_time)), - )), + opt(preceded(tag("T"), duration_time)), + ), ), |(y, mo, d, time)| { // at least one element must be present for a valid duration representation if y.is_none() && mo.is_none() && d.is_none() && time.is_none() { - return Err(Err::Error((i, nom::error::ErrorKind::Eof))); + return Err((i, nom::error::ErrorKind::Eof)); } let (h, mi, s, ms) = time.unwrap_or((0, 0, 0, 0)); @@ -334,11 +334,12 @@ fn duration_ymdhms(i: &[u8]) -> IResult<&[u8], Duration> { millisecond: ms, }) }, - )(i) + ) + .parse(i) } fn duration_weeks(i: &[u8]) -> IResult<&[u8], Duration> { - preceded(tag(b"P"), duration_week) + preceded(tag("P"), duration_week) .map(Duration::Weeks) .parse(i) } @@ -350,16 +351,16 @@ fn duration_datetime_year(i: &[u8]) -> IResult<&[u8], u32> { fn duration_datetime(i: &[u8]) -> IResult<&[u8], Duration> { preceded( - tuple((tag(b"P"), not(sign))), - tuple(( + (tag("P"), not(sign)), + ( duration_datetime_year, - opt(tag(b"-")), + opt(tag("-")), date_month, - opt(tag(b"-")), + opt(tag("-")), date_day, - tag(b"T"), + tag("T"), parse_time, - )), + ), ) .map(|(year, _, month, _, day, _, t)| Duration::YMDHMS { year, @@ -377,5 +378,5 @@ fn duration_datetime(i: &[u8]) -> IResult<&[u8], Duration> { /// /// See [`duration()`][`crate::duration()`] for supported formats. pub fn parse_duration(i: &[u8]) -> IResult<&[u8], Duration> { - alt((duration_ymdhms, duration_weeks, duration_datetime))(i) + alt((duration_ymdhms, duration_weeks, duration_datetime)).parse(i) }