From ad97f080487d058f924d9dca8e83f17e97ec0534 Mon Sep 17 00:00:00 2001 From: ThetaDev Date: Sun, 18 Jun 2023 21:41:29 +0200 Subject: [PATCH] [YouTube] Fix parsing short relative date formats (English only) (#1068) --- .../extractor/localization/TimeAgoParser.java | 3 +- .../newpipe/extractor/utils/TimeagoTest.java | 154 ++++++++++++++++++ timeago-parser/raw/unique_patterns.json | 39 +++-- .../extractor/timeago/patterns/en.java | 14 +- .../extractor/timeago/patterns/en_GB.java | 12 +- 5 files changed, 195 insertions(+), 27 deletions(-) create mode 100644 extractor/src/test/java/org/schabi/newpipe/extractor/utils/TimeagoTest.java diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/localization/TimeAgoParser.java b/extractor/src/main/java/org/schabi/newpipe/extractor/localization/TimeAgoParser.java index 7680bebd9f..1981b2002f 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/localization/TimeAgoParser.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/localization/TimeAgoParser.java @@ -93,7 +93,8 @@ private boolean textualDateMatches(final String textualDate, final String agoPhr final String escapedSeparator = patternsHolder.wordSeparator().equals(" ") // From JDK8 → \h - Treat horizontal spaces as a normal one // (non-breaking space, thin space, etc.) - ? "[ \\t\\xA0\\u1680\\u180e\\u2000-\\u200a\\u202f\\u205f\\u3000]" + // Also split the string on numbers to be able to parse strings like "2wk" + ? "[ \\t\\xA0\\u1680\\u180e\\u2000-\\u200a\\u202f\\u205f\\u3000\\d]" : Pattern.quote(patternsHolder.wordSeparator()); // (^|separator)pattern($|separator) diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/utils/TimeagoTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/utils/TimeagoTest.java new file mode 100644 index 0000000000..0b4eecb50b --- /dev/null +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/utils/TimeagoTest.java @@ -0,0 +1,154 @@ +package org.schabi.newpipe.extractor.utils; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.schabi.newpipe.extractor.exceptions.ParsingException; +import org.schabi.newpipe.extractor.localization.Localization; +import org.schabi.newpipe.extractor.localization.TimeAgoParser; +import org.schabi.newpipe.extractor.localization.TimeAgoPatternsManager; + +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.time.temporal.ChronoUnit; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TimeagoTest { + private static TimeAgoParser parser; + private static OffsetDateTime now; + + @BeforeAll + public static void setUp() { + parser = TimeAgoPatternsManager.getTimeAgoParserFor(Localization.DEFAULT); + now = OffsetDateTime.now(ZoneOffset.UTC); + } + + @Test + void parseTimeago() throws ParsingException { + assertTimeWithin1s( + now.minus(1, ChronoUnit.SECONDS), + parser.parse("1 second ago").offsetDateTime() + ); + assertTimeWithin1s( + now.minus(12, ChronoUnit.SECONDS), + parser.parse("12 second ago").offsetDateTime() + ); + assertTimeWithin1s( + now.minus(1, ChronoUnit.MINUTES), + parser.parse("1 minute ago").offsetDateTime() + ); + assertTimeWithin1s( + now.minus(23, ChronoUnit.MINUTES), + parser.parse("23 minutes ago").offsetDateTime() + ); + assertTimeWithin1s( + now.minus(1, ChronoUnit.HOURS), + parser.parse("1 hour ago").offsetDateTime() + ); + assertTimeWithin1s( + now.minus(8, ChronoUnit.HOURS), + parser.parse("8 hours ago").offsetDateTime() + ); + assertEquals( + now.minus(1, ChronoUnit.DAYS).truncatedTo(ChronoUnit.HOURS), + parser.parse("1 day ago").offsetDateTime() + ); + assertEquals( + now.minus(3, ChronoUnit.DAYS).truncatedTo(ChronoUnit.HOURS), + parser.parse("3 days ago").offsetDateTime() + ); + assertEquals( + now.minus(1, ChronoUnit.WEEKS).truncatedTo(ChronoUnit.HOURS), + parser.parse("1 week ago").offsetDateTime() + ); + assertEquals( + now.minus(3, ChronoUnit.WEEKS).truncatedTo(ChronoUnit.HOURS), + parser.parse("3 weeks ago").offsetDateTime() + ); + assertEquals( + now.minus(1, ChronoUnit.MONTHS).truncatedTo(ChronoUnit.HOURS), + parser.parse("1 month ago").offsetDateTime() + ); + assertEquals( + now.minus(3, ChronoUnit.MONTHS).truncatedTo(ChronoUnit.HOURS), + parser.parse("3 months ago").offsetDateTime() + ); + assertEquals( + now.minus(1, ChronoUnit.YEARS).minusDays(1).truncatedTo(ChronoUnit.HOURS), + parser.parse("1 year ago").offsetDateTime() + ); + assertEquals( + now.minus(3, ChronoUnit.YEARS).minusDays(1).truncatedTo(ChronoUnit.HOURS), + parser.parse("3 years ago").offsetDateTime() + ); + } + + @Test + void parseTimeagoShort() throws ParsingException { + final TimeAgoParser parser = TimeAgoPatternsManager.getTimeAgoParserFor(Localization.DEFAULT); + final OffsetDateTime now = OffsetDateTime.now(ZoneOffset.UTC); + + assertTimeWithin1s( + now.minus(1, ChronoUnit.SECONDS), + parser.parse("1 sec ago").offsetDateTime() + ); + assertTimeWithin1s( + now.minus(12, ChronoUnit.SECONDS), + parser.parse("12 sec ago").offsetDateTime() + ); + assertTimeWithin1s( + now.minus(1, ChronoUnit.MINUTES), + parser.parse("1 min ago").offsetDateTime() + ); + assertTimeWithin1s( + now.minus(23, ChronoUnit.MINUTES), + parser.parse("23 min ago").offsetDateTime() + ); + assertTimeWithin1s( + now.minus(1, ChronoUnit.HOURS), + parser.parse("1 hr ago").offsetDateTime() + ); + assertTimeWithin1s( + now.minus(8, ChronoUnit.HOURS), + parser.parse("8 hr ago").offsetDateTime() + ); + assertEquals( + now.minus(1, ChronoUnit.DAYS).truncatedTo(ChronoUnit.HOURS), + parser.parse("1 day ago").offsetDateTime() + ); + assertEquals( + now.minus(3, ChronoUnit.DAYS).truncatedTo(ChronoUnit.HOURS), + parser.parse("3 days ago").offsetDateTime() + ); + assertEquals( + now.minus(1, ChronoUnit.WEEKS).truncatedTo(ChronoUnit.HOURS), + parser.parse("1 wk ago").offsetDateTime() + ); + assertEquals( + now.minus(3, ChronoUnit.WEEKS).truncatedTo(ChronoUnit.HOURS), + parser.parse("3 wk ago").offsetDateTime() + ); + assertEquals( + now.minus(1, ChronoUnit.MONTHS).truncatedTo(ChronoUnit.HOURS), + parser.parse("1 mo ago").offsetDateTime() + ); + assertEquals( + now.minus(3, ChronoUnit.MONTHS).truncatedTo(ChronoUnit.HOURS), + parser.parse("3 mo ago").offsetDateTime() + ); + assertEquals( + now.minus(1, ChronoUnit.YEARS).minusDays(1).truncatedTo(ChronoUnit.HOURS), + parser.parse("1 yr ago").offsetDateTime() + ); + assertEquals( + now.minus(3, ChronoUnit.YEARS).minusDays(1).truncatedTo(ChronoUnit.HOURS), + parser.parse("3 yr ago").offsetDateTime() + ); + } + + void assertTimeWithin1s(final OffsetDateTime expected, final OffsetDateTime actual) { + final long delta = Math.abs(expected.toEpochSecond() - actual.toEpochSecond()); + assertTrue(delta <= 1, String.format("Expected: %s\nActual: %s", expected, actual)); + } +} diff --git a/timeago-parser/raw/unique_patterns.json b/timeago-parser/raw/unique_patterns.json index 1d92538286..a78218aa95 100644 --- a/timeago-parser/raw/unique_patterns.json +++ b/timeago-parser/raw/unique_patterns.json @@ -415,46 +415,56 @@ "word_separator": " ", "seconds": [ "second", - "seconds" + "seconds", + "sec" ], "minutes": [ "minute", - "minutes" + "minutes", + "min" ], "hours": [ "hour", - "hours" + "hours", + "h" ], "days": [ "day", - "days" + "days", + "d" ], "weeks": [ "week", - "weeks" + "weeks", + "w" ], "months": [ "month", - "months" + "months", + "mo" ], "years": [ "year", - "years" + "years", + "y" ] }, "en-GB": { "word_separator": " ", "seconds": [ "second", - "seconds" + "seconds", + "sec" ], "minutes": [ "minute", - "minutes" + "minutes", + "min" ], "hours": [ "hour", - "hours" + "hours", + "hr" ], "days": [ "day", @@ -462,15 +472,18 @@ ], "weeks": [ "week", - "weeks" + "weeks", + "wk" ], "months": [ "month", - "months" + "months", + "mo" ], "years": [ "year", - "years" + "years", + "yr" ] }, "es": { diff --git a/timeago-parser/src/main/java/org/schabi/newpipe/extractor/timeago/patterns/en.java b/timeago-parser/src/main/java/org/schabi/newpipe/extractor/timeago/patterns/en.java index 817825df57..b2a76f3520 100644 --- a/timeago-parser/src/main/java/org/schabi/newpipe/extractor/timeago/patterns/en.java +++ b/timeago-parser/src/main/java/org/schabi/newpipe/extractor/timeago/patterns/en.java @@ -9,13 +9,13 @@ public class en extends PatternsHolder { private static final String WORD_SEPARATOR = " "; private static final String[] - SECONDS /**/ = {"second", "seconds"}, - MINUTES /**/ = {"minute", "minutes"}, - HOURS /**/ = {"hour", "hours"}, - DAYS /**/ = {"day", "days"}, - WEEKS /**/ = {"week", "weeks"}, - MONTHS /**/ = {"month", "months"}, - YEARS /**/ = {"year", "years"}; + SECONDS /**/ = {"second", "seconds", "sec"}, + MINUTES /**/ = {"minute", "minutes", "min"}, + HOURS /**/ = {"hour", "hours", "h"}, + DAYS /**/ = {"day", "days", "d"}, + WEEKS /**/ = {"week", "weeks", "w"}, + MONTHS /**/ = {"month", "months", "mo"}, + YEARS /**/ = {"year", "years", "y"}; private static final en INSTANCE = new en(); diff --git a/timeago-parser/src/main/java/org/schabi/newpipe/extractor/timeago/patterns/en_GB.java b/timeago-parser/src/main/java/org/schabi/newpipe/extractor/timeago/patterns/en_GB.java index d8ce88782a..680d6a95ea 100644 --- a/timeago-parser/src/main/java/org/schabi/newpipe/extractor/timeago/patterns/en_GB.java +++ b/timeago-parser/src/main/java/org/schabi/newpipe/extractor/timeago/patterns/en_GB.java @@ -9,13 +9,13 @@ public class en_GB extends PatternsHolder { private static final String WORD_SEPARATOR = " "; private static final String[] - SECONDS /**/ = {"second", "seconds"}, - MINUTES /**/ = {"minute", "minutes"}, - HOURS /**/ = {"hour", "hours"}, + SECONDS /**/ = {"second", "seconds", "sec"}, + MINUTES /**/ = {"minute", "minutes", "min"}, + HOURS /**/ = {"hour", "hours", "hr"}, DAYS /**/ = {"day", "days"}, - WEEKS /**/ = {"week", "weeks"}, - MONTHS /**/ = {"month", "months"}, - YEARS /**/ = {"year", "years"}; + WEEKS /**/ = {"week", "weeks", "wk"}, + MONTHS /**/ = {"month", "months", "mo"}, + YEARS /**/ = {"year", "years", "yr"}; private static final en_GB INSTANCE = new en_GB();