From 459bee930a76beba91954da45a213473cf00bf45 Mon Sep 17 00:00:00 2001 From: LisoUseInAIKyrios <118716522+LisoUseInAIKyrios@users.noreply.github.com> Date: Wed, 28 Aug 2024 03:36:22 -0400 Subject: [PATCH 01/10] feat(YouTube - Keyword filter): Remove keyword minimum length. Match whole keywords and not substrings. --- .../components/KeywordContentFilter.java | 133 +++++++++++++++--- 1 file changed, 112 insertions(+), 21 deletions(-) diff --git a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java index 4e0e6f58b9..dcb724493b 100644 --- a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java +++ b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java @@ -46,11 +46,6 @@ @RequiresApi(api = Build.VERSION_CODES.N) final class KeywordContentFilter extends Filter { - /** - * Minimum keyword/phrase length to prevent excessively broad content filtering. - */ - private static final int MINIMUM_KEYWORD_LENGTH = 3; - /** * Strings found in the buffer for every videos. * Full strings should be specified, as they are compared using {@link String#contains(CharSequence)}. @@ -223,14 +218,112 @@ private static String capitalizeAllFirstLetters(String sentence) { * @return If the phrase will will hide all videos. Not an exhaustive check. */ private static boolean phrasesWillHideAllVideos(@NonNull String[] phrases) { - for (String commonString : STRINGS_IN_EVERY_BUFFER) { - if (Utils.containsAny(commonString, phrases)) { - return true; + for (String phrase : phrases) { + for (String commonString : STRINGS_IN_EVERY_BUFFER) { + byte[] commonStringBytes = commonString.getBytes(StandardCharsets.UTF_8); + int matchIndex = 0; + while (true) { + matchIndex = commonString.indexOf(phrase, matchIndex); + if (matchIndex < 0) break; + + if (keywordMatchIsWholeWord(commonStringBytes, matchIndex, phrase.length())) { + return true; + } + + matchIndex++; + } } } + return false; } + /** + * @return If the start and end indexes are not surrounded by other letters. + * If the indexes are surrounded by numbers/symbols/punctuation it is considered a whole word. + */ + private static boolean keywordMatchIsWholeWord(byte[] text, int keywordStartIndex, int keywordLength) { + final Integer codePointBefore = getUtf8CodePointBefore(text, keywordStartIndex); + if (codePointBefore != null && Character.isLetter(codePointBefore)) { + return false; + } + + final Integer codePointAfter = getUtf8CodePointAt(text, keywordStartIndex + keywordLength); + //noinspection RedundantIfStatement + if (codePointAfter != null && Character.isLetter(codePointAfter)) { + return false; + } + + return true; + } + + /** + * @return The UTF8 character point immediately before the index, + * or null if the bytes before the index is not a valid UTF8 character. + */ + @Nullable + private static Integer getUtf8CodePointBefore(byte[] data, int index) { + if (index == 0) return null; + + final int UTF8_MAX_BYTE_COUNT = 4; + int startIndex = index - 1; + int characterByteCount = 1; + do { + final int characterByteLength = getUTF8CharacterLengthFromStartByte(data[startIndex]); + if (characterByteLength > 0) { + return decodeUtf8ToCodePoint(data, startIndex, characterByteLength); + } + } while (--startIndex >= 0 && ++characterByteCount < UTF8_MAX_BYTE_COUNT); + + return null; + } + + /** + * @return The UTF8 character point at the index, + * or null if the index holds no valid UTF8 character. + */ + @Nullable + private static Integer getUtf8CodePointAt(byte[] data, int startIndex) { + if (startIndex >= data.length) { + return null; + } + + final int characterByteLength = getUTF8CharacterLengthFromStartByte(data[startIndex]); + if (characterByteLength <= 0 || startIndex + characterByteLength > data.length) { + return null; + } + + return decodeUtf8ToCodePoint(data, startIndex, characterByteLength); + } + + private static int getUTF8CharacterLengthFromStartByte(byte startByte) { + if ((startByte & 0x80) == 0) return 1; // 0xxxxxxx (ASCII) + if ((startByte & 0xE0) == 0xC0) return 2; // 110xxxxx + if ((startByte & 0xF0) == 0xE0) return 3; // 1110xxxx + if ((startByte & 0xF8) == 0xF0) return 4; // 11110xxx + return -1; // Not a UTF8 character. + } + + public static int decodeUtf8ToCodePoint(byte[] data, int startIndex, int length) { + switch (length) { + case 1: + return data[startIndex]; + case 2: + return ((data[startIndex] & 0x1F) << 6) | + (data[startIndex + 1] & 0x3F); + case 3: + return ((data[startIndex] & 0x0F) << 12) | + ((data[startIndex + 1] & 0x3F) << 6) | + (data[startIndex + 2] & 0x3F); + case 4: + return ((data[startIndex] & 0x07) << 18) | + ((data[startIndex + 1] & 0x3F) << 12) | + ((data[startIndex + 2] & 0x3F) << 6) | + (data[startIndex + 3] & 0x3F); + } + throw new IllegalArgumentException("length is: " + length); + } + private synchronized void parseKeywords() { // Must be synchronized since Litho is multi-threaded. String rawKeywords = Settings.HIDE_KEYWORD_CONTENT_PHRASES.get(); //noinspection StringEquality @@ -250,12 +343,6 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho phrase = phrase.stripTrailing(); if (phrase.isBlank()) continue; - if (phrase.length() < MINIMUM_KEYWORD_LENGTH) { - // Do not reset the setting. Keep the invalid keywords so the user can fix the mistake. - Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_length", phrase, MINIMUM_KEYWORD_LENGTH)); - continue; - } - // Add common casing that might appear. // // This could be simplified by adding case insensitive search to the prefix search, @@ -282,14 +369,18 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho } for (String keyword : keywords) { - // Use a callback to get the keyword that matched. - // TrieSearch could have this built in, but that's slightly more complicated since - // the strings are stored as a byte array and embedded in the search tree. + // Verify the keyword is a whole word and not a substring, + // so a keyword like "ai" is matched but "fair" is not. TrieSearch.TriePatternMatchedCallback callback = - (textSearched, matchedStartIndex, matchedLength, callbackParameter) -> { - // noinspection unchecked - ((MutableReference) callbackParameter).value = keyword; - return true; + (textSearched, startIndex, matchLength, callbackParameter) -> { + if (keywordMatchIsWholeWord(textSearched, startIndex, matchLength)) { + Logger.printDebug(() -> "Matched keyword: '" + keyword + "'"); + // noinspection unchecked + ((MutableReference) callbackParameter).value = keyword; + return true; + } + + return false; }; byte[] stringBytes = keyword.getBytes(StandardCharsets.UTF_8); search.addPattern(stringBytes, callback); From 40bc0ef9dbd20b253fc5b3c484039054b959e709 Mon Sep 17 00:00:00 2001 From: LisoUseInAIKyrios <118716522+LisoUseInAIKyrios@users.noreply.github.com> Date: Wed, 28 Aug 2024 05:38:12 -0400 Subject: [PATCH 02/10] fix: Can now strip away leading spaces --- .../youtube/patches/components/KeywordContentFilter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java index dcb724493b..a6a1a0852a 100644 --- a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java +++ b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java @@ -339,8 +339,8 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho Set keywords = new LinkedHashSet<>(10 * split.length); for (String phrase : split) { - // Remove any trailing white space the user may have accidentally included. - phrase = phrase.stripTrailing(); + // Remove any white space padding the user may have accidentally included. + phrase = phrase.stripLeading().stripTrailing(); if (phrase.isBlank()) continue; // Add common casing that might appear. From ab628e61eb2b50c630fc133de05e40924b650735 Mon Sep 17 00:00:00 2001 From: LisoUseInAIKyrios <118716522+LisoUseInAIKyrios@users.noreply.github.com> Date: Wed, 28 Aug 2024 07:57:17 -0400 Subject: [PATCH 03/10] fix: Validate the entire byte sequence is valid UTF8 and not just the first byte --- .../components/KeywordContentFilter.java | 65 +++++++++++-------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java index a6a1a0852a..d8deb1a3e3 100644 --- a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java +++ b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java @@ -145,6 +145,8 @@ final class KeywordContentFilter extends Filter { private static final long ALL_VIDEOS_FILTERED_BACKOFF_MILLISECONDS = 60 * 1000; // 60 seconds + private static final int UTF8_MAX_BYTE_COUNT = 4; + /** * Rolling average of how many videos were filtered by a keyword. * Used to detect if a keyword passes the initial check against {@link #STRINGS_IN_EVERY_BUFFER} @@ -263,17 +265,12 @@ private static boolean keywordMatchIsWholeWord(byte[] text, int keywordStartInde */ @Nullable private static Integer getUtf8CodePointBefore(byte[] data, int index) { - if (index == 0) return null; - - final int UTF8_MAX_BYTE_COUNT = 4; - int startIndex = index - 1; - int characterByteCount = 1; - do { - final int characterByteLength = getUTF8CharacterLengthFromStartByte(data[startIndex]); - if (characterByteLength > 0) { - return decodeUtf8ToCodePoint(data, startIndex, characterByteLength); + int characterByteCount = 0; + while (--index >= 0 && ++characterByteCount <= UTF8_MAX_BYTE_COUNT) { + if (isValidUTF8(data, index, characterByteCount)) { + return decodeUTF8ToCodePoint(data, index, characterByteCount); } - } while (--startIndex >= 0 && ++characterByteCount < UTF8_MAX_BYTE_COUNT); + } return null; } @@ -283,29 +280,41 @@ private static Integer getUtf8CodePointBefore(byte[] data, int index) { * or null if the index holds no valid UTF8 character. */ @Nullable - private static Integer getUtf8CodePointAt(byte[] data, int startIndex) { - if (startIndex >= data.length) { - return null; - } - - final int characterByteLength = getUTF8CharacterLengthFromStartByte(data[startIndex]); - if (characterByteLength <= 0 || startIndex + characterByteLength > data.length) { - return null; + private static Integer getUtf8CodePointAt(byte[] data, int index) { + int characterByteCount = 0; + final int dataLength = data.length; + while (index + characterByteCount < dataLength && ++characterByteCount <= UTF8_MAX_BYTE_COUNT) { + if (isValidUTF8(data, index, characterByteCount)) { + return decodeUTF8ToCodePoint(data, index, characterByteCount); + } } - return decodeUtf8ToCodePoint(data, startIndex, characterByteLength); + return null; } - private static int getUTF8CharacterLengthFromStartByte(byte startByte) { - if ((startByte & 0x80) == 0) return 1; // 0xxxxxxx (ASCII) - if ((startByte & 0xE0) == 0xC0) return 2; // 110xxxxx - if ((startByte & 0xF0) == 0xE0) return 3; // 1110xxxx - if ((startByte & 0xF8) == 0xF0) return 4; // 11110xxx - return -1; // Not a UTF8 character. + public static boolean isValidUTF8(byte[] data, int startIndex, int numberOfBytes) { + switch (numberOfBytes) { + case 1: // 0xxxxxxx (ASCII) + return (data[startIndex] & 0x80) == 0; + case 2: // 110xxxxx, 10xxxxxx + return (data[startIndex] & 0xE0) == 0xC0 + && (data[startIndex + 1] & 0xC0) == 0x80; + case 3: // 1110xxxx, 10xxxxxx, 10xxxxxx + return (data[startIndex] & 0xF0) == 0xE0 + && (data[startIndex + 1] & 0xC0) == 0x80 + && (data[startIndex + 2] & 0xC0) == 0x80; + case 4: // 11110xxx, 10xxxxxx, 10xxxxxx, 10xxxxxx + return (data[startIndex] & 0xF8) == 0xF0 + && (data[startIndex + 1] & 0xC0) == 0x80 + && (data[startIndex + 2] & 0xC0) == 0x80 + && (data[startIndex + 3] & 0xC0) == 0x80; + } + + throw new IllegalArgumentException("numberOfBytes: " + numberOfBytes); } - public static int decodeUtf8ToCodePoint(byte[] data, int startIndex, int length) { - switch (length) { + public static int decodeUTF8ToCodePoint(byte[] data, int startIndex, int numberOfBytes) { + switch (numberOfBytes) { case 1: return data[startIndex]; case 2: @@ -321,7 +330,7 @@ public static int decodeUtf8ToCodePoint(byte[] data, int startIndex, int length) ((data[startIndex + 2] & 0x3F) << 6) | (data[startIndex + 3] & 0x3F); } - throw new IllegalArgumentException("length is: " + length); + throw new IllegalArgumentException("numberOfBytes: " + numberOfBytes); } private synchronized void parseKeywords() { // Must be synchronized since Litho is multi-threaded. From 86f2f24d5a95cdd529e55236c72ffee1b5809866 Mon Sep 17 00:00:00 2001 From: LisoUseInAIKyrios <118716522+LisoUseInAIKyrios@users.noreply.github.com> Date: Wed, 28 Aug 2024 18:20:28 -0400 Subject: [PATCH 04/10] fix: Ignore metadata component --- .../components/KeywordContentFilter.java | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java index d8deb1a3e3..cf06a97f6a 100644 --- a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java +++ b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java @@ -41,6 +41,11 @@ * (ie: "mr beast" automatically filters "Mr Beast" and "MR BEAST"). * - Keywords present in the layout or video data cannot be used as filters, otherwise all videos * will always be hidden. This patch checks for some words of these words. + * - Keywords are matched against whole words, and not as substrings. + * So 'ai' will hide 'Model #ai123 release!' and 'Is AI self aware?', + * but not hide 'DMCA guide to fair use'. This behavior is desired and intentional + * to prevent false hiding, but it also may require adding plural versions of some words. + * (use keywords of 'fox' and 'foxes'). */ @SuppressWarnings("unused") @RequiresApi(api = Build.VERSION_CODES.N) @@ -102,7 +107,8 @@ final class KeywordContentFilter extends Filter { "search_video_with_context.eml", "video_with_context.eml", // Subscription tab videos. "related_video_with_context.eml", - "video_lockup_with_attachment.eml", // A/B test for subscribed video. + // A/B test for subscribed video, and sometimes when tablet layout is enabled. + "video_lockup_with_attachment.eml", "compact_video.eml", "inline_shorts", "shorts_video_cell", @@ -131,7 +137,8 @@ final class KeywordContentFilter extends Filter { "metadata.eml", "thumbnail.eml", "avatar.eml", - "overflow_button.eml" + "overflow_button.eml", + "metadata.eml" ); /** @@ -213,6 +220,7 @@ private static String capitalizeAllFirstLetters(String sentence) { capitalizeNext = false; } } + return new String(codePoints, 0, codePoints.length); } @@ -267,8 +275,8 @@ private static boolean keywordMatchIsWholeWord(byte[] text, int keywordStartInde private static Integer getUtf8CodePointBefore(byte[] data, int index) { int characterByteCount = 0; while (--index >= 0 && ++characterByteCount <= UTF8_MAX_BYTE_COUNT) { - if (isValidUTF8(data, index, characterByteCount)) { - return decodeUTF8ToCodePoint(data, index, characterByteCount); + if (isValidUtf8(data, index, characterByteCount)) { + return decodeUtf8ToCodePoint(data, index, characterByteCount); } } @@ -284,15 +292,15 @@ private static Integer getUtf8CodePointAt(byte[] data, int index) { int characterByteCount = 0; final int dataLength = data.length; while (index + characterByteCount < dataLength && ++characterByteCount <= UTF8_MAX_BYTE_COUNT) { - if (isValidUTF8(data, index, characterByteCount)) { - return decodeUTF8ToCodePoint(data, index, characterByteCount); + if (isValidUtf8(data, index, characterByteCount)) { + return decodeUtf8ToCodePoint(data, index, characterByteCount); } } return null; } - public static boolean isValidUTF8(byte[] data, int startIndex, int numberOfBytes) { + public static boolean isValidUtf8(byte[] data, int startIndex, int numberOfBytes) { switch (numberOfBytes) { case 1: // 0xxxxxxx (ASCII) return (data[startIndex] & 0x80) == 0; @@ -313,7 +321,7 @@ public static boolean isValidUTF8(byte[] data, int startIndex, int numberOfBytes throw new IllegalArgumentException("numberOfBytes: " + numberOfBytes); } - public static int decodeUTF8ToCodePoint(byte[] data, int startIndex, int numberOfBytes) { + public static int decodeUtf8ToCodePoint(byte[] data, int startIndex, int numberOfBytes) { switch (numberOfBytes) { case 1: return data[startIndex]; @@ -378,10 +386,10 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho } for (String keyword : keywords) { - // Verify the keyword is a whole word and not a substring, - // so a keyword like "ai" is matched but "fair" is not. TrieSearch.TriePatternMatchedCallback callback = (textSearched, startIndex, matchLength, callbackParameter) -> { + // Verify the keyword is a whole word and not a substring, + // so a keyword like "ai" is matched but "fair" is not. if (keywordMatchIsWholeWord(textSearched, startIndex, matchLength)) { Logger.printDebug(() -> "Matched keyword: '" + keyword + "'"); // noinspection unchecked From 18adeb2d70318b5301de45de653bdea6576545a4 Mon Sep 17 00:00:00 2001 From: LisoUseInAIKyrios <118716522+LisoUseInAIKyrios@users.noreply.github.com> Date: Wed, 28 Aug 2024 22:11:13 -0400 Subject: [PATCH 05/10] fix: Add a toggle to enable/disable whole word matching --- .../components/KeywordContentFilter.java | 79 ++++++++++++------- .../youtube/settings/Settings.java | 2 + 2 files changed, 53 insertions(+), 28 deletions(-) diff --git a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java index cf06a97f6a..41e7d38cc3 100644 --- a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java +++ b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java @@ -52,8 +52,7 @@ final class KeywordContentFilter extends Filter { /** - * Strings found in the buffer for every videos. - * Full strings should be specified, as they are compared using {@link String#contains(CharSequence)}. + * Strings found in the buffer for every videos. Full strings should be specified. * * This list does not include every common buffer string, and this can be added/changed as needed. * Words must be entered with the exact casing as found in the buffer. @@ -88,7 +87,7 @@ final class KeywordContentFilter extends Filter { "search_vwc_description_transition_key", "g-high-recZ", // Text and litho components found in the buffer that belong to path filters. - "metadata.eml", + "expandable_metadata.eml", "thumbnail.eml", "avatar.eml", "overflow_button.eml", @@ -137,10 +136,15 @@ final class KeywordContentFilter extends Filter { "metadata.eml", "thumbnail.eml", "avatar.eml", - "overflow_button.eml", - "metadata.eml" + "overflow_button.eml" ); + /** + * Minimum keyword/phrase length to prevent excessively broad content filtering. + * Only applies when {@link Settings#HIDE_KEYWORD_CONTENT_SEARCH} is not enabled. + */ + private static final int MINIMUM_KEYWORD_LENGTH = 3; + /** * Threshold for {@link #filteredVideosPercentage} * that indicates all or nearly all videos have been filtered. @@ -182,6 +186,8 @@ final class KeywordContentFilter extends Filter { */ private volatile String lastKeywordPhrasesParsed; + private volatile boolean matchWholeWords; + private volatile ByteTrieSearch bufferSearch; /** @@ -227,20 +233,24 @@ private static String capitalizeAllFirstLetters(String sentence) { /** * @return If the phrase will will hide all videos. Not an exhaustive check. */ - private static boolean phrasesWillHideAllVideos(@NonNull String[] phrases) { + private static boolean phrasesWillHideAllVideos(@NonNull String[] phrases, boolean matchWholeWords) { for (String phrase : phrases) { for (String commonString : STRINGS_IN_EVERY_BUFFER) { - byte[] commonStringBytes = commonString.getBytes(StandardCharsets.UTF_8); - int matchIndex = 0; - while (true) { - matchIndex = commonString.indexOf(phrase, matchIndex); - if (matchIndex < 0) break; - - if (keywordMatchIsWholeWord(commonStringBytes, matchIndex, phrase.length())) { - return true; + if (matchWholeWords) { + byte[] commonStringBytes = commonString.getBytes(StandardCharsets.UTF_8); + int matchIndex = 0; + while (true) { + matchIndex = commonString.indexOf(phrase, matchIndex); + if (matchIndex < 0) break; + + if (keywordMatchIsWholeWord(commonStringBytes, matchIndex, phrase.length())) { + return true; + } + + matchIndex++; } - - matchIndex++; + } else if (Utils.containsAny(commonString, phrases)) { + return true; } } } @@ -342,9 +352,11 @@ public static int decodeUtf8ToCodePoint(byte[] data, int startIndex, int numberO } private synchronized void parseKeywords() { // Must be synchronized since Litho is multi-threaded. + final boolean matchWholeWordsEnabled = Settings.HIDE_KEYWORD_CONTENT_WHOLE_WORDS.get(); String rawKeywords = Settings.HIDE_KEYWORD_CONTENT_PHRASES.get(); + //noinspection StringEquality - if (rawKeywords == lastKeywordPhrasesParsed) { + if (rawKeywords == lastKeywordPhrasesParsed && matchWholeWordsEnabled == matchWholeWords) { Logger.printDebug(() -> "Using previously initialized search"); return; // Another thread won the race, and search is already initialized. } @@ -356,10 +368,19 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho Set keywords = new LinkedHashSet<>(10 * split.length); for (String phrase : split) { - // Remove any white space padding the user may have accidentally included. - phrase = phrase.stripLeading().stripTrailing(); + // Remove any trailing spaces the user may have accidentally included. + phrase = phrase.stripTrailing(); if (phrase.isBlank()) continue; + if (matchWholeWordsEnabled) { + // Can strip off any leading spaces since it's whole word matching. + phrase = phrase.stripTrailing(); + } else if (phrase.length() < MINIMUM_KEYWORD_LENGTH) { + // Do not reset the setting. Keep the invalid keywords so the user can fix the mistake. + Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_length", phrase, MINIMUM_KEYWORD_LENGTH)); + continue; + } + // Add common casing that might appear. // // This could be simplified by adding case insensitive search to the prefix search, @@ -377,7 +398,7 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho capitalizeAllFirstLetters(phrase), phrase.toUpperCase() }; - if (phrasesWillHideAllVideos(phraseVariations)) { + if (phrasesWillHideAllVideos(phraseVariations, matchWholeWordsEnabled)) { Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_common", phrase)); continue; } @@ -390,14 +411,14 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho (textSearched, startIndex, matchLength, callbackParameter) -> { // Verify the keyword is a whole word and not a substring, // so a keyword like "ai" is matched but "fair" is not. - if (keywordMatchIsWholeWord(textSearched, startIndex, matchLength)) { - Logger.printDebug(() -> "Matched keyword: '" + keyword + "'"); - // noinspection unchecked - ((MutableReference) callbackParameter).value = keyword; - return true; + if (matchWholeWordsEnabled && !keywordMatchIsWholeWord(textSearched, startIndex, matchLength)) { + return false; } - return false; + Logger.printDebug(() -> "Matched keyword: '" + keyword + "'"); + // noinspection unchecked + ((MutableReference) callbackParameter).value = keyword; + return true; }; byte[] stringBytes = keyword.getBytes(StandardCharsets.UTF_8); search.addPattern(stringBytes, callback); @@ -409,6 +430,7 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho bufferSearch = search; timeToResumeFiltering = 0; filteredVideosPercentage = 0; + matchWholeWords = matchWholeWordsEnabled; lastKeywordPhrasesParsed = rawKeywords; // Must set last. } @@ -489,8 +511,9 @@ boolean isFiltered(@Nullable String identifier, String path, byte[] protobufBuff // Field is intentionally compared using reference equality. //noinspection StringEquality - if (Settings.HIDE_KEYWORD_CONTENT_PHRASES.get() != lastKeywordPhrasesParsed) { - // User changed the keywords. + if (Settings.HIDE_KEYWORD_CONTENT_PHRASES.get() != lastKeywordPhrasesParsed + || Settings.HIDE_KEYWORD_CONTENT_WHOLE_WORDS.get() != matchWholeWords) { + // User changed the keywords or whole word setting. parseKeywords(); } diff --git a/app/src/main/java/app/revanced/integrations/youtube/settings/Settings.java b/app/src/main/java/app/revanced/integrations/youtube/settings/Settings.java index 8708d579c5..ef89dea60e 100644 --- a/app/src/main/java/app/revanced/integrations/youtube/settings/Settings.java +++ b/app/src/main/java/app/revanced/integrations/youtube/settings/Settings.java @@ -74,6 +74,8 @@ public class Settings extends BaseSettings { public static final BooleanSetting HIDE_KEYWORD_CONTENT_HOME = new BooleanSetting("revanced_hide_keyword_content_home", FALSE); public static final BooleanSetting HIDE_KEYWORD_CONTENT_SUBSCRIPTIONS = new BooleanSetting("revanced_hide_keyword_content_subscriptions", FALSE); public static final BooleanSetting HIDE_KEYWORD_CONTENT_SEARCH = new BooleanSetting("revanced_hide_keyword_content_search", FALSE); + public static final BooleanSetting HIDE_KEYWORD_CONTENT_WHOLE_WORDS = new BooleanSetting("revanced_hide_keyword_content_whole_word", FALSE, + parentsAny(HIDE_KEYWORD_CONTENT_HOME, HIDE_KEYWORD_CONTENT_SUBSCRIPTIONS, HIDE_KEYWORD_CONTENT_SEARCH)); public static final StringSetting HIDE_KEYWORD_CONTENT_PHRASES = new StringSetting("revanced_hide_keyword_content_phrases", "", parentsAny(HIDE_KEYWORD_CONTENT_HOME, HIDE_KEYWORD_CONTENT_SUBSCRIPTIONS, HIDE_KEYWORD_CONTENT_SEARCH)); From 90c9fb1cc35cb4109ef86c7862304a91006d05f8 Mon Sep 17 00:00:00 2001 From: LisoUseInAIKyrios <118716522+LisoUseInAIKyrios@users.noreply.github.com> Date: Thu, 29 Aug 2024 17:42:29 -0400 Subject: [PATCH 06/10] fix: Use whole word syntax --- .../components/KeywordContentFilter.java | 82 +++++++++++++------ .../youtube/settings/Settings.java | 2 - .../settings/preference/HtmlPreference.java | 35 ++++++++ 3 files changed, 91 insertions(+), 28 deletions(-) create mode 100644 app/src/main/java/app/revanced/integrations/youtube/settings/preference/HtmlPreference.java diff --git a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java index 41e7d38cc3..f2da4af9de 100644 --- a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java +++ b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java @@ -10,9 +10,8 @@ import androidx.annotation.RequiresApi; import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.LinkedHashSet; -import java.util.Set; +import java.util.LinkedHashMap; +import java.util.Map; import java.util.concurrent.atomic.AtomicReference; import app.revanced.integrations.shared.Logger; @@ -141,7 +140,7 @@ final class KeywordContentFilter extends Filter { /** * Minimum keyword/phrase length to prevent excessively broad content filtering. - * Only applies when {@link Settings#HIDE_KEYWORD_CONTENT_SEARCH} is not enabled. + * Only applies when not using whole word syntax. */ private static final int MINIMUM_KEYWORD_LENGTH = 3; @@ -186,8 +185,6 @@ final class KeywordContentFilter extends Filter { */ private volatile String lastKeywordPhrasesParsed; - private volatile boolean matchWholeWords; - private volatile ByteTrieSearch bufferSearch; /** @@ -351,12 +348,19 @@ public static int decodeUtf8ToCodePoint(byte[] data, int startIndex, int numberO throw new IllegalArgumentException("numberOfBytes: " + numberOfBytes); } + private static boolean phraseUsesWholeWordSyntax(String phrase) { + return phrase.startsWith("\"") && phrase.endsWith("\""); + } + + private static String stripWholeWordSyntax(String phrase) { + return phrase.substring(1, phrase.length() - 1); + } + private synchronized void parseKeywords() { // Must be synchronized since Litho is multi-threaded. - final boolean matchWholeWordsEnabled = Settings.HIDE_KEYWORD_CONTENT_WHOLE_WORDS.get(); String rawKeywords = Settings.HIDE_KEYWORD_CONTENT_PHRASES.get(); //noinspection StringEquality - if (rawKeywords == lastKeywordPhrasesParsed && matchWholeWordsEnabled == matchWholeWords) { + if (rawKeywords == lastKeywordPhrasesParsed) { Logger.printDebug(() -> "Using previously initialized search"); return; // Another thread won the race, and search is already initialized. } @@ -365,23 +369,30 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho String[] split = rawKeywords.split("\n"); if (split.length != 0) { // Linked Set so log statement are more organized and easier to read. - Set keywords = new LinkedHashSet<>(10 * split.length); + // Map is: Phrase -> isWholeWord + Map keywords = new LinkedHashMap<>(10 * split.length); for (String phrase : split) { // Remove any trailing spaces the user may have accidentally included. phrase = phrase.stripTrailing(); if (phrase.isBlank()) continue; - if (matchWholeWordsEnabled) { - // Can strip off any leading spaces since it's whole word matching. - phrase = phrase.stripTrailing(); + final boolean wholeWordMatching; + if (phraseUsesWholeWordSyntax(phrase)) { + if (phrase.length() == 2) { + continue; // Empty "" phrase + } + phrase = stripWholeWordSyntax(phrase); + wholeWordMatching = true; } else if (phrase.length() < MINIMUM_KEYWORD_LENGTH) { // Do not reset the setting. Keep the invalid keywords so the user can fix the mistake. Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_length", phrase, MINIMUM_KEYWORD_LENGTH)); continue; + } else { + wholeWordMatching = false; } - // Add common casing that might appear. + // Common casing that might appear. // // This could be simplified by adding case insensitive search to the prefix search, // which is very simple to add to StringTreSearch for Unicode and ByteTrieSearch for ASCII. @@ -390,7 +401,7 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho // UTF-8 characters can be different byte lengths, which does // not allow comparing two different byte arrays using simple plain array indexes. // - // Instead add all common case variations of the words. + // Instead use all common case variations of the words. String[] phraseVariations = { phrase, phrase.toLowerCase(), @@ -398,24 +409,45 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho capitalizeAllFirstLetters(phrase), phrase.toUpperCase() }; - if (phrasesWillHideAllVideos(phraseVariations, matchWholeWordsEnabled)) { - Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_common", phrase)); + + if (phrasesWillHideAllVideos(phraseVariations, wholeWordMatching)) { + String toastMessage; + // If whole word matching is off, but would pass with on, then show a different toast. + if (!wholeWordMatching && !phrasesWillHideAllVideos(phraseVariations, true)) { + toastMessage = "revanced_hide_keyword_toast_invalid_common_whole_word_required"; + } else { + toastMessage = "revanced_hide_keyword_toast_invalid_common"; + } + + Utils.showToastLong(str(toastMessage, phrase)); continue; } - keywords.addAll(Arrays.asList(phraseVariations)); + for (String variation : phraseVariations) { + // Check if the same phrase is declared both with and without quotes. + Boolean existing = keywords.get(variation); + if (existing != null && existing != wholeWordMatching) { + Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_conflicting", phrase)); + break; + } + + keywords.put(variation, wholeWordMatching); + } } - for (String keyword : keywords) { + for (Map.Entry entry : keywords.entrySet()) { + String keyword = entry.getKey(); + //noinspection ExtractMethodRecommender + final boolean isWholeWord = entry.getValue(); + TrieSearch.TriePatternMatchedCallback callback = (textSearched, startIndex, matchLength, callbackParameter) -> { - // Verify the keyword is a whole word and not a substring, - // so a keyword like "ai" is matched but "fair" is not. - if (matchWholeWordsEnabled && !keywordMatchIsWholeWord(textSearched, startIndex, matchLength)) { + if (isWholeWord && !keywordMatchIsWholeWord(textSearched, startIndex, matchLength)) { return false; } - Logger.printDebug(() -> "Matched keyword: '" + keyword + "'"); + Logger.printDebug(() -> (isWholeWord ? "Matched whole keyword: '" + : "Matched keyword: '") + keyword + "'"); // noinspection unchecked ((MutableReference) callbackParameter).value = keyword; return true; @@ -424,13 +456,12 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho search.addPattern(stringBytes, callback); } - Logger.printDebug(() -> "Search using: (" + search.getEstimatedMemorySize() + " KB) keywords: " + keywords); + Logger.printDebug(() -> "Search using: (" + search.getEstimatedMemorySize() + " KB) keywords: " + keywords.keySet()); } bufferSearch = search; timeToResumeFiltering = 0; filteredVideosPercentage = 0; - matchWholeWords = matchWholeWordsEnabled; lastKeywordPhrasesParsed = rawKeywords; // Must set last. } @@ -511,8 +542,7 @@ boolean isFiltered(@Nullable String identifier, String path, byte[] protobufBuff // Field is intentionally compared using reference equality. //noinspection StringEquality - if (Settings.HIDE_KEYWORD_CONTENT_PHRASES.get() != lastKeywordPhrasesParsed - || Settings.HIDE_KEYWORD_CONTENT_WHOLE_WORDS.get() != matchWholeWords) { + if (Settings.HIDE_KEYWORD_CONTENT_PHRASES.get() != lastKeywordPhrasesParsed) { // User changed the keywords or whole word setting. parseKeywords(); } diff --git a/app/src/main/java/app/revanced/integrations/youtube/settings/Settings.java b/app/src/main/java/app/revanced/integrations/youtube/settings/Settings.java index ef89dea60e..8708d579c5 100644 --- a/app/src/main/java/app/revanced/integrations/youtube/settings/Settings.java +++ b/app/src/main/java/app/revanced/integrations/youtube/settings/Settings.java @@ -74,8 +74,6 @@ public class Settings extends BaseSettings { public static final BooleanSetting HIDE_KEYWORD_CONTENT_HOME = new BooleanSetting("revanced_hide_keyword_content_home", FALSE); public static final BooleanSetting HIDE_KEYWORD_CONTENT_SUBSCRIPTIONS = new BooleanSetting("revanced_hide_keyword_content_subscriptions", FALSE); public static final BooleanSetting HIDE_KEYWORD_CONTENT_SEARCH = new BooleanSetting("revanced_hide_keyword_content_search", FALSE); - public static final BooleanSetting HIDE_KEYWORD_CONTENT_WHOLE_WORDS = new BooleanSetting("revanced_hide_keyword_content_whole_word", FALSE, - parentsAny(HIDE_KEYWORD_CONTENT_HOME, HIDE_KEYWORD_CONTENT_SUBSCRIPTIONS, HIDE_KEYWORD_CONTENT_SEARCH)); public static final StringSetting HIDE_KEYWORD_CONTENT_PHRASES = new StringSetting("revanced_hide_keyword_content_phrases", "", parentsAny(HIDE_KEYWORD_CONTENT_HOME, HIDE_KEYWORD_CONTENT_SUBSCRIPTIONS, HIDE_KEYWORD_CONTENT_SEARCH)); diff --git a/app/src/main/java/app/revanced/integrations/youtube/settings/preference/HtmlPreference.java b/app/src/main/java/app/revanced/integrations/youtube/settings/preference/HtmlPreference.java new file mode 100644 index 0000000000..96d29645d2 --- /dev/null +++ b/app/src/main/java/app/revanced/integrations/youtube/settings/preference/HtmlPreference.java @@ -0,0 +1,35 @@ +package app.revanced.integrations.youtube.settings.preference; + +import static android.text.Html.FROM_HTML_MODE_COMPACT; + +import android.content.Context; +import android.os.Build; +import android.preference.Preference; +import android.text.Html; +import android.util.AttributeSet; + +import androidx.annotation.RequiresApi; + +/** + * Allows using basic html for the summary text. + */ +@SuppressWarnings({"unused", "deprecation"}) +@RequiresApi(api = Build.VERSION_CODES.O) +public class HtmlPreference extends Preference { + { + setSummary(Html.fromHtml(getSummary().toString(), FROM_HTML_MODE_COMPACT)); + } + + public HtmlPreference(Context context, AttributeSet attrs, int defStyleAttr, int defStyleRes) { + super(context, attrs, defStyleAttr, defStyleRes); + } + public HtmlPreference(Context context, AttributeSet attrs, int defStyleAttr) { + super(context, attrs, defStyleAttr); + } + public HtmlPreference(Context context, AttributeSet attrs) { + super(context, attrs); + } + public HtmlPreference(Context context) { + super(context); + } +} From 05a0176bc2b42b15b1c5ee6895bfb430098a1b25 Mon Sep 17 00:00:00 2001 From: LisoUseInAIKyrios <118716522+LisoUseInAIKyrios@users.noreply.github.com> Date: Thu, 29 Aug 2024 18:40:30 -0400 Subject: [PATCH 07/10] Comments --- .../youtube/patches/components/KeywordContentFilter.java | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java index f2da4af9de..83e3dc25f5 100644 --- a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java +++ b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java @@ -25,7 +25,7 @@ /** *
- * Allows hiding home feed and search results based on keywords and/or channel names.
+ * Allows hiding home feed and search results based on video title keywords and/or channel names.
  *
  * Limitations:
  * - Searching for a keyword phrase will give no search results.
@@ -40,11 +40,7 @@
  *   (ie: "mr beast" automatically filters "Mr Beast" and "MR BEAST").
  * - Keywords present in the layout or video data cannot be used as filters, otherwise all videos
  *   will always be hidden.  This patch checks for some words of these words.
- * - Keywords are matched against whole words, and not as substrings.
- *   So 'ai' will hide 'Model #ai123 release!' and 'Is AI self aware?',
- *   but not hide 'DMCA guide to fair use'. This behavior is desired and intentional
- *   to prevent false hiding, but it also may require adding plural versions of some words.
- *   (use keywords of 'fox' and 'foxes').
+ * - When using whole word syntax, some keywords may need additional pluralized variations.
  */
 @SuppressWarnings("unused")
 @RequiresApi(api = Build.VERSION_CODES.N)

From e592b5f8bbef44e9d533e0d021b4d3df92829456 Mon Sep 17 00:00:00 2001
From: LisoUseInAIKyrios <118716522+LisoUseInAIKyrios@users.noreply.github.com>
Date: Thu, 29 Aug 2024 21:14:30 -0400
Subject: [PATCH 08/10] fix: Allow 1 and 2 character words if using keywords in
 languages that do not use spaces

---
 .../components/KeywordContentFilter.java      | 31 ++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java
index 83e3dc25f5..d649e7129a 100644
--- a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java
+++ b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java
@@ -2,6 +2,7 @@
 
 import static app.revanced.integrations.shared.StringRef.str;
 import static app.revanced.integrations.youtube.shared.NavigationBar.NavigationButton;
+import static java.lang.Character.UnicodeBlock.*;
 
 import android.os.Build;
 
@@ -223,6 +224,31 @@ private static String capitalizeAllFirstLetters(String sentence) {
         return new String(codePoints, 0, codePoints.length);
     }
 
+    /**
+     * @return If the string contains any characters from languages that do not use spaces between words.
+     */
+    private static boolean isLanguageWithNoSpaces(String text) {
+        for (int i = 0, length = text.length(); i < length;) {
+            final int codePoint = text.codePointAt(i);
+
+            Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
+            if (block == CJK_UNIFIED_IDEOGRAPHS // Chinese and Kanji
+                    || block == HIRAGANA // Japanese Hiragana
+                    || block == KATAKANA // Japanese Katakana
+                    || block == THAI
+                    || block == LAO
+                    || block == MYANMAR
+                    || block == KHMER
+                    || block == TIBETAN) {
+                return true;
+            }
+
+            i += Character.charCount(codePoint);
+        }
+
+        return false;
+    }
+
     /**
      * @return If the phrase will will hide all videos. Not an exhaustive check.
      */
@@ -380,7 +406,10 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho
                     }
                     phrase = stripWholeWordSyntax(phrase);
                     wholeWordMatching = true;
-                } else if (phrase.length() < MINIMUM_KEYWORD_LENGTH) {
+                } else if (phrase.length() < MINIMUM_KEYWORD_LENGTH && !isLanguageWithNoSpaces(phrase)) {
+                    // Allow phrases of 1 and 2 characters if using a
+                    // language that does not use spaces between words.
+
                     // Do not reset the setting. Keep the invalid keywords so the user can fix the mistake.
                     Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_length", phrase, MINIMUM_KEYWORD_LENGTH));
                     continue;

From 98927ff2298fed506f0f9c196348a01eb17d5bda Mon Sep 17 00:00:00 2001
From: LisoUseInAIKyrios <118716522+LisoUseInAIKyrios@users.noreply.github.com>
Date: Fri, 30 Aug 2024 16:28:42 -0400
Subject: [PATCH 09/10] Update
 app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java

Co-authored-by: oSumAtrIX 
---
 .../youtube/patches/components/KeywordContentFilter.java        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java
index d649e7129a..9a8ac649bf 100644
--- a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java
+++ b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java
@@ -250,7 +250,7 @@ private static boolean isLanguageWithNoSpaces(String text) {
     }
 
     /**
-     * @return If the phrase will will hide all videos. Not an exhaustive check.
+     * @return If the phrase will hide all videos. Not an exhaustive check.
      */
     private static boolean phrasesWillHideAllVideos(@NonNull String[] phrases, boolean matchWholeWords) {
         for (String phrase : phrases) {

From fba8d61f2bf0d5656c2766ab65d796ed63822061 Mon Sep 17 00:00:00 2001
From: LisoUseInAIKyrios <118716522+LisoUseInAIKyrios@users.noreply.github.com>
Date: Fri, 30 Aug 2024 17:38:20 -0400
Subject: [PATCH 10/10] refactor

---
 .../youtube/patches/components/KeywordContentFilter.java    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java
index 9a8ac649bf..3185036c98 100644
--- a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java
+++ b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java
@@ -451,12 +451,12 @@ private synchronized void parseKeywords() { // Must be synchronized since Litho
                 for (String variation : phraseVariations) {
                     // Check if the same phrase is declared both with and without quotes.
                     Boolean existing = keywords.get(variation);
-                    if (existing != null && existing != wholeWordMatching) {
+                    if (existing == null) {
+                        keywords.put(variation, wholeWordMatching);
+                    } else if (existing != wholeWordMatching) {
                         Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_conflicting", phrase));
                         break;
                     }
-
-                    keywords.put(variation, wholeWordMatching);
                 }
             }