From f4dcab16004e566f1969bc6f627facf198573fee Mon Sep 17 00:00:00 2001 From: Stypox Date: Sun, 7 Apr 2024 23:37:55 +0200 Subject: [PATCH] [YouTube] Replace link text with accessibility label --- .../youtube/YoutubeDescriptionHelper.java | 81 ++++++++++++++++--- .../YoutubeStreamExtractorDefaultTest.java | 12 ++- 2 files changed, 82 insertions(+), 11 deletions(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java index 0eeecdac76..a42b15a225 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java @@ -12,6 +12,9 @@ import java.util.Comparator; import java.util.List; import java.util.Stack; +import java.util.function.Function; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -29,6 +32,11 @@ private YoutubeDescriptionHelper() { public static final String ITALIC_OPEN = ""; public static final String ITALIC_CLOSE = ""; + // special link chips (e.g. for YT videos, YT channels or social media accounts): + // (u00a0) u00a0 u00a0 [/•] u00a0 u00a0 u00a0 + private static final Pattern LINK_CONTENT_CLEANER_REGEX + = Pattern.compile("(?s)^\u00a0+[/•]\u00a0+(.*)\u00a0+$"); + /** * Can be a command run, or a style run. */ @@ -37,17 +45,30 @@ static final class Run { @Nonnull final String close; final int pos; final boolean isClose; + @Nullable final Function transformContent; + int openPosInOutput = -1; Run( @Nonnull final String open, @Nonnull final String close, final int pos, final boolean isClose + ) { + this(open, close, pos, isClose, null); + } + + Run( + @Nonnull final String open, + @Nonnull final String close, + final int pos, + final boolean isClose, + @Nullable final Function transformContent ) { this.open = open; this.close = close; this.pos = pos; this.isClose = isClose; + this.transformContent = transformContent; } public boolean sameOpen(@Nonnull final Run other) { @@ -148,12 +169,22 @@ static String runsToHtml( // condition, because no run will close before being opened, but let's be sure while (!openRuns.empty()) { final Run popped = openRuns.pop(); - textBuilder.append(popped.close); if (popped.sameOpen(closer)) { + // before closing the current run, if the run has a transformContent + // function, use it to transform the content of the current run, based on + // the openPosInOutput set when the current run was opened + if (popped.transformContent != null && popped.openPosInOutput >= 0) { + textBuilder.replace(popped.openPosInOutput, textBuilder.length(), + popped.transformContent.apply( + textBuilder.substring(popped.openPosInOutput))); + } + // close the run that we really need to close + textBuilder.append(popped.close); break; } // we keep popping from openRuns, closing all of the runs we find, // until we find the run that we really need to close ... + textBuilder.append(popped.close); tempStack.push(popped); } while (!tempStack.empty()) { @@ -168,8 +199,10 @@ static String runsToHtml( } else { // this will never be reached if openersIndex >= openers.size() because of the // way minPos is calculated - textBuilder.append(openers.get(openersIndex).open); - openRuns.push(openers.get(openersIndex)); + final Run opener = openers.get(openersIndex); + textBuilder.append(opener.open); + opener.openPosInOutput = textBuilder.length(); // save for transforming later + openRuns.push(opener); ++openersIndex; } } @@ -180,11 +213,7 @@ static String runsToHtml( return textBuilder.toString() .replace("\n", "
") .replace(" ", "  ") - // special link chips (e.g. for YT videos, YT channels or social media accounts): - // u00a0 u00a0 [/•] u00a0 u00a0 u00a0 - .replace("\">\u00a0\u00a0/\u00a0", "\">") - .replace("\">\u00a0\u00a0•\u00a0", "\">") - .replace("\u00a0\u00a0", ""); + .replace('\u00a0', ' '); } private static void addAllCommandRuns( @@ -212,12 +241,44 @@ private static void addAllCommandRuns( } final String open = ""; + final Function transformContent = getTransformContentFun(run); - openers.add(new Run(open, LINK_CLOSE, startIndex, false)); - closers.add(new Run(open, LINK_CLOSE, startIndex + length, true)); + openers.add(new Run(open, LINK_CLOSE, startIndex, false, + transformContent)); + closers.add(new Run(open, LINK_CLOSE, startIndex + length, true, + transformContent)); }); } + private static Function getTransformContentFun(final JsonObject run) { + final String accessibilityLabel = run.getObject("onTapOptions") + .getObject("accessibilityInfo") + .getString("accessibilityLabel", "") + // accessibility labels are e.g. "Instagram Channel Link: instagram_profile_name" + .replaceFirst(" Channel Link", ""); + + final Function transformContent; + if (accessibilityLabel.isEmpty() || accessibilityLabel.startsWith("YouTube: ")) { + // if there is no accessibility label, or the link points to YouTube, cleanup the link + // text, see LINK_CONTENT_CLEANER_REGEX's documentation for more details + transformContent = (content) -> { + final Matcher m = LINK_CONTENT_CLEANER_REGEX.matcher(content); + if (m.find()) { + return m.group(1); + } + return content; + }; + } else { + // if there is an accessibility label, replace the link text with it, because on the + // YouTube website an ambiguous link text is next to an icon explaining which service it + // belongs to, but since we can't add icons, we instead use the accessibility label + // which contains information about the service + transformContent = (content) -> accessibilityLabel; + } + + return transformContent; + } + private static void addAllStyleRuns( @Nonnull final JsonObject attributedDescription, @Nonnull final List openers, diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java index cb773dcd40..40684db12b 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/stream/YoutubeStreamExtractorDefaultTest.java @@ -51,6 +51,9 @@ import org.schabi.newpipe.extractor.stream.StreamSegment; import org.schabi.newpipe.extractor.stream.StreamType; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; @@ -162,7 +165,7 @@ public static void setUp() throws Exception { } public static class DescriptionTestUnboxing extends DefaultStreamExtractorTest { - private static final String ID = "cV5TjZCJkuA"; + private static final String ID = "ZeerrnuLi5E"; private static final String URL = BASE_URL + ID; private static StreamExtractor extractor; @@ -187,6 +190,13 @@ public static void setUp() throws Exception { @Override public String expectedUploaderUrl() { return "https://www.youtube.com/channel/UCsTcErHg8oDvUnTzoqsYeNw"; } @Override public long expectedUploaderSubscriberCountAtLeast() { return 18_000_000; } @Override public List expectedDescriptionContains() { + try(FileOutputStream a = new FileOutputStream("/home/stypox/Desktop/newpipestream.html")) { + a.write(extractor().getDescription().getContent().getBytes()); + } catch (IOException e) { + throw new RuntimeException(e); + } catch (ParsingException e) { + throw new RuntimeException(e); + } return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34", "https://www.youtube.com/watch?v=Lqv6G0pDNnw&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34", "https://www.youtube.com/watch?v=XxaRBPyrnBU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",