From 293c3e9e4754237506b4b43d0748dd4daa62b377 Mon Sep 17 00:00:00 2001 From: AudricV <74829229+AudricV@users.noreply.github.com> Date: Sun, 17 Mar 2024 15:08:58 +0100 Subject: [PATCH 1/6] [YouTube] Support new A/B tested comments data Also improve current comments code by removing outdated comment renderer data. --- .../YoutubeCommentsEUVMInfoItemExtractor.java | 235 ++++++++++++++++++ .../extractors/YoutubeCommentsExtractor.java | 121 +++++++-- .../YoutubeCommentsInfoItemExtractor.java | 92 ++++--- 3 files changed, 377 insertions(+), 71 deletions(-) create mode 100644 extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsEUVMInfoItemExtractor.java diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsEUVMInfoItemExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsEUVMInfoItemExtractor.java new file mode 100644 index 0000000000..857e6096fd --- /dev/null +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsEUVMInfoItemExtractor.java @@ -0,0 +1,235 @@ +package org.schabi.newpipe.extractor.services.youtube.extractors; + +import com.grack.nanojson.JsonObject; +import org.schabi.newpipe.extractor.Image; +import org.schabi.newpipe.extractor.Page; +import org.schabi.newpipe.extractor.comments.CommentsInfoItemExtractor; +import org.schabi.newpipe.extractor.exceptions.ParsingException; +import org.schabi.newpipe.extractor.localization.DateWrapper; +import org.schabi.newpipe.extractor.localization.TimeAgoParser; +import org.schabi.newpipe.extractor.stream.Description; +import org.schabi.newpipe.extractor.utils.Utils; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.util.List; +import java.util.Objects; + +import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getAttributedDescription; +import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getImagesFromThumbnailsArray; +import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty; + +/** + * A {@link CommentsInfoItemExtractor} for YouTube comment data returned in a view model and entity + * updates. + */ +class YoutubeCommentsEUVMInfoItemExtractor implements CommentsInfoItemExtractor { + + private static final String AUTHOR = "author"; + private static final String PROPERTIES = "properties"; + + @Nonnull + private final JsonObject commentViewModel; + @Nullable + private final JsonObject commentRepliesRenderer; + @Nonnull + private final JsonObject commentEntityPayload; + @Nonnull + private final JsonObject engagementToolbarStateEntityPayload; + @Nonnull + private final String videoUrl; + @Nonnull + private final TimeAgoParser timeAgoParser; + + YoutubeCommentsEUVMInfoItemExtractor( + @Nonnull final JsonObject commentViewModel, + @Nullable final JsonObject commentRepliesRenderer, + @Nonnull final JsonObject commentEntityPayload, + @Nonnull final JsonObject engagementToolbarStateEntityPayload, + @Nonnull final String videoUrl, + @Nonnull final TimeAgoParser timeAgoParser) { + this.commentViewModel = commentViewModel; + this.commentRepliesRenderer = commentRepliesRenderer; + this.commentEntityPayload = commentEntityPayload; + this.engagementToolbarStateEntityPayload = engagementToolbarStateEntityPayload; + this.videoUrl = videoUrl; + this.timeAgoParser = timeAgoParser; + } + + @Override + public String getName() throws ParsingException { + return getUploaderName(); + } + + @Override + public String getUrl() throws ParsingException { + return videoUrl; + } + + @Nonnull + @Override + public List getThumbnails() throws ParsingException { + return getUploaderAvatars(); + } + + @Override + public int getLikeCount() throws ParsingException { + final String textualLikeCount = getTextualLikeCount(); + try { + if (Utils.isBlank(textualLikeCount)) { + return 0; + } + + return (int) Utils.mixedNumberWordToLong(textualLikeCount); + } catch (final Exception e) { + throw new ParsingException( + "Unexpected error while converting textual like count to like count", e); + } + } + + @Override + public String getTextualLikeCount() { + return commentEntityPayload.getObject("toolbar") + .getString("likeCountNotliked"); + } + + @Override + public Description getCommentText() throws ParsingException { + // Comments' text work in the same way as an attributed video description + return new Description( + getAttributedDescription(commentEntityPayload.getObject(PROPERTIES) + .getObject("content")), Description.HTML); + } + + @Override + public String getTextualUploadDate() throws ParsingException { + return commentEntityPayload.getObject(PROPERTIES) + .getString("publishedTime"); + } + + @Nullable + @Override + public DateWrapper getUploadDate() throws ParsingException { + final String textualPublishedTime = getTextualUploadDate(); + if (isNullOrEmpty(textualPublishedTime)) { + return null; + } + + return timeAgoParser.parse(textualPublishedTime); + } + + @Override + public String getCommentId() throws ParsingException { + String commentId = commentEntityPayload.getObject(PROPERTIES) + .getString("commentId"); + if (isNullOrEmpty(commentId)) { + commentId = commentViewModel.getString("commentId"); + if (isNullOrEmpty(commentId)) { + throw new ParsingException("Could not get comment ID"); + } + } + return commentId; + } + + @Override + public String getUploaderUrl() throws ParsingException { + final JsonObject author = commentEntityPayload.getObject(AUTHOR); + String channelId = author.getString("channelId"); + if (isNullOrEmpty(channelId)) { + channelId = author.getObject("channelCommand") + .getObject("innertubeCommand") + .getObject("browseEndpoint") + .getString("browseId"); + if (isNullOrEmpty(channelId)) { + channelId = author.getObject("avatar") + .getObject("endpoint") + .getObject("innertubeCommand") + .getObject("browseEndpoint") + .getString("browseId"); + if (isNullOrEmpty(channelId)) { + throw new ParsingException("Could not get channel ID"); + } + } + } + return "https://www.youtube.com/channel/" + channelId; + } + + @Override + public String getUploaderName() throws ParsingException { + return commentEntityPayload.getObject(AUTHOR) + .getString("displayName"); + } + + @Nonnull + @Override + public List getUploaderAvatars() throws ParsingException { + return getImagesFromThumbnailsArray(commentEntityPayload.getObject("avatar") + .getObject("image") + .getArray("sources")); + } + + @Override + public boolean isHeartedByUploader() { + return "TOOLBAR_HEART_STATE_HEARTED".equals( + engagementToolbarStateEntityPayload.getString("heartState")); + } + + @Override + public boolean isPinned() { + return commentViewModel.has("pinnedText"); + } + + @Override + public boolean isUploaderVerified() throws ParsingException { + final JsonObject author = commentEntityPayload.getObject(AUTHOR); + return author.getBoolean("isVerified") || author.getBoolean("isArtist"); + } + + @Override + public int getReplyCount() throws ParsingException { + // As YouTube allows replies up to 750 comments, we cannot check if the count returned is a + // mixed number or a real number + // Assume it is a mixed one, as it matches how numbers of most properties are returned + final String replyCountString = commentEntityPayload.getObject("toolbar") + .getString("replyCount"); + if (isNullOrEmpty(replyCountString)) { + return 0; + } + return (int) Utils.mixedNumberWordToLong(replyCountString); + } + + @Nullable + @Override + public Page getReplies() throws ParsingException { + if (isNullOrEmpty(commentRepliesRenderer)) { + return null; + } + + final String continuation = commentRepliesRenderer.getArray("contents") + .stream() + .filter(JsonObject.class::isInstance) + .map(JsonObject.class::cast) + .map(content -> content.getObject("continuationItemRenderer", null)) + .filter(Objects::nonNull) + .findFirst() + .map(continuationItemRenderer -> + continuationItemRenderer.getObject("continuationEndpoint") + .getObject("continuationCommand") + .getString("token")) + .orElseThrow(() -> + new ParsingException("Could not get comment replies continuation")); + return new Page(videoUrl, continuation); + } + + @Override + public boolean isChannelOwner() { + return commentEntityPayload.getObject(AUTHOR) + .getBoolean("isCreator"); + } + + @Override + public boolean hasCreatorReply() { + return commentRepliesRenderer != null + && commentRepliesRenderer.has("viewRepliesCreatorThumbnail"); + } +} diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java index 84e6c3e1e1..8667768a4b 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java @@ -13,6 +13,7 @@ import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler; import org.schabi.newpipe.extractor.localization.Localization; +import org.schabi.newpipe.extractor.localization.TimeAgoParser; import org.schabi.newpipe.extractor.utils.JsonUtils; import org.schabi.newpipe.extractor.utils.Utils; @@ -21,7 +22,6 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Collections; -import java.util.List; import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getJsonPostResponse; import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getTextFromObject; @@ -30,6 +30,9 @@ public class YoutubeCommentsExtractor extends CommentsExtractor { + private static final String COMMENT_VIEW_MODEL_KEY = "commentViewModel"; + private static final String COMMENT_RENDERER_KEY = "commentRenderer"; + /** * Whether comments are disabled on video. */ @@ -74,8 +77,7 @@ private String findInitialCommentsToken(final JsonObject nextResponse) { return null; } - final String token = contents - .stream() + final String token = contents.stream() // Only use JsonObjects .filter(JsonObject.class::isInstance) .map(JsonObject.class::cast) @@ -120,6 +122,21 @@ private JsonArray getJsonContents(final JsonObject nextResponse) { } } + @Nonnull + private JsonObject getMutationPayloadFromEntityKey(@Nonnull final JsonArray mutations, + @Nonnull final String commentKey) + throws ParsingException { + return mutations.stream() + .filter(JsonObject.class::isInstance) + .map(JsonObject.class::cast) + .filter(mutation -> commentKey.equals( + mutation.getString("entityKey"))) + .findFirst() + .orElseThrow(() -> new ParsingException( + "Could not get comment entity payload mutation")) + .getObject("payload"); + } + @Nonnull private InfoItemsPage getInfoItemsPageForDisabledComments() { return new InfoItemsPage<>(Collections.emptyList(), null, Collections.emptyList()); @@ -207,8 +224,8 @@ private InfoItemsPage extractComments(final JsonObject jsonObj return new InfoItemsPage<>(collector, getNextPage(jsonObject)); } - private void collectCommentsFrom(final CommentsInfoItemsCollector collector, - final JsonObject jsonObject) + private void collectCommentsFrom(@Nonnull final CommentsInfoItemsCollector collector, + @Nonnull final JsonObject jsonObject) throws ParsingException { final JsonArray onResponseReceivedEndpoints = @@ -233,6 +250,8 @@ private void collectCommentsFrom(final CommentsInfoItemsCollector collector, final JsonArray contents; try { + // A copy of the array is needed, otherwise the continuation item is removed from the + // original object which is used to get the continuation contents = new JsonArray(JsonUtils.getArray(commentsEndpoint, path)); } catch (final Exception e) { // No comments @@ -244,23 +263,80 @@ private void collectCommentsFrom(final CommentsInfoItemsCollector collector, contents.remove(index); } - final String jsonKey = contents.getObject(0).has("commentThreadRenderer") - ? "commentThreadRenderer" - : "commentRenderer"; + // The mutations object, which is returned in the comments' continuation + // It contains parts of comment data when comments are returned with a view model + final JsonArray mutations = jsonObject.getObject("frameworkUpdates") + .getObject("entityBatchUpdate") + .getArray("mutations"); + final String videoUrl = getUrl(); + final TimeAgoParser timeAgoParser = getTimeAgoParser(); - final List comments; - try { - comments = JsonUtils.getValues(contents, jsonKey); - } catch (final Exception e) { - throw new ParsingException("Unable to get parse youtube comments", e); + for (final Object o : contents) { + if (!(o instanceof JsonObject)) { + continue; + } + + collectCommentItem(mutations, (JsonObject) o, collector, videoUrl, timeAgoParser); } + } - final String url = getUrl(); - comments.stream() - .filter(JsonObject.class::isInstance) - .map(JsonObject.class::cast) - .map(jObj -> new YoutubeCommentsInfoItemExtractor(jObj, url, getTimeAgoParser())) - .forEach(collector::commit); + private void collectCommentItem(@Nonnull final JsonArray mutations, + @Nonnull final JsonObject content, + @Nonnull final CommentsInfoItemsCollector collector, + @Nonnull final String videoUrl, + @Nonnull final TimeAgoParser timeAgoParser) + throws ParsingException { + if (content.has("commentThreadRenderer")) { + final JsonObject commentThreadRenderer = + content.getObject("commentThreadRenderer"); + if (commentThreadRenderer.has(COMMENT_VIEW_MODEL_KEY)) { + final JsonObject commentViewModel = + commentThreadRenderer.getObject(COMMENT_VIEW_MODEL_KEY) + .getObject(COMMENT_VIEW_MODEL_KEY); + collector.commit(new YoutubeCommentsEUVMInfoItemExtractor( + commentViewModel, + commentThreadRenderer.getObject("replies") + .getObject("commentRepliesRenderer"), + getMutationPayloadFromEntityKey(mutations, + commentViewModel.getString("commentKey", "")) + .getObject("commentEntityPayload"), + getMutationPayloadFromEntityKey(mutations, + commentViewModel.getString("toolbarStateKey", "")) + .getObject("engagementToolbarStateEntityPayload"), + videoUrl, + timeAgoParser)); + } else if (commentThreadRenderer.has("comment")) { + collector.commit(new YoutubeCommentsInfoItemExtractor( + commentThreadRenderer.getObject("comment") + .getObject(COMMENT_RENDERER_KEY), + commentThreadRenderer.getObject("replies") + .getObject("commentRepliesRenderer"), + videoUrl, + timeAgoParser)); + } + } else if (content.has(COMMENT_VIEW_MODEL_KEY)) { + final JsonObject commentViewModel = content.getObject(COMMENT_VIEW_MODEL_KEY); + collector.commit(new YoutubeCommentsEUVMInfoItemExtractor( + commentViewModel, + null, + getMutationPayloadFromEntityKey(mutations, + commentViewModel.getString("commentKey", "")) + .getObject("commentEntityPayload"), + getMutationPayloadFromEntityKey(mutations, + commentViewModel.getString("toolbarStateKey", "")) + .getObject("engagementToolbarStateEntityPayload"), + videoUrl, + timeAgoParser)); + } else if (content.has(COMMENT_RENDERER_KEY)) { + // commentRenderers are directly returned for comment replies, so there is no + // commentRepliesRenderer to provide + // Also, YouTube has only one comment reply level + collector.commit(new YoutubeCommentsInfoItemExtractor( + content.getObject(COMMENT_RENDERER_KEY), + null, + videoUrl, + timeAgoParser)); + } } @Override @@ -307,10 +383,11 @@ public int getCommentsCount() throws ExtractionException { return -1; } - final JsonObject countText = ajaxJson - .getArray("onResponseReceivedEndpoints").getObject(0) + final JsonObject countText = ajaxJson.getArray("onResponseReceivedEndpoints") + .getObject(0) .getObject("reloadContinuationItemsCommand") - .getArray("continuationItems").getObject(0) + .getArray("continuationItems") + .getObject(0) .getObject("commentsHeaderRenderer") .getObject("countText"); diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsInfoItemExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsInfoItemExtractor.java index 06b68fe5e4..ddc7b7bcc0 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsInfoItemExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsInfoItemExtractor.java @@ -22,40 +22,36 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtractor { - private final JsonObject json; - private JsonObject commentRenderer; + @Nonnull + private final JsonObject commentRenderer; + @Nullable + private final JsonObject commentRepliesRenderer; + @Nonnull private final String url; + @Nonnull private final TimeAgoParser timeAgoParser; - public YoutubeCommentsInfoItemExtractor(final JsonObject json, - final String url, - final TimeAgoParser timeAgoParser) { - this.json = json; + public YoutubeCommentsInfoItemExtractor(@Nonnull final JsonObject commentRenderer, + @Nullable final JsonObject commentRepliesRenderer, + @Nonnull final String url, + @Nonnull final TimeAgoParser timeAgoParser) { + this.commentRenderer = commentRenderer; + this.commentRepliesRenderer = commentRepliesRenderer; this.url = url; this.timeAgoParser = timeAgoParser; } - private JsonObject getCommentRenderer() throws ParsingException { - if (commentRenderer == null) { - if (json.has("comment")) { - commentRenderer = JsonUtils.getObject(json, "comment.commentRenderer"); - } else { - commentRenderer = json; - } - } - return commentRenderer; - } - @Nonnull private List getAuthorThumbnails() throws ParsingException { try { - return getImagesFromThumbnailsArray(JsonUtils.getArray(getCommentRenderer(), + return getImagesFromThumbnailsArray(JsonUtils.getArray(commentRenderer, "authorThumbnail.thumbnails")); } catch (final Exception e) { throw new ParsingException("Could not get author thumbnails", e); } } + @Nonnull @Override public String getUrl() throws ParsingException { return url; @@ -70,7 +66,7 @@ public List getThumbnails() throws ParsingException { @Override public String getName() throws ParsingException { try { - return getTextFromObject(JsonUtils.getObject(getCommentRenderer(), "authorText")); + return getTextFromObject(JsonUtils.getObject(commentRenderer, "authorText")); } catch (final Exception e) { return ""; } @@ -79,7 +75,7 @@ public String getName() throws ParsingException { @Override public String getTextualUploadDate() throws ParsingException { try { - return getTextFromObject(JsonUtils.getObject(getCommentRenderer(), + return getTextFromObject(JsonUtils.getObject(commentRenderer, "publishedTimeText")); } catch (final Exception e) { throw new ParsingException("Could not get publishedTimeText", e); @@ -90,8 +86,7 @@ public String getTextualUploadDate() throws ParsingException { @Override public DateWrapper getUploadDate() throws ParsingException { final String textualPublishedTime = getTextualUploadDate(); - if (timeAgoParser != null && textualPublishedTime != null - && !textualPublishedTime.isEmpty()) { + if (textualPublishedTime != null && !textualPublishedTime.isEmpty()) { return timeAgoParser.parse(textualPublishedTime); } else { return null; @@ -118,7 +113,7 @@ public int getLikeCount() throws ParsingException { // Try first to get the exact like count by using the accessibility data final String likeCount; try { - likeCount = Utils.removeNonDigitCharacters(JsonUtils.getString(getCommentRenderer(), + likeCount = Utils.removeNonDigitCharacters(JsonUtils.getString(commentRenderer, "actionButtons.commentActionButtonsRenderer.likeButton.toggleButtonRenderer" + ".accessibilityData.accessibilityData.label")); } catch (final Exception e) { @@ -170,11 +165,11 @@ public String getTextualLikeCount() throws ParsingException { */ try { // If a comment has no likes voteCount is not set - if (!getCommentRenderer().has("voteCount")) { + if (!commentRenderer.has("voteCount")) { return ""; } - final JsonObject voteCountObj = JsonUtils.getObject(getCommentRenderer(), "voteCount"); + final JsonObject voteCountObj = JsonUtils.getObject(commentRenderer, "voteCount"); if (voteCountObj.isEmpty()) { return ""; } @@ -188,7 +183,7 @@ public String getTextualLikeCount() throws ParsingException { @Override public Description getCommentText() throws ParsingException { try { - final JsonObject contentText = JsonUtils.getObject(getCommentRenderer(), "contentText"); + final JsonObject contentText = JsonUtils.getObject(commentRenderer, "contentText"); if (contentText.isEmpty()) { // completely empty comments as described in // https://github.com/TeamNewPipe/NewPipeExtractor/issues/380#issuecomment-668808584 @@ -208,7 +203,7 @@ public Description getCommentText() throws ParsingException { @Override public String getCommentId() throws ParsingException { try { - return JsonUtils.getString(getCommentRenderer(), "commentId"); + return JsonUtils.getString(commentRenderer, "commentId"); } catch (final Exception e) { throw new ParsingException("Could not get comment id", e); } @@ -221,27 +216,26 @@ public List getUploaderAvatars() throws ParsingException { } @Override - public boolean isHeartedByUploader() throws ParsingException { - final JsonObject commentActionButtonsRenderer = getCommentRenderer() - .getObject("actionButtons") + public boolean isHeartedByUploader() { + final JsonObject commentActionButtonsRenderer = commentRenderer.getObject("actionButtons") .getObject("commentActionButtonsRenderer"); return commentActionButtonsRenderer.has("creatorHeart"); } @Override - public boolean isPinned() throws ParsingException { - return getCommentRenderer().has("pinnedCommentBadge"); + public boolean isPinned() { + return commentRenderer.has("pinnedCommentBadge"); } @Override public boolean isUploaderVerified() throws ParsingException { - return getCommentRenderer().has("authorCommentBadge"); + return commentRenderer.has("authorCommentBadge"); } @Override public String getUploaderName() throws ParsingException { try { - return getTextFromObject(JsonUtils.getObject(getCommentRenderer(), "authorText")); + return getTextFromObject(JsonUtils.getObject(commentRenderer, "authorText")); } catch (final Exception e) { return ""; } @@ -250,7 +244,7 @@ public String getUploaderName() throws ParsingException { @Override public String getUploaderUrl() throws ParsingException { try { - return "https://www.youtube.com/channel/" + JsonUtils.getString(getCommentRenderer(), + return "https://www.youtube.com/channel/" + JsonUtils.getString(commentRenderer, "authorEndpoint.browseEndpoint.browseId"); } catch (final Exception e) { return ""; @@ -258,19 +252,22 @@ public String getUploaderUrl() throws ParsingException { } @Override - public int getReplyCount() throws ParsingException { - final JsonObject commentRendererJsonObject = getCommentRenderer(); - if (commentRendererJsonObject.has("replyCount")) { - return commentRendererJsonObject.getInt("replyCount"); + public int getReplyCount() { + if (commentRenderer.has("replyCount")) { + return commentRenderer.getInt("replyCount"); } return UNKNOWN_REPLY_COUNT; } @Override public Page getReplies() { + if (commentRepliesRenderer == null) { + return null; + } + try { final String id = JsonUtils.getString( - JsonUtils.getArray(json, "replies.commentRepliesRenderer.contents") + JsonUtils.getArray(commentRepliesRenderer, "contents") .getObject(0), "continuationItemRenderer.continuationEndpoint.continuationCommand.token"); return new Page(url, id); @@ -280,20 +277,17 @@ public Page getReplies() { } @Override - public boolean isChannelOwner() throws ParsingException { - return getCommentRenderer().getBoolean("authorIsChannelOwner"); + public boolean isChannelOwner() { + return commentRenderer.getBoolean("authorIsChannelOwner"); } - @Override - public boolean hasCreatorReply() throws ParsingException { - try { - final JsonObject commentRepliesRenderer = JsonUtils.getObject(json, - "replies.commentRepliesRenderer"); - return commentRepliesRenderer.has("viewRepliesCreatorThumbnail"); - } catch (final Exception e) { + public boolean hasCreatorReply() { + if (commentRepliesRenderer == null) { return false; } + + return commentRepliesRenderer.has("viewRepliesCreatorThumbnail"); } } From 09732d6785034544a63d068dd67421edcd91694a Mon Sep 17 00:00:00 2001 From: Stypox Date: Sun, 31 Mar 2024 22:54:47 +0200 Subject: [PATCH 2/6] [YouTube] Add support for styles in attributed descriptions Also refactor descriptions parsing. --- .../youtube/YoutubeDescriptionHelper.java | 255 ++++++++++++++++++ .../youtube/YoutubeParsingHelper.java | 80 ------ .../YoutubeCommentsEUVMInfoItemExtractor.java | 4 +- .../extractors/YoutubeStreamExtractor.java | 4 +- 4 files changed, 259 insertions(+), 84 deletions(-) create mode 100644 extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java new file mode 100644 index 0000000000..0eeecdac76 --- /dev/null +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java @@ -0,0 +1,255 @@ +package org.schabi.newpipe.extractor.services.youtube; + +import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getUrlFromNavigationEndpoint; +import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty; + +import com.grack.nanojson.JsonObject; + +import org.jsoup.nodes.Entities; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Stack; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +public final class YoutubeDescriptionHelper { + + private YoutubeDescriptionHelper() { + } + + public static final String LINK_CLOSE = ""; + public static final String STRIKETHROUGH_OPEN = ""; + public static final String STRIKETHROUGH_CLOSE = ""; + public static final String BOLD_OPEN = ""; + public static final String BOLD_CLOSE = ""; + public static final String ITALIC_OPEN = ""; + public static final String ITALIC_CLOSE = ""; + + /** + * Can be a command run, or a style run. + */ + static final class Run { + @Nonnull final String open; + @Nonnull final String close; + final int pos; + final boolean isClose; + + Run( + @Nonnull final String open, + @Nonnull final String close, + final int pos, + final boolean isClose + ) { + this.open = open; + this.close = close; + this.pos = pos; + this.isClose = isClose; + } + + public boolean sameOpen(@Nonnull final Run other) { + return open.equals(other.open); + } + } + + /** + * Parse a video description in the new "attributed" format, which contains the entire visible + * plaintext ({@code content}) and an array of {@code commandRuns}. + * + *

+ * The {@code commandRuns} include the links and their position in the text. + *

+ * + * @param attributedDescription the JSON object of the attributed description + * @return the parsed description, in HTML format, as a string + */ + public static String attributedDescriptionToHtml( + @Nullable final JsonObject attributedDescription + ) { + if (isNullOrEmpty(attributedDescription)) { + return null; + } + + final String content = attributedDescription.getString("content"); + if (content == null) { + return null; + } + + // all run pairs must always of length at least 1, or they should be discarded, + // otherwise various assumptions made in runsToHtml may fail + final List openers = new ArrayList<>(); + final List closers = new ArrayList<>(); + addAllCommandRuns(attributedDescription, openers, closers); + addAllStyleRuns(attributedDescription, openers, closers); + + // Note that sorting this way might put closers with the same close position in the wrong + // order with respect to their openers, causing unnecessary closes and reopens. E.g. + // bb&i is instead generated as bb&i if the is + // encountered before the . Solving this wouldn't be difficult, thanks to stable sort, + // but would require additional sorting steps which would just make this slower for the + // general case where it's unlikely there are coincident closes. + Collections.sort(openers, Comparator.comparingInt(run -> run.pos)); + Collections.sort(closers, Comparator.comparingInt(run -> run.pos)); + + return runsToHtml(openers, closers, content); + } + + /** + * Applies the formatting specified by the intervals stored in {@code openers} and {@code + * closers} to {@code content} in order to obtain valid HTML even when intervals overlap. For + * example <b>b<i>b&i</b>i</i> would not be valid HTML, so this function + * instead generates <b>b<i>b&i</i></b><i>i</i>. + *

+ * Every opener in {@code openers} must have a corresponding closer in {@code closers}. Every + * corresponding (opener, closer) pair must have a length of at least one (i.e. empty intervals + * are not allowed). + *

+ * + * @param openers contains all of the places where a run begins, must have the same size of + * closers, must be ordered by {@link Run#pos} + * @param closers contains all of the places where a run ends, must have the same size of + * openers, must be ordered by {@link Run#pos} + * @param content the content to apply formatting to + * @return the formatted content in HTML + */ + static String runsToHtml( + @Nonnull final List openers, + @Nonnull final List closers, + @Nonnull final String content + ) { + final Stack openRuns = new Stack<>(); + final Stack tempStack = new Stack<>(); + final StringBuilder textBuilder = new StringBuilder(); + int currentTextPos = 0; + int openersIndex = 0; + int closersIndex = 0; + + // openers and closers have the same length, but we will surely finish openers earlier than + // closers, since every opened interval needs to be closed at some point and there can't be + // empty intervals, hence check only closersIndex < closers.size() + while (closersIndex < closers.size()) { + final int minPos = openersIndex < openers.size() + ? Math.min(closers.get(closersIndex).pos, openers.get(openersIndex).pos) + : closers.get(closersIndex).pos; + + // append piece of text until current index + textBuilder.append(content, currentTextPos, minPos); + currentTextPos = minPos; + + if (closers.get(closersIndex).pos == minPos) { + // even in case of position tie, first process closers + final Run closer = closers.get(closersIndex); + ++closersIndex; + + // because of the assumptions, this while wouldn't need the !openRuns.empty() + // condition, because no run will close before being opened, but let's be sure + while (!openRuns.empty()) { + final Run popped = openRuns.pop(); + textBuilder.append(popped.close); + if (popped.sameOpen(closer)) { + break; + } + // we keep popping from openRuns, closing all of the runs we find, + // until we find the run that we really need to close ... + tempStack.push(popped); + } + while (!tempStack.empty()) { + // ... and then we reopen all of the runs that we didn't need to close + // e.g. in bb&ii, when is encountered, is printed + // instead, to make sure the HTML is valid, obtaining bb&ii + final Run popped = tempStack.pop(); + textBuilder.append(popped.open); + openRuns.push(popped); + } + + } else { + // this will never be reached if openersIndex >= openers.size() because of the + // way minPos is calculated + textBuilder.append(openers.get(openersIndex).open); + openRuns.push(openers.get(openersIndex)); + ++openersIndex; + } + } + + // append last piece of text + textBuilder.append(content, currentTextPos, content.length()); + + return textBuilder.toString() + .replace("\n", "
") + .replace(" ", "  ") + // special link chips (e.g. for YT videos, YT channels or social media accounts): + // u00a0 u00a0 [/•] u00a0 u00a0 u00a0 + .replace("\">\u00a0\u00a0/\u00a0", "\">") + .replace("\">\u00a0\u00a0•\u00a0", "\">") + .replace("\u00a0\u00a0", ""); + } + + private static void addAllCommandRuns( + @Nonnull final JsonObject attributedDescription, + @Nonnull final List openers, + @Nonnull final List closers + ) { + attributedDescription.getArray("commandRuns") + .stream() + .filter(JsonObject.class::isInstance) + .map(JsonObject.class::cast) + .forEach(run -> { + final JsonObject navigationEndpoint = run.getObject("onTap") + .getObject("innertubeCommand"); + + final int startIndex = run.getInt("startIndex", -1); + final int length = run.getInt("length", 0); + if (startIndex < 0 || length < 1 || navigationEndpoint == null) { + return; + } + + final String url = getUrlFromNavigationEndpoint(navigationEndpoint); + if (url == null) { + return; + } + + final String open = ""; + + openers.add(new Run(open, LINK_CLOSE, startIndex, false)); + closers.add(new Run(open, LINK_CLOSE, startIndex + length, true)); + }); + } + + private static void addAllStyleRuns( + @Nonnull final JsonObject attributedDescription, + @Nonnull final List openers, + @Nonnull final List closers + ) { + attributedDescription.getArray("styleRuns") + .stream() + .filter(JsonObject.class::isInstance) + .map(JsonObject.class::cast) + .forEach(run -> { + final int start = run.getInt("startIndex", -1); + final int length = run.getInt("length", 0); + if (start < 0 || length < 1) { + return; + } + final int end = start + length; + + if (run.has("strikethrough")) { + openers.add(new Run(STRIKETHROUGH_OPEN, STRIKETHROUGH_CLOSE, start, false)); + closers.add(new Run(STRIKETHROUGH_OPEN, STRIKETHROUGH_CLOSE, end, true)); + } + + if (run.getBoolean("italic", false)) { + openers.add(new Run(ITALIC_OPEN, ITALIC_CLOSE, start, false)); + closers.add(new Run(ITALIC_OPEN, ITALIC_CLOSE, end, true)); + } + + if (run.has("weightLabel") + && !"FONT_WEIGHT_NORMAL".equals(run.getString("weightLabel"))) { + openers.add(new Run(BOLD_OPEN, BOLD_CLOSE, start, false)); + closers.add(new Run(BOLD_OPEN, BOLD_CLOSE, end, true)); + } + }); + } +} diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java index d6560cd809..b6ea901d3e 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java @@ -996,86 +996,6 @@ public static String getTextFromObject(final JsonObject textObject, final boolea return text; } - /** - * Parse a video description in the new "attributed" format, which contains the entire visible - * plaintext ({@code content}) and an array of {@code commandRuns}. - * - *

- * The {@code commandRuns} include the links and their position in the text. - *

- * - * @param attributedDescription the JSON object of the attributed description - * @return the parsed description, in HTML format, as a string - */ - @Nullable - public static String getAttributedDescription( - @Nullable final JsonObject attributedDescription) { - if (isNullOrEmpty(attributedDescription)) { - return null; - } - - final String content = attributedDescription.getString("content"); - if (content == null) { - return null; - } - - final JsonArray commandRuns = attributedDescription.getArray("commandRuns"); - - final StringBuilder textBuilder = new StringBuilder(); - int textStart = 0; - - for (final Object commandRun : commandRuns) { - if (!(commandRun instanceof JsonObject)) { - continue; - } - - final JsonObject run = ((JsonObject) commandRun); - final int startIndex = run.getInt("startIndex", -1); - final int length = run.getInt("length"); - final JsonObject navigationEndpoint = run.getObject("onTap") - .getObject("innertubeCommand"); - - if (startIndex < 0 || length < 1 || navigationEndpoint == null) { - continue; - } - - final String url = getUrlFromNavigationEndpoint(navigationEndpoint); - - if (url == null) { - continue; - } - - // Append text before the link - if (startIndex > textStart) { - textBuilder.append(content, textStart, startIndex); - } - - // Trim and append link text - // Channel/Video format: 3xu00a0, (/ •), u00a0, , 2xu00a0 - final String linkText = content.substring(startIndex, startIndex + length) - .replace('\u00a0', ' ') - .trim() - .replaceFirst("^[/•] *", ""); - - textBuilder.append("
") - .append(Entities.escape(linkText)) - .append(""); - - textStart = startIndex + length; - } - - // Append the remaining text - if (textStart < content.length()) { - textBuilder.append(content.substring(textStart)); - } - - return textBuilder.toString() - .replaceAll("\\n", "
") - .replaceAll(" {2}", "  "); - } - @Nonnull public static String getTextFromObjectOrThrow(final JsonObject textObject, final String error) throws ParsingException { diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsEUVMInfoItemExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsEUVMInfoItemExtractor.java index 857e6096fd..96b08f9dce 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsEUVMInfoItemExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsEUVMInfoItemExtractor.java @@ -15,7 +15,7 @@ import java.util.List; import java.util.Objects; -import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getAttributedDescription; +import static org.schabi.newpipe.extractor.services.youtube.YoutubeDescriptionHelper.attributedDescriptionToHtml; import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getImagesFromThumbnailsArray; import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty; @@ -97,7 +97,7 @@ public String getTextualLikeCount() { public Description getCommentText() throws ParsingException { // Comments' text work in the same way as an attributed video description return new Description( - getAttributedDescription(commentEntityPayload.getObject(PROPERTIES) + attributedDescriptionToHtml(commentEntityPayload.getObject(PROPERTIES) .getObject("content")), Description.HTML); } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java index 6dd3520a1f..f660e16163 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java @@ -22,6 +22,7 @@ import static org.schabi.newpipe.extractor.services.youtube.ItagItem.APPROX_DURATION_MS_UNKNOWN; import static org.schabi.newpipe.extractor.services.youtube.ItagItem.CONTENT_LENGTH_UNKNOWN; +import static org.schabi.newpipe.extractor.services.youtube.YoutubeDescriptionHelper.attributedDescriptionToHtml; import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.CONTENT_CHECK_OK; import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.CPN; import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.RACY_CHECK_OK; @@ -30,7 +31,6 @@ import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.fixThumbnailUrl; import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.generateContentPlaybackNonce; import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.generateTParameter; -import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getAttributedDescription; import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getImagesFromThumbnailsArray; import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getJsonAndroidPostResponse; import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getJsonIosPostResponse; @@ -261,7 +261,7 @@ public Description getDescription() throws ParsingException { return new Description(videoSecondaryInfoRendererDescription, Description.HTML); } - final String attributedDescription = getAttributedDescription( + final String attributedDescription = attributedDescriptionToHtml( getVideoSecondaryInfoRenderer().getObject("attributedDescription")); if (!isNullOrEmpty(attributedDescription)) { return new Description(attributedDescription, Description.HTML); From b80c3f5d5158685ce59c883d41d425426a632563 Mon Sep 17 00:00:00 2001 From: Stypox Date: Mon, 8 Apr 2024 00:14:28 +0200 Subject: [PATCH 3/6] [YouTube] Replace link text with accessibility label --- .../youtube/YoutubeDescriptionHelper.java | 81 ++++++++++++++++--- 1 file changed, 71 insertions(+), 10 deletions(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java index 0eeecdac76..49b94f6e90 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java @@ -12,6 +12,9 @@ import java.util.Comparator; import java.util.List; import java.util.Stack; +import java.util.function.Function; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -29,6 +32,11 @@ private YoutubeDescriptionHelper() { public static final String ITALIC_OPEN = ""; public static final String ITALIC_CLOSE = ""; + // special link chips (e.g. for YT videos, YT channels or social media accounts): + // (u00a0) u00a0 u00a0 [/•] u00a0 u00a0 u00a0 + private static final Pattern LINK_CONTENT_CLEANER_REGEX + = Pattern.compile("(?s)^\u00a0+[/•]\u00a0+(.*?)\u00a0+$"); + /** * Can be a command run, or a style run. */ @@ -37,17 +45,30 @@ static final class Run { @Nonnull final String close; final int pos; final boolean isClose; + @Nullable final Function transformContent; + int openPosInOutput = -1; Run( @Nonnull final String open, @Nonnull final String close, final int pos, final boolean isClose + ) { + this(open, close, pos, isClose, null); + } + + Run( + @Nonnull final String open, + @Nonnull final String close, + final int pos, + final boolean isClose, + @Nullable final Function transformContent ) { this.open = open; this.close = close; this.pos = pos; this.isClose = isClose; + this.transformContent = transformContent; } public boolean sameOpen(@Nonnull final Run other) { @@ -148,12 +169,22 @@ static String runsToHtml( // condition, because no run will close before being opened, but let's be sure while (!openRuns.empty()) { final Run popped = openRuns.pop(); - textBuilder.append(popped.close); if (popped.sameOpen(closer)) { + // before closing the current run, if the run has a transformContent + // function, use it to transform the content of the current run, based on + // the openPosInOutput set when the current run was opened + if (popped.transformContent != null && popped.openPosInOutput >= 0) { + textBuilder.replace(popped.openPosInOutput, textBuilder.length(), + popped.transformContent.apply( + textBuilder.substring(popped.openPosInOutput))); + } + // close the run that we really need to close + textBuilder.append(popped.close); break; } // we keep popping from openRuns, closing all of the runs we find, // until we find the run that we really need to close ... + textBuilder.append(popped.close); tempStack.push(popped); } while (!tempStack.empty()) { @@ -168,8 +199,10 @@ static String runsToHtml( } else { // this will never be reached if openersIndex >= openers.size() because of the // way minPos is calculated - textBuilder.append(openers.get(openersIndex).open); - openRuns.push(openers.get(openersIndex)); + final Run opener = openers.get(openersIndex); + textBuilder.append(opener.open); + opener.openPosInOutput = textBuilder.length(); // save for transforming later + openRuns.push(opener); ++openersIndex; } } @@ -180,11 +213,7 @@ static String runsToHtml( return textBuilder.toString() .replace("\n", "
") .replace(" ", "  ") - // special link chips (e.g. for YT videos, YT channels or social media accounts): - // u00a0 u00a0 [/•] u00a0 u00a0 u00a0 - .replace("\">\u00a0\u00a0/\u00a0", "\">") - .replace("\">\u00a0\u00a0•\u00a0", "\">") - .replace("\u00a0\u00a0", ""); + .replace('\u00a0', ' '); } private static void addAllCommandRuns( @@ -212,12 +241,44 @@ private static void addAllCommandRuns( } final String open = ""; + final Function transformContent = getTransformContentFun(run); - openers.add(new Run(open, LINK_CLOSE, startIndex, false)); - closers.add(new Run(open, LINK_CLOSE, startIndex + length, true)); + openers.add(new Run(open, LINK_CLOSE, startIndex, false, + transformContent)); + closers.add(new Run(open, LINK_CLOSE, startIndex + length, true, + transformContent)); }); } + private static Function getTransformContentFun(final JsonObject run) { + final String accessibilityLabel = run.getObject("onTapOptions") + .getObject("accessibilityInfo") + .getString("accessibilityLabel", "") + // accessibility labels are e.g. "Instagram Channel Link: instagram_profile_name" + .replaceFirst(" Channel Link", ""); + + final Function transformContent; + if (accessibilityLabel.isEmpty() || accessibilityLabel.startsWith("YouTube: ")) { + // if there is no accessibility label, or the link points to YouTube, cleanup the link + // text, see LINK_CONTENT_CLEANER_REGEX's documentation for more details + transformContent = (content) -> { + final Matcher m = LINK_CONTENT_CLEANER_REGEX.matcher(content); + if (m.find()) { + return m.group(1); + } + return content; + }; + } else { + // if there is an accessibility label, replace the link text with it, because on the + // YouTube website an ambiguous link text is next to an icon explaining which service it + // belongs to, but since we can't add icons, we instead use the accessibility label + // which contains information about the service + transformContent = (content) -> accessibilityLabel; + } + + return transformContent; + } + private static void addAllStyleRuns( @Nonnull final JsonObject attributedDescription, @Nonnull final List openers, From a90237816a748dc02cc47a92f9369dddbc9e22e8 Mon Sep 17 00:00:00 2001 From: Stypox Date: Mon, 8 Apr 2024 09:47:15 +0200 Subject: [PATCH 4/6] [YouTube] Cleanup description helper Remove unneeded isClose field, and make constants private --- .../youtube/YoutubeDescriptionHelper.java | 41 ++++++++----------- .../newpipe/extractor/stream/Description.java | 4 +- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java index 49b94f6e90..33148d7ebc 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java @@ -24,13 +24,13 @@ public final class YoutubeDescriptionHelper { private YoutubeDescriptionHelper() { } - public static final String LINK_CLOSE = ""; - public static final String STRIKETHROUGH_OPEN = ""; - public static final String STRIKETHROUGH_CLOSE = ""; - public static final String BOLD_OPEN = ""; - public static final String BOLD_CLOSE = ""; - public static final String ITALIC_OPEN = ""; - public static final String ITALIC_CLOSE = ""; + private static final String LINK_CLOSE = ""; + private static final String STRIKETHROUGH_OPEN = ""; + private static final String STRIKETHROUGH_CLOSE = ""; + private static final String BOLD_OPEN = ""; + private static final String BOLD_CLOSE = ""; + private static final String ITALIC_OPEN = ""; + private static final String ITALIC_CLOSE = ""; // special link chips (e.g. for YT videos, YT channels or social media accounts): // (u00a0) u00a0 u00a0 [/•] u00a0 u00a0 u00a0 @@ -44,30 +44,26 @@ static final class Run { @Nonnull final String open; @Nonnull final String close; final int pos; - final boolean isClose; @Nullable final Function transformContent; int openPosInOutput = -1; Run( @Nonnull final String open, @Nonnull final String close, - final int pos, - final boolean isClose + final int pos ) { - this(open, close, pos, isClose, null); + this(open, close, pos, null); } Run( @Nonnull final String open, @Nonnull final String close, final int pos, - final boolean isClose, @Nullable final Function transformContent ) { this.open = open; this.close = close; this.pos = pos; - this.isClose = isClose; this.transformContent = transformContent; } @@ -87,6 +83,7 @@ public boolean sameOpen(@Nonnull final Run other) { * @param attributedDescription the JSON object of the attributed description * @return the parsed description, in HTML format, as a string */ + @Nullable public static String attributedDescriptionToHtml( @Nullable final JsonObject attributedDescription ) { @@ -243,10 +240,8 @@ private static void addAllCommandRuns( final String open = ""; final Function transformContent = getTransformContentFun(run); - openers.add(new Run(open, LINK_CLOSE, startIndex, false, - transformContent)); - closers.add(new Run(open, LINK_CLOSE, startIndex + length, true, - transformContent)); + openers.add(new Run(open, LINK_CLOSE, startIndex, transformContent)); + closers.add(new Run(open, LINK_CLOSE, startIndex + length, transformContent)); }); } @@ -297,19 +292,19 @@ private static void addAllStyleRuns( final int end = start + length; if (run.has("strikethrough")) { - openers.add(new Run(STRIKETHROUGH_OPEN, STRIKETHROUGH_CLOSE, start, false)); - closers.add(new Run(STRIKETHROUGH_OPEN, STRIKETHROUGH_CLOSE, end, true)); + openers.add(new Run(STRIKETHROUGH_OPEN, STRIKETHROUGH_CLOSE, start)); + closers.add(new Run(STRIKETHROUGH_OPEN, STRIKETHROUGH_CLOSE, end)); } if (run.getBoolean("italic", false)) { - openers.add(new Run(ITALIC_OPEN, ITALIC_CLOSE, start, false)); - closers.add(new Run(ITALIC_OPEN, ITALIC_CLOSE, end, true)); + openers.add(new Run(ITALIC_OPEN, ITALIC_CLOSE, start)); + closers.add(new Run(ITALIC_OPEN, ITALIC_CLOSE, end)); } if (run.has("weightLabel") && !"FONT_WEIGHT_NORMAL".equals(run.getString("weightLabel"))) { - openers.add(new Run(BOLD_OPEN, BOLD_CLOSE, start, false)); - closers.add(new Run(BOLD_OPEN, BOLD_CLOSE, end, true)); + openers.add(new Run(BOLD_OPEN, BOLD_CLOSE, start)); + closers.add(new Run(BOLD_OPEN, BOLD_CLOSE, end)); } }); } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/stream/Description.java b/extractor/src/main/java/org/schabi/newpipe/extractor/stream/Description.java index d55237b259..2641815b12 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/stream/Description.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/stream/Description.java @@ -3,6 +3,8 @@ import java.io.Serializable; import java.util.Objects; +import javax.annotation.Nullable; + public class Description implements Serializable { public static final int HTML = 1; @@ -13,7 +15,7 @@ public class Description implements Serializable { private final String content; private final int type; - public Description(final String content, final int type) { + public Description(@Nullable final String content, final int type) { this.type = type; if (content == null) { this.content = ""; From 3f7b2653e3d2245b33e5fcc5e385d0739c4b6ac7 Mon Sep 17 00:00:00 2001 From: Stypox Date: Mon, 8 Apr 2024 09:47:38 +0200 Subject: [PATCH 5/6] [YouTube] Add YoutubeDescriptionHelperTest --- .../youtube/YoutubeDescriptionHelperTest.java | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelperTest.java diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelperTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelperTest.java new file mode 100644 index 0000000000..7c0713651c --- /dev/null +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelperTest.java @@ -0,0 +1,81 @@ +package org.schabi.newpipe.extractor.services.youtube; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.schabi.newpipe.extractor.services.youtube.YoutubeDescriptionHelper.runsToHtml; + +import org.junit.jupiter.api.Test; +import org.schabi.newpipe.extractor.services.youtube.YoutubeDescriptionHelper.Run; + +import java.util.Comparator; +import java.util.List; +import java.util.function.Function; +import java.util.stream.Collectors; + +public class YoutubeDescriptionHelperTest { + + private static void assertRunsToHtml(final String expectedHtml, + final List openers, + final List closers, + final String content) { + assertEquals( + expectedHtml, + runsToHtml( + openers.stream() + .sorted(Comparator.comparingInt(run -> run.pos)) + .collect(Collectors.toList()), + closers.stream() + .sorted(Comparator.comparingInt(run -> run.pos)) + .collect(Collectors.toList()), + content + ) + ); + } + + @Test + public void testNoRuns() { + assertRunsToHtml( + "abc *a* _c_

test  &", + List.of(), + List.of(), + "abc *a* _c_
\u00a0\n\u00a0test &" + ); + } + + @Test + public void testNormalRuns() { + assertRunsToHtml( + "hello nice test", + List.of(new Run("", "", 0), new Run("", "", 3), + new Run("", "", 11)), + List.of(new Run("", "", 9), new Run("", "", 6), + new Run("", "", 15)), + "hello nice test" + ); + } + + @Test + public void testOverlappingRuns() { + assertRunsToHtml( + "0123456789", + List.of(new Run("", "", 2), new Run("", "", 4)), + List.of(new Run("", "", 6), new Run("", "", 8)), + "0123456789" + ); + } + + @Test + public void testTransformingRuns() { + final Function tA = content -> "whatever"; + final Function tD + = content -> Integer.parseInt(content) % 2 == 0 ? "even" : "odd"; + + assertRunsToHtml( + "0whatever45odd89", + List.of(new Run("", "", 1, tA), new Run("", "", 2), + new Run("", "", 3), new Run("", "", 6, tD)), + List.of(new Run("", "", 4, tA), new Run("", "", 3), + new Run("", "", 5), new Run("", "", 8, tD)), + "0123456789" + ); + } +} From 02274d5395dfb6275a202245726f7bada2e2c5fe Mon Sep 17 00:00:00 2001 From: Stypox Date: Mon, 8 Apr 2024 10:19:26 +0200 Subject: [PATCH 6/6] [YouTube] Avoid XSS attacks in description or comments --- .../youtube/YoutubeDescriptionHelper.java | 33 +++++++++++-------- .../youtube/YoutubeDescriptionHelperTest.java | 2 +- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java index 33148d7ebc..afe053a358 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java @@ -35,7 +35,7 @@ private YoutubeDescriptionHelper() { // special link chips (e.g. for YT videos, YT channels or social media accounts): // (u00a0) u00a0 u00a0 [/•] u00a0 u00a0 u00a0 private static final Pattern LINK_CONTENT_CLEANER_REGEX - = Pattern.compile("(?s)^\u00a0+[/•]\u00a0+(.*?)\u00a0+$"); + = Pattern.compile("(?s)^ +[/•] +(.*?) +$"); /** * Can be a command run, or a style run. @@ -74,10 +74,13 @@ public boolean sameOpen(@Nonnull final Run other) { /** * Parse a video description in the new "attributed" format, which contains the entire visible - * plaintext ({@code content}) and an array of {@code commandRuns}. + * plaintext ({@code content}) and an array of {@code commandRuns} and {@code styleRuns}. + * Returns the formatted content in HTML format, and escapes the text to make sure there are no + * XSS attacks. * *

- * The {@code commandRuns} include the links and their position in the text. + * {@code commandRuns} include the links and their range in the text, while {@code styleRuns} + * include the styling to apply to various ranges in the text. *

* * @param attributedDescription the JSON object of the attributed description @@ -119,25 +122,28 @@ public static String attributedDescriptionToHtml( * Applies the formatting specified by the intervals stored in {@code openers} and {@code * closers} to {@code content} in order to obtain valid HTML even when intervals overlap. For * example <b>b<i>b&i</b>i</i> would not be valid HTML, so this function - * instead generates <b>b<i>b&i</i></b><i>i</i>. + * instead generates <b>b<i>b&i</i></b><i>i</i>. Any HTML + * special characters in {@code rawContent} are escaped to make sure there are no XSS attacks. + * *

* Every opener in {@code openers} must have a corresponding closer in {@code closers}. Every * corresponding (opener, closer) pair must have a length of at least one (i.e. empty intervals * are not allowed). *

* - * @param openers contains all of the places where a run begins, must have the same size of - * closers, must be ordered by {@link Run#pos} - * @param closers contains all of the places where a run ends, must have the same size of - * openers, must be ordered by {@link Run#pos} - * @param content the content to apply formatting to + * @param openers contains all of the places where a run begins, must have the same size of + * closers, must be ordered by {@link Run#pos} + * @param closers contains all of the places where a run ends, must have the same size of + * openers, must be ordered by {@link Run#pos} + * @param rawContent the content to apply formatting to, and to escape to avoid XSS * @return the formatted content in HTML */ static String runsToHtml( @Nonnull final List openers, @Nonnull final List closers, - @Nonnull final String content + @Nonnull final String rawContent ) { + final String content = rawContent.replace('\u00a0', ' '); final Stack openRuns = new Stack<>(); final Stack tempStack = new Stack<>(); final StringBuilder textBuilder = new StringBuilder(); @@ -154,7 +160,7 @@ static String runsToHtml( : closers.get(closersIndex).pos; // append piece of text until current index - textBuilder.append(content, currentTextPos, minPos); + textBuilder.append(Entities.escape(content.substring(currentTextPos, minPos))); currentTextPos = minPos; if (closers.get(closersIndex).pos == minPos) { @@ -205,12 +211,11 @@ static String runsToHtml( } // append last piece of text - textBuilder.append(content, currentTextPos, content.length()); + textBuilder.append(Entities.escape(content.substring(currentTextPos))); return textBuilder.toString() .replace("\n", "
") - .replace(" ", "  ") - .replace('\u00a0', ' '); + .replace(" ", "  "); } private static void addAllCommandRuns( diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelperTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelperTest.java index 7c0713651c..6e9b193034 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelperTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelperTest.java @@ -34,7 +34,7 @@ private static void assertRunsToHtml(final String expectedHtml, @Test public void testNoRuns() { assertRunsToHtml( - "abc *a* _c_

test  &", + "abc *a* _c_ <br>
<a href=\"#\">test</a>  &amp;", List.of(), List.of(), "abc *a* _c_
\u00a0\n\u00a0test &"