add support for webtoon.xyz

mikf · Jan 31, 2024 · 4cbbeb7 · 4cbbeb7
1 parent 22647c2
commit 4cbbeb7
Show file tree

Hide file tree

Showing 5 changed files with 170 additions and 0 deletions.
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
@@ -973,6 +973,12 @@ Consider all listed sites to potentially be NSFW.
     <td>Comics, Episodes</td>
     <td></td>
 </tr>
+<tr>
+    <td>WebtoonXYZ</td>
+    <td>https://www.webtoon.xyz/</td>
+    <td>Chapters, Manga</td>
+    <td></td>
+</tr>
 <tr>
     <td>Weibo</td>
     <td>https://www.weibo.com/</td>

diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -174,6 +174,7 @@
     "weasyl",
     "webmshare",
     "webtoons",
+    "webtoonxyz",
     "weibo",
     "wikiart",
     "wikifeet",

diff --git a/gallery_dl/extractor/webtoonxyz.py b/gallery_dl/extractor/webtoonxyz.py
@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.webtoon.xyz/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text, exception
+import re
+
+
+class WebtoonxyzBase():
+    """Base class for Webtoon.xyz extractors"""
+    category = "webtoonxyz"
+    root = "https://www.webtoon.xyz"
+
+    @staticmethod
+    def parse_chapter_string(chapter_string, data):
+        match = re.match(
+            r"(?:(.+)\s*-\s*)?[Cc]hapter\s*(\d+)(\.\d+)?(?:\s*-\s*(.+))?",
+            text.unescape(chapter_string).strip())
+        manga, chapter, minor, title = match.groups()
+        manga = manga.strip() if manga else ""
+        data["manga"] = data.pop("manga", manga)
+        data["chapter"] = text.parse_int(chapter)
+        data["chapter_minor"] = minor or ""
+        data["title"] = title or ""
+        data["lang"] = "en"
+        data["language"] = "English"
+
+
+class WebtoonxyzChapterExtractor(WebtoonxyzBase, ChapterExtractor):
+    """Extractor for manga-chapters from www.webtoon.xyz"""
+    pattern = (r"(?:https?://)?(?:www\.)?webtoon\.xyz"
+               r"(/read/[^/?#]+/[^/?#]+)")
+    example = "https://www.webtoon.xyz/read/MANGA/chapter-01/"
+
+    def metadata(self, page):
+        tags = text.extr(page, 'class="wp-manga-tags-list">', '</div>')
+        data = {"tags": list(text.split_html(tags)[::2])}
+        info = text.extr(page, '<h1 id="chapter-heading">', "</h1>")
+        if not info:
+            raise exception.NotFoundError("chapter")
+        self.parse_chapter_string(info, data)
+        return data
+
+    def images(self, page):
+        page = text.extr(
+            page, '<div class="reading-content">', '<div class="entry-header')
+        return [
+            (text.extr(img, 'src="', '"').strip(), None)
+            for img in text.extract_iter(page, '<img id="image-', '>')
+        ]
+
+
+class WebtoonxyzMangaExtractor(WebtoonxyzBase, MangaExtractor):
+    """Extractor for manga from www.webtoon.xyz"""
+    chapterclass = WebtoonxyzChapterExtractor
+    pattern = r"(?:https?://)?(?:www\.)?webtoon\.xyz(/read/[^/?#]+)/?$"
+    example = "https://www.webtoon.xyz/read/MANGA"
+
+    def chapters(self, page):
+        if 'class="error404' in page:
+            raise exception.NotFoundError("manga")
+        data = self.metadata(page)
+        result = []
+        for chapter in text.extract_iter(
+                page, '<li class="wp-manga-chapter', "</li>"):
+            url , pos = text.extract(chapter, '<a href="', '"')
+            info, _ = text.extract(chapter, ">", "</a>", pos)
+            self.parse_chapter_string(info, data)
+            result.append((url, data.copy()))
+        return result
+
+    def metadata(self, page):
+        extr = text.extract_from(text.extr(
+            page, 'class="summary_content">', 'class="manga-action"'))
+        return {
+            "manga"      : text.extr(page, "<h1>", "</h1>").strip(),
+            "description": text.unescape(text.remove_html(text.extract(
+                page, ">", "</div>", page.index("summary__content"))[0])),
+            "rating"     : text.parse_float(
+                extr('total_votes">', "</span>").strip()),
+            "manga_alt"  : text.remove_html(
+                extr("Alternative </h5>\n</div>", "</div>")).split("; "),
+            "author"     : list(text.extract_iter(
+                extr('class="author-content">', "</div>"), '"tag">', "</a>")),
+            "artist"     : list(text.extract_iter(
+                extr('class="artist-content">', "</div>"), '"tag">', "</a>")),
+            "genres"     : list(text.extract_iter(
+                extr('class="genres-content">', "</div>"), '"tag">', "</a>")),
+            "type"       : text.remove_html(
+                extr("Type </h5>\n</div>", "</div>")),
+            "release"    : text.parse_int(text.remove_html(
+                extr("Release </h5>\n</div>", "</div>"))),
+            "status"     : text.remove_html(
+                extr("Status </h5>\n</div>", "</div>")),
+        }
diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py
@@ -140,6 +140,7 @@
     "wallpapercave"  : "Wallpaper Cave",
     "webmshare"      : "webmshare",
     "webtoons"       : "Webtoon",
+    "webtoonxyz"     : "Webtoon.xyz",
     "wikiart"        : "WikiArt.org",
     "wikimediacommons": "Wikimedia Commons",
     "xbunkr"         : "xBunkr",

diff --git a/test/results/webtoonxyz.py b/test/results/webtoonxyz.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import webtoonxyz
+from gallery_dl import exception
+
+
+__tests__ = (
+{
+    "#url"     : "https://www.webtoon.xyz/read/the-world-after-the-end/chapter-105/",
+    "#category": ("", "webtoonxyz", "chapter"),
+    "#class"   : webtoonxyz.WebtoonxyzChapterExtractor,
+    "#pattern" : r"https://www\.webtoon\.xyz/wp-content/uploads/WP-manga/data/manga_[^/]+/[^/]+/[^.]+\.\w+",
+    "#count"   : 11,
+
+    "manga"        : "The World After The End",
+    "title"        : "",
+    "chapter"      : 105,
+    "lang"         : "en",
+    "language"     : "English",
+},
+
+{
+    "#url"     : "https://www.webtoon.xyz/read/the-world-after-the-end/chapter-1000000/",
+    "#category": ("", "webtoonxyz", "chapter"),
+    "#class"   : webtoonxyz.WebtoonxyzChapterExtractor,
+    "#exception": exception.NotFoundError,
+},
+
+{
+    "#url"     : "https://www.webtoon.xyz/read/the-world-after-the-end/",
+    "#category": ("", "webtoonxyz", "manga"),
+    "#class"   : webtoonxyz.WebtoonxyzMangaExtractor,
+    "#pattern" : r"https://www\.webtoon\.xyz/read/such-a-cute-spy/chapter-\d+([_-].+)?/",
+    "#count"   : ">= 13",
+
+    "manga"      : "The World After The End",
+    "author"     : ["S-Cynaan", "Sing Shong"],
+    "artist"     : ["Undead Potato"],
+    "genres"     : [
+        "Action",
+        "Adventure",
+        "Fantasy",
+    ],
+    "rating"     : float,
+    "status"     : "OnGoing",
+    "lang"       : "en",
+    "language"   : "English",
+    "manga_alt"  : list,
+},
+
+{
+    "#url"     : "https://www.webtoon.xyz/read/doesnotexist",
+    "#category": ("", "webtoonxyz", "manga"),
+    "#class"   : webtoonxyz.WebtoonxyzMangaExtractor,
+    "#exception": exception.HttpError,
+},
+
+)