Skip to content

Commit

Permalink
update(v1.6.3-0): add subsystem-deduplicate;
Browse files Browse the repository at this point in the history
  • Loading branch information
jasoneri committed Feb 13, 2025
1 parent 4be07b0 commit d2220ee
Show file tree
Hide file tree
Showing 18 changed files with 269 additions and 72 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ log/
# PyBuilder
target/

# vscode
.vscode

# Jupyter Notebook
.ipynb_checkpoints

Expand Down Expand Up @@ -170,6 +173,7 @@ analyze/
*.ico
temp
conf.yml
record.db
requirements-in.txt
__temp
_bug_log
Expand Down
10 changes: 6 additions & 4 deletions ComicSpider/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.python import get_func_args

from utils import conf
from utils.website import JmUtils, set_author_ahead, MangabzUtils
from assets import res

Expand Down Expand Up @@ -67,10 +68,11 @@ def image_downloaded(self, response, request, info, *, item=None):
class JmComicPipeline(ComicPipeline):
def file_folder(self, basepath, section, spider, title, meta: dict):
path = super(JmComicPipeline, self).file_folder(basepath, section, spider, title, meta)
# jm上传者太多命名规范太杂有重名情况出现(例如'満开开花-催眠で'),重名时加上车号确保不重
_epsId = re.search(r"(\d+)$", meta.get("referer", ""))
if bool(_epsId):
path = f"{path}[{_epsId.group(1)}]"
if not conf.isDeduplicate:
# jm上传者太多命名规范太杂有重名情况出现(例如'満开开花-催眠で'),重名时加上车号确保不重
_epsId = re.search(r"(\d+)$", meta.get("referer", ""))
if bool(_epsId):
path = f"{path}[{_epsId.group(1)}]"
return path

def get_images(self, response, request, info, *, item=None):
Expand Down
44 changes: 28 additions & 16 deletions ComicSpider/spiders/basecomicspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@
from variables import *
from assets import res as ori_res
from ComicSpider.items import ComicspiderItem
from utils import font_color, Queues, QueuesManager, PresetHtmlEl, correct_domain, temp_p
from utils import (
font_color, Queues, QueuesManager, PresetHtmlEl, correct_domain, temp_p, conf, md5
)
from utils.processed_class import (
TextBrowserState, ProcessState, QueueHandler, refresh_state, Url
)
from utils.sql import SqlUtils


class SayToGui:
Expand Down Expand Up @@ -86,6 +89,7 @@ class BaseComicSpider(scrapy.Spider):
manager: QueuesManager = None
Q: QueueHandler = None
say: SayToGui = None
sql_handler: SqlUtils = None
ua = {}

num_of_row = 5
Expand Down Expand Up @@ -270,6 +274,7 @@ def from_crawler(cls, crawler, *args, **kwargs):
spider.Q('ProcessQueue').send(spider.process_state)

spider.say = SayToGui(spider, q, spider.text_browser_state)
spider.sql_handler = SqlUtils(spider.name)
return spider

def close(self, reason):
Expand All @@ -279,6 +284,7 @@ def close(self, reason):
except:
pass
sleep(0.3)
self.sql_handler.close()
if reason == "ConnectionResetError":
return
elif 'init' not in self.process_state.process:
Expand Down Expand Up @@ -312,18 +318,21 @@ def parse_section(self, response):
self.Q('ProcessQueue').send(self.process_state)

title = PresetHtmlEl.sub(response.meta.get('title'))
self.say(f'{"=" * 15}{title}》')
results = self.frame_section(response) # {1: url1……}
referer = response.url
for page, url in results.items():
item = ComicspiderItem()
item['title'] = title
item['page'] = str(page)
item['section'] = 'meaningless'
item['image_urls'] = [url]
item['referer'] = referer
self.total += 1
yield item
title_md5 = md5(title)
if not conf.isDeduplicate or not (conf.isDeduplicate and self.sql_handler.check_dupe(title_md5)):
self.sql_handler.add(title_md5)
self.say(f'{"=" * 15}{title}》')
results = self.frame_section(response) # {1: url1……}
referer = response.url
for page, url in results.items():
item = ComicspiderItem()
item['title'] = title
item['page'] = str(page)
item['section'] = 'meaningless'
item['image_urls'] = [url]
item['referer'] = referer
self.total += 1
yield item
self.process_state.process = 'fin'
self.Q('ProcessQueue').send(self.process_state)

Expand All @@ -344,9 +353,12 @@ def parse_section(self, response):
yield scrapy.Request(url=next_page_flag, callback=self.parse_section, meta=meta)
else:
title = PresetHtmlEl.sub(response.meta.get('title'))
for page, url in results.items():
meta = {'title': title, 'page': page}
yield scrapy.Request(url=url, callback=self.parse_fin_page, meta=meta)
title_md5 = md5(title)
if not conf.isDeduplicate or not self.sql_handler.check_dupe(title_md5):
self.sql_handler.add(title_md5)
for page, url in results.items():
meta = {'title': title, 'page': page}
yield scrapy.Request(url=url, callback=self.parse_fin_page, meta=meta)


class BodyFormat:
Expand Down
2 changes: 1 addition & 1 deletion ComicSpider/spiders/wnacg.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def frame_book(self, response):
self.say(example_b.format(str(x + 1), title, chr(12288)))
self.say('') if (x + 1) % self.num_of_row == 0 else None
frame_results[x + 1] = [url, title]
preview.add(x + 1, img_preview, PresetHtmlEl.sub(title), preview_url) # 其实title已兜底处理,但preview受其影响所以前置一下
preview.add(x + 1, img_preview, title, preview_url)
self.say(preview.created_temp_html)
return self.say.frame_book_print(frame_results, url=response.url)

Expand Down
14 changes: 14 additions & 0 deletions GUI/conf_dialog.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,27 @@ def __init__(self, parent=None):
def setupUi(self, Dialog):
super(ConfDialog, self).setupUi(Dialog)
self.buttonBox.accepted.connect(self.save_conf)
self.isBanCover.stateChanged.connect(self.handleBanCoverStateChange)
tip = QtCore.QCoreApplication.translate("Dialog", F"idx corresponds/序号对应:\n{json.dumps(SPIDERS)}")
self.completerEdit.setToolTip(tip)
self.label_completer.setToolTip(tip)

def handleBanCoverStateChange(self, state):
if state == QtCore.Qt.Checked:
self.isDeduplicate.setChecked(False)
self.isDeduplicate.setDisabled(True)
else:
self.isDeduplicate.setDisabled(False)

def show_self(self): # can't naming `show`. If done, just run code once
# 1. Text类配置
for _ in ('sv_path', 'proxies', 'custom_map', 'cv_proj_path', "completer", "eh_cookies",
"clip_db", "clip_read_num"):
getattr(self, f"{_}Edit").setText(self.transfer_to_gui(getattr(conf, _) or ""))
self.logLevelComboBox.setCurrentIndex(self.logLevelComboBox.findText(getattr(conf, "log_level")))
# 2. CheckBox类配置
for _ in ('isBanCover', 'isDeduplicate'):
getattr(self, f"{_}").setChecked(getattr(conf, f"{_}"))
super(ConfDialog, self).show()

@staticmethod
Expand All @@ -51,6 +63,8 @@ def save_conf(self):
"eh_cookies": yaml.safe_load(cp(getattr(self, f"eh_cookiesEdit").toPlainText().replace("\t", ""))),
"proxies": cp(self.proxiesEdit.text()).replace(" ", "").split(",") if self.proxiesEdit.text() else None,
"log_level": getattr(self, "logLevelComboBox").currentText(),
"isBanCover": getattr(self, "isBanCover").isChecked(),
"isDeduplicate": getattr(self, "isDeduplicate").isChecked(),
"clip_db": getattr(self, f"clip_dbEdit").text(),
"clip_read_num": getattr(self, f"clip_read_numEdit").text()
}
Expand Down
18 changes: 9 additions & 9 deletions GUI/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
import re
import sys
import time
import traceback
from multiprocessing import Process
import multiprocessing.managers as m
from PyQt5.QtCore import QThread, Qt, pyqtSignal, QCoreApplication, QRect
from PyQt5.QtWidgets import QMainWindow, QMenu, QAction, QMessageBox, QCompleter
import traceback

from GUI.uic.ui_mainwindow import Ui_MainWindow
from GUI.conf_dialog import ConfDialog
Expand All @@ -21,24 +21,22 @@
conf, p, ori_path)
from utils.processed_class import (
InputFieldState, TextBrowserState, ProcessState,
GuiQueuesManger, QueueHandler, refresh_state, crawl_what, ClipManager
GuiQueuesManger, QueueHandler, refresh_state, crawl_what, ClipManager,
PreviewHtml
)
from utils.website import httpx, Cookies, MangabzUtils, JmUtils, WnacgUtils, EHentaiKits
from utils.website import spider_utils_map
from utils.comic_viewer_tools import combine_then_mv, show_max
from deploy import curr_os

spider_utils_map = {1: object, 2: JmUtils, 3: WnacgUtils, 4: EHentaiKits, 5: MangabzUtils}


class ClipTasksThread(QThread):
info_signal = pyqtSignal(tuple)
total_signal = pyqtSignal(dict)

def __init__(self, gui, parse_func, tasks):
def __init__(self, gui, tasks):
super(ClipTasksThread, self).__init__()
self.gui: SpiderGUI = gui
self.tasks = tasks
self.parse_func = parse_func

def run(self):
self.msleep(1200) # 延后1s,否则子线程太快导致主界面没跟上
Expand All @@ -47,7 +45,7 @@ def run(self):
for idx, url in enumerate(self.tasks):
try:
resp = cli.get(url, follow_redirects=True, timeout=3)
info = self.parse_func(resp.text)
info = self.gui.spiderUtils.parse_book(resp.text)
self.msleep(50)
self.info_signal.emit((idx + 1, url, *info[1:]))
total[idx + 1] = [info[2], info[0]]
Expand Down Expand Up @@ -378,6 +376,8 @@ def set_preview(self):
self.previewBtn.setEnabled(True)
self.previewBtn.setFocus()
self.BrowserWindow.ensureBtn.clicked.connect(self.ensure_preview)
if conf.isDeduplicate:
PreviewHtml.tip_duplication(SPIDERS[self.chooseBox.currentIndex()], self.tf)

def show_preview(self):
"""prevent PreviewWindow is None when init"""
Expand Down Expand Up @@ -411,7 +411,7 @@ def init_clip_handle(self, tf, match_urls):
self.BrowserWindow.resize(self.BrowserWindow.width(), 860)
self.BrowserWindow.show()
self.page = self.BrowserWindow.view.page()
self.clipTasksThread = ClipTasksThread(self, getattr(self.spiderUtils, "parse_book"), match_urls)
self.clipTasksThread = ClipTasksThread(self, match_urls)
self.clipTasksThread.info_signal.connect(self.single_clip_tasks_data)
self.clipTasksThread.total_signal.connect(self.all_clip_tasks_data)
self.clipTasksThread.start()
Expand Down
14 changes: 14 additions & 0 deletions GUI/src/preview_format/bootstrap.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Preview</title>
<link rel="stylesheet" href="https://cdn.staticfile.net/bootstrap/5.3.2/css/bootstrap.min.css">
<style>
.downloaded {
filter: grayscale(100%); /* 将图片灰度化 */
opacity: 0.6; /* 降低图片透明度 */
}
</style>
</head>
<body>
<script src="https://cdn.staticfile.net/bootstrap/5.3.2/js/bootstrap.min.js"></script>
Expand All @@ -20,6 +26,14 @@
return selectedValues
}
</script>
<script>
document.addEventListener("DOMContentLoaded", function () {
const downloadedImages = document.querySelectorAll('.img-thumbnail.downloaded');
downloadedImages.forEach(img => {
img.closest('.form-check').style.backgroundColor = 'lightsalmon';
});
});
</script>
<div class="row">
{body}
</div>
Expand Down
6 changes: 5 additions & 1 deletion GUI/src/preview_format/bootstrap_by_clip.html
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,15 @@
border-radius: 5px;
font-size: 12px;
}

.modal-dialog {
max-width: 65%; /* 您可以将此值更改为所需的宽度,例如 600px、80% 等 */
align-items: center;
}

.downloaded {
filter: grayscale(100%); /* 将图片灰度化 */
opacity: 0.6; /* 降低图片透明度 */
}
</style>
</head>
<body>
Expand Down
28 changes: 28 additions & 0 deletions GUI/uic/conf_dia.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,32 @@ def setupUi(self, Dialog):
self.logLevelComboBox.addItem("")
self.logLevelComboBox.addItem("")
self.horizontalLayout_log_level.addWidget(self.logLevelComboBox)
self.line = QtWidgets.QFrame(Dialog)
self.line.setFrameShape(QtWidgets.QFrame.VLine)
self.line.setFrameShadow(QtWidgets.QFrame.Sunken)
self.line.setObjectName("line")
self.horizontalLayout_log_level.addWidget(self.line)
self.isDeduplicate = QtWidgets.QCheckBox(Dialog)
sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Maximum, QtWidgets.QSizePolicy.Expanding)
sizePolicy.setHorizontalStretch(0)
sizePolicy.setVerticalStretch(0)
sizePolicy.setHeightForWidth(self.isDeduplicate.sizePolicy().hasHeightForWidth())
self.isDeduplicate.setSizePolicy(sizePolicy)
self.isDeduplicate.setObjectName("isDeduplicate")
self.horizontalLayout_log_level.addWidget(self.isDeduplicate)
self.line_2 = QtWidgets.QFrame(Dialog)
self.line_2.setFrameShape(QtWidgets.QFrame.VLine)
self.line_2.setFrameShadow(QtWidgets.QFrame.Sunken)
self.line_2.setObjectName("line_2")
self.horizontalLayout_log_level.addWidget(self.line_2)
self.isBanCover = QtWidgets.QCheckBox(Dialog)
sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Maximum, QtWidgets.QSizePolicy.Expanding)
sizePolicy.setHorizontalStretch(0)
sizePolicy.setVerticalStretch(0)
sizePolicy.setHeightForWidth(self.isBanCover.sizePolicy().hasHeightForWidth())
self.isBanCover.setSizePolicy(sizePolicy)
self.isBanCover.setObjectName("isBanCover")
self.horizontalLayout_log_level.addWidget(self.isBanCover)
self.gridLayout_2.addLayout(self.horizontalLayout_log_level, 1, 0, 1, 1)
self.horizontalLayout_proxies = QtWidgets.QHBoxLayout()
self.horizontalLayout_proxies.setObjectName("horizontalLayout_proxies")
Expand Down Expand Up @@ -258,6 +284,8 @@ def retranslateUi(self, Dialog):
self.logLevelComboBox.setItemText(1, _translate("Dialog", "DEBUG"))
self.logLevelComboBox.setItemText(2, _translate("Dialog", "INFO"))
self.logLevelComboBox.setItemText(3, _translate("Dialog", "ERROR"))
self.isDeduplicate.setText(_translate("Dialog", "去重"))
self.isBanCover.setText(_translate("Dialog", "禁止覆盖"))
self.label_4.setToolTip(_translate("Dialog", "proxies"))
self.label_4.setText(_translate("Dialog", "代理"))
self.proxiesEdit.setToolTip(_translate("Dialog", "proxies"))
Expand Down
5 changes: 4 additions & 1 deletion GUI/uic/ui_mainwindow.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ def setupUi(self, MainWindow):
font.setBold(False)
font.setWeight(50)
self.textBrowser.setFont(font)
self.textBrowser.setStyleSheet("QTextBrowser {\n"
"color: black;\n"
"}")
self.textBrowser.setObjectName("textBrowser")
self.funcLayout.addWidget(self.textBrowser)
self.funcGroupBox = QtWidgets.QGroupBox(self.centralwidget)
Expand Down Expand Up @@ -476,7 +479,7 @@ def setupUi(self, MainWindow):

def retranslateUi(self, MainWindow):
_translate = QtCore.QCoreApplication.translate
MainWindow.setWindowTitle(_translate("MainWindow", "ComicGUISpider v1.6.2"))
MainWindow.setWindowTitle(_translate("MainWindow", "ComicGUISpider v1.6.3"))
self.chooseBox.setToolTip(_translate("MainWindow", "选中网站后看状态栏有输入提示"))
self.chooseBox.setItemText(0, _translate("MainWindow", "点选一个网站"))
self.chooseBox.setItemText(1, _translate("MainWindow", "1、拷贝漫画"))
Expand Down
Loading

1 comment on commit d2220ee

@jasoneri
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#21

Please sign in to comment.