Skip to content

Commit

Permalink
修复输出路径错误
Browse files Browse the repository at this point in the history
  • Loading branch information
tumuyan committed Apr 5, 2022
1 parent 933279f commit a6fe521
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 9 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
/out/
*txt
/DumpMoegirl.jar
/Clean.jar
4 changes: 3 additions & 1 deletion DumpMoegirlDebugDemo.bat
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
chcp 65001
java -jar out\artifacts\DumpMoegirl\DumpMoegirl.jar -o out\moegirl-debug -p 6
java -jar out\artifacts\DumpMoegirl\DumpMoegirl.jar -o out\moegirl-debug

pause
27 changes: 27 additions & 0 deletions config.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#使用命令 java -jar DumpMoegirl.jar -c config.txt 来调用配置文件完成爬虫任务
#使用命令 java -jar Clean.jar -c config.txt 来调用配置文件完成纯文本词条过滤任务
#使用#开头表示对参数进行注释(注意把参数名称和值同时注释掉)
#允许多个参数,每行一个。行首行末的空字符会在处理时自动去除
#参数请尽量不要包含空格和非英文
#-debug
#debug用参数,使用此开关可以不填部分参数,使用默认路径
-pageLimit
#debug使用,限制爬虫任务爬的页面数量
3
-input_files
#输入文件的列表,允许多个参数;爬虫任务无需此参数
-output_files
#输出文件的列表,wiki clean任务可以不填入此参数
moe.txt
-opencc_path
#opencc可执行文件所在的路径(不含文件名)如果不设置opencc路径,则不使用简繁翻译
-opencc_config
#opencc的配置文件所在的路径,是opencc_path的相对路径
-blacklist
#废词列表文件,允许多个.当缺少废词时,不做废词过滤
-blacklist_fix
#修复过杀废词
-blacklist_regex
#废词正则表达式,允许多个。当词条不在废词列表中,但是与此正则表达式匹配时,输出到graylist文件中
-less-output
#不保留转换过程产生的大部分中间文件。
6 changes: 3 additions & 3 deletions src/com/tumuyan/dictspider/Clean.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public static void main(String[] args) {
config_file = args[index + 1];
}
Config config = Utils.ReadConfig(config_file);
config.setDefault_path("A:\\ProjectPython\\zhwiki-20220320-all-titles-in-ns0");
config.setDefault_path("A:\\ProjectPython\\zhwiki-20220401-all-titles-in-ns0");
config.Parse(args);

if (!config.verifyInputPath()) {
Expand All @@ -73,13 +73,13 @@ public static void main(String[] args) {
public static void OutputWords(Dict dict, Config config) {
try {
Set<String> chs = dict.getChs();
String opencc_path = config.getOpencc_path();
String opencc_config = config.getOpencc_config();
String path_w = config.getPath_w();
boolean auto_delete = config.isAuto_delete();


if (config.verifyOpencc()) {
String opencc_path = config.getOpencc_path();
String opencc_config = config.getOpencc_config();
WriteList(chs, path_w + ".cn.dict.txt", auto_delete, false);

OpenCC(path_w + ".cn.dict.txt", path_w + ".chs.dict.txt", opencc_path, opencc_config);
Expand Down
5 changes: 3 additions & 2 deletions src/com/tumuyan/dictspider/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -314,10 +314,11 @@ public boolean verifyInputPath() {
System.out.println("[Err]Input path missing.");
return false;
}
}else{
path = input_files.get(0);
}

if (path_w.length() < 1) {
path_w = path.replace(".dict.txt", "");
path_w = path.replace(".txt", "");
}
return true;

Expand Down
2 changes: 1 addition & 1 deletion src/com/tumuyan/dictspider/DumpMoeGirl.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public static void main(String[] args) {
}
Config config = Utils.ReadConfig(config_file);

config.setDefault_path_w("moegirl.txt");
config.setDefault_path_w("A:\\ProjectPython\\moegirl.txt");

config.Parse(args);

Expand Down
2 changes: 1 addition & 1 deletion wikiClean.bat
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
chcp 65001
@echo off
echo input %1
java -jar %~dp0\out\artifacts\Clean\Clean-o %~dp0\out\clean -i %1 -d
java -jar %~dp0\out\artifacts\Clean\Clean.jar -o %~dp0\out\clean -i %1 -d

pause

0 comments on commit a6fe521

Please sign in to comment.