Skip to content

Commit

Permalink
重新整理参数,令程序可以脱离debug模式使用
Browse files Browse the repository at this point in the history
  • Loading branch information
tumuyan committed Mar 26, 2022
1 parent 5c6bf86 commit c162404
Show file tree
Hide file tree
Showing 9 changed files with 433 additions and 68 deletions.
10 changes: 10 additions & 0 deletions .idea/artifacts/Clean.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 0 additions & 9 deletions .idea/artifacts/dic_spider_jar.xml

This file was deleted.

2 changes: 2 additions & 0 deletions DumpMoegirlDebugDemo.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chcp 65001
java -jar out\artifacts\DumpMoegirl\DumpMoegirl.jar -o out\moegirl-debug -p 6
29 changes: 29 additions & 0 deletions Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Dict Trick

这是一组方便处理词库的工具

目前已经完成初步整理的有:

## Clean

用于过滤词库中的废词,完成简繁转换



## DumpMoeGirl

用户dump萌娘百科的词库,并调用Clean工具完成处理



## 使用方法

1. 下载或者build jar文件
2. 下载opencc (由于java具有跨平台性,而opencc本身就是跨平台的,理论上Linux也可以使用这个工具)
3. 根据需求编辑opencc的简繁转换配置文件
4. 根据需求编辑废词文件
5. 根据需求编辑配置文件,仓库中的`config.txt`是一个示例,已经备注了使用的参数(配置文件可以是任何名称)
6. 使用命令 `java -jar DumpMoegirl.jar -c config.txt` 来调用配置文件完成爬虫任务
使用命令 `java -jar Clean.jar -c config.txt` 来调用配置文件完成纯文本词条过滤任务
7. 当然也可以不使用配置文件,直接在命令行内输入所需参数

79 changes: 56 additions & 23 deletions src/com/tumuyan/dictspider/Clean.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import java.io.*;
import java.util.*;

import static com.tumuyan.dictspider.Utils.OpenCC_T2S;
import static com.tumuyan.dictspider.Utils.WriteList;
import static com.tumuyan.dictspider.Utils.*;

// 把dump的wiki词条,拆解为中文、英文、混合词条

Expand Down Expand Up @@ -39,46 +38,60 @@
public class Clean {

public static void main(String[] args) {
// System.out.println("args.length="+args.length + ", class="+ Clean.class.getSimpleName());
// for(String s :args){
// System.out.println(" args:"+s);
// System.out.println("args.length=" + args.length + ", class=" + Clean.class.getSimpleName());
// for (String s : args) {
// System.out.println(" args:" + s);
// }

Config config = new Config();
config.setDefault_path("A:\\ProjectPython\\zhwiki-20220220-all-titles-in-ns0");
String config_file = null;
List<String> arg_list = Arrays.asList(args);
int index = arg_list.indexOf("-c");
if (index < 0)
index = arg_list.indexOf("-config");

if (index >= 0 && index < args.length - 1) {
config_file = args[index + 1];
}
Config config = Utils.ReadConfig(config_file);
config.setDefault_path("A:\\ProjectPython\\zhwiki-20220320-all-titles-in-ns0");
config.Parse(args);

if (!config.verifyInputPath()) {
return;
}


Dict dict = new Dict();
for (String p : config.getInput_files()) {
System.out.println("Load file: " + p);
dict.add(ReadFile(p));
}

OutputWords(dict, config.getPath_w(), config.auto_delete, true);
OutputWords(dict, config);
}


public static void OutputWords(Dict dict, String path_w, boolean auto_delete, boolean t2s) {
public static void OutputWords(Dict dict, Config config) {
try {
Set<String> chs = dict.getChs();
String opencc_path = config.getOpencc_path();
String opencc_config = config.getOpencc_config();
String path_w = config.getPath_w();
boolean auto_delete = config.isAuto_delete();

if (t2s) {

if (config.verifyOpencc()) {
WriteList(chs, path_w + ".cn.dict.txt", auto_delete, false);

OpenCC_T2S(path_w + ".cn.dict.txt", path_w + ".chs.dict.txt", "A:\\EBookTools\\OpenCC\\bin");
OpenCC(path_w + ".cn.dict.txt", path_w + ".chs.dict.txt", opencc_path, opencc_config);
chs = Utils.ReadWords(path_w + ".chs.dict.txt");
chs.removeAll(ReadWords());
WriteList(chs, path_w + ".chs2.dict.txt", auto_delete, false);
}

} else {
chs.removeAll(ReadWords());
WriteList(chs, path_w + ".cn.dict.txt", auto_delete, false);
if (config.verifyBlacklist()) {
chs.removeAll(ReadBlackWords(config.getBlacklist()));
WriteGrayWords(chs, path_w, config.getBlacklist_fix(), config.getBlacklist_regex());
}

WriteList(chs, path_w + ".dict.txt", auto_delete, false);
WriteList(dict.getEng(), path_w + ".eng.dict.txt", auto_delete, false);
WriteList(dict.getMix(), path_w + ".mix.dict.txt", auto_delete, false);
WriteList(dict.getSuffix(), path_w + ".chs.suffix.txt", auto_delete, false);
Expand All @@ -89,15 +102,37 @@ public static void OutputWords(Dict dict, String path_w, boolean auto_delete, bo
System.out.println("Finish");
}

// 废词列表
public static Set<String> ReadBlackWords(List<String> list) {

public static Set<String> ReadWords() {
Set<String> words = new HashSet<>();

words.addAll(Utils.ReadWords("A:\\ProjectOthers\\rime-pinyin-simp\\others\\废词.txt"));
words.addAll(Utils.ReadWords("A:\\ProjectOthers\\rime-pinyin-simp\\others\\废词-村县镇乡路村縣鎮鄉路.txt"));
for (String str : list) {
words.addAll(Utils.ReadWords(str));
}
return words;
}

// 部分词条不在废词列表内,但是也不在修复列表中。这些词条大概率后续会列入废词列表中
public static void WriteGrayWords(Set<String> chs, String path_w, List<String> black_fix, List<String> black_regix) throws Exception {
System.out.println(new Date() + " WriteGrayWords...");
Set<String> grayWords = new HashSet<>();
Set<String> words = new HashSet<>();
for (String rule : black_fix) {
words.addAll(Utils.ReadWords(rule));
}

for (String rule : black_regix) {
for (String str : chs) {
if (str.matches(rule)) {
if (!words.contains(str))
grayWords.add(str);
}
}
}
WriteList(grayWords, path_w + ".gray.dict.txt", true, false);
}


// path_w为空时,读取path每一行文本,如果包含tab,把第一个字到keys中;并返回key
// path_w不为空时,把带拼音的写入path_w并返回key
Expand All @@ -109,9 +144,7 @@ public static Dict ReadFile(String path) {

BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fileInputStream));

String line = null;

StringBuffer buffer = new StringBuffer();
String line;

while ((line = bufferedReader.readLine()) != null) {

Expand Down
Loading

0 comments on commit c162404

Please sign in to comment.