Skip to content

Commit

Permalink
增加less-output参数,更新readme
Browse files Browse the repository at this point in the history
  • Loading branch information
tumuyan committed Mar 26, 2022
1 parent c162404 commit 933279f
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 7 deletions.
25 changes: 23 additions & 2 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,15 @@

## Clean

用于过滤词库中的废词,完成简繁转换

用于过滤词库中的废词,完成简繁转换,用于中文输入法使用。针对wiki词条和萌娘百科词条做了精准优化和大量测试。处理过程并非直接使用黑名单/正则进行匹配。

- 一定程度上保留了中英文混合词条
- 统一调整`-``·`等符号
- 从空格、标点切分词条
- 切分和抛弃大量含序号/数字/数量词的词条,包括但不限于`①⒛Ⅰⅻ甲`
- 切分和去除大量非常见汉字的词条
- 去除部分过短的词条
- ...

## DumpMoeGirl

Expand All @@ -26,4 +32,19 @@
6. 使用命令 `java -jar DumpMoegirl.jar -c config.txt` 来调用配置文件完成爬虫任务
使用命令 `java -jar Clean.jar -c config.txt` 来调用配置文件完成纯文本词条过滤任务
7. 当然也可以不使用配置文件,直接在命令行内输入所需参数
8. 程序运行结束,名称为`.dict.txt`没有额外后缀的文件为最终文件。如果没有使用`-less-output`参数,可以得到转换过程产生的其他文件,可以用于进一步分析和改善。



## 附言

我在使用此工具以及深蓝词库转换工具持续更新萌娘百科、维基百科的rime词库文件,但是由于你并不一定使用了和我相同的配置文件和相同版本的软件,会导致转换的结果可能存在差异。



转换结果和部分配置文件在我的[仓库](https://github.com/tumuyan/rime-melt)中:

- `pinyin_simp_wiki.dict.yaml`[Github下载](https://github.com/tumuyan/rime-pinyin-simp/raw/master/pinyin_simp_wiki.dict.yaml)
- `pinyin_simp_moe.dict.yaml`[Github下载](https://github.com/tumuyan/rime-pinyin-simp/raw/master/pinyin_simp_moe.dict.yaml)

- 废词文件:https://github.com/tumuyan/rime-melt/tree/master/others
14 changes: 11 additions & 3 deletions src/com/tumuyan/dictspider/Clean.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ public static void OutputWords(Dict dict, Config config) {

OpenCC(path_w + ".cn.dict.txt", path_w + ".chs.dict.txt", opencc_path, opencc_config);
chs = Utils.ReadWords(path_w + ".chs.dict.txt");

if (config.isLess_output()) {
File file = new File(path_w + ".cn.dict.txt");
file.delete();
}
}

if (config.verifyBlacklist()) {
Expand All @@ -92,9 +97,12 @@ public static void OutputWords(Dict dict, Config config) {
}

WriteList(chs, path_w + ".dict.txt", auto_delete, false);
WriteList(dict.getEng(), path_w + ".eng.dict.txt", auto_delete, false);
WriteList(dict.getMix(), path_w + ".mix.dict.txt", auto_delete, false);
WriteList(dict.getSuffix(), path_w + ".chs.suffix.txt", auto_delete, false);
if (!config.isLess_output()) {
WriteList(dict.getEng(), path_w + ".eng.dict.txt", auto_delete, false);
WriteList(dict.getMix(), path_w + ".mix.dict.txt", auto_delete, false);
WriteList(dict.getSuffix(), path_w + ".chs.suffix.txt", auto_delete, false);
}

} catch (Exception e) {
e.printStackTrace();
}
Expand Down
11 changes: 9 additions & 2 deletions src/com/tumuyan/dictspider/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public class Config {
private List<String> blacklist = new ArrayList<>(); // 废词
private List<String> blacklist_fix = new ArrayList<>(); // 修复过杀废词
private List<String> blacklist_regex = new ArrayList<>(); // 废词正则表达式
private boolean less_output; //输出更少的文件

public void setDefault_opencc_config(String default_opencc_config) {
this.default_opencc_config = default_opencc_config;
Expand Down Expand Up @@ -90,15 +91,19 @@ public boolean isAuto_delete() {
return auto_delete;
}

public boolean isLess_output() {
return less_output;
}

public Config() {

}

public static String[] short_name = new String[]{
"h", "a", "d", "o", "p", "i", "cc", "ccc", "b", "bf", "bs"
"l", "h", "a", "d", "o", "p", "i", "cc", "ccc", "b", "bf", "bs"
};
public static List<String> full_name = Arrays.asList(
"help", "a", "debug", "output", "pagelimit", "input", "opencc", "opencc-config", "blacklist", "blacklist-fix", "blackstring"
"less-output", "help", "a", "debug", "output", "pagelimit", "input", "opencc", "opencc-config", "blacklist", "blacklist-fix", "blackstring"
);


Expand All @@ -113,6 +118,8 @@ public void Parse(String[] args) {
auto_delete = true;
} else if (arg.equals("-d") || arg.equals("-debug")) {
debug = true;
} else if (arg.equals("-l") || arg.equals("-less-output")) {
less_output = true;
} else if (arg.equals("-o") || arg.equals("-output")) {
i++;
if (args.length > i) {
Expand Down

0 comments on commit 933279f

Please sign in to comment.