Skip to content

Commit

Permalink
实装调用参数,重构增加代码重用程度
Browse files Browse the repository at this point in the history
输出文件的命名存在问题
  • Loading branch information
tumuyan committed Mar 2, 2022
1 parent 9421429 commit 5c6bf86
Show file tree
Hide file tree
Showing 13 changed files with 817 additions and 1,055 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
/out/
*txt
13 changes: 13 additions & 0 deletions .idea/artifacts/DumpMoegirl.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions meta/DumpMoegirl/META-INF/MANIFEST.MF
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Manifest-Version: 1.0
Main-Class: com.tumuyan.dictspider.DumpMoeGirl

2 changes: 1 addition & 1 deletion src/META-INF/MANIFEST.MF
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Manifest-Version: 1.0
Main-Class: com.tumuyan.dictspider.WikiCClean
Main-Class: com.tumuyan.dictspider.Clean

134 changes: 134 additions & 0 deletions src/com/tumuyan/dictspider/Clean.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
package com.tumuyan.dictspider;

import java.io.*;
import java.util.*;

import static com.tumuyan.dictspider.Utils.OpenCC_T2S;
import static com.tumuyan.dictspider.Utils.WriteList;

// 把dump的wiki词条,拆解为中文、英文、混合词条


/*
env.engine.context:commit_history().latest_text()
pinyin_simp_wiki.dict.yaml
# Rime dictionary
# encoding: utf-8
#
# 维基百科词库
# by tumuyan
---
name: pinyin_simp_wiki
version: "20210628"
sort: by_weight
use_preset_vocabulary: false
...
手动清理
^[^a-zA-Z][a-zA-Z]+\s
^.{1,2}[村县]$
*/


public class Clean {

public static void main(String[] args) {
// System.out.println("args.length="+args.length + ", class="+ Clean.class.getSimpleName());
// for(String s :args){
// System.out.println(" args:"+s);
// }

Config config = new Config();
config.setDefault_path("A:\\ProjectPython\\zhwiki-20220220-all-titles-in-ns0");
config.Parse(args);

if (!config.verifyInputPath()) {
return;
}

Dict dict = new Dict();
for (String p : config.getInput_files()) {
System.out.println("Load file: " + p);
dict.add(ReadFile(p));
}

OutputWords(dict, config.getPath_w(), config.auto_delete, true);
}


public static void OutputWords(Dict dict, String path_w, boolean auto_delete, boolean t2s) {
try {
Set<String> chs = dict.getChs();

if (t2s) {
WriteList(chs, path_w + ".cn.dict.txt", auto_delete, false);

OpenCC_T2S(path_w + ".cn.dict.txt", path_w + ".chs.dict.txt", "A:\\EBookTools\\OpenCC\\bin");
chs = Utils.ReadWords(path_w + ".chs.dict.txt");
chs.removeAll(ReadWords());
WriteList(chs, path_w + ".chs2.dict.txt", auto_delete, false);

} else {
chs.removeAll(ReadWords());
WriteList(chs, path_w + ".cn.dict.txt", auto_delete, false);
}

WriteList(dict.getEng(), path_w + ".eng.dict.txt", auto_delete, false);
WriteList(dict.getMix(), path_w + ".mix.dict.txt", auto_delete, false);
WriteList(dict.getSuffix(), path_w + ".chs.suffix.txt", auto_delete, false);
} catch (Exception e) {
e.printStackTrace();
}

System.out.println("Finish");
}


public static Set<String> ReadWords() {
Set<String> words = new HashSet<>();

words.addAll(Utils.ReadWords("A:\\ProjectOthers\\rime-pinyin-simp\\others\\废词.txt"));
words.addAll(Utils.ReadWords("A:\\ProjectOthers\\rime-pinyin-simp\\others\\废词-村县镇乡路村縣鎮鄉路.txt"));
return words;
}


// path_w为空时,读取path每一行文本,如果包含tab,把第一个字到keys中;并返回key
// path_w不为空时,把带拼音的写入path_w并返回key
public static Dict ReadFile(String path) {
Dict dict = new Dict();

try {
FileInputStream fileInputStream = new FileInputStream(path);

BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fileInputStream));

String line = null;

StringBuffer buffer = new StringBuffer();

while ((line = bufferedReader.readLine()) != null) {

// 如果匹配到空行
if (line.length() < 2)
continue;
dict.add(line);
}

fileInputStream.close();

} catch (Exception e) {
e.printStackTrace();
}
return dict;

}


}
146 changes: 146 additions & 0 deletions src/com/tumuyan/dictspider/Config.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
package com.tumuyan.dictspider;

import java.io.File;
import java.util.ArrayList;
import java.util.List;

public class Config {
private String path_w = "";
String default_path_w = "";
String default_path = "";
boolean auto_delete = false;
boolean debug = false;
Integer pageLimit = Integer.MAX_VALUE;
List<String> input_files = new ArrayList<>();

public void setDefault_path(String default_path) {
this.default_path = default_path;
}

public void setDefault_path_w(String default_path_w) {
this.default_path_w = default_path_w;
}

public List<String> getInput_files() {
return input_files;
}

public Integer getPageLimit() {
return pageLimit;
}

public String getPath_w() {
return path_w;
}

public boolean isAuto_delete() {
return auto_delete;
}

public Config() {

}

public void Parse(String[] args) {

for (int i = 0; i < args.length; i++) {
final String arg = args[i];
if (arg.equals("-h") || arg.equals("-help")) {
System.out.println("help\n");
return;
} else if (arg.equals("-a")) {
auto_delete = true;
} else if (arg.equals("-d") || arg.equals("-debug")) {
debug = true;
} else if (arg.equals("-o") || arg.equals("-output")) {
i++;
if (args.length > i) {

path_w = args[i].trim().replaceFirst("(\\.[^./\\\\]+)?[/\\\\]?$", ".");
File file = new File(path_w).getParentFile();
// File file=new File(path_w.replaceFirst("[^/\\\\]+$",""));
if (file == null) {
// File f = new File(new File(System.getProperty("user.dir")),path_w);
// File f = new File(DumpMoeGirl.class.getClassLoader().getResource("").getFile());
File f = new File(DumpMoeGirl.class.getProtectionDomain().getCodeSource().getLocation().getFile());
f = new File(f.getParentFile(), path_w);

file = f.getParentFile();
path_w = f.getPath();
}

if (file.exists()) {
System.out.println("Output to: " + path_w);
} else {
path_w = "";
System.out.println("[Err]Output folder not exist: " + file.getPath());
}

} else {
System.out.println("[Err]Output arg not exist.");
}
} else if (arg.equals("-p") || arg.equals("-pagelimit")) {
i++;
if (args.length > i) {
pageLimit = Integer.parseInt(args[i]);
if (pageLimit == null) {
pageLimit = Integer.MAX_VALUE;
System.out.println("[Err]unexpected pageLimit arg and disable pageLimit: " + args[i]);
}
} else {
System.out.println("[Err]pageLimit arg not exist.");
}
} else if (arg.equals("-i") || arg.equals("-input")) {
i++;
if (!input_files.contains(args[i])) {
File file = new File(args[i]);
if (file.exists()) {
input_files.add(args[i]);
System.out.println("Input: " + args[i]);
} else
System.out.println("[Err]Input file not exist: " + args[i]);

} else {
System.out.println("[Err]Input arg not exist.");
}
}
}

}

// 验证是否设置了输出路径
public boolean verifyOutputPath() {
if (path_w.length() < 1) {
if (debug && default_path_w.length() > 0)
path_w = default_path_w;
else {
System.out.println("[Err]Output path missing.");
return false;
}
}
return true;
}


// 验证是否设置了输入路径
public boolean verifyInputPath() {
String path = "";

if (input_files.size() < 1) {
if (debug && default_path.length() > 1) {
path = default_path;
input_files = new ArrayList<>();
input_files.add(path);
} else {
System.out.println("[Err]Input path missing.");
return false;
}
}

if (path_w.length() < 0) {
path_w = path.replace(".dict.txt", "");
}
return true;

}
}
Loading

0 comments on commit 5c6bf86

Please sign in to comment.