-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
输出文件的命名存在问题
- Loading branch information
Showing
13 changed files
with
817 additions
and
1,055 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
/out/ | ||
*txt |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Manifest-Version: 1.0 | ||
Main-Class: com.tumuyan.dictspider.DumpMoeGirl | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
Manifest-Version: 1.0 | ||
Main-Class: com.tumuyan.dictspider.WikiCClean | ||
Main-Class: com.tumuyan.dictspider.Clean | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
package com.tumuyan.dictspider; | ||
|
||
import java.io.*; | ||
import java.util.*; | ||
|
||
import static com.tumuyan.dictspider.Utils.OpenCC_T2S; | ||
import static com.tumuyan.dictspider.Utils.WriteList; | ||
|
||
// 把dump的wiki词条,拆解为中文、英文、混合词条 | ||
|
||
|
||
/* | ||
env.engine.context:commit_history().latest_text() | ||
pinyin_simp_wiki.dict.yaml | ||
# Rime dictionary | ||
# encoding: utf-8 | ||
# | ||
# 维基百科词库 | ||
# by tumuyan | ||
--- | ||
name: pinyin_simp_wiki | ||
version: "20210628" | ||
sort: by_weight | ||
use_preset_vocabulary: false | ||
... | ||
手动清理 | ||
^[^a-zA-Z][a-zA-Z]+\s | ||
^.{1,2}[村县]$ | ||
*/ | ||
|
||
|
||
public class Clean { | ||
|
||
public static void main(String[] args) { | ||
// System.out.println("args.length="+args.length + ", class="+ Clean.class.getSimpleName()); | ||
// for(String s :args){ | ||
// System.out.println(" args:"+s); | ||
// } | ||
|
||
Config config = new Config(); | ||
config.setDefault_path("A:\\ProjectPython\\zhwiki-20220220-all-titles-in-ns0"); | ||
config.Parse(args); | ||
|
||
if (!config.verifyInputPath()) { | ||
return; | ||
} | ||
|
||
Dict dict = new Dict(); | ||
for (String p : config.getInput_files()) { | ||
System.out.println("Load file: " + p); | ||
dict.add(ReadFile(p)); | ||
} | ||
|
||
OutputWords(dict, config.getPath_w(), config.auto_delete, true); | ||
} | ||
|
||
|
||
public static void OutputWords(Dict dict, String path_w, boolean auto_delete, boolean t2s) { | ||
try { | ||
Set<String> chs = dict.getChs(); | ||
|
||
if (t2s) { | ||
WriteList(chs, path_w + ".cn.dict.txt", auto_delete, false); | ||
|
||
OpenCC_T2S(path_w + ".cn.dict.txt", path_w + ".chs.dict.txt", "A:\\EBookTools\\OpenCC\\bin"); | ||
chs = Utils.ReadWords(path_w + ".chs.dict.txt"); | ||
chs.removeAll(ReadWords()); | ||
WriteList(chs, path_w + ".chs2.dict.txt", auto_delete, false); | ||
|
||
} else { | ||
chs.removeAll(ReadWords()); | ||
WriteList(chs, path_w + ".cn.dict.txt", auto_delete, false); | ||
} | ||
|
||
WriteList(dict.getEng(), path_w + ".eng.dict.txt", auto_delete, false); | ||
WriteList(dict.getMix(), path_w + ".mix.dict.txt", auto_delete, false); | ||
WriteList(dict.getSuffix(), path_w + ".chs.suffix.txt", auto_delete, false); | ||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
} | ||
|
||
System.out.println("Finish"); | ||
} | ||
|
||
|
||
public static Set<String> ReadWords() { | ||
Set<String> words = new HashSet<>(); | ||
|
||
words.addAll(Utils.ReadWords("A:\\ProjectOthers\\rime-pinyin-simp\\others\\废词.txt")); | ||
words.addAll(Utils.ReadWords("A:\\ProjectOthers\\rime-pinyin-simp\\others\\废词-村县镇乡路村縣鎮鄉路.txt")); | ||
return words; | ||
} | ||
|
||
|
||
// path_w为空时,读取path每一行文本,如果包含tab,把第一个字到keys中;并返回key | ||
// path_w不为空时,把带拼音的写入path_w并返回key | ||
public static Dict ReadFile(String path) { | ||
Dict dict = new Dict(); | ||
|
||
try { | ||
FileInputStream fileInputStream = new FileInputStream(path); | ||
|
||
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fileInputStream)); | ||
|
||
String line = null; | ||
|
||
StringBuffer buffer = new StringBuffer(); | ||
|
||
while ((line = bufferedReader.readLine()) != null) { | ||
|
||
// 如果匹配到空行 | ||
if (line.length() < 2) | ||
continue; | ||
dict.add(line); | ||
} | ||
|
||
fileInputStream.close(); | ||
|
||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
} | ||
return dict; | ||
|
||
} | ||
|
||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
package com.tumuyan.dictspider; | ||
|
||
import java.io.File; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
public class Config { | ||
private String path_w = ""; | ||
String default_path_w = ""; | ||
String default_path = ""; | ||
boolean auto_delete = false; | ||
boolean debug = false; | ||
Integer pageLimit = Integer.MAX_VALUE; | ||
List<String> input_files = new ArrayList<>(); | ||
|
||
public void setDefault_path(String default_path) { | ||
this.default_path = default_path; | ||
} | ||
|
||
public void setDefault_path_w(String default_path_w) { | ||
this.default_path_w = default_path_w; | ||
} | ||
|
||
public List<String> getInput_files() { | ||
return input_files; | ||
} | ||
|
||
public Integer getPageLimit() { | ||
return pageLimit; | ||
} | ||
|
||
public String getPath_w() { | ||
return path_w; | ||
} | ||
|
||
public boolean isAuto_delete() { | ||
return auto_delete; | ||
} | ||
|
||
public Config() { | ||
|
||
} | ||
|
||
public void Parse(String[] args) { | ||
|
||
for (int i = 0; i < args.length; i++) { | ||
final String arg = args[i]; | ||
if (arg.equals("-h") || arg.equals("-help")) { | ||
System.out.println("help\n"); | ||
return; | ||
} else if (arg.equals("-a")) { | ||
auto_delete = true; | ||
} else if (arg.equals("-d") || arg.equals("-debug")) { | ||
debug = true; | ||
} else if (arg.equals("-o") || arg.equals("-output")) { | ||
i++; | ||
if (args.length > i) { | ||
|
||
path_w = args[i].trim().replaceFirst("(\\.[^./\\\\]+)?[/\\\\]?$", "."); | ||
File file = new File(path_w).getParentFile(); | ||
// File file=new File(path_w.replaceFirst("[^/\\\\]+$","")); | ||
if (file == null) { | ||
// File f = new File(new File(System.getProperty("user.dir")),path_w); | ||
// File f = new File(DumpMoeGirl.class.getClassLoader().getResource("").getFile()); | ||
File f = new File(DumpMoeGirl.class.getProtectionDomain().getCodeSource().getLocation().getFile()); | ||
f = new File(f.getParentFile(), path_w); | ||
|
||
file = f.getParentFile(); | ||
path_w = f.getPath(); | ||
} | ||
|
||
if (file.exists()) { | ||
System.out.println("Output to: " + path_w); | ||
} else { | ||
path_w = ""; | ||
System.out.println("[Err]Output folder not exist: " + file.getPath()); | ||
} | ||
|
||
} else { | ||
System.out.println("[Err]Output arg not exist."); | ||
} | ||
} else if (arg.equals("-p") || arg.equals("-pagelimit")) { | ||
i++; | ||
if (args.length > i) { | ||
pageLimit = Integer.parseInt(args[i]); | ||
if (pageLimit == null) { | ||
pageLimit = Integer.MAX_VALUE; | ||
System.out.println("[Err]unexpected pageLimit arg and disable pageLimit: " + args[i]); | ||
} | ||
} else { | ||
System.out.println("[Err]pageLimit arg not exist."); | ||
} | ||
} else if (arg.equals("-i") || arg.equals("-input")) { | ||
i++; | ||
if (!input_files.contains(args[i])) { | ||
File file = new File(args[i]); | ||
if (file.exists()) { | ||
input_files.add(args[i]); | ||
System.out.println("Input: " + args[i]); | ||
} else | ||
System.out.println("[Err]Input file not exist: " + args[i]); | ||
|
||
} else { | ||
System.out.println("[Err]Input arg not exist."); | ||
} | ||
} | ||
} | ||
|
||
} | ||
|
||
// 验证是否设置了输出路径 | ||
public boolean verifyOutputPath() { | ||
if (path_w.length() < 1) { | ||
if (debug && default_path_w.length() > 0) | ||
path_w = default_path_w; | ||
else { | ||
System.out.println("[Err]Output path missing."); | ||
return false; | ||
} | ||
} | ||
return true; | ||
} | ||
|
||
|
||
// 验证是否设置了输入路径 | ||
public boolean verifyInputPath() { | ||
String path = ""; | ||
|
||
if (input_files.size() < 1) { | ||
if (debug && default_path.length() > 1) { | ||
path = default_path; | ||
input_files = new ArrayList<>(); | ||
input_files.add(path); | ||
} else { | ||
System.out.println("[Err]Input path missing."); | ||
return false; | ||
} | ||
} | ||
|
||
if (path_w.length() < 0) { | ||
path_w = path.replace(".dict.txt", ""); | ||
} | ||
return true; | ||
|
||
} | ||
} |
Oops, something went wrong.