重新整理参数，令程序可以脱离debug模式使用

tumuyan · Mar 26, 2022 · c162404 · c162404
1 parent 5c6bf86
commit c162404
Show file tree

Hide file tree

Showing 9 changed files with 433 additions and 68 deletions.
diff --git a/.idea/artifacts/Clean.xml b/.idea/artifacts/Clean.xml
diff --git a/.idea/artifacts/dic_spider_jar.xml b/.idea/artifacts/dic_spider_jar.xml
diff --git a/DumpMoegirlDebugDemo.bat b/DumpMoegirlDebugDemo.bat
@@ -0,0 +1,2 @@
+chcp 65001
+java -jar out\artifacts\DumpMoegirl\DumpMoegirl.jar -o out\moegirl-debug -p 6
diff --git a/Readme.md b/Readme.md
@@ -0,0 +1,29 @@
+# Dict Trick
+
+这是一组方便处理词库的工具
+
+目前已经完成初步整理的有：
+
+## Clean
+
+用于过滤词库中的废词，完成简繁转换
+
+
+
+## DumpMoeGirl
+
+用户dump萌娘百科的词库，并调用Clean工具完成处理
+
+
+
+## 使用方法
+
+1. 下载或者build jar文件
+2. 下载opencc （由于java具有跨平台性，而opencc本身就是跨平台的，理论上Linux也可以使用这个工具）
+3. 根据需求编辑opencc的简繁转换配置文件
+4. 根据需求编辑废词文件
+5. 根据需求编辑配置文件，仓库中的`config.txt`是一个示例，已经备注了使用的参数（配置文件可以是任何名称）
+6. 使用命令 `java -jar DumpMoegirl.jar -c config.txt` 来调用配置文件完成爬虫任务
+   使用命令 `java -jar Clean.jar -c config.txt` 来调用配置文件完成纯文本词条过滤任务
+7. 当然也可以不使用配置文件，直接在命令行内输入所需参数
+
diff --git a/src/com/tumuyan/dictspider/Clean.java b/src/com/tumuyan/dictspider/Clean.java
@@ -3,8 +3,7 @@
 import java.io.*;
 import java.util.*;
 
-import static com.tumuyan.dictspider.Utils.OpenCC_T2S;
-import static com.tumuyan.dictspider.Utils.WriteList;
+import static com.tumuyan.dictspider.Utils.*;
 
 // 把dump的wiki词条，拆解为中文、英文、混合词条
 
@@ -39,46 +38,60 @@
 public class Clean {
 
     public static void main(String[] args) {
-//        System.out.println("args.length="+args.length + ", class="+ Clean.class.getSimpleName());
-//        for(String s :args){
-//            System.out.println("  args:"+s);
+//        System.out.println("args.length=" + args.length + ", class=" + Clean.class.getSimpleName());
+//        for (String s : args) {
+//            System.out.println("  args:" + s);
 //        }
 
-        Config config = new Config();
-        config.setDefault_path("A:\\ProjectPython\\zhwiki-20220220-all-titles-in-ns0");
+        String config_file = null;
+        List<String> arg_list = Arrays.asList(args);
+        int index = arg_list.indexOf("-c");
+        if (index < 0)
+            index = arg_list.indexOf("-config");
+
+        if (index >= 0 && index < args.length - 1) {
+            config_file = args[index + 1];
+        }
+        Config config = Utils.ReadConfig(config_file);
+        config.setDefault_path("A:\\ProjectPython\\zhwiki-20220320-all-titles-in-ns0");
         config.Parse(args);
 
         if (!config.verifyInputPath()) {
             return;
         }
 
+
         Dict dict = new Dict();
         for (String p : config.getInput_files()) {
             System.out.println("Load file: " + p);
             dict.add(ReadFile(p));
         }
 
-        OutputWords(dict, config.getPath_w(), config.auto_delete, true);
+        OutputWords(dict, config);
     }
 
-
-    public static void OutputWords(Dict dict, String path_w, boolean auto_delete, boolean t2s) {
+    public static void OutputWords(Dict dict, Config config) {
         try {
             Set<String> chs = dict.getChs();
+            String opencc_path = config.getOpencc_path();
+            String opencc_config = config.getOpencc_config();
+            String path_w = config.getPath_w();
+            boolean auto_delete = config.isAuto_delete();
 
-            if (t2s) {
+
+            if (config.verifyOpencc()) {
                 WriteList(chs, path_w + ".cn.dict.txt", auto_delete, false);
 
-                OpenCC_T2S(path_w + ".cn.dict.txt", path_w + ".chs.dict.txt", "A:\\EBookTools\\OpenCC\\bin");
+                OpenCC(path_w + ".cn.dict.txt", path_w + ".chs.dict.txt", opencc_path, opencc_config);
                 chs = Utils.ReadWords(path_w + ".chs.dict.txt");
-                chs.removeAll(ReadWords());
-                WriteList(chs, path_w + ".chs2.dict.txt", auto_delete, false);
+            }
 
-            } else {
-                chs.removeAll(ReadWords());
-                WriteList(chs, path_w + ".cn.dict.txt", auto_delete, false);
+            if (config.verifyBlacklist()) {
+                chs.removeAll(ReadBlackWords(config.getBlacklist()));
+                WriteGrayWords(chs, path_w, config.getBlacklist_fix(), config.getBlacklist_regex());
             }
 
+            WriteList(chs, path_w + ".dict.txt", auto_delete, false);
             WriteList(dict.getEng(), path_w + ".eng.dict.txt", auto_delete, false);
             WriteList(dict.getMix(), path_w + ".mix.dict.txt", auto_delete, false);
             WriteList(dict.getSuffix(), path_w + ".chs.suffix.txt", auto_delete, false);
@@ -89,15 +102,37 @@ public static void OutputWords(Dict dict, String path_w, boolean auto_delete, bo
         System.out.println("Finish");
     }
 
+    //   废词列表
+    public static Set<String> ReadBlackWords(List<String> list) {
 
-    public static Set<String> ReadWords() {
         Set<String> words = new HashSet<>();
 
-        words.addAll(Utils.ReadWords("A:\\ProjectOthers\\rime-pinyin-simp\\others\\废词.txt"));
-        words.addAll(Utils.ReadWords("A:\\ProjectOthers\\rime-pinyin-simp\\others\\废词-村县镇乡路村縣鎮鄉路.txt"));
+        for (String str : list) {
+            words.addAll(Utils.ReadWords(str));
+        }
         return words;
     }
 
+    //    部分词条不在废词列表内，但是也不在修复列表中。这些词条大概率后续会列入废词列表中
+    public static void WriteGrayWords(Set<String> chs, String path_w, List<String> black_fix, List<String> black_regix) throws Exception {
+        System.out.println(new Date() + " WriteGrayWords...");
+        Set<String> grayWords = new HashSet<>();
+        Set<String> words = new HashSet<>();
+        for (String rule : black_fix) {
+            words.addAll(Utils.ReadWords(rule));
+        }
+
+        for (String rule : black_regix) {
+            for (String str : chs) {
+                if (str.matches(rule)) {
+                    if (!words.contains(str))
+                        grayWords.add(str);
+                }
+            }
+        }
+        WriteList(grayWords, path_w + ".gray.dict.txt", true, false);
+    }
+
 
     // path_w为空时，读取path每一行文本,如果包含tab，把第一个字到keys中；并返回key
 //    path_w不为空时，把带拼音的写入path_w并返回key
@@ -109,9 +144,7 @@ public static Dict ReadFile(String path) {
 
             BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fileInputStream));
 
-            String line = null;
-
-            StringBuffer buffer = new StringBuffer();
+            String line;
 
             while ((line = bufferedReader.readLine()) != null) {
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		chcp 65001
		java -jar out\artifacts\DumpMoegirl\DumpMoegirl.jar -o out\moegirl-debug -p 6