fullname 辞書が UTF-8 になった関係の Makefile 調整

skk-dev/skktools#27 から annotation-filter.rb を持ってきて script に置いた
tamo · Aug 16, 2024 · 6fc84f9 · 6fc84f9
1 parent ceec4a0
commit 6fc84f9
Show file tree

Hide file tree

Showing 2 changed files with 151 additions and 18 deletions.
diff --git a/Makefile b/Makefile
@@ -5,22 +5,25 @@
 COUNT	  = skkdic-count
 CURL      = curl
 DATE	  = date
+DENO	  = deno
 EMACS	  = emacs --batch --directory ./
 EXPR	  = skkdic-expr
 EXPR2	  = skkdic-expr2
 GAWK	  = LC_ALL=C gawk
 GREP	  = grep
-SED	  = sed
 GZIP	  = gzip -9
+ICONV	  = iconv
 MD5	  = md5
 MV	  = mv --force
 RM	  = /bin/rm -f
 RUBY	  = ruby -I $(TOOLS_DIR)/filters
+SED	  = sed
 SORT	  = skkdic-sort
 TAR	  = tar
+TOUCH	  = touch
 UNZIP	  = unzip -o
+
 ZIPDIC_DIR  = ./zipcode
-DENO 	= deno
 
 DIC2PDB = dic2pdb
 DICCOMPACT = diccompact.rb
@@ -49,7 +52,8 @@ CDB_TARGET = ./`basename $(CDB_SOURCE)`.cdb
 
 clean:
 	$(RM) *.gz* *~ `find . -name '*~'` `find . -name '.*~'` `find . -name '.#*'` \
-	*.unannotated SKK-JISYO.wrong PBinlineDB.pdb *.tmp *.w PBinlineDB.dic *.taciturn SKK-JISYO.L+ SKK-JISYO.total SKK-JISYO.total+zipcode SKK-JISYO.L.header SKK-JISYO.china_taiwan \
+	*.unannotated SKK-JISYO.wrong PBinlineDB.pdb *.tmp *.u8 *.w PBinlineDB.dic *.taciturn \
+	SKK-JISYO.L+ SKK-JISYO.total SKK-JISYO.total+zipcode SKK-JISYO.L.header SKK-JISYO.china_taiwan \
 	emoji-list.txt
 
 archive: gzip
@@ -117,33 +121,35 @@ SKK-JISYO.L+: SKK-JISYO.L SKK-JISYO.L.header
 	$(EXPR2) SKK-JISYO.L + SKK-JISYO.tmp | cat SKK-JISYO.L.header - > SKK-JISYO.L+
 	$(RM) SKK-JISYO.tmp SKK-JISYO.addition
 
-SKK-JISYO.total: SKK-JISYO.L SKK-JISYO.geo SKK-JISYO.station SKK-JISYO.jinmei SKK-JISYO.propernoun SKK-JISYO.fullname SKK-JISYO.law SKK-JISYO.okinawa SKK-JISYO.hukugougo SKK-JISYO.assoc SKK-JISYO.notes SKK-JISYO.L.header
+SKK-JISYO.total: SKK-JISYO.L.u8 SKK-JISYO.geo.u8 SKK-JISYO.station.u8 SKK-JISYO.jinmei.u8 SKK-JISYO.propernoun.u8 SKK-JISYO.fullname SKK-JISYO.law.u8 SKK-JISYO.okinawa.u8 SKK-JISYO.hukugougo.u8 SKK-JISYO.assoc.u8 SKK-JISYO.notes SKK-JISYO.L.header.u8
 	$(RUBY) $(TOOLS_DIR)/filters/conjugation.rb -Cpox SKK-JISYO.notes > SKK-JISYO.tmp
 	$(RUBY) $(TOOLS_DIR)/filters/asayaKe.rb -p SKK-JISYO.L >> SKK-JISYO.tmp
 	$(RUBY) $(TOOLS_DIR)/filters/complete-numerative.rb -pU SKK-JISYO.L >> SKK-JISYO.tmp
 	$(RUBY) $(TOOLS_DIR)/filters/abbrev-convert.rb -K -s 2 SKK-JISYO.L >> SKK-JISYO.tmp
 	$(RUBY) $(TOOLS_DIR)/filters/abbrev-convert.rb -w -s 2 SKK-JISYO.L >> SKK-JISYO.tmp
+	$(ICONV) -f euc-jp -t utf-8 SKK-JISYO.tmp > SKK-JISYO.tmp.u8
 	# order is very important here
-	$(EXPR2) SKK-JISYO.geo + SKK-JISYO.station + SKK-JISYO.jinmei + SKK-JISYO.propernoun + SKK-JISYO.fullname + SKK-JISYO.tmp + SKK-JISYO.law + SKK-JISYO.okinawa + SKK-JISYO.hukugougo + SKK-JISYO.assoc - SKK-JISYO.L > SKK-JISYO.addition
+	$(EXPR2) SKK-JISYO.geo.u8 + SKK-JISYO.station.u8 + SKK-JISYO.jinmei.u8 + SKK-JISYO.propernoun.u8 + SKK-JISYO.fullname + SKK-JISYO.tmp.u8 + SKK-JISYO.law.u8 + SKK-JISYO.okinawa.u8 + SKK-JISYO.hukugougo.u8 + SKK-JISYO.assoc.u8 - SKK-JISYO.L.u8 > SKK-JISYO.addition
 	# why eliminating SKK-JISYO.L once? -- to not add too noisy
 	# annotations from SKK-JISYO.jinmei and so on.
-	$(EXPR2) SKK-JISYO.L + SKK-JISYO.addition | cat SKK-JISYO.L.header - > SKK-JISYO.total
+	$(EXPR2) SKK-JISYO.L.u8 + SKK-JISYO.addition | cat SKK-JISYO.L.header.u8 - > SKK-JISYO.total
 	$(RM) SKK-JISYO.tmp SKK-JISYO.addition
 
-SKK-JISYO.total+zipcode: SKK-JISYO.total $(ZIPDIC_DIR)/SKK-JISYO.zipcode $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode SKK-JISYO.L.header
-	$(EXPR2) SKK-JISYO.total + $(ZIPDIC_DIR)/SKK-JISYO.zipcode + $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode | cat SKK-JISYO.L.header - > SKK-JISYO.total+zipcode
+# zipcode がまだ UTF-8 ではない場合
+SKK-JISYO.total+zipcode: SKK-JISYO.total $(ZIPDIC_DIR)/SKK-JISYO.zipcode.u8 $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode.u8 SKK-JISYO.L.header.u8
+	$(EXPR2) SKK-JISYO.total + $(ZIPDIC_DIR)/SKK-JISYO.zipcode.u8 + $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode.u8 | cat SKK-JISYO.L.header.u8 - > SKK-JISYO.total+zipcode
 
 SKK-JISYO.L.taciturn: SKK-JISYO.L SKK-JISYO.L.header
 	$(RUBY) $(TOOLS_DIR)/filters/annotation-filter.rb -d SKK-JISYO.L | $(EXPR2) | cat SKK-JISYO.L.header - > SKK-JISYO.L.taciturn
 
 SKK-JISYO.L+.taciturn: SKK-JISYO.L+ SKK-JISYO.L.header
 	$(RUBY) $(TOOLS_DIR)/filters/annotation-filter.rb -d SKK-JISYO.L+ | $(EXPR2) | cat SKK-JISYO.L.header - > SKK-JISYO.L+.taciturn
 
-SKK-JISYO.total.taciturn: SKK-JISYO.total SKK-JISYO.L.header
-	$(RUBY) $(TOOLS_DIR)/filters/annotation-filter.rb -d SKK-JISYO.total | $(EXPR2) | cat SKK-JISYO.L.header - > SKK-JISYO.total.taciturn
+SKK-JISYO.total.taciturn: SKK-JISYO.total SKK-JISYO.L.header.u8
+	$(RUBY) script/annotation-filter.rb -8 -d SKK-JISYO.total | $(EXPR2) | cat SKK-JISYO.L.header.u8 - > SKK-JISYO.total.taciturn
 
-SKK-JISYO.total+zipcode.taciturn: SKK-JISYO.total+zipcode SKK-JISYO.L.header
-	$(RUBY) $(TOOLS_DIR)/filters/annotation-filter.rb -d SKK-JISYO.total+zipcode | $(EXPR2) | cat SKK-JISYO.L.header - > SKK-JISYO.total+zipcode.taciturn
+SKK-JISYO.total+zipcode.taciturn: SKK-JISYO.total+zipcode SKK-JISYO.L.header.u8
+	$(RUBY) script/annotation-filter.rb -8 -d SKK-JISYO.total+zipcode | $(EXPR2) | cat SKK-JISYO.L.header.u8 - > SKK-JISYO.total+zipcode.taciturn
 
 SKK-JISYO.L+.unannotated: SKK-JISYO.L+
 	$(GAWK) -f $(TOOLS_DIR)/unannotation.awk SKK-JISYO.L+ > SKK-JISYO.L+.unannotated
@@ -158,6 +164,15 @@ SKK-JISYO.L.header: SKK-JISYO.L
 	echo ';; (This dictionary was automatically generated from SKK dictionaries)' > SKK-JISYO.L.header
 	$(SED) -n '/^;; okuri-ari entries./q;p' SKK-JISYO.L >> SKK-JISYO.L.header
 
+$(ZIPDIC_DIR)/SKK-JISYO.office.zipcode.u8: $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode
+	$(ICONV) -f euc-jisx0213 -t utf-8 $< > $@
+SKK-JISYO.L.header.u8: SKK-JISYO.L.header
+	$(ICONV) -f euc-jp -t utf-8 SKK-JISYO.L.header > SKK-JISYO.L.header.u8
+	$(SED) -i "2s/coding: euc-jp /coding: utf-8 /" SKK-JISYO.L.header.u8
+%.u8: %
+	$(ICONV) -f euc-jp -t utf-8 $< > $@
+
+
 unannotated-all: unannotated SKK-JISYO.L+.unannotated SKK-JISYO.total.unannotated SKK-JISYO.total+zipcode.unannotated
 
 taciturn-all: SKK-JISYO.L.taciturn SKK-JISYO.L+.taciturn SKK-JISYO.total.taciturn SKK-JISYO.total+zipcode.taciturn
@@ -242,26 +257,27 @@ EUC_SRCS = SKK-JISYO.assoc SKK-JISYO.china_taiwan SKK-JISYO.edict SKK-JISYO.geo
 UTF_SRCS = SKK-JISYO.edict2 SKK-JISYO.emoji SKK-JISYO.fullname SKK-JISYO.pinyin
 EUC_JSON = $(EUC_SRCS:%=json/%.json)
 UTF_JSON = $(UTF_SRCS:%=json/%.json)
+
 json: euc_json utf_json
 euc_json: $(EUC_SRCS)
 	for file in $(EUC_SRCS); do \
 		$(DENO) run --allow-read --allow-write --allow-net script/txt2json.ts \
 		-c EUC-JP -i $$file -m meta/$$file.yaml -o json/$$file.json ; \
-	done
+	done &&	$(TOUCH) euc_json
 utf_json: $(UTF_SRCS)
 	for file in $(UTF_SRCS); do \
 		$(DENO) run --allow-read --allow-write --allow-net script/txt2json.ts \
 		-c UTF-8 -i $$file -m meta/$$file.yaml -o json/$$file.json ; \
-	done
-euc: $(EUC_JSON)
+	done && $(TOUCH) utf_json
+euc_txt: $(EUC_JSON)
 	for file in $(EUC_SRCS); do \
 		$(DENO) run --allow-read --allow-write --allow-net script/json2txt.ts \
 		-c EUC-JP -i json/$$file.json -o $$file ; \
-	done
-utf: $(UTF_JSON)
+	done &&	$(TOUCH) euc_txt
+utf_txt: $(UTF_JSON)
 	for file in $(UTF_SRCS); do \
 		$(DENO) run --allow-read --allow-write --allow-net script/json2txt.ts \
 		-c UTF-8 -i json/$$file.json -o $$file ; \
-	done
+	done &&	$(TOUCH) utf_txt
 
 # end of Makefile.
diff --git a/script/annotation-filter.rb b/script/annotation-filter.rb
@@ -0,0 +1,117 @@
+#!/usr/bin/env ruby
+# -*- coding: utf-8 -*-
+## Copyright (C) 2005 MITA Yuusuke <[email protected]>
+##
+## Author: MITA Yuusuke <[email protected]>
+## Maintainer: SKK Development Team <[email protected]>
+## Keywords: japanese, dictionary
+##
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2, or (at your option)
+## any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+## General Public License for more details.
+
+## You should have received a copy of the GNU General Public License
+## along with this program, see the file COPYING.  If not, write to the
+## Free Software Foundation Inc., 51 Franklin St, Fifth Floor, Boston,
+## MA 02110-1301, USA.
+##
+### Instruction:
+##
+## 
+
+require 'jcode' if RUBY_VERSION.to_f < 1.9
+#require 'kconv'
+require 'skkdictools'
+require 'optparse'
+opt = OptionParser.new
+
+keep_annotation = false
+output_all = true
+unannotate_unique = false
+unannotate_cap = 99999999
+doublebar = "remove"
+rulesets = Array.new
+default_rulesets = [
+  [ "exclude", '※|\?$' ],
+  # [ "exclude", "\[卑\]" ],
+  [ "keep", '旧字|異体字|本字|大字|†|→' ],
+  # [ "keep", "NB:|=|≒|≠|和製|<rare>" ],
+  # [ "cut", "‖" ] - 'doublebar' handles it inplace
+]
+encoding = "euc-jis-2004"
+
+
+opt.on('-c pattern', 'cut annotations after <pattern>') { |pattern| rulesets << [ "cut", pattern]}
+opt.on('-e pattern', 'eliminate candidates if <pattern> matches') { |pattern| rulesets << [ "exclude", pattern]}
+opt.on('-x pattern', 'output pairs if <pattern> matches (use with -t)') { |pattern| rulesets << [ "extract", pattern]}
+opt.on('-u pattern', 'unannotate candidates if <pattern> matches (use with -k)') { |pattern| rulesets << [ "unannotate", pattern]}
+opt.on('-U pattern', 'keep annotations matching <pattern>') { |pattern| rulesets << [ "keep", pattern]}
+
+opt.on('-s', 'unannotate if the candidate is "unique"') { unannotate_unique = true }
+opt.on('-j VAL', "never unannotate if an entry has more than <VAL> candidates") { |v| unannotate_cap = v.to_i }
+opt.on('-k', 'keep annotations by default') { keep_annotation = true }
+opt.on('-t', "extraction mode: output requested pairs only") { output_all = false }
+opt.on('-d', "apply default rulesets") { rulesets += default_rulesets }
+opt.on('-8', "read and write in utf8") { encoding = "utf-8" }
+
+opt.on('-b', "sticky '‖' -- annotation after '‖' will always be kept") { doublebar = "sticky" }
+#opt.on('-B', "always remove annotations after '‖'") { doublebar = "remove" }
+opt.on('-B', "treat '‖' as a part of annotation") { doublebar = "dumb" }
+
+
+begin
+  opt.parse!(ARGV)
+  #rulesets = default_rulesets if rulesets.empty?
+rescue OptionParser::InvalidOption
+  print "'#{$0} -h' for help.\n"
+  exit 1
+end
+Encoding.default_external = encoding
+STDOUT.set_encoding(encoding, "utf-8")
+
+
+while gets
+  $_.encode!("utf-8")
+  next if $_ =~ /^;/ || $_ =~ /^$/
+  midasi, tokens = $_.parse_skk_entry
+  total = tokens.count {|item| !item.nil? }
+  #results = Array.new
+
+  tokens.each do |token|
+    word, annotation, comment = token.skk_split_tokens( doublebar == "dumb" ? nil : '‖')
+
+    do_unannotate = !keep_annotation
+    do_output = output_all
+    do_unannotate = true if unannotate_unique && total == 1
+    do_unannotate = false if unannotate_cap <= total
+
+    rulesets.each do |rule|
+      if !annotation.nil?
+	match = (annotation =~ Regexp.compile(rule[1]))
+	if match
+	  case rule[0]
+	  when "cut"
+	    annotation = annotation[0, match]
+	  when "extract"
+	    do_output = true
+	  when "exclude"
+	    do_output = false
+	  when "unannotate"
+	    do_unannotate = true
+	  when "keep"
+	    do_unannotate = false
+	  end
+	end
+      end
+    end
+    next if !do_output
+    #results << [word, do_unannotate ? nil : annotation, doublebar == "sticky" ? comment : nil]
+    print_pair(midasi, word, do_unannotate ? nil : annotation, doublebar == "sticky" ? comment : nil)
+  end
+end