From 6fc84f947a6acb627b0c0264b020821a52cae2a9 Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Fri, 16 Aug 2024 17:26:02 +0900 Subject: [PATCH] =?UTF-8?q?fullname=20=E8=BE=9E=E6=9B=B8=E3=81=8C=20UTF-8?= =?UTF-8?q?=20=E3=81=AB=E3=81=AA=E3=81=A3=E3=81=9F=E9=96=A2=E4=BF=82?= =?UTF-8?q?=E3=81=AE=20Makefile=20=E8=AA=BF=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://github.com/skk-dev/skktools/pull/27 から annotation-filter.rb を持ってきて script に置いた --- Makefile | 52 ++++++++++------ script/annotation-filter.rb | 117 ++++++++++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+), 18 deletions(-) create mode 100755 script/annotation-filter.rb diff --git a/Makefile b/Makefile index 2ccf2a3..50094fc 100644 --- a/Makefile +++ b/Makefile @@ -5,22 +5,25 @@ COUNT = skkdic-count CURL = curl DATE = date +DENO = deno EMACS = emacs --batch --directory ./ EXPR = skkdic-expr EXPR2 = skkdic-expr2 GAWK = LC_ALL=C gawk GREP = grep -SED = sed GZIP = gzip -9 +ICONV = iconv MD5 = md5 MV = mv --force RM = /bin/rm -f RUBY = ruby -I $(TOOLS_DIR)/filters +SED = sed SORT = skkdic-sort TAR = tar +TOUCH = touch UNZIP = unzip -o + ZIPDIC_DIR = ./zipcode -DENO = deno DIC2PDB = dic2pdb DICCOMPACT = diccompact.rb @@ -49,7 +52,8 @@ CDB_TARGET = ./`basename $(CDB_SOURCE)`.cdb clean: $(RM) *.gz* *~ `find . -name '*~'` `find . -name '.*~'` `find . -name '.#*'` \ - *.unannotated SKK-JISYO.wrong PBinlineDB.pdb *.tmp *.w PBinlineDB.dic *.taciturn SKK-JISYO.L+ SKK-JISYO.total SKK-JISYO.total+zipcode SKK-JISYO.L.header SKK-JISYO.china_taiwan \ + *.unannotated SKK-JISYO.wrong PBinlineDB.pdb *.tmp *.u8 *.w PBinlineDB.dic *.taciturn \ + SKK-JISYO.L+ SKK-JISYO.total SKK-JISYO.total+zipcode SKK-JISYO.L.header SKK-JISYO.china_taiwan \ emoji-list.txt archive: gzip @@ -117,21 +121,23 @@ SKK-JISYO.L+: SKK-JISYO.L SKK-JISYO.L.header $(EXPR2) SKK-JISYO.L + SKK-JISYO.tmp | cat SKK-JISYO.L.header - > SKK-JISYO.L+ $(RM) SKK-JISYO.tmp SKK-JISYO.addition -SKK-JISYO.total: SKK-JISYO.L SKK-JISYO.geo SKK-JISYO.station SKK-JISYO.jinmei SKK-JISYO.propernoun SKK-JISYO.fullname SKK-JISYO.law SKK-JISYO.okinawa SKK-JISYO.hukugougo SKK-JISYO.assoc SKK-JISYO.notes SKK-JISYO.L.header +SKK-JISYO.total: SKK-JISYO.L.u8 SKK-JISYO.geo.u8 SKK-JISYO.station.u8 SKK-JISYO.jinmei.u8 SKK-JISYO.propernoun.u8 SKK-JISYO.fullname SKK-JISYO.law.u8 SKK-JISYO.okinawa.u8 SKK-JISYO.hukugougo.u8 SKK-JISYO.assoc.u8 SKK-JISYO.notes SKK-JISYO.L.header.u8 $(RUBY) $(TOOLS_DIR)/filters/conjugation.rb -Cpox SKK-JISYO.notes > SKK-JISYO.tmp $(RUBY) $(TOOLS_DIR)/filters/asayaKe.rb -p SKK-JISYO.L >> SKK-JISYO.tmp $(RUBY) $(TOOLS_DIR)/filters/complete-numerative.rb -pU SKK-JISYO.L >> SKK-JISYO.tmp $(RUBY) $(TOOLS_DIR)/filters/abbrev-convert.rb -K -s 2 SKK-JISYO.L >> SKK-JISYO.tmp $(RUBY) $(TOOLS_DIR)/filters/abbrev-convert.rb -w -s 2 SKK-JISYO.L >> SKK-JISYO.tmp + $(ICONV) -f euc-jp -t utf-8 SKK-JISYO.tmp > SKK-JISYO.tmp.u8 # order is very important here - $(EXPR2) SKK-JISYO.geo + SKK-JISYO.station + SKK-JISYO.jinmei + SKK-JISYO.propernoun + SKK-JISYO.fullname + SKK-JISYO.tmp + SKK-JISYO.law + SKK-JISYO.okinawa + SKK-JISYO.hukugougo + SKK-JISYO.assoc - SKK-JISYO.L > SKK-JISYO.addition + $(EXPR2) SKK-JISYO.geo.u8 + SKK-JISYO.station.u8 + SKK-JISYO.jinmei.u8 + SKK-JISYO.propernoun.u8 + SKK-JISYO.fullname + SKK-JISYO.tmp.u8 + SKK-JISYO.law.u8 + SKK-JISYO.okinawa.u8 + SKK-JISYO.hukugougo.u8 + SKK-JISYO.assoc.u8 - SKK-JISYO.L.u8 > SKK-JISYO.addition # why eliminating SKK-JISYO.L once? -- to not add too noisy # annotations from SKK-JISYO.jinmei and so on. - $(EXPR2) SKK-JISYO.L + SKK-JISYO.addition | cat SKK-JISYO.L.header - > SKK-JISYO.total + $(EXPR2) SKK-JISYO.L.u8 + SKK-JISYO.addition | cat SKK-JISYO.L.header.u8 - > SKK-JISYO.total $(RM) SKK-JISYO.tmp SKK-JISYO.addition -SKK-JISYO.total+zipcode: SKK-JISYO.total $(ZIPDIC_DIR)/SKK-JISYO.zipcode $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode SKK-JISYO.L.header - $(EXPR2) SKK-JISYO.total + $(ZIPDIC_DIR)/SKK-JISYO.zipcode + $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode | cat SKK-JISYO.L.header - > SKK-JISYO.total+zipcode +# zipcode がまだ UTF-8 ではない場合 +SKK-JISYO.total+zipcode: SKK-JISYO.total $(ZIPDIC_DIR)/SKK-JISYO.zipcode.u8 $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode.u8 SKK-JISYO.L.header.u8 + $(EXPR2) SKK-JISYO.total + $(ZIPDIC_DIR)/SKK-JISYO.zipcode.u8 + $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode.u8 | cat SKK-JISYO.L.header.u8 - > SKK-JISYO.total+zipcode SKK-JISYO.L.taciturn: SKK-JISYO.L SKK-JISYO.L.header $(RUBY) $(TOOLS_DIR)/filters/annotation-filter.rb -d SKK-JISYO.L | $(EXPR2) | cat SKK-JISYO.L.header - > SKK-JISYO.L.taciturn @@ -139,11 +145,11 @@ SKK-JISYO.L.taciturn: SKK-JISYO.L SKK-JISYO.L.header SKK-JISYO.L+.taciturn: SKK-JISYO.L+ SKK-JISYO.L.header $(RUBY) $(TOOLS_DIR)/filters/annotation-filter.rb -d SKK-JISYO.L+ | $(EXPR2) | cat SKK-JISYO.L.header - > SKK-JISYO.L+.taciturn -SKK-JISYO.total.taciturn: SKK-JISYO.total SKK-JISYO.L.header - $(RUBY) $(TOOLS_DIR)/filters/annotation-filter.rb -d SKK-JISYO.total | $(EXPR2) | cat SKK-JISYO.L.header - > SKK-JISYO.total.taciturn +SKK-JISYO.total.taciturn: SKK-JISYO.total SKK-JISYO.L.header.u8 + $(RUBY) script/annotation-filter.rb -8 -d SKK-JISYO.total | $(EXPR2) | cat SKK-JISYO.L.header.u8 - > SKK-JISYO.total.taciturn -SKK-JISYO.total+zipcode.taciturn: SKK-JISYO.total+zipcode SKK-JISYO.L.header - $(RUBY) $(TOOLS_DIR)/filters/annotation-filter.rb -d SKK-JISYO.total+zipcode | $(EXPR2) | cat SKK-JISYO.L.header - > SKK-JISYO.total+zipcode.taciturn +SKK-JISYO.total+zipcode.taciturn: SKK-JISYO.total+zipcode SKK-JISYO.L.header.u8 + $(RUBY) script/annotation-filter.rb -8 -d SKK-JISYO.total+zipcode | $(EXPR2) | cat SKK-JISYO.L.header.u8 - > SKK-JISYO.total+zipcode.taciturn SKK-JISYO.L+.unannotated: SKK-JISYO.L+ $(GAWK) -f $(TOOLS_DIR)/unannotation.awk SKK-JISYO.L+ > SKK-JISYO.L+.unannotated @@ -158,6 +164,15 @@ SKK-JISYO.L.header: SKK-JISYO.L echo ';; (This dictionary was automatically generated from SKK dictionaries)' > SKK-JISYO.L.header $(SED) -n '/^;; okuri-ari entries./q;p' SKK-JISYO.L >> SKK-JISYO.L.header +$(ZIPDIC_DIR)/SKK-JISYO.office.zipcode.u8: $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode + $(ICONV) -f euc-jisx0213 -t utf-8 $< > $@ +SKK-JISYO.L.header.u8: SKK-JISYO.L.header + $(ICONV) -f euc-jp -t utf-8 SKK-JISYO.L.header > SKK-JISYO.L.header.u8 + $(SED) -i "2s/coding: euc-jp /coding: utf-8 /" SKK-JISYO.L.header.u8 +%.u8: % + $(ICONV) -f euc-jp -t utf-8 $< > $@ + + unannotated-all: unannotated SKK-JISYO.L+.unannotated SKK-JISYO.total.unannotated SKK-JISYO.total+zipcode.unannotated taciturn-all: SKK-JISYO.L.taciturn SKK-JISYO.L+.taciturn SKK-JISYO.total.taciturn SKK-JISYO.total+zipcode.taciturn @@ -242,26 +257,27 @@ EUC_SRCS = SKK-JISYO.assoc SKK-JISYO.china_taiwan SKK-JISYO.edict SKK-JISYO.geo UTF_SRCS = SKK-JISYO.edict2 SKK-JISYO.emoji SKK-JISYO.fullname SKK-JISYO.pinyin EUC_JSON = $(EUC_SRCS:%=json/%.json) UTF_JSON = $(UTF_SRCS:%=json/%.json) + json: euc_json utf_json euc_json: $(EUC_SRCS) for file in $(EUC_SRCS); do \ $(DENO) run --allow-read --allow-write --allow-net script/txt2json.ts \ -c EUC-JP -i $$file -m meta/$$file.yaml -o json/$$file.json ; \ - done + done && $(TOUCH) euc_json utf_json: $(UTF_SRCS) for file in $(UTF_SRCS); do \ $(DENO) run --allow-read --allow-write --allow-net script/txt2json.ts \ -c UTF-8 -i $$file -m meta/$$file.yaml -o json/$$file.json ; \ - done -euc: $(EUC_JSON) + done && $(TOUCH) utf_json +euc_txt: $(EUC_JSON) for file in $(EUC_SRCS); do \ $(DENO) run --allow-read --allow-write --allow-net script/json2txt.ts \ -c EUC-JP -i json/$$file.json -o $$file ; \ - done -utf: $(UTF_JSON) + done && $(TOUCH) euc_txt +utf_txt: $(UTF_JSON) for file in $(UTF_SRCS); do \ $(DENO) run --allow-read --allow-write --allow-net script/json2txt.ts \ -c UTF-8 -i json/$$file.json -o $$file ; \ - done + done && $(TOUCH) utf_txt # end of Makefile. diff --git a/script/annotation-filter.rb b/script/annotation-filter.rb new file mode 100755 index 0000000..367f3ab --- /dev/null +++ b/script/annotation-filter.rb @@ -0,0 +1,117 @@ +#!/usr/bin/env ruby +# -*- coding: utf-8 -*- +## Copyright (C) 2005 MITA Yuusuke +## +## Author: MITA Yuusuke +## Maintainer: SKK Development Team +## Keywords: japanese, dictionary +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2, or (at your option) +## any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program, see the file COPYING. If not, write to the +## Free Software Foundation Inc., 51 Franklin St, Fifth Floor, Boston, +## MA 02110-1301, USA. +## +### Instruction: +## +## + +require 'jcode' if RUBY_VERSION.to_f < 1.9 +#require 'kconv' +require 'skkdictools' +require 'optparse' +opt = OptionParser.new + +keep_annotation = false +output_all = true +unannotate_unique = false +unannotate_cap = 99999999 +doublebar = "remove" +rulesets = Array.new +default_rulesets = [ + [ "exclude", '※|\?$' ], + # [ "exclude", "\[卑\]" ], + [ "keep", '旧字|異体字|本字|大字|†|→' ], + # [ "keep", "NB:|=|≒|≠|和製|" ], + # [ "cut", "‖" ] - 'doublebar' handles it inplace +] +encoding = "euc-jis-2004" + + +opt.on('-c pattern', 'cut annotations after ') { |pattern| rulesets << [ "cut", pattern]} +opt.on('-e pattern', 'eliminate candidates if matches') { |pattern| rulesets << [ "exclude", pattern]} +opt.on('-x pattern', 'output pairs if matches (use with -t)') { |pattern| rulesets << [ "extract", pattern]} +opt.on('-u pattern', 'unannotate candidates if matches (use with -k)') { |pattern| rulesets << [ "unannotate", pattern]} +opt.on('-U pattern', 'keep annotations matching ') { |pattern| rulesets << [ "keep", pattern]} + +opt.on('-s', 'unannotate if the candidate is "unique"') { unannotate_unique = true } +opt.on('-j VAL', "never unannotate if an entry has more than candidates") { |v| unannotate_cap = v.to_i } +opt.on('-k', 'keep annotations by default') { keep_annotation = true } +opt.on('-t', "extraction mode: output requested pairs only") { output_all = false } +opt.on('-d', "apply default rulesets") { rulesets += default_rulesets } +opt.on('-8', "read and write in utf8") { encoding = "utf-8" } + +opt.on('-b', "sticky '‖' -- annotation after '‖' will always be kept") { doublebar = "sticky" } +#opt.on('-B', "always remove annotations after '‖'") { doublebar = "remove" } +opt.on('-B', "treat '‖' as a part of annotation") { doublebar = "dumb" } + + +begin + opt.parse!(ARGV) + #rulesets = default_rulesets if rulesets.empty? +rescue OptionParser::InvalidOption + print "'#{$0} -h' for help.\n" + exit 1 +end +Encoding.default_external = encoding +STDOUT.set_encoding(encoding, "utf-8") + + +while gets + $_.encode!("utf-8") + next if $_ =~ /^;/ || $_ =~ /^$/ + midasi, tokens = $_.parse_skk_entry + total = tokens.count {|item| !item.nil? } + #results = Array.new + + tokens.each do |token| + word, annotation, comment = token.skk_split_tokens( doublebar == "dumb" ? nil : '‖') + + do_unannotate = !keep_annotation + do_output = output_all + do_unannotate = true if unannotate_unique && total == 1 + do_unannotate = false if unannotate_cap <= total + + rulesets.each do |rule| + if !annotation.nil? + match = (annotation =~ Regexp.compile(rule[1])) + if match + case rule[0] + when "cut" + annotation = annotation[0, match] + when "extract" + do_output = true + when "exclude" + do_output = false + when "unannotate" + do_unannotate = true + when "keep" + do_unannotate = false + end + end + end + end + next if !do_output + #results << [word, do_unannotate ? nil : annotation, doublebar == "sticky" ? comment : nil] + print_pair(midasi, word, do_unannotate ? nil : annotation, doublebar == "sticky" ? comment : nil) + end +end