Skip to content

Commit

Permalink
fullname 辞書が UTF-8 になった関係の Makefile 調整
Browse files Browse the repository at this point in the history
skk-dev/skktools#27
から annotation-filter.rb を持ってきて script に置いた
  • Loading branch information
tamo committed Aug 16, 2024
1 parent ceec4a0 commit 6fc84f9
Show file tree
Hide file tree
Showing 2 changed files with 151 additions and 18 deletions.
52 changes: 34 additions & 18 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,25 @@
COUNT = skkdic-count
CURL = curl
DATE = date
DENO = deno
EMACS = emacs --batch --directory ./
EXPR = skkdic-expr
EXPR2 = skkdic-expr2
GAWK = LC_ALL=C gawk
GREP = grep
SED = sed
GZIP = gzip -9
ICONV = iconv
MD5 = md5
MV = mv --force
RM = /bin/rm -f
RUBY = ruby -I $(TOOLS_DIR)/filters
SED = sed
SORT = skkdic-sort
TAR = tar
TOUCH = touch
UNZIP = unzip -o

ZIPDIC_DIR = ./zipcode
DENO = deno

DIC2PDB = dic2pdb
DICCOMPACT = diccompact.rb
Expand Down Expand Up @@ -49,7 +52,8 @@ CDB_TARGET = ./`basename $(CDB_SOURCE)`.cdb

clean:
$(RM) *.gz* *~ `find . -name '*~'` `find . -name '.*~'` `find . -name '.#*'` \
*.unannotated SKK-JISYO.wrong PBinlineDB.pdb *.tmp *.w PBinlineDB.dic *.taciturn SKK-JISYO.L+ SKK-JISYO.total SKK-JISYO.total+zipcode SKK-JISYO.L.header SKK-JISYO.china_taiwan \
*.unannotated SKK-JISYO.wrong PBinlineDB.pdb *.tmp *.u8 *.w PBinlineDB.dic *.taciturn \
SKK-JISYO.L+ SKK-JISYO.total SKK-JISYO.total+zipcode SKK-JISYO.L.header SKK-JISYO.china_taiwan \
emoji-list.txt

archive: gzip
Expand Down Expand Up @@ -117,33 +121,35 @@ SKK-JISYO.L+: SKK-JISYO.L SKK-JISYO.L.header
$(EXPR2) SKK-JISYO.L + SKK-JISYO.tmp | cat SKK-JISYO.L.header - > SKK-JISYO.L+
$(RM) SKK-JISYO.tmp SKK-JISYO.addition

SKK-JISYO.total: SKK-JISYO.L SKK-JISYO.geo SKK-JISYO.station SKK-JISYO.jinmei SKK-JISYO.propernoun SKK-JISYO.fullname SKK-JISYO.law SKK-JISYO.okinawa SKK-JISYO.hukugougo SKK-JISYO.assoc SKK-JISYO.notes SKK-JISYO.L.header
SKK-JISYO.total: SKK-JISYO.L.u8 SKK-JISYO.geo.u8 SKK-JISYO.station.u8 SKK-JISYO.jinmei.u8 SKK-JISYO.propernoun.u8 SKK-JISYO.fullname SKK-JISYO.law.u8 SKK-JISYO.okinawa.u8 SKK-JISYO.hukugougo.u8 SKK-JISYO.assoc.u8 SKK-JISYO.notes SKK-JISYO.L.header.u8
$(RUBY) $(TOOLS_DIR)/filters/conjugation.rb -Cpox SKK-JISYO.notes > SKK-JISYO.tmp
$(RUBY) $(TOOLS_DIR)/filters/asayaKe.rb -p SKK-JISYO.L >> SKK-JISYO.tmp
$(RUBY) $(TOOLS_DIR)/filters/complete-numerative.rb -pU SKK-JISYO.L >> SKK-JISYO.tmp
$(RUBY) $(TOOLS_DIR)/filters/abbrev-convert.rb -K -s 2 SKK-JISYO.L >> SKK-JISYO.tmp
$(RUBY) $(TOOLS_DIR)/filters/abbrev-convert.rb -w -s 2 SKK-JISYO.L >> SKK-JISYO.tmp
$(ICONV) -f euc-jp -t utf-8 SKK-JISYO.tmp > SKK-JISYO.tmp.u8
# order is very important here
$(EXPR2) SKK-JISYO.geo + SKK-JISYO.station + SKK-JISYO.jinmei + SKK-JISYO.propernoun + SKK-JISYO.fullname + SKK-JISYO.tmp + SKK-JISYO.law + SKK-JISYO.okinawa + SKK-JISYO.hukugougo + SKK-JISYO.assoc - SKK-JISYO.L > SKK-JISYO.addition
$(EXPR2) SKK-JISYO.geo.u8 + SKK-JISYO.station.u8 + SKK-JISYO.jinmei.u8 + SKK-JISYO.propernoun.u8 + SKK-JISYO.fullname + SKK-JISYO.tmp.u8 + SKK-JISYO.law.u8 + SKK-JISYO.okinawa.u8 + SKK-JISYO.hukugougo.u8 + SKK-JISYO.assoc.u8 - SKK-JISYO.L.u8 > SKK-JISYO.addition
# why eliminating SKK-JISYO.L once? -- to not add too noisy
# annotations from SKK-JISYO.jinmei and so on.
$(EXPR2) SKK-JISYO.L + SKK-JISYO.addition | cat SKK-JISYO.L.header - > SKK-JISYO.total
$(EXPR2) SKK-JISYO.L.u8 + SKK-JISYO.addition | cat SKK-JISYO.L.header.u8 - > SKK-JISYO.total
$(RM) SKK-JISYO.tmp SKK-JISYO.addition

SKK-JISYO.total+zipcode: SKK-JISYO.total $(ZIPDIC_DIR)/SKK-JISYO.zipcode $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode SKK-JISYO.L.header
$(EXPR2) SKK-JISYO.total + $(ZIPDIC_DIR)/SKK-JISYO.zipcode + $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode | cat SKK-JISYO.L.header - > SKK-JISYO.total+zipcode
# zipcode がまだ UTF-8 ではない場合
SKK-JISYO.total+zipcode: SKK-JISYO.total $(ZIPDIC_DIR)/SKK-JISYO.zipcode.u8 $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode.u8 SKK-JISYO.L.header.u8
$(EXPR2) SKK-JISYO.total + $(ZIPDIC_DIR)/SKK-JISYO.zipcode.u8 + $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode.u8 | cat SKK-JISYO.L.header.u8 - > SKK-JISYO.total+zipcode

SKK-JISYO.L.taciturn: SKK-JISYO.L SKK-JISYO.L.header
$(RUBY) $(TOOLS_DIR)/filters/annotation-filter.rb -d SKK-JISYO.L | $(EXPR2) | cat SKK-JISYO.L.header - > SKK-JISYO.L.taciturn

SKK-JISYO.L+.taciturn: SKK-JISYO.L+ SKK-JISYO.L.header
$(RUBY) $(TOOLS_DIR)/filters/annotation-filter.rb -d SKK-JISYO.L+ | $(EXPR2) | cat SKK-JISYO.L.header - > SKK-JISYO.L+.taciturn

SKK-JISYO.total.taciturn: SKK-JISYO.total SKK-JISYO.L.header
$(RUBY) $(TOOLS_DIR)/filters/annotation-filter.rb -d SKK-JISYO.total | $(EXPR2) | cat SKK-JISYO.L.header - > SKK-JISYO.total.taciturn
SKK-JISYO.total.taciturn: SKK-JISYO.total SKK-JISYO.L.header.u8
$(RUBY) script/annotation-filter.rb -8 -d SKK-JISYO.total | $(EXPR2) | cat SKK-JISYO.L.header.u8 - > SKK-JISYO.total.taciturn

SKK-JISYO.total+zipcode.taciturn: SKK-JISYO.total+zipcode SKK-JISYO.L.header
$(RUBY) $(TOOLS_DIR)/filters/annotation-filter.rb -d SKK-JISYO.total+zipcode | $(EXPR2) | cat SKK-JISYO.L.header - > SKK-JISYO.total+zipcode.taciturn
SKK-JISYO.total+zipcode.taciturn: SKK-JISYO.total+zipcode SKK-JISYO.L.header.u8
$(RUBY) script/annotation-filter.rb -8 -d SKK-JISYO.total+zipcode | $(EXPR2) | cat SKK-JISYO.L.header.u8 - > SKK-JISYO.total+zipcode.taciturn

SKK-JISYO.L+.unannotated: SKK-JISYO.L+
$(GAWK) -f $(TOOLS_DIR)/unannotation.awk SKK-JISYO.L+ > SKK-JISYO.L+.unannotated
Expand All @@ -158,6 +164,15 @@ SKK-JISYO.L.header: SKK-JISYO.L
echo ';; (This dictionary was automatically generated from SKK dictionaries)' > SKK-JISYO.L.header
$(SED) -n '/^;; okuri-ari entries./q;p' SKK-JISYO.L >> SKK-JISYO.L.header

$(ZIPDIC_DIR)/SKK-JISYO.office.zipcode.u8: $(ZIPDIC_DIR)/SKK-JISYO.office.zipcode
$(ICONV) -f euc-jisx0213 -t utf-8 $< > $@
SKK-JISYO.L.header.u8: SKK-JISYO.L.header
$(ICONV) -f euc-jp -t utf-8 SKK-JISYO.L.header > SKK-JISYO.L.header.u8
$(SED) -i "2s/coding: euc-jp /coding: utf-8 /" SKK-JISYO.L.header.u8
%.u8: %
$(ICONV) -f euc-jp -t utf-8 $< > $@


unannotated-all: unannotated SKK-JISYO.L+.unannotated SKK-JISYO.total.unannotated SKK-JISYO.total+zipcode.unannotated

taciturn-all: SKK-JISYO.L.taciturn SKK-JISYO.L+.taciturn SKK-JISYO.total.taciturn SKK-JISYO.total+zipcode.taciturn
Expand Down Expand Up @@ -242,26 +257,27 @@ EUC_SRCS = SKK-JISYO.assoc SKK-JISYO.china_taiwan SKK-JISYO.edict SKK-JISYO.geo
UTF_SRCS = SKK-JISYO.edict2 SKK-JISYO.emoji SKK-JISYO.fullname SKK-JISYO.pinyin
EUC_JSON = $(EUC_SRCS:%=json/%.json)
UTF_JSON = $(UTF_SRCS:%=json/%.json)

json: euc_json utf_json
euc_json: $(EUC_SRCS)
for file in $(EUC_SRCS); do \
$(DENO) run --allow-read --allow-write --allow-net script/txt2json.ts \
-c EUC-JP -i $$file -m meta/$$file.yaml -o json/$$file.json ; \
done
done && $(TOUCH) euc_json
utf_json: $(UTF_SRCS)
for file in $(UTF_SRCS); do \
$(DENO) run --allow-read --allow-write --allow-net script/txt2json.ts \
-c UTF-8 -i $$file -m meta/$$file.yaml -o json/$$file.json ; \
done
euc: $(EUC_JSON)
done && $(TOUCH) utf_json
euc_txt: $(EUC_JSON)
for file in $(EUC_SRCS); do \
$(DENO) run --allow-read --allow-write --allow-net script/json2txt.ts \
-c EUC-JP -i json/$$file.json -o $$file ; \
done
utf: $(UTF_JSON)
done && $(TOUCH) euc_txt
utf_txt: $(UTF_JSON)
for file in $(UTF_SRCS); do \
$(DENO) run --allow-read --allow-write --allow-net script/json2txt.ts \
-c UTF-8 -i json/$$file.json -o $$file ; \
done
done && $(TOUCH) utf_txt

# end of Makefile.
117 changes: 117 additions & 0 deletions script/annotation-filter.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env ruby
# -*- coding: utf-8 -*-
## Copyright (C) 2005 MITA Yuusuke <[email protected]>
##
## Author: MITA Yuusuke <[email protected]>
## Maintainer: SKK Development Team <[email protected]>
## Keywords: japanese, dictionary
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2, or (at your option)
## any later version.

## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with this program, see the file COPYING. If not, write to the
## Free Software Foundation Inc., 51 Franklin St, Fifth Floor, Boston,
## MA 02110-1301, USA.
##
### Instruction:
##
##

require 'jcode' if RUBY_VERSION.to_f < 1.9
#require 'kconv'
require 'skkdictools'
require 'optparse'
opt = OptionParser.new

keep_annotation = false
output_all = true
unannotate_unique = false
unannotate_cap = 99999999
doublebar = "remove"
rulesets = Array.new
default_rulesets = [
[ "exclude", '※|\?$' ],
# [ "exclude", "\[卑\]" ],
[ "keep", '旧字|異体字|本字|大字|†|→' ],
# [ "keep", "NB:|=|≒|≠|和製|<rare>" ],
# [ "cut", "‖" ] - 'doublebar' handles it inplace
]
encoding = "euc-jis-2004"


opt.on('-c pattern', 'cut annotations after <pattern>') { |pattern| rulesets << [ "cut", pattern]}
opt.on('-e pattern', 'eliminate candidates if <pattern> matches') { |pattern| rulesets << [ "exclude", pattern]}
opt.on('-x pattern', 'output pairs if <pattern> matches (use with -t)') { |pattern| rulesets << [ "extract", pattern]}
opt.on('-u pattern', 'unannotate candidates if <pattern> matches (use with -k)') { |pattern| rulesets << [ "unannotate", pattern]}
opt.on('-U pattern', 'keep annotations matching <pattern>') { |pattern| rulesets << [ "keep", pattern]}

opt.on('-s', 'unannotate if the candidate is "unique"') { unannotate_unique = true }
opt.on('-j VAL', "never unannotate if an entry has more than <VAL> candidates") { |v| unannotate_cap = v.to_i }
opt.on('-k', 'keep annotations by default') { keep_annotation = true }
opt.on('-t', "extraction mode: output requested pairs only") { output_all = false }
opt.on('-d', "apply default rulesets") { rulesets += default_rulesets }
opt.on('-8', "read and write in utf8") { encoding = "utf-8" }

opt.on('-b', "sticky '‖' -- annotation after '‖' will always be kept") { doublebar = "sticky" }
#opt.on('-B', "always remove annotations after '‖'") { doublebar = "remove" }
opt.on('-B', "treat '‖' as a part of annotation") { doublebar = "dumb" }


begin
opt.parse!(ARGV)
#rulesets = default_rulesets if rulesets.empty?
rescue OptionParser::InvalidOption
print "'#{$0} -h' for help.\n"
exit 1
end
Encoding.default_external = encoding
STDOUT.set_encoding(encoding, "utf-8")


while gets
$_.encode!("utf-8")
next if $_ =~ /^;/ || $_ =~ /^$/
midasi, tokens = $_.parse_skk_entry
total = tokens.count {|item| !item.nil? }
#results = Array.new

tokens.each do |token|
word, annotation, comment = token.skk_split_tokens( doublebar == "dumb" ? nil : '‖')

do_unannotate = !keep_annotation
do_output = output_all
do_unannotate = true if unannotate_unique && total == 1
do_unannotate = false if unannotate_cap <= total

rulesets.each do |rule|
if !annotation.nil?
match = (annotation =~ Regexp.compile(rule[1]))
if match
case rule[0]
when "cut"
annotation = annotation[0, match]
when "extract"
do_output = true
when "exclude"
do_output = false
when "unannotate"
do_unannotate = true
when "keep"
do_unannotate = false
end
end
end
end
next if !do_output
#results << [word, do_unannotate ? nil : annotation, doublebar == "sticky" ? comment : nil]
print_pair(midasi, word, do_unannotate ? nil : annotation, doublebar == "sticky" ? comment : nil)
end
end

0 comments on commit 6fc84f9

Please sign in to comment.