Skip to content

Commit

Permalink
fix: 完善长词优先的逻辑 close #592
Browse files Browse the repository at this point in the history
不提升包含英文、数字、emoji、假名的候选项
  • Loading branch information
iDvel committed Jan 2, 2024
1 parent 2852474 commit e5d1229
Showing 1 changed file with 71 additions and 35 deletions.
106 changes: 71 additions & 35 deletions lua/long_word_filter.lua
Original file line number Diff line number Diff line change
@@ -1,45 +1,81 @@
-- 长词优先(提升「西安」「提案」「图案」「饥饿」等词汇的优先级)
-- 感谢&参考于: https://github.com/tumuyan/rime-melt
-- 修改:不提升英文和中英混输的
-- 不提升包含英文、数字、emoji、假名的候选项

local function isEmoji(text)
for _, char in utf8.codes(text) do
if (char >= 0x1F600 and char <= 0x1F64F) or -- Emoticons
(char >= 0x1F300 and char <= 0x1F5FF) or -- Misc Symbols and Pictographs
(char >= 0x1F680 and char <= 0x1F6FF) or -- Transport and Map
(char >= 0x2600 and char <= 0x26FF) or -- Misc symbols
(char >= 0x2700 and char <= 0x27BF) or -- Dingbats
(char >= 0xFE00 and char <= 0xFE0F) or -- Variation Selectors
(char >= 0x1F900 and char <= 0x1F9FF) or -- Supplemental Symbols and Pictographs
(char >= 0x1F1E6 and char <= 0x1F1FF) then -- Flags (iOS)
return true
end
end
return false
end

local function containsJapaneseKana(str)
for _, code in utf8.codes(str) do
-- 检查平假名的 Unicode 范围(0x3040 至 0x309F)
if code >= 0x3040 and code <= 0x309F then
return true
end
-- 检查片假名的 Unicode 范围(0x30A0 至 0x30FF)
if code >= 0x30A0 and code <= 0x30FF then
return true
end
end
return false
end

local M = {}

function M.init(env)
-- 提升 count 个词语,插入到第 idx 个位置,默认 2、4。
local config = env.engine.schema.config
env.name_space = env.name_space:gsub("^*", "")
M.idx = config:get_int(env.name_space .. "/idx") or 4
M.count = config:get_int(env.name_space .. "/count") or 2
-- 提升 count 个词语,插入到第 idx 个位置,默认 2、4。
local config = env.engine.schema.config
env.name_space = env.name_space:gsub("^*", "")
M.count = config:get_int(env.name_space .. "/count") or 2
M.idx = config:get_int(env.name_space .. "/idx") or 4
end

function M.func(input, env)
local l = {}
local firstWordLength = 0 -- 记录第一个候选词的长度,提前的候选词至少要比第一个候选词长
local done = 0 -- 记录筛选了多少个词条(只提升 count 个词的权重)
local i = 1
for cand in input:iter() do
-- 找到要提升的词
local leng = utf8.len(cand.text)
if (firstWordLength < 1 or i < M.idx) then
i = i + 1
firstWordLength = leng
yield(cand)
elseif ((leng > firstWordLength) and (done < M.count)) and (string.find(cand.text, "[%w%p%s]+") == nil) then
yield(cand)
done = done + 1
else
table.insert(l, cand)
end
-- 找齐了或者 l 太大了,就不找了
if (done == M.count) or (#l > 50) then
break
end
end
for _, cand in ipairs(l) do
yield(cand)
end
for cand in input:iter() do
yield(cand)
end
function M.func(input)
local l = {}
local firstWordLength = 0 -- 记录第一个候选词的长度,提前的候选词至少要比第一个候选词长
local done = 0 -- 记录筛选了多少个词条(只提升 count 个词的权重)
local i = 1
for cand in input:iter() do
local leng = utf8.len(cand.text)
-- 只以第一个候选项的长度作为参考
if firstWordLength < 1 then
firstWordLength = leng
end
-- 不处理 M.idx 之前的候选项
if i < M.idx then
i = i + 1
yield(cand)
-- 长词直接 yield,其余的放到 l 里
elseif leng <= firstWordLength or cand.text:find("[%a%d]") or containsJapaneseKana(cand.text) or isEmoji(cand.text) then
table.insert(l, cand)
else
yield(cand)
done = done + 1
end
-- 找齐了或者 l 太大了,就不找了,一般前 50 个就够了
if done == M.count or #l > 50 then
break
end
end
-- yield l 及后续的候选项
for _, cand in ipairs(l) do
yield(cand)
end
for cand in input:iter() do
yield(cand)
end
end

return M

0 comments on commit e5d1229

Please sign in to comment.