From caf046f34fbde018e95c2b392401730be67e46f5 Mon Sep 17 00:00:00 2001 From: Hai Liang Wang Date: Thu, 24 Sep 2020 14:56:38 +0800 Subject: [PATCH] export jieba, fix prob files and remove deps of absl-py --- Requirements.txt | 2 +- VALUATION.md | 67 +++-- demo.py | 16 +- scripts/publish.sh | 10 + scripts/test.sh | 2 +- setup.py | 7 +- synonyms/__init__.py | 1 + synonyms/jieba/README.md | 460 +++++++++++++++++--------------- synonyms/jieba/__init__.py | 26 +- synonyms/jieba/analyse/tfidf.py | 11 +- synonyms/synonyms.py | 24 +- synonyms/word2vec.py | 24 +- 12 files changed, 336 insertions(+), 314 deletions(-) diff --git a/Requirements.txt b/Requirements.txt index d5541ff..c8c6cb4 100644 --- a/Requirements.txt +++ b/Requirements.txt @@ -1 +1 @@ -synonyms>=3.12 \ No newline at end of file +synonyms>=3.13 \ No newline at end of file diff --git a/VALUATION.md b/VALUATION.md index 72f9471..d468a4a 100644 --- a/VALUATION.md +++ b/VALUATION.md @@ -1,34 +1,33 @@ -# synonyms 分数评测 [(v3.12.0)](https://pypi.python.org/pypi/synonyms/3.12.0) - -| 词 1 | 词 2 | synonyms | 人工评定 | -| ------ | -------- | -------- | -------- | -| 轿车 | 汽车 | 0.892 | 0.98 | -| 宝石 | 宝物 | 1.0 | 0.96 | -| 旅游 | 游历 | 0.649 | 0.96 | -| 男孩子 | 小伙子 | 0.77 | 0.94 | -| 海岸 | 海滨 | 0.889 | 0.925 | -| 庇护所 | 精神病院 | 0.211 | 0.9025 | -| 魔术师 | 巫师 | 0.95 | 0.875 | -| 中午 | 正午 | 0.9 | 0.855 | -| 火炉 | 炉灶 | 0.889 | 0.7775 | -| 食物 | 水果 | 0.363 | 0.77 | -| 鸟 | 公鸡 | 0.895 | 0.7625 | -| 鸟 | 鹤 | 1.0 | 0.7425 | -| 工具 | 器械 | 0.881 | 0.7375 | -| 兄弟 | 和尚 | 0.139 | 0.705 | -| 起重机 | 器械 | 0.195 | 0.42 | -| 小伙子 | 兄弟 | 0.703 | 0.415 | -| 旅行 | 轿车 | 0.088 | 0.29 | -| 和尚 | 圣贤 | 0.222 | 0.275 | -| 墓地 | 林地 | 0.874 | 0.2375 | -| 食物 | 公鸡 | 0.151 | 0.2225 | -| 海岸 | 丘陵 | 0.248 | 0.2175 | -| 森林 | 墓地 | 0.14 | 0.21 | -| 岸边 | 林地 | 0.193 | 0.1575 | -| 和尚 | 奴隶 | 0.059 | 0.1375 | -| 海岸 | 森林 | 0.23 | 0.105 | -| 小伙子 | 巫师 | 0.182 | 0.105 | -| 琴弦 | 微笑 | 0.089 | 0.0325 | -| 玻璃 | 魔术师 | 0.02 | 0.0275 | -| 中午 | 绳子 | 0.049 | 0.02 | -| 公鸡 | 航行 | 0.0 | 0.02 | +# synonyms 分数评测 [(v3.13.0)](https://pypi.python.org/pypi/synonyms/3.13.0) +| 词1 | 词2 | synonyms | 人工评定 | +| --- | --- | --- | --- | +| 轿车 | 汽车 | 0.892 | 0.98 | +| 宝石 | 宝物 | 1.0 | 0.96 | +| 旅游 | 游历 | 0.649 | 0.96 | +| 男孩子 | 小伙子 | 0.77 | 0.94 | +| 海岸 | 海滨 | 0.889 | 0.925 | +| 庇护所 | 精神病院 | 0.211 | 0.9025 | +| 魔术师 | 巫师 | 0.95 | 0.875 | +| 中午 | 正午 | 0.9 | 0.855 | +| 火炉 | 炉灶 | 0.889 | 0.7775 | +| 食物 | 水果 | 0.363 | 0.77 | +| 鸟 | 公鸡 | 0.895 | 0.7625 | +| 鸟 | 鹤 | 1.0 | 0.7425 | +| 工具 | 器械 | 0.881 | 0.7375 | +| 兄弟 | 和尚 | 0.139 | 0.705 | +| 起重机 | 器械 | 0.195 | 0.42 | +| 小伙子 | 兄弟 | 0.703 | 0.415 | +| 旅行 | 轿车 | 0.088 | 0.29 | +| 和尚 | 圣贤 | 0.222 | 0.275 | +| 墓地 | 林地 | 0.874 | 0.2375 | +| 食物 | 公鸡 | 0.151 | 0.2225 | +| 海岸 | 丘陵 | 0.248 | 0.2175 | +| 森林 | 墓地 | 0.14 | 0.21 | +| 岸边 | 林地 | 0.193 | 0.1575 | +| 和尚 | 奴隶 | 0.059 | 0.1375 | +| 海岸 | 森林 | 0.23 | 0.105 | +| 小伙子 | 巫师 | 0.182 | 0.105 | +| 琴弦 | 微笑 | 0.089 | 0.0325 | +| 玻璃 | 魔术师 | 0.02 | 0.0275 | +| 中午 | 绳子 | 0.049 | 0.02 | +| 公鸡 | 航行 | 0.0 | 0.02 | diff --git a/demo.py b/demo.py index af3c64c..bd2ef54 100755 --- a/demo.py +++ b/demo.py @@ -33,10 +33,6 @@ # raise "Must be using Python 3" # -from absl import flags -from absl import logging - -FLAGS = flags.FLAGS import synonyms # https://github.com/huyingxi/Synonyms import numpy import unittest @@ -144,10 +140,20 @@ def test_basecase_2(self): sen2 = "今天天气怎么样" r = synonyms.compare(sen1, sen2, seg=True) + + def test_analyse_extract_tags(self): + ''' + 使用 Tag 方式获得关键词 + https://github.com/fxsjy/jieba/tree/v0.39 + ''' + from synonyms.jieba import analyse + sentence = "华为芯片被断供,源于美国关于华为的修订版禁令生效——9月15日以来,台积电、高通、三星等华为的重要合作伙伴,只要没有美国的相关许可证,都无法供应芯片给华为,而中芯国际等国产芯片企业,也因采用美国技术,而无法供货给华为。目前华为部分型号的手机产品出现货少的现象,若该形势持续下去,华为手机业务将遭受重创。" + keywords = analyse.extract_tags(sentence, topK=5, withWeight=False, allowPOS=()) + print("[test_analyse_extract_tags] keywords %s" % keywords) + def test(): unittest.main() if __name__ == '__main__': - FLAGS([__file__, '--verbosity', '1']) test() diff --git a/scripts/publish.sh b/scripts/publish.sh index 1ea3fec..273e548 100755 --- a/scripts/publish.sh +++ b/scripts/publish.sh @@ -13,4 +13,14 @@ export PATH=/opt/miniconda3/envs/venv-py3/bin:$PATH # main [ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return cd $baseDir/.. + +if [ ! -d tmp ]; then + mkdir tmp +fi + +if [ -f synonyms/data/words.vector.gz ]; then + mv synonyms/data/words.vector.gz tmp +fi + python setup.py sdist upload -r pypi +mv tmp/words.vector.gz synonyms/data/words.vector.gz \ No newline at end of file diff --git a/scripts/test.sh b/scripts/test.sh index cdbaa3e..755c8e9 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -15,7 +15,7 @@ export PATH=/opt/miniconda3/envs/venv-py3/bin:$PATH cd $baseDir/.. if [ -f .env ]; then echo "load env with" `pwd`"/.env" - #source .env + source .env fi python demo.py diff --git a/setup.py b/setup.py index 8cbf537..7963f10 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name='synonyms', - version='3.12.0', + version='3.13.0', description='中文近义词:聊天机器人,智能问答工具包;Chinese Synonyms for Natural Language Processing and Understanding', long_description=LONGDOC, author='Hai Liang Wang, Hu Ying Xi', @@ -41,11 +41,12 @@ 'six>=1.11.0', 'numpy>=1.13.1', 'scipy>=1.0.0', - 'scikit-learn>=0.19.1', - 'absl-py>=0.4' + 'scikit-learn>=0.19.1' ], package_data={ 'synonyms': [ + '**/**/idf.txt', + '**/**/*.p', '**/*.gz', '**/*.txt', 'LICENSE']}) diff --git a/synonyms/__init__.py b/synonyms/__init__.py index 1614464..492507f 100644 --- a/synonyms/__init__.py +++ b/synonyms/__init__.py @@ -1,4 +1,5 @@ __all__ = ["seg", + "jieba", "nearby", "compare", "display", diff --git a/synonyms/jieba/README.md b/synonyms/jieba/README.md index 12f09f8..63c803d 100644 --- a/synonyms/jieba/README.md +++ b/synonyms/jieba/README.md @@ -1,58 +1,62 @@ -jieba -======== +# jieba + “结巴”中文分词:做最好的 Python 中文分词组件 +``` +https://github.com/fxsjy/jieba/tree/v0.39 +``` + "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module. - _Scroll down for English documentation._ +# 特点 -特点 -======== -* 支持三种分词模式: - * 精确模式,试图将句子最精确地切开,适合文本分析; - * 全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义; - * 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。 +- 支持三种分词模式: -* 支持繁体分词 -* 支持自定义词典 -* MIT 授权协议 + - 精确模式,试图将句子最精确地切开,适合文本分析; + - 全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义; + - 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。 + +- 支持繁体分词 +- 支持自定义词典 +- MIT 授权协议 + +# 在线演示 -在线演示 -========= http://jiebademo.ap01.aws.af.cm/ (Powered by Appfog) 网站代码:https://github.com/fxsjy/jiebademo - -安装说明 -======= +# 安装说明 代码对 Python 2/3 均兼容 -* 全自动安装:`easy_install jieba` 或者 `pip install jieba` / `pip3 install jieba` -* 半自动安装:先下载 http://pypi.python.org/pypi/jieba/ ,解压后运行 `python setup.py install` -* 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录 -* 通过 `import jieba` 来引用 +- 全自动安装:`easy_install jieba` 或者 `pip install jieba` / `pip3 install jieba` +- 半自动安装:先下载 http://pypi.python.org/pypi/jieba/ ,解压后运行 `python setup.py install` +- 手动安装:将 jieba 目录放置于当前目录或者 site-packages 目录 +- 通过 `import jieba` 来引用 + +# 算法 -算法 -======== -* 基于前缀词典实现高效的词图扫描,生成句子中汉字所有可能成词情况所构成的有向无环图 (DAG) -* 采用了动态规划查找最大概率路径, 找出基于词频的最大切分组合 -* 对于未登录词,采用了基于汉字成词能力的 HMM 模型,使用了 Viterbi 算法 +- 基于前缀词典实现高效的词图扫描,生成句子中汉字所有可能成词情况所构成的有向无环图 (DAG) +- 采用了动态规划查找最大概率路径, 找出基于词频的最大切分组合 +- 对于未登录词,采用了基于汉字成词能力的 HMM 模型,使用了 Viterbi 算法 + +# 主要功能 -主要功能 -======= 1. 分词 --------- -* `jieba.cut` 方法接受三个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数用来控制是否使用 HMM 模型 -* `jieba.cut_for_search` 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细 -* 待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8 -* `jieba.cut` 以及 `jieba.cut_for_search` 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode),或者用 -* `jieba.lcut` 以及 `jieba.lcut_for_search` 直接返回 list -* `jieba.Tokenizer(dictionary=DEFAULT_DICT)` 新建自定义分词器,可用于同时使用不同词典。`jieba.dt` 为默认分词器,所有全局分词相关函数都是该分词器的映射。 + +--- + +- `jieba.cut` 方法接受三个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数用来控制是否使用 HMM 模型 +- `jieba.cut_for_search` 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细 +- 待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8 +- `jieba.cut` 以及 `jieba.cut_for_search` 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode),或者用 +- `jieba.lcut` 以及 `jieba.lcut_for_search` 直接返回 list +- `jieba.Tokenizer(dictionary=DEFAULT_DICT)` 新建自定义分词器,可用于同时使用不同词典。`jieba.dt` 为默认分词器,所有全局分词相关函数都是该分词器的映射。 代码示例 @@ -84,14 +88,15 @@ print(", ".join(seg_list)) 【搜索引擎模式】: 小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造 2. 添加自定义词典 ----------------- + +--- ### 载入词典 -* 开发者可以指定自己自定义的词典,以便包含 jieba 词库里没有的词。虽然 jieba 有新词识别能力,但是自行添加新词可以保证更高的正确率 -* 用法: jieba.load_userdict(file_name) # file_name 为文件类对象或自定义词典的路径 -* 词典格式和 `dict.txt` 一样,一个词占一行;每一行分三部分:词语、词频(可省略)、词性(可省略),用空格隔开,顺序不可颠倒。`file_name` 若为路径或二进制方式打开的文件,则文件必须为 UTF-8 编码。 -* 词频省略时使用自动计算的能保证分出该词的词频。 +- 开发者可以指定自己自定义的词典,以便包含 jieba 词库里没有的词。虽然 jieba 有新词识别能力,但是自行添加新词可以保证更高的正确率 +- 用法: jieba.load_userdict(file_name) # file_name 为文件类对象或自定义词典的路径 +- 词典格式和 `dict.txt` 一样,一个词占一行;每一行分三部分:词语、词频(可省略)、词性(可省略),用空格隔开,顺序不可颠倒。`file_name` 若为路径或二进制方式打开的文件,则文件必须为 UTF-8 编码。 +- 词频省略时使用自动计算的能保证分出该词的词频。 **例如:** @@ -102,25 +107,24 @@ print(", ".join(seg_list)) 台中 ``` -* 更改分词器(默认为 `jieba.dt`)的 `tmp_dir` 和 `cache_file` 属性,可分别指定缓存文件所在的文件夹及其文件名,用于受限的文件系统。 +- 更改分词器(默认为 `jieba.dt`)的 `tmp_dir` 和 `cache_file` 属性,可分别指定缓存文件所在的文件夹及其文件名,用于受限的文件系统。 -* 范例: +- 范例: - * 自定义词典:https://github.com/fxsjy/jieba/blob/master/test/userdict.txt + - 自定义词典:https://github.com/fxsjy/jieba/blob/master/test/userdict.txt - * 用法示例:https://github.com/fxsjy/jieba/blob/master/test/test_userdict.py + - 用法示例:https://github.com/fxsjy/jieba/blob/master/test/test_userdict.py + - 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 / - * 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 / - - * 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / + - 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / ### 调整词典 -* 使用 `add_word(word, freq=None, tag=None)` 和 `del_word(word)` 可在程序中动态修改词典。 -* 使用 `suggest_freq(segment, tune=True)` 可调节单个词语的词频,使其能(或不能)被分出来。 +- 使用 `add_word(word, freq=None, tag=None)` 和 `del_word(word)` 可在程序中动态修改词典。 +- 使用 `suggest_freq(segment, tune=True)` 可调节单个词语的词频,使其能(或不能)被分出来。 -* 注意:自动计算的词频在使用 HMM 新词发现功能时可能无效。 +- 注意:自动计算的词频在使用 HMM 新词发现功能时可能无效。 代码示例: @@ -139,20 +143,22 @@ print(", ".join(seg_list)) 「/台中/」/正确/应该/不会/被/切开 ``` -* "通过用户自定义词典来增强歧义纠错能力" --- https://github.com/fxsjy/jieba/issues/14 +- "通过用户自定义词典来增强歧义纠错能力" --- https://github.com/fxsjy/jieba/issues/14 3. 关键词提取 -------------- + +--- + ### 基于 TF-IDF 算法的关键词抽取 `import jieba.analyse` -* jieba.analyse.extract_tags(sentence, topK=20, withWeight=False, allowPOS=()) - * sentence 为待提取的文本 - * topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20 - * withWeight 为是否一并返回关键词权重值,默认值为 False - * allowPOS 仅包括指定词性的词,默认值为空,即不筛选 -* jieba.analyse.TFIDF(idf_path=None) 新建 TFIDF 实例,idf_path 为 IDF 频率文件 +- jieba.analyse.extract_tags(sentence, topK=20, withWeight=False, allowPOS=()) + - sentence 为待提取的文本 + - topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20 + - withWeight 为是否一并返回关键词权重值,默认值为 False + - allowPOS 仅包括指定词性的词,默认值为空,即不筛选 +- jieba.analyse.TFIDF(idf_path=None) 新建 TFIDF 实例,idf_path 为 IDF 频率文件 代码示例 (关键词提取) @@ -160,42 +166,44 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py 关键词提取所使用逆向文件频率(IDF)文本语料库可以切换成自定义语料库的路径 -* 用法: jieba.analyse.set_idf_path(file_name) # file_name为自定义语料库的路径 -* 自定义语料库示例:https://github.com/fxsjy/jieba/blob/master/extra_dict/idf.txt.big -* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_idfpath.py +- 用法: jieba.analyse.set_idf_path(file_name) # file_name 为自定义语料库的路径 +- 自定义语料库示例:https://github.com/fxsjy/jieba/blob/master/extra_dict/idf.txt.big +- 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_idfpath.py 关键词提取所使用停止词(Stop Words)文本语料库可以切换成自定义语料库的路径 -* 用法: jieba.analyse.set_stop_words(file_name) # file_name为自定义语料库的路径 -* 自定义语料库示例:https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt -* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py +- 用法: jieba.analyse.set_stop_words(file_name) # file_name 为自定义语料库的路径 +- 自定义语料库示例:https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt +- 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py 关键词一并返回关键词权重值示例 -* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_with_weight.py +- 用法示例:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_with_weight.py ### 基于 TextRank 算法的关键词抽取 -* jieba.analyse.textrank(sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) 直接使用,接口相同,注意默认过滤词性。 -* jieba.analyse.TextRank() 新建自定义 TextRank 实例 +- jieba.analyse.textrank(sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) 直接使用,接口相同,注意默认过滤词性。 +- jieba.analyse.TextRank() 新建自定义 TextRank 实例 算法论文: [TextRank: Bringing Order into Texts](http://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf) #### 基本思想: 1. 将待抽取关键词的文本进行分词 -2. 以固定窗口大小(默认为5,通过span属性调整),词之间的共现关系,构建图 -3. 计算图中节点的PageRank,注意是无向带权图 +2. 以固定窗口大小(默认为 5,通过 span 属性调整),词之间的共现关系,构建图 +3. 计算图中节点的 PageRank,注意是无向带权图 #### 使用示例: 见 [test/demo.py](https://github.com/fxsjy/jieba/blob/master/test/demo.py) 4. 词性标注 ------------ -* `jieba.posseg.POSTokenizer(tokenizer=None)` 新建自定义分词器,`tokenizer` 参数可指定内部使用的 `jieba.Tokenizer` 分词器。`jieba.posseg.dt` 为默认词性标注分词器。 -* 标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法。 -* 用法示例 + +--- + +- `jieba.posseg.POSTokenizer(tokenizer=None)` 新建自定义分词器,`tokenizer` 参数可指定内部使用的 `jieba.Tokenizer` 分词器。`jieba.posseg.dt` 为默认词性标注分词器。 +- 标注句子分词后每个词的词性,采用和 ictclas 兼容的标记法。 +- 用法示例 ```pycon >>> import jieba.posseg as pseg @@ -210,23 +218,28 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py ``` 5. 并行分词 ------------ -* 原理:将目标文本按行分隔后,把各行文本分配到多个 Python 进程并行分词,然后归并结果,从而获得分词速度的可观提升 -* 基于 python 自带的 multiprocessing 模块,目前暂不支持 Windows -* 用法: - * `jieba.enable_parallel(4)` # 开启并行分词模式,参数为并行进程数 - * `jieba.disable_parallel()` # 关闭并行分词模式 -* 例子:https://github.com/fxsjy/jieba/blob/master/test/parallel/test_file.py +--- + +- 原理:将目标文本按行分隔后,把各行文本分配到多个 Python 进程并行分词,然后归并结果,从而获得分词速度的可观提升 +- 基于 python 自带的 multiprocessing 模块,目前暂不支持 Windows +- 用法: + + - `jieba.enable_parallel(4)` # 开启并行分词模式,参数为并行进程数 + - `jieba.disable_parallel()` # 关闭并行分词模式 + +- 例子:https://github.com/fxsjy/jieba/blob/master/test/parallel/test_file.py -* 实验结果:在 4 核 3.4GHz Linux 机器上,对金庸全集进行精确分词,获得了 1MB/s 的速度,是单进程版的 3.3 倍。 +- 实验结果:在 4 核 3.4GHz Linux 机器上,对金庸全集进行精确分词,获得了 1MB/s 的速度,是单进程版的 3.3 倍。 -* **注意**:并行分词仅支持默认分词器 `jieba.dt` 和 `jieba.posseg.dt`。 +- **注意**:并行分词仅支持默认分词器 `jieba.dt` 和 `jieba.posseg.dt`。 6. Tokenize:返回词语在原文的起止位置 ----------------------------------- -* 注意,输入参数只接受 unicode -* 默认模式 + +--- + +- 注意,输入参数只接受 unicode +- 默认模式 ```python result = jieba.tokenize(u'永和服装饰品有限公司') @@ -242,7 +255,7 @@ word 有限公司 start: 6 end:10 ``` -* 搜索模式 +- 搜索模式 ```python result = jieba.tokenize(u'永和服装饰品有限公司', mode='search') @@ -259,14 +272,16 @@ word 公司 start: 8 end:10 word 有限公司 start: 6 end:10 ``` - 7. ChineseAnalyzer for Whoosh 搜索引擎 --------------------------------------------- -* 引用: `from jieba.analyse import ChineseAnalyzer` -* 用法示例:https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py + +--- + +- 引用: `from jieba.analyse import ChineseAnalyzer` +- 用法示例:https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py 8. 命令行分词 -------------------- + +--- 使用示例:`python -m jieba news.txt > cut_result.txt` @@ -324,92 +339,87 @@ word 有限公司 start: 6 end:10 If no filename specified, use STDIN instead. -延迟加载机制 ------------- +## 延迟加载机制 jieba 采用延迟加载,`import jieba` 和 `jieba.Tokenizer()` 不会立即触发词典的加载,一旦有必要才开始加载词典构建前缀字典。如果你想手工初始 jieba,也可以手动初始化。 import jieba jieba.initialize() # 手动初始化(可选) - 在 0.28 之前的版本是不能指定主词典的路径的,有了延迟加载机制后,你可以改变主词典的路径: jieba.set_dictionary('data/dict.txt.big') 例子: https://github.com/fxsjy/jieba/blob/master/test/test_change_dictpath.py -其他词典 -======== +# 其他词典 + 1. 占用内存较小的词典文件 -https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small + https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small 2. 支持繁体分词更好的词典文件 -https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big + https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big 下载你所需要的词典,然后覆盖 jieba/dict.txt 即可;或者用 `jieba.set_dictionary('data/dict.txt.big')` -其他语言实现 -========== +# 其他语言实现 + +## 结巴分词 Java 版本 -结巴分词 Java 版本 ----------------- 作者:piaolingxue 地址:https://github.com/huaban/jieba-analysis -结巴分词 C++ 版本 ----------------- +## 结巴分词 C++ 版本 + 作者:yanyiwu 地址:https://github.com/yanyiwu/cppjieba -结巴分词 Node.js 版本 ----------------- +## 结巴分词 Node.js 版本 + 作者:yanyiwu 地址:https://github.com/yanyiwu/nodejieba -结巴分词 Erlang 版本 ----------------- +## 结巴分词 Erlang 版本 + 作者:falood 地址:https://github.com/falood/exjieba -结巴分词 R 版本 ----------------- +## 结巴分词 R 版本 + 作者:qinwf 地址:https://github.com/qinwf/jiebaR -结巴分词 iOS 版本 ----------------- +## 结巴分词 iOS 版本 + 作者:yanyiwu 地址:https://github.com/yanyiwu/iosjieba -结巴分词 PHP 版本 ----------------- +## 结巴分词 PHP 版本 + 作者:fukuball 地址:https://github.com/fukuball/jieba-php -结巴分词 .NET(C#) 版本 ----------------- +## 结巴分词 .NET(C#) 版本 + 作者:anderscui 地址:https://github.com/anderscui/jieba.NET/ -结巴分词 Go 版本 ----------------- +## 结巴分词 Go 版本 -+ 作者: wangbin 地址: https://github.com/wangbin/jiebago -+ 作者: yanyiwu 地址: https://github.com/yanyiwu/gojieba +- 作者: wangbin 地址: https://github.com/wangbin/jiebago +- 作者: yanyiwu 地址: https://github.com/yanyiwu/gojieba + +# 系统集成 -系统集成 -======== 1. Solr: https://github.com/sing1ee/jieba-solr -分词速度 -========= -* 1.5 MB / Second in Full Mode -* 400 KB / Second in Default Mode -* 测试环境: Intel(R) Core(TM) i7-2600 CPU @ 3.4GHz;《围城》.txt +# 分词速度 + +- 1.5 MB / Second in Full Mode +- 400 KB / Second in Default Mode +- 测试环境: Intel(R) Core(TM) i7-2600 CPU @ 3.4GHz;《围城》.txt -常见问题 -========= +# 常见问题 ## 1. 模型的数据是如何生成的? @@ -440,60 +450,59 @@ P(台中) < P(台)×P(中),“台中”词频不够导致其成词概率较 **更多问题请点击**:https://github.com/fxsjy/jieba/issues?sort=updated&state=closed -修订历史 -========== +# 修订历史 + https://github.com/fxsjy/jieba/blob/master/Changelog --------------------- +--- + +# jieba -jieba -======== "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module. -Features -======== -* Support three types of segmentation mode: +# Features + +- Support three types of segmentation mode: 1. Accurate Mode attempts to cut the sentence into the most accurate segmentations, which is suitable for text analysis. 2. Full Mode gets all the possible words from the sentence. Fast but not accurate. 3. Search Engine Mode, based on the Accurate Mode, attempts to cut long words into several short words, which can raise the recall rate. Suitable for search engines. -* Supports Traditional Chinese -* Supports customized dictionaries -* MIT License +- Supports Traditional Chinese +- Supports customized dictionaries +- MIT License +# Online demo -Online demo -========= http://jiebademo.ap01.aws.af.cm/ (Powered by Appfog) -Usage -======== -* Fully automatic installation: `easy_install jieba` or `pip install jieba` -* Semi-automatic installation: Download http://pypi.python.org/pypi/jieba/ , run `python setup.py install` after extracting. -* Manual installation: place the `jieba` directory in the current directory or python `site-packages` directory. -* `import jieba`. +# Usage + +- Fully automatic installation: `easy_install jieba` or `pip install jieba` +- Semi-automatic installation: Download http://pypi.python.org/pypi/jieba/ , run `python setup.py install` after extracting. +- Manual installation: place the `jieba` directory in the current directory or python `site-packages` directory. +- `import jieba`. -Algorithm -======== -* Based on a prefix dictionary structure to achieve efficient word graph scanning. Build a directed acyclic graph (DAG) for all possible word combinations. -* Use dynamic programming to find the most probable combination based on the word frequency. -* For unknown words, a HMM-based model is used with the Viterbi algorithm. +# Algorithm -Main Functions -============== +- Based on a prefix dictionary structure to achieve efficient word graph scanning. Build a directed acyclic graph (DAG) for all possible word combinations. +- Use dynamic programming to find the most probable combination based on the word frequency. +- For unknown words, a HMM-based model is used with the Viterbi algorithm. + +# Main Functions 1. Cut --------- -* The `jieba.cut` function accepts three input parameters: the first parameter is the string to be cut; the second parameter is `cut_all`, controlling the cut mode; the third parameter is to control whether to use the Hidden Markov Model. -* `jieba.cut_for_search` accepts two parameter: the string to be cut; whether to use the Hidden Markov Model. This will cut the sentence into short words suitable for search engines. -* The input string can be an unicode/str object, or a str/bytes object which is encoded in UTF-8 or GBK. Note that using GBK encoding is not recommended because it may be unexpectly decoded as UTF-8. -* `jieba.cut` and `jieba.cut_for_search` returns an generator, from which you can use a `for` loop to get the segmentation result (in unicode). -* `jieba.lcut` and `jieba.lcut_for_search` returns a list. -* `jieba.Tokenizer(dictionary=DEFAULT_DICT)` creates a new customized Tokenizer, which enables you to use different dictionaries at the same time. `jieba.dt` is the default Tokenizer, to which almost all global functions are mapped. +--- + +- The `jieba.cut` function accepts three input parameters: the first parameter is the string to be cut; the second parameter is `cut_all`, controlling the cut mode; the third parameter is to control whether to use the Hidden Markov Model. +- `jieba.cut_for_search` accepts two parameter: the string to be cut; whether to use the Hidden Markov Model. This will cut the sentence into short words suitable for search engines. +- The input string can be an unicode/str object, or a str/bytes object which is encoded in UTF-8 or GBK. Note that using GBK encoding is not recommended because it may be unexpectly decoded as UTF-8. +- `jieba.cut` and `jieba.cut_for_search` returns an generator, from which you can use a `for` loop to get the segmentation result (in unicode). +- `jieba.lcut` and `jieba.lcut_for_search` returns a list. +- `jieba.Tokenizer(dictionary=DEFAULT_DICT)` creates a new customized Tokenizer, which enables you to use different dictionaries at the same time. `jieba.dt` is the default Tokenizer, to which almost all global functions are mapped. **Code example: segmentation** @@ -524,16 +533,16 @@ Output: [Search Engine Mode]: 小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造 - 2. Add a custom dictionary ----------------------------- + +--- ### Load dictionary -* Developers can specify their own custom dictionary to be included in the jieba default dictionary. Jieba is able to identify new words, but you can add your own new words can ensure a higher accuracy. -* Usage: `jieba.load_userdict(file_name)` # file_name is a file-like object or the path of the custom dictionary -* The dictionary format is the same as that of `dict.txt`: one word per line; each line is divided into three parts separated by a space: word, word frequency, POS tag. If `file_name` is a path or a file opened in binary mode, the dictionary must be UTF-8 encoded. -* The word frequency and POS tag can be omitted respectively. The word frequency will be filled with a suitable value if omitted. +- Developers can specify their own custom dictionary to be included in the jieba default dictionary. Jieba is able to identify new words, but you can add your own new words can ensure a higher accuracy. +- Usage: `jieba.load_userdict(file_name)` # file_name is a file-like object or the path of the custom dictionary +- The dictionary format is the same as that of `dict.txt`: one word per line; each line is divided into three parts separated by a space: word, word frequency, POS tag. If `file_name` is a path or a file opened in binary mode, the dictionary must be UTF-8 encoded. +- The word frequency and POS tag can be omitted respectively. The word frequency will be filled with a suitable value if omitted. **For example:** @@ -544,10 +553,9 @@ Output: 台中 ``` +- Change a Tokenizer's `tmp_dir` and `cache_file` to specify the path of the cache file, for using on a restricted file system. -* Change a Tokenizer's `tmp_dir` and `cache_file` to specify the path of the cache file, for using on a restricted file system. - -* Example: +- Example: 云计算 5 李小福 2 @@ -557,13 +565,12 @@ Output: [After]: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / - ### Modify dictionary -* Use `add_word(word, freq=None, tag=None)` and `del_word(word)` to modify the dictionary dynamically in programs. -* Use `suggest_freq(segment, tune=True)` to adjust the frequency of a single word so that it can (or cannot) be segmented. +- Use `add_word(word, freq=None, tag=None)` and `del_word(word)` to modify the dictionary dynamically in programs. +- Use `suggest_freq(segment, tune=True)` to adjust the frequency of a single word so that it can (or cannot) be segmented. -* Note that HMM may affect the final result. +- Note that HMM may affect the final result. Example: @@ -583,15 +590,17 @@ Example: ``` 3. Keyword Extraction ------------------------ + +--- + `import jieba.analyse` -* `jieba.analyse.extract_tags(sentence, topK=20, withWeight=False, allowPOS=())` - * `sentence`: the text to be extracted - * `topK`: return how many keywords with the highest TF/IDF weights. The default value is 20 - * `withWeight`: whether return TF/IDF weights with the keywords. The default value is False - * `allowPOS`: filter words with which POSs are included. Empty for no filtering. -* `jieba.analyse.TFIDF(idf_path=None)` creates a new TFIDF instance, `idf_path` specifies IDF file path. +- `jieba.analyse.extract_tags(sentence, topK=20, withWeight=False, allowPOS=())` + - `sentence`: the text to be extracted + - `topK`: return how many keywords with the highest TF/IDF weights. The default value is 20 + - `withWeight`: whether return TF/IDF weights with the keywords. The default value is False + - `allowPOS`: filter words with which POSs are included. Empty for no filtering. +- `jieba.analyse.TFIDF(idf_path=None)` creates a new TFIDF instance, `idf_path` specifies IDF file path. Example (keyword extraction) @@ -599,15 +608,15 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py Developers can specify their own custom IDF corpus in jieba keyword extraction -* Usage: `jieba.analyse.set_idf_path(file_name) # file_name is the path for the custom corpus` -* Custom Corpus Sample:https://github.com/fxsjy/jieba/blob/master/extra_dict/idf.txt.big -* Sample Code:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_idfpath.py +- Usage: `jieba.analyse.set_idf_path(file_name) # file_name is the path for the custom corpus` +- Custom Corpus Sample:https://github.com/fxsjy/jieba/blob/master/extra_dict/idf.txt.big +- Sample Code:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_idfpath.py Developers can specify their own custom stop words corpus in jieba keyword extraction -* Usage: `jieba.analyse.set_stop_words(file_name) # file_name is the path for the custom corpus` -* Custom Corpus Sample:https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt -* Sample Code:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py +- Usage: `jieba.analyse.set_stop_words(file_name) # file_name is the path for the custom corpus` +- Custom Corpus Sample:https://github.com/fxsjy/jieba/blob/master/extra_dict/stop_words.txt +- Sample Code:https://github.com/fxsjy/jieba/blob/master/test/extract_tags_stop_words.py There's also a [TextRank](http://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf) implementation available. @@ -618,10 +627,12 @@ Note that it filters POS by default. `jieba.analyse.TextRank()` creates a new TextRank instance. 4. Part of Speech Tagging -------------------------- -* `jieba.posseg.POSTokenizer(tokenizer=None)` creates a new customized Tokenizer. `tokenizer` specifies the jieba.Tokenizer to internally use. `jieba.posseg.dt` is the default POSTokenizer. -* Tags the POS of each word after segmentation, using labels compatible with ictclas. -* Example: + +--- + +- `jieba.posseg.POSTokenizer(tokenizer=None)` creates a new customized Tokenizer. `tokenizer` specifies the jieba.Tokenizer to internally use. `jieba.posseg.dt` is the default POSTokenizer. +- Tags the POS of each word after segmentation, using labels compatible with ictclas. +- Example: ```pycon >>> import jieba.posseg as pseg @@ -636,24 +647,29 @@ Note that it filters POS by default. ``` 5. Parallel Processing ----------------------- -* Principle: Split target text by line, assign the lines into multiple Python processes, and then merge the results, which is considerably faster. -* Based on the multiprocessing module of Python. -* Usage: - * `jieba.enable_parallel(4)` # Enable parallel processing. The parameter is the number of processes. - * `jieba.disable_parallel()` # Disable parallel processing. -* Example: - https://github.com/fxsjy/jieba/blob/master/test/parallel/test_file.py +--- -* Result: On a four-core 3.4GHz Linux machine, do accurate word segmentation on Complete Works of Jin Yong, and the speed reaches 1MB/s, which is 3.3 times faster than the single-process version. +- Principle: Split target text by line, assign the lines into multiple Python processes, and then merge the results, which is considerably faster. +- Based on the multiprocessing module of Python. +- Usage: -* **Note** that parallel processing supports only default tokenizers, `jieba.dt` and `jieba.posseg.dt`. + - `jieba.enable_parallel(4)` # Enable parallel processing. The parameter is the number of processes. + - `jieba.disable_parallel()` # Disable parallel processing. + +- Example: + https://github.com/fxsjy/jieba/blob/master/test/parallel/test_file.py + +- Result: On a four-core 3.4GHz Linux machine, do accurate word segmentation on Complete Works of Jin Yong, and the speed reaches 1MB/s, which is 3.3 times faster than the single-process version. + +- **Note** that parallel processing supports only default tokenizers, `jieba.dt` and `jieba.posseg.dt`. 6. Tokenize: return words with position ----------------------------------------- -* The input must be unicode -* Default mode + +--- + +- The input must be unicode +- Default mode ```python result = jieba.tokenize(u'永和服装饰品有限公司') @@ -669,7 +685,7 @@ word 有限公司 start: 6 end:10 ``` -* Search mode +- Search mode ```python result = jieba.tokenize(u'永和服装饰品有限公司',mode='search') @@ -686,14 +702,16 @@ word 公司 start: 8 end:10 word 有限公司 start: 6 end:10 ``` - 7. ChineseAnalyzer for Whoosh -------------------------------- -* `from jieba.analyse import ChineseAnalyzer` -* Example: https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py + +--- + +- `from jieba.analyse import ChineseAnalyzer` +- Example: https://github.com/fxsjy/jieba/blob/master/test/test_whoosh.py 8. Command Line Interface --------------------------------- + +--- $> python -m jieba --help Jieba command line interface. @@ -720,8 +738,8 @@ word 有限公司 start: 6 end:10 If no filename specified, use STDIN instead. -Initialization ---------------- +## Initialization + By default, Jieba don't build the prefix dictionary unless it's necessary. This takes 1-3 seconds, after which it is not initialized again. If you want to initialize Jieba manually, you can call: import jieba @@ -731,24 +749,22 @@ You can also specify the dictionary (not supported before version 0.28) : jieba.set_dictionary('data/dict.txt.big') - -Using Other Dictionaries -=========================== +# Using Other Dictionaries It is possible to use your own dictionary with Jieba, and there are also two dictionaries ready for download: 1. A smaller dictionary for a smaller memory footprint: -https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small + https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small 2. There is also a bigger dictionary that has better support for traditional Chinese (繁體): -https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big + https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big By default, an in-between dictionary is used, called `dict.txt` and included in the distribution. In either case, download the file you want, and then call `jieba.set_dictionary('data/dict.txt.big')` or just replace the existing `dict.txt`. -Segmentation speed -========= -* 1.5 MB / Second in Full Mode -* 400 KB / Second in Default Mode -* Test Env: Intel(R) Core(TM) i7-2600 CPU @ 3.4GHz;《围城》.txt +# Segmentation speed + +- 1.5 MB / Second in Full Mode +- 400 KB / Second in Default Mode +- Test Env: Intel(R) Core(TM) i7-2600 CPU @ 3.4GHz;《围城》.txt diff --git a/synonyms/jieba/__init__.py b/synonyms/jieba/__init__.py index 7f2a7c3..4b4266e 100644 --- a/synonyms/jieba/__init__.py +++ b/synonyms/jieba/__init__.py @@ -13,7 +13,6 @@ from hashlib import md5 from ._compat import * from . import finalseg -from absl import logging if os.name == 'nt': from shutil import move as _replace_file @@ -25,6 +24,8 @@ DEFAULT_DICT = None DEFAULT_DICT_NAME = os.path.join(os.path.pardir, "data", "vocab.txt") +print("[jieba] default dict file path %s" % DEFAULT_DICT_NAME) + DICT_WRITING = {} pool = None @@ -80,10 +81,10 @@ def gen_pfdict(self, f): return lfreq, ltotal def initialize(self, dictionary=None): - logging.debug("initialize dictionary: %s| initialized: %s"% (dictionary, self.initialized)) + # print("initialize dictionary: %s| initialized: %s"% (dictionary, self.initialized)) if dictionary: abs_path = _get_abs_path(dictionary) - logging.debug("abs_path: %s| self.dictionary: %s" % (abs_path, self.dictionary)) + # print("abs_path: %s| self.dictionary: %s" % (abs_path, self.dictionary)) if self.dictionary == abs_path and self.initialized: return else: @@ -101,7 +102,7 @@ def initialize(self, dictionary=None): if self.initialized: return - logging.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary')) + # print("Building prefix dict from %s ..." % (abs_path or 'the default dictionary')) t1 = time.time() if self.cache_file: cache_file = self.cache_file @@ -120,8 +121,7 @@ def initialize(self, dictionary=None): load_from_cache_fail = True if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or os.path.getmtime(cache_file) > os.path.getmtime(abs_path)): - logging.debug( - "Loading model from cache %s" % cache_file) + # print("Loading model from cache %s" % cache_file) try: with open(cache_file, 'rb') as cf: self.FREQ, self.total = marshal.load(cf) @@ -134,8 +134,7 @@ def initialize(self, dictionary=None): DICT_WRITING[abs_path] = wlock with wlock: self.FREQ, self.total = self.gen_pfdict(self.get_dict_file()) - logging.debug( - "Dumping model to file cache %s" % cache_file) + # print("Dumping model to file cache %s" % cache_file) try: # prevent moving across different filesystems fd, fpath = tempfile.mkstemp(dir=tmpdir) @@ -144,7 +143,7 @@ def initialize(self, dictionary=None): (self.FREQ, self.total), temp_cache_file) _replace_file(fpath, cache_file) except Exception: - logging.exception("Dump cache file failed.") + print("[Synonyms] jieba dump cache file failed.") try: del DICT_WRITING[abs_path] @@ -152,12 +151,12 @@ def initialize(self, dictionary=None): pass self.initialized = True - logging.debug( - "Loading model cost %.3f seconds." % (time.time() - t1)) - logging.debug("Prefix dict has been built succesfully.") + # print( + # "Loading model cost %.3f seconds." % (time.time() - t1)) + # print("Prefix dict has been built succesfully.") def check_initialized(self): - # logging.debug("check_initialized: %s" % self.initialized) + # print("check_initialized: %s" % self.initialized) if not self.initialized: self.initialize() @@ -343,6 +342,7 @@ def _lcut_for_search_no_hmm(self, sentence): def get_dict_file(self): if self.dictionary == DEFAULT_DICT: + print("[jieba] load default dict %s ..." % DEFAULT_DICT_NAME) return get_module_res(DEFAULT_DICT_NAME) else: return open(self.dictionary, 'rb') diff --git a/synonyms/jieba/analyse/tfidf.py b/synonyms/jieba/analyse/tfidf.py index 680b889..9685877 100644 --- a/synonyms/jieba/analyse/tfidf.py +++ b/synonyms/jieba/analyse/tfidf.py @@ -44,11 +44,12 @@ def __init__(self, idf_path=None): def set_new_path(self, new_idf_path): if self.path != new_idf_path: self.path = new_idf_path - content = open(new_idf_path, 'rb').read().decode('utf-8') - self.idf_freq = {} - for line in content.splitlines(): - word, freq = line.strip().split(' ') - self.idf_freq[word] = float(freq) + with open(new_idf_path, 'rb') as f: + content = f.read().decode('utf-8') + self.idf_freq = {} + for line in content.splitlines(): + word, freq = line.strip().split(' ') + self.idf_freq[word] = float(freq) self.median_idf = sorted( self.idf_freq.values())[len(self.idf_freq) // 2] diff --git a/synonyms/synonyms.py b/synonyms/synonyms.py index f1a3b0a..0344637 100755 --- a/synonyms/synonyms.py +++ b/synonyms/synonyms.py @@ -18,15 +18,15 @@ from __future__ import division __copyright__ = "Copyright (c) (2017-2020) Chatopera Inc. All Rights Reserved" -__author__ = "Hu Ying Xi<>, Hai Liang Wang" -__date__ = "2017-09-27" -__version__ = "3.12.0" +__author__ = "Hu Ying Xi<>, Hai Liang Wang" +__date__ = "2020-09-24" +__version__ = "3.13.0" import os import sys import numpy as np curdir = os.path.dirname(os.path.abspath(__file__)) -sys.path.append(curdir) +sys.path.insert(0, curdir) PLT = 2 @@ -47,7 +47,6 @@ import json import gzip import shutil -from absl import logging from .word2vec import KeyedVectors from .utils import any2utf8 from .utils import any2unicode @@ -55,7 +54,7 @@ from .utils import cosine from .utils import is_digit import jieba -from .jieba import posseg as _tokenizer +from jieba import posseg as _tokenizer import wget ''' @@ -105,12 +104,12 @@ def _load_stopwords(file_path): print(">> Synonyms on loading stopwords [%s] ..." % _fin_stopwords_path) _load_stopwords(_fin_stopwords_path) -def _segment_words(sen): +def _segment_words(sen, HMM=True): ''' segment words with jieba ''' words, tags = [], [] - m = _tokenizer.cut(sen, HMM=True) # HMM更好的识别新词 + m = _tokenizer.cut(sen, HMM=HMM) # HMM更好的识别新词 for x in m: words.append(x.word) tags.append(x.flag) @@ -134,7 +133,7 @@ def _load_w2v(model_file=_f_model, binary=True): if not os.path.exists(model_file) and _download_model: print("\n[Synonyms] downloading data from %s to %s ... \n this only happens if SYNONYMS_WORD2VEC_BIN_URL_ZH_CN is not present and Synonyms initialization for the first time. \n It would take minutes that depends on network." % (_f_url, model_file)) wget.download(_f_url, out = model_file) - print("\n[Synonyms] download is done.\n") + print("\n[Synonyms] downloaded.\n") elif not os.path.exists(model_file): print("[Synonyms] os.path : ", os.path) raise Exception("Model file [%s] does not exist." % model_file) @@ -164,7 +163,7 @@ def _get_wv(sentence, ignore=False): if ignore: continue else: - logging.warning("not exist in w2v model: %s" % y_) + print("[Synonyms] not exist in w2v model: %s" % y_) # c.append(np.zeros((100,), dtype=float)) random_state = np.random.RandomState(seed=(hash(y_) % (2**32 - 1))) c.append(random_state.uniform(low=-10.0, high=10.0, size=(100,))) @@ -274,7 +273,6 @@ def _similarity_distance(s1, s2, ignore): except: pass u = _nearby_levenshtein_distance(s1, s2) - logging.debug("g: %s, u: %s" % (g, u)) if u >= 0.99: r = 1.0 elif u > 0.9: @@ -332,8 +330,8 @@ def compare(s1, s2, seg=True, ignore=False, stopwords=False): s2_words = [] if seg: - s1 = [x for x in jieba.cut(s1, cut_all=False, HMM=False)] - s2 = [x for x in jieba.cut(s2, cut_all=False, HMM=False)] + s1, _ = _segment_words(s1) + s2, _ = _segment_words(s2) else: s1 = s1.split() s2 = s2.split() diff --git a/synonyms/word2vec.py b/synonyms/word2vec.py index 93941fa..901ced7 100644 --- a/synonyms/word2vec.py +++ b/synonyms/word2vec.py @@ -28,8 +28,6 @@ else: xrange = range -from absl import logging - from .utils import smart_open, to_unicode, cosine from numpy import dot, zeros, dtype, float32 as REAL,\ double, array, vstack, fromstring, sqrt, newaxis,\ @@ -37,8 +35,6 @@ argmax from sklearn.neighbors import KDTree - - class Vocab(object): """ A single vocabulary item, used internally for collecting per-word frequency/sampling info, @@ -117,14 +113,14 @@ def load_word2vec_format( """ counts = None if fvocab is not None: - logging.debug("loading word counts from %s" % fvocab) + # print("loading word counts from %s" % fvocab) counts = {} with smart_open(fvocab) as fin: for line in fin: word, count = to_unicode(line).strip().split() counts[word] = int(count) - logging.debug("loading projection weights from %s" % fname) + # print("loading projection weights from %s" % fname) with smart_open(fname) as fin: header = to_unicode(fin.readline(), encoding=encoding) # throws for invalid file format @@ -137,11 +133,9 @@ def load_word2vec_format( def add_word(word, weights): word_id = len(result.vocab) - # logging.debug("word id: %d, word: %s, weights: %s" % (word_id, word, weights)) + # print("word id: %d, word: %s, weights: %s" % (word_id, word, weights)) if word in result.vocab: - logging.debug( - "duplicate word '%s' in %s, ignoring all but first" % - (word, fname)) + # print( "duplicate word '%s' in %s, ignoring all but first" % (word, fname)) return if counts is None: # most common scenario: no vocab file given. just make up @@ -155,9 +149,7 @@ def add_word(word, weights): else: # vocab file given, but word is missing -- set count to # None (TODO: or raise?) - logging.debug( - "vocabulary file is incomplete: '%s' is missing" % - word) + # print( "vocabulary file is incomplete: '%s' is missing" % word) result.vocab[word] = Vocab(index=word_id, count=None) result.syn0[word_id] = weights result.index2word.append(word) @@ -199,9 +191,7 @@ def add_word(word, weights): word, weights = parts[0], [REAL(x) for x in parts[1:]] add_word(word, weights) if result.syn0.shape[0] != len(result.vocab): - logging.debug( - "duplicate words detected, shrinking matrix size from %i to %i" % - (result.syn0.shape[0], len(result.vocab))) + # print( "duplicate words detected, shrinking matrix size from %i to %i" % (result.syn0.shape[0], len(result.vocab))) result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)]) assert (len(result.vocab), vector_size) == result.syn0.shape ''' @@ -210,7 +200,7 @@ def add_word(word, weights): http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree ''' result.kdt = KDTree(result.syn0, leaf_size=10, metric = "euclidean") - logging.debug("loaded %s matrix from %s" % (result.syn0.shape, fname)) + # print("loaded %s matrix from %s" % (result.syn0.shape, fname)) return result def word_vec(self, word, use_norm=False):