-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathtextlib.py
100 lines (79 loc) · 3.21 KB
/
textlib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# -*- coding: utf-8 -*-
import os
import re
import sys
import numpy as np
import logging
logger = logging.getLogger(__file__)
logging.basicConfig(
format="[%(asctime)s - %(filename)s:line %(lineno)s] %(message)s",
datefmt='%d %b %H:%M:%S',
level=logging.INFO)
en_stop_fname = os.path.join(os.path.dirname(__file__), 'stopwords_en.txt')
zh_stop_fname = os.path.join(os.path.dirname(__file__), 'stopwords_zh.txt')
ENGLISH_STOP_WORDS = set(map(str.strip, open(en_stop_fname).readlines()))
CHINESE_STOP_WORDS = set(map(str.strip, open(zh_stop_fname).readlines()))
if 3 == sys.version_info[0]:
CHN_DEL_SET = ', 。 、 ! 《 》 “ ” ; ? ‘ ’ '.split()
else:
CHN_DEL_SET = [x.decode('utf-8') for x in ', 。 、 ! 《 》 “ ” ; ? ‘ ’ '.split()]
class TextTool:
@staticmethod
def tokenize(input_str, clean=True, language='en', remove_stopword=False):
if 'en' == language: # English
# delete non-ascii chars
#sent = input_str.decode('utf-8').encode('ascii', 'ignore')
sent = input_str
if clean:
sent = sent.replace('\r',' ')
sent = re.sub(r"[^A-Za-z0-9]", " ", sent).strip().lower()
tokens = sent.split()
if remove_stopword:
tokens = [x for x in tokens if x not in ENGLISH_STOP_WORDS]
else: # Chinese
# sent = input_str #string.decode('utf-8')
sent = input_str.decode('utf-8')
if clean:
for elem in CHN_DEL_SET:
sent = sent.replace(elem,'')
sent = sent.encode('utf-8')
sent = re.sub("[A-Za-z]", "", sent)
tokens = [x for x in sent.split()]
if remove_stopword:
tokens = [x for x in tokens if x not in CHINESE_STOP_WORDS]
return tokens
class Vocabulary(object):
"""Simple vocabulary wrapper."""
def __init__(self, encoding):
self.word2idx = {}
self.idx2word = {}
self.encoding = encoding
def add(self, word):
if word not in self.word2idx:
idx = len(self.word2idx)
self.word2idx[word] = idx
self.idx2word[idx] = word
def find(self, word):
return self.word2idx.get(word, -1)
def __getitem__(self, index):
return self.idx2word[index]
def __call__(self, word):
if (word not in self.word2idx):
if 'gru' in self.encoding:
return self.word2idx['<unk>']
else:
raise Exception ('word out of vocab: %s' % word)
else:
return self.word2idx[word]
def __len__(self):
return len(self.word2idx)
if __name__ == '__main__':
test_strs = '''a Dog is running
The dog runs
dogs-x runs'''.split('\n')
for t in test_strs:
print t, '->', TextTool.tokenize(t, clean=True, language='en'), '->', TextTool.tokenize(t, 'en', True)
test_strs = '''一间 干净 整洁 的 房间 。
一只 黄色 的 小狗 趴在 长椅 上'''.split('\n')
for t in test_strs:
print t, '->', ' '.join(TextTool.tokenize(t, clean=True, language='zh')), '->', ' '.join(TextTool.tokenize(t, 'zh', True))