-
Notifications
You must be signed in to change notification settings - Fork 2
/
test_multilingual_text_splitter.py
99 lines (85 loc) · 2.64 KB
/
test_multilingual_text_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#pip install nltk langid jieba polyglot pyicu
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import langid
import jieba
# Download necessary NLTK data
nltk.download('punkt')
# Supported languages with fallback methods
supported_languages = {
'ar_JO': 'arabic',
'ca_ES': 'catalan',
'cs_CZ': 'czech',
'cy_GB': 'welsh',
'da_DK': 'danish',
'de_DE': 'german',
'el_GR': 'greek',
'en_GB': 'english',
'en_US': 'english',
'es_ES': 'spanish',
'es_MX': 'spanish',
'fi_FI': 'finnish',
'fr_FR': 'french',
'hu_HU': 'hungarian',
'is_IS': 'icelandic',
'it_IT': 'italian',
'ka_GE': 'georgian',
'kk_KZ': 'kazakh',
'lb_LU': 'luxembourgish',
'ne_NP': 'nepali',
'nl_BE': 'dutch',
'nl_NL': 'dutch',
'no_NO': 'norwegian',
'pl_PL': 'polish',
'pt_BR': 'portuguese',
'pt_PT': 'portuguese',
'ro_RO': 'romanian',
'ru_RU': 'russian',
'sr_RS': 'serbian',
'sv_SE': 'swedish',
'sw_CD': 'swahili',
'tr_TR': 'turkish',
'uk_UA': 'ukrainian',
'vi_VN': 'vietnamese',
'zh_CN': 'chinese'
}
def split_text(text, lang_code=None, max_chunk_size=200):
# Detect language if not provided
if lang_code is None:
lang_code, _ = langid.classify(text)
if lang_code not in supported_languages:
raise ValueError(f"Language code {lang_code} is not supported.")
lang = supported_languages[lang_code]
if lang == 'chinese':
# Chinese tokenization using jieba
words = jieba.lcut(text)
elif lang in ['ka_GE', 'kk_KZ', 'ne_NP', 'lb_LU', 'sw_CD']:
# For unsupported languages by NLTK, using basic word splitting
words = text.split()
else:
# Standard NLTK processing
sentences = sent_tokenize(text, language=lang)
words = []
for sentence in sentences:
words.extend(word_tokenize(sentence, language=lang))
chunks = []
current_chunk = []
current_length = 0
for word in words:
word_length = len(word)
if current_length + word_length + 1 > max_chunk_size:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_length = word_length
else:
current_chunk.append(word)
current_length += word_length + 1
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
# Example usage
text = "Your input text here."
language_code = 'en_US' # You can specify or set to None for auto-detection
chunks = split_text(text, lang_code=language_code)
for i, chunk in enumerate(chunks):
print(f"Chunk {i+1}: {chunk}\n")