import string from chatterbot import languages class LowercaseTagger(object): """ Returns the text in lowercase. """ def __init__(self, language=None): self.language = language or languages.CZE def get_text_index_string(self, text): return text.lower() class PosLemmaTagger(object): def __init__(self, language=None): import nltk self.language = language or languages.CZE #print(self.language) self.punctuation_table = str.maketrans(dict.fromkeys(string.punctuation)) self.nltk_stopwords = nltk.corpus.stopwords.words(self.language.ENGLISH_NAME.lower()) #self.nlp = nltk.data.load('nltk:tokenizers/punkt/czech.pickle') self.nlp = nltk.data.load('nltk:tokenizers/punkt/'+self.language.ENGLISH_NAME.lower()+'.pickle') def get_text_index_string(self, text): """ Return a string of text containing part-of-speech, lemma pairs. """ bigram_pairs = [] if len(text) <= 2: text_without_punctuation = text.translate(self.punctuation_table) if len(text_without_punctuation) >= 1: text = text_without_punctuation document = self.nlp.tokenize(text) if len(text) <= 2: bigram_pairs = [ token.lower() for token in document ] else: tokens = [ token for token in document if not token in self.nltk_stopwords ] if len(tokens) < 2: tokens = [ token for token in document if True ] # for index in range(1, len(tokens)): # bigram_pairs.append('{}:{}'.format( # tokens[index - 1], # tokens[index].lower() # )) if not bigram_pairs: bigram_pairs = [ token.lower() for token in document ] return ' '.join(bigram_pairs)