Skip to content

Commit

Permalink
GH-1711: Move tokenizer implementations to distinct file
Browse files Browse the repository at this point in the history
  • Loading branch information
Mario Sänger committed Jun 29, 2020
1 parent 272c8c5 commit b9923ec
Show file tree
Hide file tree
Showing 7 changed files with 286 additions and 281 deletions.
272 changes: 11 additions & 261 deletions flair/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,11 @@
from flair.file_utils import Tqdm
from operator import itemgetter

from segtok.segmenter import split_single
from segtok.tokenizer import split_contractions
from segtok.tokenizer import word_tokenizer

from torch.utils.data import Dataset, random_split
from torch.utils.data import Dataset
from torch.utils.data.dataset import ConcatDataset, Subset

from typing import List, Dict, Union, Callable, Optional



log = logging.getLogger("flair")


Expand Down Expand Up @@ -485,272 +479,25 @@ def name(self) -> str:
return self.__class__.__name__


class SpacyTokenizer(Tokenizer):
"""
Implementation of :class:`Tokenizer`, using models from Spacy.
:param model a Spacy V2 model or the name of the model to load.
"""

def __init__(self, model):
super(SpacyTokenizer, self).__init__()

try:
import spacy
from spacy.language import Language
except ImportError:
raise ImportError(
"Please install Spacy v2.0 or better before using the Spacy tokenizer, otherwise you can use segtok_tokenizer as advanced tokenizer."
)

if isinstance(model, Language):
self.model: Language = model
elif isinstance(model, str):
self.model: Language = spacy.load(model)
else:
raise AssertionError(f"Unexpected type of parameter model. Please provide a loaded spacy model or the name of the model to load.")

def tokenize(self, text: str) -> List[Token]:
from spacy.tokens.doc import Doc
from spacy.tokens.token import Token as SpacyToken

doc: Doc = self.model.make_doc(text)
previous_token = None
tokens: List[Token] = []
for word in doc:
word: SpacyToken = word
token = Token(
text=word.text, start_position=word.idx, whitespace_after=True
)
tokens.append(token)

if (previous_token is not None) and (
token.start_pos == previous_token.start_pos + len(previous_token.text)
):
previous_token.whitespace_after = False

previous_token = token

return tokens

@property
def name(self) -> str:
return (
self.__class__.__name__
+ "_"
+ self.model.meta["name"]
+ "_"
+ self.model.meta["version"]
)


class SegtokTokenizer(Tokenizer):
"""
Tokenizer using segtok, a third party library dedicated to rules-based Indo-European languages.
For further details see: https://github.com/fnl/segtok
"""
def __init__(self):
super(SegtokTokenizer, self).__init__()

def tokenize(self, text: str) -> List[Token]:
return SegtokTokenizer.run_tokenize(text)

@staticmethod
def run_tokenize(text: str) -> List[Token]:
tokens: List[Token] = []
words: List[str] = []

sentences = split_single(text)
for sentence in sentences:
contractions = split_contractions(word_tokenizer(sentence))
words.extend(contractions)

words = list(filter(None, words))

# determine offsets for whitespace_after field
index = text.index
current_offset = 0
previous_word_offset = -1
previous_token = None
for word in words:
try:
word_offset = index(word, current_offset)
start_position = word_offset
except:
word_offset = previous_word_offset + 1
start_position = (
current_offset + 1 if current_offset > 0 else current_offset
)

if word:
token = Token(
text=word, start_position=start_position, whitespace_after=True
)
tokens.append(token)

if (previous_token is not None) and word_offset - 1 == previous_word_offset:
previous_token.whitespace_after = False

current_offset = word_offset + len(word)
previous_word_offset = current_offset - 1
previous_token = token

return tokens


class SpaceTokenizer(Tokenizer):
"""
Tokenizer based on space character only.
"""
def __init__(self):
super(SpaceTokenizer, self).__init__()

def tokenize(self, text: str) -> List[Token]:
return SpaceTokenizer.run_tokenize(text)

@staticmethod
def run_tokenize(text: str) -> List[Token]:
tokens: List[Token] = []
word = ""
index = -1
for index, char in enumerate(text):
if char == " ":
if len(word) > 0:
start_position = index - len(word)
tokens.append(
Token(
text=word, start_position=start_position, whitespace_after=True
)
)

word = ""
else:
word += char
# increment for last token in sentence if not followed by whitespace
index += 1
if len(word) > 0:
start_position = index - len(word)
tokens.append(
Token(text=word, start_position=start_position, whitespace_after=False)
)

return tokens


class JapaneseTokenizer(Tokenizer):
"""
Tokenizer using konoha, a third party library which supports
multiple Japanese tokenizer such as MeCab, KyTea and SudachiPy.
For further details see:
https://github.com/himkt/konoha
"""

def __init__(self, tokenizer: str):
super(JapaneseTokenizer, self).__init__()

if tokenizer.lower() != "mecab":
raise NotImplementedError("Currently, MeCab is only supported.")

try:
import konoha
except ModuleNotFoundError:
log.warning("-" * 100)
log.warning('ATTENTION! The library "konoha" is not installed!')
log.warning(
'To use Japanese tokenizer, please first install with the following steps:'
)
log.warning(
'- Install mecab with "sudo apt install mecab libmecab-dev mecab-ipadic"'
)
log.warning('- Install konoha with "pip install konoha[mecab]"')
log.warning("-" * 100)
pass

self.tokenizer = tokenizer
self.sentence_tokenizer = konoha.SentenceTokenizer()
self.word_tokenizer = konoha.WordTokenizer(tokenizer)

def tokenize(self, text: str) -> List[Token]:
tokens: List[Token] = []
words: List[str] = []

sentences = self.sentence_tokenizer.tokenize(text)
for sentence in sentences:
konoha_tokens = self.word_tokenizer.tokenize(sentence)
words.extend(list(map(str, konoha_tokens)))

# determine offsets for whitespace_after field
index = text.index
current_offset = 0
previous_word_offset = -1
previous_token = None
for word in words:
try:
word_offset = index(word, current_offset)
start_position = word_offset
except:
word_offset = previous_word_offset + 1
start_position = (
current_offset + 1 if current_offset > 0 else current_offset
)

token = Token(
text=word, start_position=start_position, whitespace_after=True
)
tokens.append(token)

if (previous_token is not None) and word_offset - 1 == previous_word_offset:
previous_token.whitespace_after = False

current_offset = word_offset + len(word)
previous_word_offset = current_offset - 1
previous_token = token

return tokens

@property
def name(self) -> str:
return (
self.__class__.__name__
+ "_"
+ self.tokenizer
)


class TokenizerWrapper(Tokenizer):
"""
Helper class to wrap tokenizer functions to the class-based tokenizer interface.
"""
def __init__(self, tokenizer_func: Callable[[str], List[Token]]):
super(TokenizerWrapper, self).__init__()
self.tokenizer_func = tokenizer_func

def tokenize(self, text: str) -> List[Token]:
return self.tokenizer_func(text)

@property
def name(self) -> str:
return self.__class__.__name__ + "_" + self.tokenizer_func.__name__


@deprecated(version="0.5", reason="Use 'flair.data.SpaceTokenizer' instead.")
@deprecated(version="0.5", reason="Use 'flair.tokenization.SpaceTokenizer' instead.")
def space_tokenizer(text: str) -> List[Token]:
# We don't want to create a SpaceTokenizer object each time this function is called,
# so delegate the call directly to the static run_tokenize method
from flair.tokenization import SpaceTokenizer
return SpaceTokenizer.run_tokenize(text)


@deprecated(version="0.5", reason="Use 'flair.data.SegtokTokenizer' instead.")
@deprecated(version="0.5", reason="Use 'flair.tokenization.SegtokTokenizer' instead.")
def segtok_tokenizer(text: str) -> List[Token]:
# We don't want to create a SegtokTokenizer object each time this function is called,
# so delegate the call directly to the static run_tokenize method
from flair.tokenization import SegtokTokenizer
return SegtokTokenizer.run_tokenize(text)


@deprecated(version="0.5", reason="Use 'flair.data.SpacyTokenizer' instead.")
@deprecated(version="0.5", reason="Use 'flair.tokenization.SpacyTokenizer' instead.")
def build_spacy_tokenizer(model) -> Callable[[str], List[Token]]:
from flair.tokenization import SpacyTokenizer
spacy_tokenizer = SpacyTokenizer(model)

def tokenizer(text: str) -> List[Token]:
Expand All @@ -759,8 +506,9 @@ def tokenizer(text: str) -> List[Token]:
return tokenizer


@deprecated(version="0.5", reason="Use 'flair.data.JapaneseTokenizer' instead.")
@deprecated(version="0.5", reason="Use 'flair.tokenization.JapaneseTokenizer' instead.")
def build_japanese_tokenizer(tokenizer: str = "MeCab"):
from flair.tokenization import JapaneseTokenizer
japanese_tokenizer = JapaneseTokenizer(tokenizer)

def tokenizer(text: str) -> List[Token]:
Expand Down Expand Up @@ -801,8 +549,10 @@ def __init__(
if isinstance(use_tokenizer, Tokenizer):
tokenizer = use_tokenizer
elif hasattr(use_tokenizer, "__call__"):
from flair.tokenization import TokenizerWrapper
tokenizer = TokenizerWrapper(use_tokenizer)
elif type(use_tokenizer) == bool:
from flair.tokenization import SegtokTokenizer, SpaceTokenizer
tokenizer = SegtokTokenizer() if use_tokenizer else SpaceTokenizer()
else:
raise AssertionError("Unexpected type of parameter 'use_tokenizer'. " +
Expand Down
5 changes: 2 additions & 3 deletions flair/data_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@
Sentence,
Corpus,
Token,
MultiCorpus,
Tokenizer,
SegtokTokenizer,
SpaceTokenizer
MultiCorpus
)
from flair.tokenization import SegtokTokenizer, SpaceTokenizer
from flair.file_utils import cached_path

log = logging.getLogger("flair")
Expand Down
6 changes: 2 additions & 4 deletions flair/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,10 @@
from flair.data import (
Sentence,
Token,
FlairDataset,
Tokenizer,
SegtokTokenizer,
SpaceTokenizer
FlairDataset
)

from flair.tokenization import SegtokTokenizer, SpaceTokenizer

log = logging.getLogger("flair")

Expand Down
5 changes: 2 additions & 3 deletions flair/datasets/document_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@
Corpus,
Token,
FlairDataset,
Tokenizer,
SegtokTokenizer,
SpaceTokenizer
Tokenizer
)
from flair.tokenization import SegtokTokenizer, SpaceTokenizer
from flair.datasets.base import find_train_dev_test_files
from flair.file_utils import cached_path, unzip_file

Expand Down
Loading

0 comments on commit b9923ec

Please sign in to comment.