from ...language import BaseDefaults, Language from ...tokens import Doc from ...util import DummyTokenizer, load_config_from_str, registry from ...vocab import Vocab from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS DEFAULT_CONFIG = """ [nlp] [nlp.tokenizer] @tokenizers = "spacy.th.ThaiTokenizer" """ @registry.tokenizers("spacy.th.ThaiTokenizer") def create_thai_tokenizer(): def thai_tokenizer_factory(nlp): return ThaiTokenizer(nlp.vocab) return thai_tokenizer_factory class ThaiTokenizer(DummyTokenizer): def __init__(self, vocab: Vocab) -> None: try: from pythainlp.tokenize import word_tokenize except ImportError: raise ImportError( "The Thai tokenizer requires the PyThaiNLP library: " "https://github.com/PyThaiNLP/pythainlp" ) from None self.word_tokenize = word_tokenize self.vocab = vocab def __call__(self, text: str) -> Doc: words = list(self.word_tokenize(text)) spaces = [False] * len(words) return Doc(self.vocab, words=words, spaces=spaces) class ThaiDefaults(BaseDefaults): config = load_config_from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS class Thai(Language): lang = "th" Defaults = ThaiDefaults __all__ = ["Thai"]