from thinc.api import Config from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc from ...util import DummyTokenizer, registry DEFAULT_CONFIG = """ [nlp] [nlp.tokenizer] @tokenizers = "spacy.th.ThaiTokenizer" """ @registry.tokenizers("spacy.th.ThaiTokenizer") def create_thai_tokenizer(): def thai_tokenizer_factory(nlp): return ThaiTokenizer(nlp) return thai_tokenizer_factory class ThaiTokenizer(DummyTokenizer): def __init__(self, nlp: Language) -> None: try: from pythainlp.tokenize import word_tokenize except ImportError: raise ImportError( "The Thai tokenizer requires the PyThaiNLP library: " "https://github.com/PyThaiNLP/pythainlp" ) from None self.word_tokenize = word_tokenize self.vocab = nlp.vocab def __call__(self, text: str) -> Doc: words = list(self.word_tokenize(text)) spaces = [False] * len(words) return Doc(self.vocab, words=words, spaces=spaces) class ThaiDefaults(Language.Defaults): config = Config().from_str(DEFAULT_CONFIG) lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS class Thai(Language): lang = "th" Defaults = ThaiDefaults __all__ = ["Thai"]