spaCy/spacy/lang/th/__init__.py
Adriane Boyd 2f981d5af1 Remove corpus-specific tag maps
Remove corpus-specific tag maps from the language data for languages
without custom tokenizers. For languages with custom word segmenters
that also provide tags (Japanese and Korean), the tag maps for the
custom tokenizers are kept as the default.

The default tag maps for languages without custom tokenizers are now the
default tag map from `lang/tag_map/py`, UPOS -> UPOS.
2020-07-15 15:58:29 +02:00

51 lines
1.4 KiB
Python

from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...attrs import LANG
from ...language import Language
from ...tokens import Doc
from ...util import DummyTokenizer
class ThaiTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None):
try:
from pythainlp.tokenize import word_tokenize
except ImportError:
raise ImportError(
"The Thai tokenizer requires the PyThaiNLP library: "
"https://github.com/PyThaiNLP/pythainlp"
)
self.word_tokenize = word_tokenize
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
def __call__(self, text):
words = list(self.word_tokenize(text))
spaces = [False] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda _text: "th"
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
@classmethod
def create_tokenizer(cls, nlp=None):
return ThaiTokenizer(cls, nlp)
class Thai(Language):
lang = "th"
Defaults = ThaiDefaults
def make_doc(self, text):
return self.tokenizer(text)
__all__ = ["Thai"]