mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
2f981d5af1
Remove corpus-specific tag maps from the language data for languages without custom tokenizers. For languages with custom word segmenters that also provide tags (Japanese and Korean), the tag maps for the custom tokenizers are kept as the default. The default tag maps for languages without custom tokenizers are now the default tag map from `lang/tag_map/py`, UPOS -> UPOS.
51 lines
1.4 KiB
Python
51 lines
1.4 KiB
Python
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
from .stop_words import STOP_WORDS
|
|
from .lex_attrs import LEX_ATTRS
|
|
|
|
from ...attrs import LANG
|
|
from ...language import Language
|
|
from ...tokens import Doc
|
|
from ...util import DummyTokenizer
|
|
|
|
|
|
class ThaiTokenizer(DummyTokenizer):
|
|
def __init__(self, cls, nlp=None):
|
|
try:
|
|
from pythainlp.tokenize import word_tokenize
|
|
except ImportError:
|
|
raise ImportError(
|
|
"The Thai tokenizer requires the PyThaiNLP library: "
|
|
"https://github.com/PyThaiNLP/pythainlp"
|
|
)
|
|
|
|
self.word_tokenize = word_tokenize
|
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
|
|
|
def __call__(self, text):
|
|
words = list(self.word_tokenize(text))
|
|
spaces = [False] * len(words)
|
|
return Doc(self.vocab, words=words, spaces=spaces)
|
|
|
|
|
|
class ThaiDefaults(Language.Defaults):
|
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
lex_attr_getters.update(LEX_ATTRS)
|
|
lex_attr_getters[LANG] = lambda _text: "th"
|
|
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
|
|
stop_words = STOP_WORDS
|
|
|
|
@classmethod
|
|
def create_tokenizer(cls, nlp=None):
|
|
return ThaiTokenizer(cls, nlp)
|
|
|
|
|
|
class Thai(Language):
|
|
lang = "th"
|
|
Defaults = ThaiDefaults
|
|
|
|
def make_doc(self, text):
|
|
return self.tokenizer(text)
|
|
|
|
|
|
__all__ = ["Thai"]
|