Making lang/th/test_tokenizer.py pass by creating ThaiTokenizer (#3078)

This commit is contained in:
Kirill Bulygin 2019-01-10 19:40:37 +05:00 committed by Matthew Honnibal
parent 1cd8f9823f
commit 7b064542f7
3 changed files with 75 additions and 46 deletions

View File

@ -1,20 +1,22 @@
# encoding: utf8
from __future__ import unicode_literals, print_function
from ...language import Language
from ...attrs import LANG
from ...tokens import Doc, Token
from ...tokenizer import Tokenizer
from .tag_map import TAG_MAP
import re
from collections import namedtuple
from .tag_map import TAG_MAP
from ...attrs import LANG
from ...language import Language
from ...tokens import Doc, Token
from ...util import DummyTokenizer
ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'lemma', 'pos'])
# XXX Is this the right place for this?
Token.set_extension('mecab_tag', default=None)
def try_mecab_import():
"""Mecab is required for Japanese support, so check for it.
@ -26,6 +28,7 @@ def try_mecab_import():
raise ImportError("Japanese support requires MeCab: "
"https://github.com/SamuraiT/mecab-python3")
def resolve_pos(token):
"""If necessary, add a field to the POS tag for UD mapping.
@ -40,81 +43,76 @@ def resolve_pos(token):
# PoS mappings.
if token.pos == '連体詞,*,*,*':
if re.match('^[こそあど此其彼]の', token.surface):
if re.match(r'[こそあど此其彼]の', token.surface):
return token.pos + ',DET'
if re.match('^[こそあど此其彼]', token.surface):
if re.match(r'[こそあど此其彼]', token.surface):
return token.pos + ',PRON'
else:
return token.pos + ',ADJ'
return token.pos + ',ADJ'
return token.pos
def detailed_tokens(tokenizer, text):
"""Format Mecab output into a nice data structure, based on Janome."""
node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it
node = node.next # first node is beginning of sentence and empty, skip it
words = []
while node.posid != 0:
surface = node.surface
base = surface # a default value. Updated if available later.
base = surface # a default value. Updated if available later.
parts = node.feature.split(',')
pos = ','.join(parts[0:4])
if len(parts) > 7:
# this information is only available for words in the tokenizer dictionary
# this information is only available for words in the tokenizer
# dictionary
base = parts[7]
words.append( ShortUnitWord(surface, base, pos) )
words.append(ShortUnitWord(surface, base, pos))
node = node.next
return words
class JapaneseTokenizer(object):
class JapaneseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
MeCab = try_mecab_import()
self.tokenizer = MeCab.Tagger()
self.tokenizer = try_mecab_import().Tagger()
self.tokenizer.parseToNode('') # see #2901
def __call__(self, text):
dtokens = detailed_tokens(self.tokenizer, text)
words = [x.surface for x in dtokens]
doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
spaces = [False] * len(words)
doc = Doc(self.vocab, words=words, spaces=spaces)
for token, dtoken in zip(doc, dtokens):
token._.mecab_tag = dtoken.pos
token.tag_ = resolve_pos(dtoken)
token.lemma_ = dtoken.lemma
return doc
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557)
def to_bytes(self, **exclude):
return b''
def from_bytes(self, bytes_data, **exclude):
return self
def to_disk(self, path, **exclude):
return None
def from_disk(self, path, **exclude):
return self
class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'ja'
lex_attr_getters[LANG] = lambda _text: 'ja'
tag_map = TAG_MAP
@classmethod
def create_tokenizer(cls, nlp=None):
return JapaneseTokenizer(cls, nlp)
class Japanese(Language):
lang = 'ja'
Defaults = JapaneseDefaults
Tokenizer = JapaneseTokenizer
def make_doc(self, text):
return self.tokenizer(text)
__all__ = ['Japanese']

View File

@ -5,34 +5,49 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...tokens import Doc
from ..norm_exceptions import BASE_NORMS
from ...attrs import LANG
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
from ...tokens import Doc
from ...util import DummyTokenizer
class ThaiTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None):
try:
from pythainlp.tokenize import word_tokenize
except ImportError:
raise ImportError(
"The Thai tokenizer requires the PyThaiNLP library: "
"https://github.com/PyThaiNLP/pythainlp")
self.word_tokenize = word_tokenize
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
def __call__(self, text):
words = list(self.word_tokenize(text, "newmm"))
spaces = [False] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'th'
lex_attr_getters[LANG] = lambda _text: 'th'
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
@classmethod
def create_tokenizer(cls, nlp=None):
return ThaiTokenizer(cls, nlp)
class Thai(Language):
lang = 'th'
Defaults = ThaiDefaults
def make_doc(self, text):
try:
from pythainlp.tokenize import word_tokenize
except ImportError:
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
"https://github.com/PyThaiNLP/pythainlp")
words = [x for x in list(word_tokenize(text,"newmm"))]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
return self.tokenizer(text)
__all__ = ['Thai']

View File

@ -635,3 +635,19 @@ class SimpleFrozenDict(dict):
def update(self, other):
raise NotImplementedError(Errors.E095)
class DummyTokenizer(object):
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557)
def to_bytes(self, **exclude):
return b''
def from_bytes(self, _bytes_data, **exclude):
return self
def to_disk(self, _path, **exclude):
return None
def from_disk(self, _path, **exclude):
return self