Making lang/th/test_tokenizer.py pass by creating ThaiTokenizer (#3078)

This commit is contained in:
Kirill Bulygin 2019-01-10 19:40:37 +05:00 committed by Matthew Honnibal
parent 1cd8f9823f
commit 7b064542f7
3 changed files with 75 additions and 46 deletions

View File

@ -1,20 +1,22 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from ...language import Language
from ...attrs import LANG
from ...tokens import Doc, Token
from ...tokenizer import Tokenizer
from .tag_map import TAG_MAP
import re import re
from collections import namedtuple from collections import namedtuple
from .tag_map import TAG_MAP
from ...attrs import LANG
from ...language import Language
from ...tokens import Doc, Token
from ...util import DummyTokenizer
ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'lemma', 'pos']) ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'lemma', 'pos'])
# XXX Is this the right place for this? # XXX Is this the right place for this?
Token.set_extension('mecab_tag', default=None) Token.set_extension('mecab_tag', default=None)
def try_mecab_import(): def try_mecab_import():
"""Mecab is required for Japanese support, so check for it. """Mecab is required for Japanese support, so check for it.
@ -26,6 +28,7 @@ def try_mecab_import():
raise ImportError("Japanese support requires MeCab: " raise ImportError("Japanese support requires MeCab: "
"https://github.com/SamuraiT/mecab-python3") "https://github.com/SamuraiT/mecab-python3")
def resolve_pos(token): def resolve_pos(token):
"""If necessary, add a field to the POS tag for UD mapping. """If necessary, add a field to the POS tag for UD mapping.
@ -40,81 +43,76 @@ def resolve_pos(token):
# PoS mappings. # PoS mappings.
if token.pos == '連体詞,*,*,*': if token.pos == '連体詞,*,*,*':
if re.match('^[こそあど此其彼]の', token.surface): if re.match(r'[こそあど此其彼]の', token.surface):
return token.pos + ',DET' return token.pos + ',DET'
if re.match('^[こそあど此其彼]', token.surface): if re.match(r'[こそあど此其彼]', token.surface):
return token.pos + ',PRON' return token.pos + ',PRON'
else: return token.pos + ',ADJ'
return token.pos + ',ADJ'
return token.pos return token.pos
def detailed_tokens(tokenizer, text): def detailed_tokens(tokenizer, text):
"""Format Mecab output into a nice data structure, based on Janome.""" """Format Mecab output into a nice data structure, based on Janome."""
node = tokenizer.parseToNode(text) node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it node = node.next # first node is beginning of sentence and empty, skip it
words = [] words = []
while node.posid != 0: while node.posid != 0:
surface = node.surface surface = node.surface
base = surface # a default value. Updated if available later. base = surface # a default value. Updated if available later.
parts = node.feature.split(',') parts = node.feature.split(',')
pos = ','.join(parts[0:4]) pos = ','.join(parts[0:4])
if len(parts) > 7: if len(parts) > 7:
# this information is only available for words in the tokenizer dictionary # this information is only available for words in the tokenizer
# dictionary
base = parts[7] base = parts[7]
words.append( ShortUnitWord(surface, base, pos) ) words.append(ShortUnitWord(surface, base, pos))
node = node.next node = node.next
return words return words
class JapaneseTokenizer(object):
class JapaneseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None): def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
MeCab = try_mecab_import() self.tokenizer = try_mecab_import().Tagger()
self.tokenizer = MeCab.Tagger()
self.tokenizer.parseToNode('') # see #2901 self.tokenizer.parseToNode('') # see #2901
def __call__(self, text): def __call__(self, text):
dtokens = detailed_tokens(self.tokenizer, text) dtokens = detailed_tokens(self.tokenizer, text)
words = [x.surface for x in dtokens] words = [x.surface for x in dtokens]
doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) spaces = [False] * len(words)
doc = Doc(self.vocab, words=words, spaces=spaces)
for token, dtoken in zip(doc, dtokens): for token, dtoken in zip(doc, dtokens):
token._.mecab_tag = dtoken.pos token._.mecab_tag = dtoken.pos
token.tag_ = resolve_pos(dtoken) token.tag_ = resolve_pos(dtoken)
token.lemma_ = dtoken.lemma token.lemma_ = dtoken.lemma
return doc return doc
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557)
def to_bytes(self, **exclude):
return b''
def from_bytes(self, bytes_data, **exclude):
return self
def to_disk(self, path, **exclude):
return None
def from_disk(self, path, **exclude):
return self
class JapaneseDefaults(Language.Defaults): class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'ja' lex_attr_getters[LANG] = lambda _text: 'ja'
tag_map = TAG_MAP tag_map = TAG_MAP
@classmethod @classmethod
def create_tokenizer(cls, nlp=None): def create_tokenizer(cls, nlp=None):
return JapaneseTokenizer(cls, nlp) return JapaneseTokenizer(cls, nlp)
class Japanese(Language): class Japanese(Language):
lang = 'ja' lang = 'ja'
Defaults = JapaneseDefaults Defaults = JapaneseDefaults
Tokenizer = JapaneseTokenizer
def make_doc(self, text): def make_doc(self, text):
return self.tokenizer(text) return self.tokenizer(text)
__all__ = ['Japanese'] __all__ = ['Japanese']

View File

@ -5,34 +5,49 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...attrs import LANG
from ...tokens import Doc
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG, NORM from ...tokens import Doc
from ...util import update_exc, add_lookups from ...util import DummyTokenizer
class ThaiTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None):
try:
from pythainlp.tokenize import word_tokenize
except ImportError:
raise ImportError(
"The Thai tokenizer requires the PyThaiNLP library: "
"https://github.com/PyThaiNLP/pythainlp")
self.word_tokenize = word_tokenize
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
def __call__(self, text):
words = list(self.word_tokenize(text, "newmm"))
spaces = [False] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
class ThaiDefaults(Language.Defaults): class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'th' lex_attr_getters[LANG] = lambda _text: 'th'
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS
@classmethod
def create_tokenizer(cls, nlp=None):
return ThaiTokenizer(cls, nlp)
class Thai(Language): class Thai(Language):
lang = 'th' lang = 'th'
Defaults = ThaiDefaults Defaults = ThaiDefaults
def make_doc(self, text): def make_doc(self, text):
try: return self.tokenizer(text)
from pythainlp.tokenize import word_tokenize
except ImportError:
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
"https://github.com/PyThaiNLP/pythainlp")
words = [x for x in list(word_tokenize(text,"newmm"))]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
__all__ = ['Thai'] __all__ = ['Thai']

View File

@ -635,3 +635,19 @@ class SimpleFrozenDict(dict):
def update(self, other): def update(self, other):
raise NotImplementedError(Errors.E095) raise NotImplementedError(Errors.E095)
class DummyTokenizer(object):
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557)
def to_bytes(self, **exclude):
return b''
def from_bytes(self, _bytes_data, **exclude):
return self
def to_disk(self, _path, **exclude):
return None
def from_disk(self, _path, **exclude):
return self