Replace python-mecab3 with fugashi for Japanese (#4621)

* Switch from mecab-python3 to fugashi

mecab-python3 has been the best MeCab binding for a long time but it's
not very actively maintained, and since it's based on old SWIG code
distributed with MeCab there's a limit to how effectively it can be
maintained.

Fugashi is a new Cython-based MeCab wrapper I wrote. Since it's not
based on the old SWIG code it's easier to keep it current and make small
deviations from the MeCab C/C++ API where that makes sense.

* Change mecab-python3 to fugashi in setup.cfg

* Change "mecab tags" to "unidic tags"

The tags come from MeCab, but the tag schema is specified by Unidic, so
it's more proper to refer to it that way.

* Update conftest

* Add fugashi link to external deps list for Japanese
This commit is contained in:
Paul O'Leary McCann 2019-11-23 22:31:04 +09:00 committed by Matthew Honnibal
parent a0fb1acb10
commit f0e3e606a6
4 changed files with 39 additions and 47 deletions

View File

@ -73,7 +73,7 @@ cuda100 =
cupy-cuda100>=5.0.0b4 cupy-cuda100>=5.0.0b4
# Language tokenizers with external dependencies # Language tokenizers with external dependencies
ja = ja =
mecab-python3==0.7 fugashi>=0.1.3
ko = ko =
natto-py==0.9.0 natto-py==0.9.0
th = th =

View File

@ -12,21 +12,23 @@ from ...tokens import Doc
from ...compat import copy_reg from ...compat import copy_reg
from ...util import DummyTokenizer from ...util import DummyTokenizer
# Handling for multiple spaces in a row is somewhat awkward, this simplifies
# the flow by creating a dummy with the same interface.
DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"])
DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"])
DummySpace = DummyNode(' ', ' ', DummyNodeFeatures(' '))
ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"]) def try_fugashi_import():
"""Fugashi is required for Japanese support, so check for it.
def try_mecab_import():
"""Mecab is required for Japanese support, so check for it.
It it's not available blow up and explain how to fix it.""" It it's not available blow up and explain how to fix it."""
try: try:
import MeCab import fugashi
return MeCab return fugashi
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"Japanese support requires MeCab: " "Japanese support requires Fugashi: "
"https://github.com/SamuraiT/mecab-python3" "https://github.com/polm/fugashi"
) )
@ -39,7 +41,7 @@ def resolve_pos(token):
""" """
# this is only used for consecutive ascii spaces # this is only used for consecutive ascii spaces
if token.pos == "空白": if token.surface == " ":
return "空白" return "空白"
# TODO: This is a first take. The rules here are crude approximations. # TODO: This is a first take. The rules here are crude approximations.
@ -53,55 +55,45 @@ def resolve_pos(token):
return token.pos + ",ADJ" return token.pos + ",ADJ"
return token.pos return token.pos
def get_words_and_spaces(tokenizer, text):
"""Get the individual tokens that make up the sentence and handle white space.
Japanese doesn't usually use white space, and MeCab's handling of it for
multiple spaces in a row is somewhat awkward.
"""
tokens = tokenizer.parseToNodeList(text)
def detailed_tokens(tokenizer, text):
"""Format Mecab output into a nice data structure, based on Janome."""
node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it
words = [] words = []
spaces = [] spaces = []
while node.posid != 0: for token in tokens:
surface = node.surface # If there's more than one space, spaces after the first become tokens
base = surface # a default value. Updated if available later. for ii in range(len(token.white_space) - 1):
parts = node.feature.split(",") words.append(DummySpace)
pos = ",".join(parts[0:4])
if len(parts) > 7:
# this information is only available for words in the tokenizer
# dictionary
base = parts[7]
words.append(ShortUnitWord(surface, base, pos))
# The way MeCab stores spaces is that the rlength of the next token is
# the length of that token plus any preceding whitespace, **in bytes**.
# also note that this is only for half-width / ascii spaces. Full width
# spaces just become tokens.
scount = node.next.rlength - node.next.length
spaces.append(bool(scount))
while scount > 1:
words.append(ShortUnitWord(" ", " ", "空白"))
spaces.append(False) spaces.append(False)
scount -= 1
node = node.next words.append(token)
spaces.append(bool(token.white_space))
return words, spaces return words, spaces
class JapaneseTokenizer(DummyTokenizer): class JapaneseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None): def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
self.tokenizer = try_mecab_import().Tagger() self.tokenizer = try_fugashi_import().Tagger()
self.tokenizer.parseToNode("") # see #2901 self.tokenizer.parseToNodeList("") # see #2901
def __call__(self, text): def __call__(self, text):
dtokens, spaces = detailed_tokens(self.tokenizer, text) dtokens, spaces = get_words_and_spaces(self.tokenizer, text)
words = [x.surface for x in dtokens] words = [x.surface for x in dtokens]
doc = Doc(self.vocab, words=words, spaces=spaces) doc = Doc(self.vocab, words=words, spaces=spaces)
mecab_tags = [] unidic_tags = []
for token, dtoken in zip(doc, dtokens): for token, dtoken in zip(doc, dtokens):
mecab_tags.append(dtoken.pos) unidic_tags.append(dtoken.pos)
token.tag_ = resolve_pos(dtoken) token.tag_ = resolve_pos(dtoken)
token.lemma_ = dtoken.lemma
doc.user_data["mecab_tags"] = mecab_tags # if there's no lemma info (it's an unk) just use the surface
token.lemma_ = dtoken.feature.lemma or dtoken.surface
doc.user_data["unidic_tags"] = unidic_tags
return doc return doc
@ -131,5 +123,4 @@ def pickle_japanese(instance):
copy_reg.pickle(Japanese, pickle_japanese) copy_reg.pickle(Japanese, pickle_japanese)
__all__ = ["Japanese"] __all__ = ["Japanese"]

View File

@ -125,7 +125,7 @@ def it_tokenizer():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ja_tokenizer(): def ja_tokenizer():
pytest.importorskip("MeCab") pytest.importorskip("fugashi")
return get_lang_class("ja").Defaults.create_tokenizer() return get_lang_class("ja").Defaults.create_tokenizer()

View File

@ -155,7 +155,8 @@
"name": "Japanese", "name": "Japanese",
"dependencies": [ "dependencies": [
{ "name": "Unidic", "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj" }, { "name": "Unidic", "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj" },
{ "name": "Mecab", "url": "https://github.com/taku910/mecab" } { "name": "Mecab", "url": "https://github.com/taku910/mecab" },
{ "name": "fugashi", "url": "https://github.com/polm/fugashi" }
], ],
"example": "これは文章です。", "example": "これは文章です。",
"has_examples": true "has_examples": true