mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Replace python-mecab3 with fugashi for Japanese (#4621)
* Switch from mecab-python3 to fugashi mecab-python3 has been the best MeCab binding for a long time but it's not very actively maintained, and since it's based on old SWIG code distributed with MeCab there's a limit to how effectively it can be maintained. Fugashi is a new Cython-based MeCab wrapper I wrote. Since it's not based on the old SWIG code it's easier to keep it current and make small deviations from the MeCab C/C++ API where that makes sense. * Change mecab-python3 to fugashi in setup.cfg * Change "mecab tags" to "unidic tags" The tags come from MeCab, but the tag schema is specified by Unidic, so it's more proper to refer to it that way. * Update conftest * Add fugashi link to external deps list for Japanese
This commit is contained in:
parent
a0fb1acb10
commit
f0e3e606a6
|
@ -73,7 +73,7 @@ cuda100 =
|
||||||
cupy-cuda100>=5.0.0b4
|
cupy-cuda100>=5.0.0b4
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
mecab-python3==0.7
|
fugashi>=0.1.3
|
||||||
ko =
|
ko =
|
||||||
natto-py==0.9.0
|
natto-py==0.9.0
|
||||||
th =
|
th =
|
||||||
|
|
|
@ -12,21 +12,23 @@ from ...tokens import Doc
|
||||||
from ...compat import copy_reg
|
from ...compat import copy_reg
|
||||||
from ...util import DummyTokenizer
|
from ...util import DummyTokenizer
|
||||||
|
|
||||||
|
# Handling for multiple spaces in a row is somewhat awkward, this simplifies
|
||||||
|
# the flow by creating a dummy with the same interface.
|
||||||
|
DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"])
|
||||||
|
DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"])
|
||||||
|
DummySpace = DummyNode(' ', ' ', DummyNodeFeatures(' '))
|
||||||
|
|
||||||
ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
|
def try_fugashi_import():
|
||||||
|
"""Fugashi is required for Japanese support, so check for it.
|
||||||
|
|
||||||
def try_mecab_import():
|
|
||||||
"""Mecab is required for Japanese support, so check for it.
|
|
||||||
It it's not available blow up and explain how to fix it."""
|
It it's not available blow up and explain how to fix it."""
|
||||||
try:
|
try:
|
||||||
import MeCab
|
import fugashi
|
||||||
|
|
||||||
return MeCab
|
return fugashi
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Japanese support requires MeCab: "
|
"Japanese support requires Fugashi: "
|
||||||
"https://github.com/SamuraiT/mecab-python3"
|
"https://github.com/polm/fugashi"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,7 +41,7 @@ def resolve_pos(token):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# this is only used for consecutive ascii spaces
|
# this is only used for consecutive ascii spaces
|
||||||
if token.pos == "空白":
|
if token.surface == " ":
|
||||||
return "空白"
|
return "空白"
|
||||||
|
|
||||||
# TODO: This is a first take. The rules here are crude approximations.
|
# TODO: This is a first take. The rules here are crude approximations.
|
||||||
|
@ -53,55 +55,45 @@ def resolve_pos(token):
|
||||||
return token.pos + ",ADJ"
|
return token.pos + ",ADJ"
|
||||||
return token.pos
|
return token.pos
|
||||||
|
|
||||||
|
def get_words_and_spaces(tokenizer, text):
|
||||||
|
"""Get the individual tokens that make up the sentence and handle white space.
|
||||||
|
|
||||||
|
Japanese doesn't usually use white space, and MeCab's handling of it for
|
||||||
|
multiple spaces in a row is somewhat awkward.
|
||||||
|
"""
|
||||||
|
|
||||||
|
tokens = tokenizer.parseToNodeList(text)
|
||||||
|
|
||||||
def detailed_tokens(tokenizer, text):
|
|
||||||
"""Format Mecab output into a nice data structure, based on Janome."""
|
|
||||||
node = tokenizer.parseToNode(text)
|
|
||||||
node = node.next # first node is beginning of sentence and empty, skip it
|
|
||||||
words = []
|
words = []
|
||||||
spaces = []
|
spaces = []
|
||||||
while node.posid != 0:
|
for token in tokens:
|
||||||
surface = node.surface
|
# If there's more than one space, spaces after the first become tokens
|
||||||
base = surface # a default value. Updated if available later.
|
for ii in range(len(token.white_space) - 1):
|
||||||
parts = node.feature.split(",")
|
words.append(DummySpace)
|
||||||
pos = ",".join(parts[0:4])
|
|
||||||
if len(parts) > 7:
|
|
||||||
# this information is only available for words in the tokenizer
|
|
||||||
# dictionary
|
|
||||||
base = parts[7]
|
|
||||||
words.append(ShortUnitWord(surface, base, pos))
|
|
||||||
|
|
||||||
# The way MeCab stores spaces is that the rlength of the next token is
|
|
||||||
# the length of that token plus any preceding whitespace, **in bytes**.
|
|
||||||
# also note that this is only for half-width / ascii spaces. Full width
|
|
||||||
# spaces just become tokens.
|
|
||||||
scount = node.next.rlength - node.next.length
|
|
||||||
spaces.append(bool(scount))
|
|
||||||
while scount > 1:
|
|
||||||
words.append(ShortUnitWord(" ", " ", "空白"))
|
|
||||||
spaces.append(False)
|
spaces.append(False)
|
||||||
scount -= 1
|
|
||||||
|
|
||||||
node = node.next
|
words.append(token)
|
||||||
|
spaces.append(bool(token.white_space))
|
||||||
return words, spaces
|
return words, spaces
|
||||||
|
|
||||||
|
|
||||||
class JapaneseTokenizer(DummyTokenizer):
|
class JapaneseTokenizer(DummyTokenizer):
|
||||||
def __init__(self, cls, nlp=None):
|
def __init__(self, cls, nlp=None):
|
||||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||||
self.tokenizer = try_mecab_import().Tagger()
|
self.tokenizer = try_fugashi_import().Tagger()
|
||||||
self.tokenizer.parseToNode("") # see #2901
|
self.tokenizer.parseToNodeList("") # see #2901
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
dtokens, spaces = detailed_tokens(self.tokenizer, text)
|
dtokens, spaces = get_words_and_spaces(self.tokenizer, text)
|
||||||
words = [x.surface for x in dtokens]
|
words = [x.surface for x in dtokens]
|
||||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||||
mecab_tags = []
|
unidic_tags = []
|
||||||
for token, dtoken in zip(doc, dtokens):
|
for token, dtoken in zip(doc, dtokens):
|
||||||
mecab_tags.append(dtoken.pos)
|
unidic_tags.append(dtoken.pos)
|
||||||
token.tag_ = resolve_pos(dtoken)
|
token.tag_ = resolve_pos(dtoken)
|
||||||
token.lemma_ = dtoken.lemma
|
|
||||||
doc.user_data["mecab_tags"] = mecab_tags
|
# if there's no lemma info (it's an unk) just use the surface
|
||||||
|
token.lemma_ = dtoken.feature.lemma or dtoken.surface
|
||||||
|
doc.user_data["unidic_tags"] = unidic_tags
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -131,5 +123,4 @@ def pickle_japanese(instance):
|
||||||
|
|
||||||
copy_reg.pickle(Japanese, pickle_japanese)
|
copy_reg.pickle(Japanese, pickle_japanese)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Japanese"]
|
__all__ = ["Japanese"]
|
||||||
|
|
|
@ -125,7 +125,7 @@ def it_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def ja_tokenizer():
|
def ja_tokenizer():
|
||||||
pytest.importorskip("MeCab")
|
pytest.importorskip("fugashi")
|
||||||
return get_lang_class("ja").Defaults.create_tokenizer()
|
return get_lang_class("ja").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -155,7 +155,8 @@
|
||||||
"name": "Japanese",
|
"name": "Japanese",
|
||||||
"dependencies": [
|
"dependencies": [
|
||||||
{ "name": "Unidic", "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj" },
|
{ "name": "Unidic", "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj" },
|
||||||
{ "name": "Mecab", "url": "https://github.com/taku910/mecab" }
|
{ "name": "Mecab", "url": "https://github.com/taku910/mecab" },
|
||||||
|
{ "name": "fugashi", "url": "https://github.com/polm/fugashi" }
|
||||||
],
|
],
|
||||||
"example": "これは文章です。",
|
"example": "これは文章です。",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
|
|
Loading…
Reference in New Issue
Block a user