diff --git a/setup.cfg b/setup.cfg index 940066a9e..3101209e7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -73,7 +73,7 @@ cuda100 = cupy-cuda100>=5.0.0b4 # Language tokenizers with external dependencies ja = - mecab-python3==0.7 + fugashi>=0.1.3 ko = natto-py==0.9.0 th = diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 056a6893b..0538461a3 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -12,21 +12,23 @@ from ...tokens import Doc from ...compat import copy_reg from ...util import DummyTokenizer +# Handling for multiple spaces in a row is somewhat awkward, this simplifies +# the flow by creating a dummy with the same interface. +DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"]) +DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"]) +DummySpace = DummyNode(' ', ' ', DummyNodeFeatures(' ')) -ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"]) - - -def try_mecab_import(): - """Mecab is required for Japanese support, so check for it. +def try_fugashi_import(): + """Fugashi is required for Japanese support, so check for it. It it's not available blow up and explain how to fix it.""" try: - import MeCab + import fugashi - return MeCab + return fugashi except ImportError: raise ImportError( - "Japanese support requires MeCab: " - "https://github.com/SamuraiT/mecab-python3" + "Japanese support requires Fugashi: " + "https://github.com/polm/fugashi" ) @@ -39,7 +41,7 @@ def resolve_pos(token): """ # this is only used for consecutive ascii spaces - if token.pos == "空白": + if token.surface == " ": return "空白" # TODO: This is a first take. The rules here are crude approximations. @@ -53,55 +55,45 @@ def resolve_pos(token): return token.pos + ",ADJ" return token.pos +def get_words_and_spaces(tokenizer, text): + """Get the individual tokens that make up the sentence and handle white space. + + Japanese doesn't usually use white space, and MeCab's handling of it for + multiple spaces in a row is somewhat awkward. + """ + + tokens = tokenizer.parseToNodeList(text) -def detailed_tokens(tokenizer, text): - """Format Mecab output into a nice data structure, based on Janome.""" - node = tokenizer.parseToNode(text) - node = node.next # first node is beginning of sentence and empty, skip it words = [] spaces = [] - while node.posid != 0: - surface = node.surface - base = surface # a default value. Updated if available later. - parts = node.feature.split(",") - pos = ",".join(parts[0:4]) - if len(parts) > 7: - # this information is only available for words in the tokenizer - # dictionary - base = parts[7] - words.append(ShortUnitWord(surface, base, pos)) - - # The way MeCab stores spaces is that the rlength of the next token is - # the length of that token plus any preceding whitespace, **in bytes**. - # also note that this is only for half-width / ascii spaces. Full width - # spaces just become tokens. - scount = node.next.rlength - node.next.length - spaces.append(bool(scount)) - while scount > 1: - words.append(ShortUnitWord(" ", " ", "空白")) + for token in tokens: + # If there's more than one space, spaces after the first become tokens + for ii in range(len(token.white_space) - 1): + words.append(DummySpace) spaces.append(False) - scount -= 1 - node = node.next + words.append(token) + spaces.append(bool(token.white_space)) return words, spaces - class JapaneseTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - self.tokenizer = try_mecab_import().Tagger() - self.tokenizer.parseToNode("") # see #2901 + self.tokenizer = try_fugashi_import().Tagger() + self.tokenizer.parseToNodeList("") # see #2901 def __call__(self, text): - dtokens, spaces = detailed_tokens(self.tokenizer, text) + dtokens, spaces = get_words_and_spaces(self.tokenizer, text) words = [x.surface for x in dtokens] doc = Doc(self.vocab, words=words, spaces=spaces) - mecab_tags = [] + unidic_tags = [] for token, dtoken in zip(doc, dtokens): - mecab_tags.append(dtoken.pos) + unidic_tags.append(dtoken.pos) token.tag_ = resolve_pos(dtoken) - token.lemma_ = dtoken.lemma - doc.user_data["mecab_tags"] = mecab_tags + + # if there's no lemma info (it's an unk) just use the surface + token.lemma_ = dtoken.feature.lemma or dtoken.surface + doc.user_data["unidic_tags"] = unidic_tags return doc @@ -131,5 +123,4 @@ def pickle_japanese(instance): copy_reg.pickle(Japanese, pickle_japanese) - __all__ = ["Japanese"] diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index d6b9ba11f..959a6b670 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -125,7 +125,7 @@ def it_tokenizer(): @pytest.fixture(scope="session") def ja_tokenizer(): - pytest.importorskip("MeCab") + pytest.importorskip("fugashi") return get_lang_class("ja").Defaults.create_tokenizer() diff --git a/website/meta/languages.json b/website/meta/languages.json index dbb300fbf..9b8c56bc6 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -155,7 +155,8 @@ "name": "Japanese", "dependencies": [ { "name": "Unidic", "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj" }, - { "name": "Mecab", "url": "https://github.com/taku910/mecab" } + { "name": "Mecab", "url": "https://github.com/taku910/mecab" }, + { "name": "fugashi", "url": "https://github.com/polm/fugashi" } ], "example": "これは文章です。", "has_examples": true