From f0e3e606a67a978f5450db4ba22913c199055710 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sat, 23 Nov 2019 22:31:04 +0900
Subject: [PATCH] Replace python-mecab3 with fugashi for Japanese (#4621)

* Switch from mecab-python3 to fugashi

mecab-python3 has been the best MeCab binding for a long time but it's
not very actively maintained, and since it's based on old SWIG code
distributed with MeCab there's a limit to how effectively it can be
maintained.

Fugashi is a new Cython-based MeCab wrapper I wrote. Since it's not
based on the old SWIG code it's easier to keep it current and make small
deviations from the MeCab C/C++ API where that makes sense.

* Change mecab-python3 to fugashi in setup.cfg

* Change "mecab tags" to "unidic tags"

The tags come from MeCab, but the tag schema is specified by Unidic, so
it's more proper to refer to it that way.

* Update conftest

* Add fugashi link to external deps list for Japanese
---
 setup.cfg                   |  2 +-
 spacy/lang/ja/__init__.py   | 79 ++++++++++++++++---------------------
 spacy/tests/conftest.py     |  2 +-
 website/meta/languages.json |  3 +-
 4 files changed, 39 insertions(+), 47 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 940066a9e..3101209e7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -73,7 +73,7 @@ cuda100 =
     cupy-cuda100>=5.0.0b4
 # Language tokenizers with external dependencies
 ja =
-    mecab-python3==0.7
+    fugashi>=0.1.3
 ko =
     natto-py==0.9.0
 th =
diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 056a6893b..0538461a3 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -12,21 +12,23 @@ from ...tokens import Doc
 from ...compat import copy_reg
 from ...util import DummyTokenizer
 
+# Handling for multiple spaces in a row is somewhat awkward, this simplifies
+# the flow by creating a dummy with the same interface.
+DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"])
+DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"])
+DummySpace = DummyNode(' ', ' ', DummyNodeFeatures(' '))
 
-ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
-
-
-def try_mecab_import():
-    """Mecab is required for Japanese support, so check for it.
+def try_fugashi_import():
+    """Fugashi is required for Japanese support, so check for it.
     It it's not available blow up and explain how to fix it."""
     try:
-        import MeCab
+        import fugashi
 
-        return MeCab
+        return fugashi
     except ImportError:
         raise ImportError(
-            "Japanese support requires MeCab: "
-            "https://github.com/SamuraiT/mecab-python3"
+            "Japanese support requires Fugashi: "
+            "https://github.com/polm/fugashi"
         )
 
 
@@ -39,7 +41,7 @@ def resolve_pos(token):
     """
 
     # this is only used for consecutive ascii spaces
-    if token.pos == "空白":
+    if token.surface == " ":
         return "空白"
 
     # TODO: This is a first take. The rules here are crude approximations.
@@ -53,55 +55,45 @@ def resolve_pos(token):
         return token.pos + ",ADJ"
     return token.pos
 
+def get_words_and_spaces(tokenizer, text):
+    """Get the individual tokens that make up the sentence and handle white space.
+
+    Japanese doesn't usually use white space, and MeCab's handling of it for
+    multiple spaces in a row is somewhat awkward.
+    """
+    
+    tokens = tokenizer.parseToNodeList(text)
 
-def detailed_tokens(tokenizer, text):
-    """Format Mecab output into a nice data structure, based on Janome."""
-    node = tokenizer.parseToNode(text)
-    node = node.next  # first node is beginning of sentence and empty, skip it
     words = []
     spaces = []
-    while node.posid != 0:
-        surface = node.surface
-        base = surface  # a default value. Updated if available later.
-        parts = node.feature.split(",")
-        pos = ",".join(parts[0:4])
-        if len(parts) > 7:
-            # this information is only available for words in the tokenizer
-            # dictionary
-            base = parts[7]
-        words.append(ShortUnitWord(surface, base, pos))
-
-        # The way MeCab stores spaces is that the rlength of the next token is
-        # the length of that token plus any preceding whitespace, **in bytes**.
-        # also note that this is only for half-width / ascii spaces. Full width
-        # spaces just become tokens.
-        scount = node.next.rlength - node.next.length
-        spaces.append(bool(scount))
-        while scount > 1:
-            words.append(ShortUnitWord(" ", " ", "空白"))
+    for token in tokens:
+        # If there's more than one space, spaces after the first become tokens
+        for ii in range(len(token.white_space) - 1):
+            words.append(DummySpace)
             spaces.append(False)
-            scount -= 1
 
-        node = node.next
+        words.append(token)
+        spaces.append(bool(token.white_space))
     return words, spaces
 
-
 class JapaneseTokenizer(DummyTokenizer):
     def __init__(self, cls, nlp=None):
         self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
-        self.tokenizer = try_mecab_import().Tagger()
-        self.tokenizer.parseToNode("")  # see #2901
+        self.tokenizer = try_fugashi_import().Tagger()
+        self.tokenizer.parseToNodeList("")  # see #2901
 
     def __call__(self, text):
-        dtokens, spaces = detailed_tokens(self.tokenizer, text)
+        dtokens, spaces = get_words_and_spaces(self.tokenizer, text)
         words = [x.surface for x in dtokens]
         doc = Doc(self.vocab, words=words, spaces=spaces)
-        mecab_tags = []
+        unidic_tags = []
         for token, dtoken in zip(doc, dtokens):
-            mecab_tags.append(dtoken.pos)
+            unidic_tags.append(dtoken.pos)
             token.tag_ = resolve_pos(dtoken)
-            token.lemma_ = dtoken.lemma
-        doc.user_data["mecab_tags"] = mecab_tags
+
+            # if there's no lemma info (it's an unk) just use the surface
+            token.lemma_ = dtoken.feature.lemma or dtoken.surface
+        doc.user_data["unidic_tags"] = unidic_tags
         return doc
 
 
@@ -131,5 +123,4 @@ def pickle_japanese(instance):
 
 copy_reg.pickle(Japanese, pickle_japanese)
 
-
 __all__ = ["Japanese"]
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index d6b9ba11f..959a6b670 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -125,7 +125,7 @@ def it_tokenizer():
 
 @pytest.fixture(scope="session")
 def ja_tokenizer():
-    pytest.importorskip("MeCab")
+    pytest.importorskip("fugashi")
     return get_lang_class("ja").Defaults.create_tokenizer()
 
 
diff --git a/website/meta/languages.json b/website/meta/languages.json
index dbb300fbf..9b8c56bc6 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -155,7 +155,8 @@
             "name": "Japanese",
             "dependencies": [
                 { "name": "Unidic", "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj" },
-                { "name": "Mecab", "url": "https://github.com/taku910/mecab" }
+                { "name": "Mecab", "url": "https://github.com/taku910/mecab" },
+                { "name": "fugashi", "url": "https://github.com/polm/fugashi" }
             ],
             "example": "これは文章です。",
             "has_examples": true