Add option to not use Janome for Japanese tokenization

2025-12-15 06:04:33 +03:00 · 2018-02-26 09:39:46 +01:00 · 2018-02-26 09:39:46 +01:00 · 5faae803c6
commit 5faae803c6
parent 9b406181cd
2 changed files with 20 additions and 1 deletions
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@ -29,6 +29,7 @@ import conll17_ud_eval
 import spacy.lang.zh

 spacy.lang.zh.Chinese.Defaults.use_jieba = False
+spacy.lang.ja.Chinese.Defaults.use_janome = False

 random.seed(0)
 numpy.random.seed(0)
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -35,14 +35,32 @@ class JapaneseTokenizer(object):
    def from_disk(self, path, **exclude):
        return self

+class JapaneseCharacterSegmenter(object):
+    def __init__(self, vocab):
+        self.vocab = vocab
+
+    def __call__(self, text):
+        words = []
+        spaces = []
+        doc = self.tokenizer(text)
+        for token in self.tokenizer(text):
+            words.extend(list(token.text))
+            spaces.extend([False]*len(token.text))
+            spaces[-1] = bool(token.whitespace_)
+        return Doc(self.vocab, words=words, spaces=spaces)
+

 class JapaneseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'ja'
+    use_janome = True

    @classmethod
    def create_tokenizer(cls, nlp=None):
-        return JapaneseTokenizer(cls, nlp)
+        if cls.use_janome:
+            return JapaneseTokenizer(cls, nlp)
+        else:
+            return JapaneseCharacterSegmenter(cls, nlp.vocab)


 class Japanese(Language):