Merge branch 'master' into master

2025-07-26 16:09:47 +03:00 · 2017-05-03 11:04:51 +02:00 · 2017-05-03 11:04:51 +02:00 · f9d7144224
commit f9d7144224
parent f2ab7d77b4 6e1fad92a1
9 changed files with 94 additions and 11 deletions
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@ -52,4 +52,5 @@ This is a list of everyone who has made significant contributions to spaCy, in a
 * Willem van Hage, [@wrvhage](https://github.com/wrvhage)
 * Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker)
 * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang)
+* Yasuaki Uechi, [@uetchy](https://github.com/uetchy)
 * Yubing Dong, [@tomtung](https://github.com/tomtung)
--- a/README.rst
+++ b/README.rst
@ -4,9 +4,10 @@ spaCy: Industrial-strength NLP
 spaCy is a library for advanced natural language processing in Python and
 Cython. spaCy is built on  the very latest research, but it isn't researchware.
 It was designed from day one to be used in real products. spaCy currently supports
-English, German and French, as well as tokenization for Chinese, Spanish, Italian,
-Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali and Hebrew. It's 
-commercial open-source software, released under the MIT license.
+English, German and French, as well as tokenization for Spanish, Italian,
+Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew,
+Chinese and Japanese. It's commercial open-source software, released under the
+MIT license.

 📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.

--- a/setup.py
+++ b/setup.py
@ -36,7 +36,8 @@ PACKAGES = [
    'spacy.fi',
    'spacy.bn',
    'spacy.he',
-    'spacy.nb',    
+    'spacy.nb',
+    'spacy.ja',
    'spacy.en.lemmatizer',
    'spacy.cli.converters',
    'spacy.language_data',
--- a/spacy/init.py
+++ b/spacy/init.py
@ -9,7 +9,7 @@ from .cli.info import info

 _languages_name = set(["en", "de", "es", "pt", "fr",
             "it", "hu", "zh", "nl", "sv",
-             "fi", "bn", "he", "nb"])
+             "fi", "bn", "he", "nb", "ja"])


 def load(name, **overrides):
--- a/spacy/ja/init.py
+++ b/spacy/ja/init.py
@ -0,0 +1,23 @@
+# encoding: utf8
+from __future__ import unicode_literals, print_function
+
+from os import path
+
+from ..language import Language
+from ..attrs import LANG
+from ..tokens import Doc
+
+from .language_data import *
+
+
+class Japanese(Language):
+    lang = 'ja'
+
+    def make_doc(self, text):
+        try:
+            from janome.tokenizer import Tokenizer
+        except ImportError:
+            raise ImportError("The Japanese tokenizer requires the Janome library: "
+                              "https://github.com/mocobeta/janome")
+        words = [x.surface for x in Tokenizer().tokenize(text)]
+        return Doc(self.vocab, words=words, spaces=[False]*len(words))
--- a/spacy/ja/language_data.py
+++ b/spacy/ja/language_data.py
@ -0,0 +1,23 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# import base language data
+from .. import language_data as base
+
+
+# import util functions
+from ..language_data import update_exc, strings_to_exc
+
+
+# import language-specific data from files
+from .tag_map import TAG_MAP
+from .stop_words import STOP_WORDS
+
+
+TAG_MAP = dict(TAG_MAP)
+STOP_WORDS = set(STOP_WORDS)
+
+
+# export
+__all__ = ["TAG_MAP", "STOP_WORDS"]
--- a/spacy/ja/stop_words.py
+++ b/spacy/ja/stop_words.py
@ -0,0 +1,9 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# stop words as whitespace-separated list
+STOP_WORDS = set("""
+。
+、
+""".split())
--- a/spacy/ja/tag_map.py
+++ b/spacy/ja/tag_map.py
@ -0,0 +1,24 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import *
+
+
+TAG_MAP = {
+    "ADV":      {POS: ADV},
+    "NOUN":     {POS: NOUN},
+    "ADP":      {POS: ADP},
+    "PRON":     {POS: PRON},
+    "SCONJ":    {POS: SCONJ},
+    "PROPN":    {POS: PROPN},
+    "DET":      {POS: DET},
+    "SYM":      {POS: SYM},
+    "INTJ":     {POS: INTJ},
+    "PUNCT":    {POS: PUNCT},
+    "NUM":      {POS: NUM},
+    "AUX":      {POS: AUX},
+    "X":        {POS: X},
+    "CONJ":     {POS: CONJ},
+    "ADJ":      {POS: ADJ},
+    "VERB":     {POS: VERB}
+}
--- a/website/docs/api/language-models.jade
+++ b/website/docs/api/language-models.jade
@ -35,14 +35,15 @@ p
    |  Work has started on the following languages. You can help by improving
    |  the existing language data and extending the tokenization patterns.

+aside("Dependencies")
+    |  Some language tokenizers require external dependencies. To use #[strong Chinese],
+    |  you need to have #[+a("https://github.com/fxsjy/jieba") Jieba] installed.
+    |  The #[strong Japanese] tokenizer requires
+    |  #[+a("https://github.com/mocobeta/janome") Janome].
+
 +table([ "Language", "Source" ])
-    each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" }
+    each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
        +row
            +cell #{language} #[code=code]
            +cell
                +src(gh("spaCy", "spacy/" + code)) spacy/#{code}
-
-p
-    |  Chinese tokenization requires the
-    |  #[+a("https://github.com/fxsjy/jieba") Jieba] library. Statistical
-    |  models are coming soon.