From c8f83aeb873c2d3beff22cbe0f967b6d56b6793e Mon Sep 17 00:00:00 2001
From: Yasuaki Uechi <uetchy@randompaper.co>
Date: Wed, 3 May 2017 13:56:21 +0900
Subject: [PATCH 01/10] Add basic japanese support

---
 setup.py                  |  3 ++-
 spacy/__init__.py         |  4 ++--
 spacy/ja/__init__.py      | 19 +++++++++++++++++++
 spacy/ja/language_data.py | 23 +++++++++++++++++++++++
 spacy/ja/stop_words.py    |  9 +++++++++
 spacy/ja/tag_map.py       | 24 ++++++++++++++++++++++++
 6 files changed, 79 insertions(+), 3 deletions(-)
 create mode 100644 spacy/ja/__init__.py
 create mode 100644 spacy/ja/language_data.py
 create mode 100644 spacy/ja/stop_words.py
 create mode 100644 spacy/ja/tag_map.py

diff --git a/setup.py b/setup.py
index 1f13747dc..52ce06843 100755
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,8 @@ PACKAGES = [
     'spacy.fi',
     'spacy.bn',
     'spacy.he',
-    'spacy.nb',    
+    'spacy.nb',
+    'spacy.ja',
     'spacy.en.lemmatizer',
     'spacy.cli.converters',
     'spacy.language_data',
diff --git a/spacy/__init__.py b/spacy/__init__.py
index f71d3addd..f5912e13e 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -5,12 +5,12 @@ from . import util
 from .deprecated import resolve_model_name
 from .cli.info import info
 
-from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb
+from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja
 
 
 _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
              it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
-             fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian)
+             fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese)
 
 
 for _lang in _languages:
diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
new file mode 100644
index 000000000..f9ab7b560
--- /dev/null
+++ b/spacy/ja/__init__.py
@@ -0,0 +1,19 @@
+# encoding: utf8
+from __future__ import unicode_literals, print_function
+
+from os import path
+
+from ..language import Language
+from ..attrs import LANG
+from ..tokens import Doc
+
+from .language_data import *
+
+
+class Japanese(Language):
+    lang = 'ja'
+
+    def make_doc(self, text):
+        from janome.tokenizer import Tokenizer
+        words = [x.surface for x in Tokenizer().tokenize(text)]
+        return Doc(self.vocab, words=words, spaces=[False]*len(words))
diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py
new file mode 100644
index 000000000..2e8dfbafb
--- /dev/null
+++ b/spacy/ja/language_data.py
@@ -0,0 +1,23 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# import base language data
+from .. import language_data as base
+
+
+# import util functions
+from ..language_data import update_exc, strings_to_exc
+
+
+# import language-specific data from files
+from .tag_map import TAG_MAP
+from .stop_words import STOP_WORDS
+
+
+TAG_MAP = dict(TAG_MAP)
+STOP_WORDS = set(STOP_WORDS)
+
+
+# export
+__all__ = ["TAG_MAP", "STOP_WORDS"]
\ No newline at end of file
diff --git a/spacy/ja/stop_words.py b/spacy/ja/stop_words.py
new file mode 100644
index 000000000..b2120b30d
--- /dev/null
+++ b/spacy/ja/stop_words.py
@@ -0,0 +1,9 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# stop words as whitespace-separated list
+STOP_WORDS = set("""
+。
+、
+""".split())
\ No newline at end of file
diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py
new file mode 100644
index 000000000..2196ff397
--- /dev/null
+++ b/spacy/ja/tag_map.py
@@ -0,0 +1,24 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import *
+
+
+TAG_MAP = {
+    "ADV":      {POS: ADV},
+    "NOUN":     {POS: NOUN},
+    "ADP":      {POS: ADP},
+    "PRON":     {POS: PRON},
+    "SCONJ":    {POS: SCONJ},
+    "PROPN":    {POS: PROPN},
+    "DET":      {POS: DET},
+    "SYM":      {POS: SYM},
+    "INTJ":     {POS: INTJ},
+    "PUNCT":    {POS: PUNCT},
+    "NUM":      {POS: NUM},
+    "AUX":      {POS: AUX},
+    "X":        {POS: X},
+    "CONJ":     {POS: CONJ},
+    "ADJ":      {POS: ADJ},
+    "VERB":     {POS: VERB}
+}
\ No newline at end of file

From 0e7a9b9facdcdc24f5064070971653f8a75e51ad Mon Sep 17 00:00:00 2001
From: Yasuaki Uechi <uetchy@randompaper.co>
Date: Wed, 3 May 2017 13:56:45 +0900
Subject: [PATCH 02/10] =?UTF-8?q?Add=20Japanese=20to=20'Alpha=20support?=
 =?UTF-8?q?=E2=80=99=20section?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 website/docs/api/language-models.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade
index a2ad9b9eb..40105b85c 100644
--- a/website/docs/api/language-models.jade
+++ b/website/docs/api/language-models.jade
@@ -36,7 +36,7 @@ p
     |  the existing language data and extending the tokenization patterns.
 
 +table([ "Language", "Source" ])
-    each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" }
+    each language, code in { ja: "Japanese", zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" }
         +row
             +cell #{language} #[code=code]
             +cell

From 8676cd013593444324f101af2f3c0b8c680777bc Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 3 May 2017 09:38:07 +0200
Subject: [PATCH 03/10] Add newline

---
 spacy/ja/language_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py
index 2e8dfbafb..007ed2b4e 100644
--- a/spacy/ja/language_data.py
+++ b/spacy/ja/language_data.py
@@ -20,4 +20,4 @@ STOP_WORDS = set(STOP_WORDS)
 
 
 # export
-__all__ = ["TAG_MAP", "STOP_WORDS"]
\ No newline at end of file
+__all__ = ["TAG_MAP", "STOP_WORDS"]

From d12ca587eababb75601078c4761e6a9d78fefecc Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 3 May 2017 09:38:29 +0200
Subject: [PATCH 04/10] Add newline

---
 spacy/ja/stop_words.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/ja/stop_words.py b/spacy/ja/stop_words.py
index b2120b30d..45bb7a4d8 100644
--- a/spacy/ja/stop_words.py
+++ b/spacy/ja/stop_words.py
@@ -6,4 +6,4 @@ from __future__ import unicode_literals
 STOP_WORDS = set("""
 。
 、
-""".split())
\ No newline at end of file
+""".split())

From 949ad6594b759ebd91da142187cbb6f675117eea Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 3 May 2017 09:38:43 +0200
Subject: [PATCH 05/10] Add newline

---
 spacy/ja/tag_map.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py
index 2196ff397..f5b6b5040 100644
--- a/spacy/ja/tag_map.py
+++ b/spacy/ja/tag_map.py
@@ -21,4 +21,4 @@ TAG_MAP = {
     "CONJ":     {POS: CONJ},
     "ADJ":      {POS: ADJ},
     "VERB":     {POS: VERB}
-}
\ No newline at end of file
+}

From d730eb0c0df2fb6784f7adcce479c4c9588764b9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 3 May 2017 09:43:29 +0200
Subject: [PATCH 06/10] Raise custom ImportError if importing janome fails

---
 spacy/ja/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
index f9ab7b560..2915d6330 100644
--- a/spacy/ja/__init__.py
+++ b/spacy/ja/__init__.py
@@ -14,6 +14,9 @@ class Japanese(Language):
     lang = 'ja'
 
     def make_doc(self, text):
-        from janome.tokenizer import Tokenizer
+        try:
+            from janome.tokenizer import Tokenizer
+        except ImportError:
+            raise ImportError("The Japanese tokenizer requires the Janome library: https://github.com/mocobeta/janome")
         words = [x.surface for x in Tokenizer().tokenize(text)]
         return Doc(self.vocab, words=words, spaces=[False]*len(words))

From 3ea23a3f4db561f800a21bed9b25ced648b826d4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 3 May 2017 09:44:38 +0200
Subject: [PATCH 07/10] Fix formatting

---
 spacy/ja/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
index 2915d6330..07e40ada6 100644
--- a/spacy/ja/__init__.py
+++ b/spacy/ja/__init__.py
@@ -17,6 +17,7 @@ class Japanese(Language):
         try:
             from janome.tokenizer import Tokenizer
         except ImportError:
-            raise ImportError("The Japanese tokenizer requires the Janome library: https://github.com/mocobeta/janome")
+            raise ImportError("The Japanese tokenizer requires the Janome library: "
+                              "https://github.com/mocobeta/janome")
         words = [x.surface for x in Tokenizer().tokenize(text)]
         return Doc(self.vocab, words=words, spaces=[False]*len(words))

From f9384b0fbd5a555d688b353f2847d4ca32242a76 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 3 May 2017 09:58:31 +0200
Subject: [PATCH 08/10] Update alpha languages and add aside for tokenizer
 dependencies

---
 website/docs/api/language-models.jade | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade
index 40105b85c..3bce7272f 100644
--- a/website/docs/api/language-models.jade
+++ b/website/docs/api/language-models.jade
@@ -35,14 +35,15 @@ p
     |  Work has started on the following languages. You can help by improving
     |  the existing language data and extending the tokenization patterns.
 
++aside("Dependencies")
+    |  Some language tokenizers require external dependencies. To use #[strong Chinese],
+    |  you need to have #[+a("https://github.com/fxsjy/jieba") Jieba] installed.
+    |  The #[strong Japanese] tokenizer requires
+    |  #[+a("https://github.com/mocobeta/janome") Janome].
+
 +table([ "Language", "Source" ])
-    each language, code in { ja: "Japanese", zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" }
+    each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
         +row
             +cell #{language} #[code=code]
             +cell
                 +src(gh("spaCy", "spacy/" + code)) spacy/#{code}
-
-p
-    |  Chinese tokenization requires the
-    |  #[+a("https://github.com/fxsjy/jieba") Jieba] library. Statistical
-    |  models are coming soon.

From e2380d87891a2591790f5873ad44a028a06f8540 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 3 May 2017 10:00:04 +0200
Subject: [PATCH 09/10] Update README.rst

---
 README.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index 9b8438ce8..24b0c232a 100644
--- a/README.rst
+++ b/README.rst
@@ -4,9 +4,10 @@ spaCy: Industrial-strength NLP
 spaCy is a library for advanced natural language processing in Python and
 Cython. spaCy is built on  the very latest research, but it isn't researchware.
 It was designed from day one to be used in real products. spaCy currently supports
-English, German and French, as well as tokenization for Chinese, Spanish, Italian,
-Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali and Hebrew. It's 
-commercial open-source software, released under the MIT license.
+English, German and French, as well as tokenization for Spanish, Italian,
+Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew,
+Chinese and Japanese. It's commercial open-source software, released under the
+MIT license.
 
 📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.
 

From 6e1fad92a1c26ddf1f73a31b7b09f2e7f7cac093 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 3 May 2017 10:01:40 +0200
Subject: [PATCH 10/10] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 69a562e48..b64dc8db3 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -52,4 +52,5 @@ This is a list of everyone who has made significant contributions to spaCy, in a
 * Willem van Hage, [@wrvhage](https://github.com/wrvhage)
 * Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker)
 * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang)
+* Yasuaki Uechi, [@uetchy](https://github.com/uetchy)
 * Yubing Dong, [@tomtung](https://github.com/tomtung)