From c8f83aeb873c2d3beff22cbe0f967b6d56b6793e Mon Sep 17 00:00:00 2001
From: Yasuaki Uechi <uetchy@randompaper.co>
Date: Wed, 3 May 2017 13:56:21 +0900
Subject: [PATCH 1/7] Add basic japanese support

---
 setup.py                  |  3 ++-
 spacy/__init__.py         |  4 ++--
 spacy/ja/__init__.py      | 19 +++++++++++++++++++
 spacy/ja/language_data.py | 23 +++++++++++++++++++++++
 spacy/ja/stop_words.py    |  9 +++++++++
 spacy/ja/tag_map.py       | 24 ++++++++++++++++++++++++
 6 files changed, 79 insertions(+), 3 deletions(-)
 create mode 100644 spacy/ja/__init__.py
 create mode 100644 spacy/ja/language_data.py
 create mode 100644 spacy/ja/stop_words.py
 create mode 100644 spacy/ja/tag_map.py

diff --git a/setup.py b/setup.py
index 1f13747dc..52ce06843 100755
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,8 @@ PACKAGES = [
     'spacy.fi',
     'spacy.bn',
     'spacy.he',
-    'spacy.nb',    
+    'spacy.nb',
+    'spacy.ja',
     'spacy.en.lemmatizer',
     'spacy.cli.converters',
     'spacy.language_data',
diff --git a/spacy/__init__.py b/spacy/__init__.py
index f71d3addd..f5912e13e 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -5,12 +5,12 @@ from . import util
 from .deprecated import resolve_model_name
 from .cli.info import info
 
-from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb
+from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja
 
 
 _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
              it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
-             fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian)
+             fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese)
 
 
 for _lang in _languages:
diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
new file mode 100644
index 000000000..f9ab7b560
--- /dev/null
+++ b/spacy/ja/__init__.py
@@ -0,0 +1,19 @@
+# encoding: utf8
+from __future__ import unicode_literals, print_function
+
+from os import path
+
+from ..language import Language
+from ..attrs import LANG
+from ..tokens import Doc
+
+from .language_data import *
+
+
+class Japanese(Language):
+    lang = 'ja'
+
+    def make_doc(self, text):
+        from janome.tokenizer import Tokenizer
+        words = [x.surface for x in Tokenizer().tokenize(text)]
+        return Doc(self.vocab, words=words, spaces=[False]*len(words))
diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py
new file mode 100644
index 000000000..2e8dfbafb
--- /dev/null
+++ b/spacy/ja/language_data.py
@@ -0,0 +1,23 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# import base language data
+from .. import language_data as base
+
+
+# import util functions
+from ..language_data import update_exc, strings_to_exc
+
+
+# import language-specific data from files
+from .tag_map import TAG_MAP
+from .stop_words import STOP_WORDS
+
+
+TAG_MAP = dict(TAG_MAP)
+STOP_WORDS = set(STOP_WORDS)
+
+
+# export
+__all__ = ["TAG_MAP", "STOP_WORDS"]
\ No newline at end of file
diff --git a/spacy/ja/stop_words.py b/spacy/ja/stop_words.py
new file mode 100644
index 000000000..b2120b30d
--- /dev/null
+++ b/spacy/ja/stop_words.py
@@ -0,0 +1,9 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# stop words as whitespace-separated list
+STOP_WORDS = set("""
+。
+、
+""".split())
\ No newline at end of file
diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py
new file mode 100644
index 000000000..2196ff397
--- /dev/null
+++ b/spacy/ja/tag_map.py
@@ -0,0 +1,24 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import *
+
+
+TAG_MAP = {
+    "ADV":      {POS: ADV},
+    "NOUN":     {POS: NOUN},
+    "ADP":      {POS: ADP},
+    "PRON":     {POS: PRON},
+    "SCONJ":    {POS: SCONJ},
+    "PROPN":    {POS: PROPN},
+    "DET":      {POS: DET},
+    "SYM":      {POS: SYM},
+    "INTJ":     {POS: INTJ},
+    "PUNCT":    {POS: PUNCT},
+    "NUM":      {POS: NUM},
+    "AUX":      {POS: AUX},
+    "X":        {POS: X},
+    "CONJ":     {POS: CONJ},
+    "ADJ":      {POS: ADJ},
+    "VERB":     {POS: VERB}
+}
\ No newline at end of file

From 0e7a9b9facdcdc24f5064070971653f8a75e51ad Mon Sep 17 00:00:00 2001
From: Yasuaki Uechi <uetchy@randompaper.co>
Date: Wed, 3 May 2017 13:56:45 +0900
Subject: [PATCH 2/7] =?UTF-8?q?Add=20Japanese=20to=20'Alpha=20support?=
 =?UTF-8?q?=E2=80=99=20section?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 website/docs/api/language-models.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade
index a2ad9b9eb..40105b85c 100644
--- a/website/docs/api/language-models.jade
+++ b/website/docs/api/language-models.jade
@@ -36,7 +36,7 @@ p
     |  the existing language data and extending the tokenization patterns.
 
 +table([ "Language", "Source" ])
-    each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" }
+    each language, code in { ja: "Japanese", zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" }
         +row
             +cell #{language} #[code=code]
             +cell

From 8676cd013593444324f101af2f3c0b8c680777bc Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 3 May 2017 09:38:07 +0200
Subject: [PATCH 3/7] Add newline

---
 spacy/ja/language_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py
index 2e8dfbafb..007ed2b4e 100644
--- a/spacy/ja/language_data.py
+++ b/spacy/ja/language_data.py
@@ -20,4 +20,4 @@ STOP_WORDS = set(STOP_WORDS)
 
 
 # export
-__all__ = ["TAG_MAP", "STOP_WORDS"]
\ No newline at end of file
+__all__ = ["TAG_MAP", "STOP_WORDS"]

From d12ca587eababb75601078c4761e6a9d78fefecc Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 3 May 2017 09:38:29 +0200
Subject: [PATCH 4/7] Add newline

---
 spacy/ja/stop_words.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/ja/stop_words.py b/spacy/ja/stop_words.py
index b2120b30d..45bb7a4d8 100644
--- a/spacy/ja/stop_words.py
+++ b/spacy/ja/stop_words.py
@@ -6,4 +6,4 @@ from __future__ import unicode_literals
 STOP_WORDS = set("""
 。
 、
-""".split())
\ No newline at end of file
+""".split())

From 949ad6594b759ebd91da142187cbb6f675117eea Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 3 May 2017 09:38:43 +0200
Subject: [PATCH 5/7] Add newline

---
 spacy/ja/tag_map.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py
index 2196ff397..f5b6b5040 100644
--- a/spacy/ja/tag_map.py
+++ b/spacy/ja/tag_map.py
@@ -21,4 +21,4 @@ TAG_MAP = {
     "CONJ":     {POS: CONJ},
     "ADJ":      {POS: ADJ},
     "VERB":     {POS: VERB}
-}
\ No newline at end of file
+}

From d730eb0c0df2fb6784f7adcce479c4c9588764b9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 3 May 2017 09:43:29 +0200
Subject: [PATCH 6/7] Raise custom ImportError if importing janome fails

---
 spacy/ja/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
index f9ab7b560..2915d6330 100644
--- a/spacy/ja/__init__.py
+++ b/spacy/ja/__init__.py
@@ -14,6 +14,9 @@ class Japanese(Language):
     lang = 'ja'
 
     def make_doc(self, text):
-        from janome.tokenizer import Tokenizer
+        try:
+            from janome.tokenizer import Tokenizer
+        except ImportError:
+            raise ImportError("The Japanese tokenizer requires the Janome library: https://github.com/mocobeta/janome")
         words = [x.surface for x in Tokenizer().tokenize(text)]
         return Doc(self.vocab, words=words, spaces=[False]*len(words))

From 3ea23a3f4db561f800a21bed9b25ced648b826d4 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines.montani@gmail.com>
Date: Wed, 3 May 2017 09:44:38 +0200
Subject: [PATCH 7/7] Fix formatting

---
 spacy/ja/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py
index 2915d6330..07e40ada6 100644
--- a/spacy/ja/__init__.py
+++ b/spacy/ja/__init__.py
@@ -17,6 +17,7 @@ class Japanese(Language):
         try:
             from janome.tokenizer import Tokenizer
         except ImportError:
-            raise ImportError("The Japanese tokenizer requires the Janome library: https://github.com/mocobeta/janome")
+            raise ImportError("The Japanese tokenizer requires the Janome library: "
+                              "https://github.com/mocobeta/janome")
         words = [x.surface for x in Tokenizer().tokenize(text)]
         return Doc(self.vocab, words=words, spaces=[False]*len(words))