From c8f83aeb873c2d3beff22cbe0f967b6d56b6793e Mon Sep 17 00:00:00 2001 From: Yasuaki Uechi Date: Wed, 3 May 2017 13:56:21 +0900 Subject: [PATCH 1/7] Add basic japanese support --- setup.py | 3 ++- spacy/__init__.py | 4 ++-- spacy/ja/__init__.py | 19 +++++++++++++++++++ spacy/ja/language_data.py | 23 +++++++++++++++++++++++ spacy/ja/stop_words.py | 9 +++++++++ spacy/ja/tag_map.py | 24 ++++++++++++++++++++++++ 6 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 spacy/ja/__init__.py create mode 100644 spacy/ja/language_data.py create mode 100644 spacy/ja/stop_words.py create mode 100644 spacy/ja/tag_map.py diff --git a/setup.py b/setup.py index 1f13747dc..52ce06843 100755 --- a/setup.py +++ b/setup.py @@ -36,7 +36,8 @@ PACKAGES = [ 'spacy.fi', 'spacy.bn', 'spacy.he', - 'spacy.nb', + 'spacy.nb', + 'spacy.ja', 'spacy.en.lemmatizer', 'spacy.cli.converters', 'spacy.language_data', diff --git a/spacy/__init__.py b/spacy/__init__.py index f71d3addd..f5912e13e 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,12 +5,12 @@ from . import util from .deprecated import resolve_model_name from .cli.info import info -from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb +from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, - fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian) + fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese) for _lang in _languages: diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py new file mode 100644 index 000000000..f9ab7b560 --- /dev/null +++ b/spacy/ja/__init__.py @@ -0,0 +1,19 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language +from ..attrs import LANG +from ..tokens import Doc + +from .language_data import * + + +class Japanese(Language): + lang = 'ja' + + def make_doc(self, text): + from janome.tokenizer import Tokenizer + words = [x.surface for x in Tokenizer().tokenize(text)] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py new file mode 100644 index 000000000..2e8dfbafb --- /dev/null +++ b/spacy/ja/language_data.py @@ -0,0 +1,23 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# import base language data +from .. import language_data as base + + +# import util functions +from ..language_data import update_exc, strings_to_exc + + +# import language-specific data from files +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS + + +TAG_MAP = dict(TAG_MAP) +STOP_WORDS = set(STOP_WORDS) + + +# export +__all__ = ["TAG_MAP", "STOP_WORDS"] \ No newline at end of file diff --git a/spacy/ja/stop_words.py b/spacy/ja/stop_words.py new file mode 100644 index 000000000..b2120b30d --- /dev/null +++ b/spacy/ja/stop_words.py @@ -0,0 +1,9 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# stop words as whitespace-separated list +STOP_WORDS = set(""" +。 +、 +""".split()) \ No newline at end of file diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py new file mode 100644 index 000000000..2196ff397 --- /dev/null +++ b/spacy/ja/tag_map.py @@ -0,0 +1,24 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TAG_MAP = { + "ADV": {POS: ADV}, + "NOUN": {POS: NOUN}, + "ADP": {POS: ADP}, + "PRON": {POS: PRON}, + "SCONJ": {POS: SCONJ}, + "PROPN": {POS: PROPN}, + "DET": {POS: DET}, + "SYM": {POS: SYM}, + "INTJ": {POS: INTJ}, + "PUNCT": {POS: PUNCT}, + "NUM": {POS: NUM}, + "AUX": {POS: AUX}, + "X": {POS: X}, + "CONJ": {POS: CONJ}, + "ADJ": {POS: ADJ}, + "VERB": {POS: VERB} +} \ No newline at end of file From 0e7a9b9facdcdc24f5064070971653f8a75e51ad Mon Sep 17 00:00:00 2001 From: Yasuaki Uechi Date: Wed, 3 May 2017 13:56:45 +0900 Subject: [PATCH 2/7] =?UTF-8?q?Add=20Japanese=20to=20'Alpha=20support?= =?UTF-8?q?=E2=80=99=20section?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- website/docs/api/language-models.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade index a2ad9b9eb..40105b85c 100644 --- a/website/docs/api/language-models.jade +++ b/website/docs/api/language-models.jade @@ -36,7 +36,7 @@ p | the existing language data and extending the tokenization patterns. +table([ "Language", "Source" ]) - each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } + each language, code in { ja: "Japanese", zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } +row +cell #{language} #[code=code] +cell From 8676cd013593444324f101af2f3c0b8c680777bc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:38:07 +0200 Subject: [PATCH 3/7] Add newline --- spacy/ja/language_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py index 2e8dfbafb..007ed2b4e 100644 --- a/spacy/ja/language_data.py +++ b/spacy/ja/language_data.py @@ -20,4 +20,4 @@ STOP_WORDS = set(STOP_WORDS) # export -__all__ = ["TAG_MAP", "STOP_WORDS"] \ No newline at end of file +__all__ = ["TAG_MAP", "STOP_WORDS"] From d12ca587eababb75601078c4761e6a9d78fefecc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:38:29 +0200 Subject: [PATCH 4/7] Add newline --- spacy/ja/stop_words.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ja/stop_words.py b/spacy/ja/stop_words.py index b2120b30d..45bb7a4d8 100644 --- a/spacy/ja/stop_words.py +++ b/spacy/ja/stop_words.py @@ -6,4 +6,4 @@ from __future__ import unicode_literals STOP_WORDS = set(""" 。 、 -""".split()) \ No newline at end of file +""".split()) From 949ad6594b759ebd91da142187cbb6f675117eea Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:38:43 +0200 Subject: [PATCH 5/7] Add newline --- spacy/ja/tag_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py index 2196ff397..f5b6b5040 100644 --- a/spacy/ja/tag_map.py +++ b/spacy/ja/tag_map.py @@ -21,4 +21,4 @@ TAG_MAP = { "CONJ": {POS: CONJ}, "ADJ": {POS: ADJ}, "VERB": {POS: VERB} -} \ No newline at end of file +} From d730eb0c0df2fb6784f7adcce479c4c9588764b9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:43:29 +0200 Subject: [PATCH 6/7] Raise custom ImportError if importing janome fails --- spacy/ja/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index f9ab7b560..2915d6330 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -14,6 +14,9 @@ class Japanese(Language): lang = 'ja' def make_doc(self, text): - from janome.tokenizer import Tokenizer + try: + from janome.tokenizer import Tokenizer + except ImportError: + raise ImportError("The Japanese tokenizer requires the Janome library: https://github.com/mocobeta/janome") words = [x.surface for x in Tokenizer().tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) From 3ea23a3f4db561f800a21bed9b25ced648b826d4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:44:38 +0200 Subject: [PATCH 7/7] Fix formatting --- spacy/ja/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 2915d6330..07e40ada6 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -17,6 +17,7 @@ class Japanese(Language): try: from janome.tokenizer import Tokenizer except ImportError: - raise ImportError("The Japanese tokenizer requires the Janome library: https://github.com/mocobeta/janome") + raise ImportError("The Japanese tokenizer requires the Janome library: " + "https://github.com/mocobeta/janome") words = [x.surface for x in Tokenizer().tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words))