From c8f83aeb873c2d3beff22cbe0f967b6d56b6793e Mon Sep 17 00:00:00 2001 From: Yasuaki Uechi Date: Wed, 3 May 2017 13:56:21 +0900 Subject: [PATCH 01/10] Add basic japanese support --- setup.py | 3 ++- spacy/__init__.py | 4 ++-- spacy/ja/__init__.py | 19 +++++++++++++++++++ spacy/ja/language_data.py | 23 +++++++++++++++++++++++ spacy/ja/stop_words.py | 9 +++++++++ spacy/ja/tag_map.py | 24 ++++++++++++++++++++++++ 6 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 spacy/ja/__init__.py create mode 100644 spacy/ja/language_data.py create mode 100644 spacy/ja/stop_words.py create mode 100644 spacy/ja/tag_map.py diff --git a/setup.py b/setup.py index 1f13747dc..52ce06843 100755 --- a/setup.py +++ b/setup.py @@ -36,7 +36,8 @@ PACKAGES = [ 'spacy.fi', 'spacy.bn', 'spacy.he', - 'spacy.nb', + 'spacy.nb', + 'spacy.ja', 'spacy.en.lemmatizer', 'spacy.cli.converters', 'spacy.language_data', diff --git a/spacy/__init__.py b/spacy/__init__.py index f71d3addd..f5912e13e 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,12 +5,12 @@ from . import util from .deprecated import resolve_model_name from .cli.info import info -from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb +from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, - fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian) + fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese) for _lang in _languages: diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py new file mode 100644 index 000000000..f9ab7b560 --- /dev/null +++ b/spacy/ja/__init__.py @@ -0,0 +1,19 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language +from ..attrs import LANG +from ..tokens import Doc + +from .language_data import * + + +class Japanese(Language): + lang = 'ja' + + def make_doc(self, text): + from janome.tokenizer import Tokenizer + words = [x.surface for x in Tokenizer().tokenize(text)] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py new file mode 100644 index 000000000..2e8dfbafb --- /dev/null +++ b/spacy/ja/language_data.py @@ -0,0 +1,23 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# import base language data +from .. import language_data as base + + +# import util functions +from ..language_data import update_exc, strings_to_exc + + +# import language-specific data from files +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS + + +TAG_MAP = dict(TAG_MAP) +STOP_WORDS = set(STOP_WORDS) + + +# export +__all__ = ["TAG_MAP", "STOP_WORDS"] \ No newline at end of file diff --git a/spacy/ja/stop_words.py b/spacy/ja/stop_words.py new file mode 100644 index 000000000..b2120b30d --- /dev/null +++ b/spacy/ja/stop_words.py @@ -0,0 +1,9 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# stop words as whitespace-separated list +STOP_WORDS = set(""" +。 +、 +""".split()) \ No newline at end of file diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py new file mode 100644 index 000000000..2196ff397 --- /dev/null +++ b/spacy/ja/tag_map.py @@ -0,0 +1,24 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TAG_MAP = { + "ADV": {POS: ADV}, + "NOUN": {POS: NOUN}, + "ADP": {POS: ADP}, + "PRON": {POS: PRON}, + "SCONJ": {POS: SCONJ}, + "PROPN": {POS: PROPN}, + "DET": {POS: DET}, + "SYM": {POS: SYM}, + "INTJ": {POS: INTJ}, + "PUNCT": {POS: PUNCT}, + "NUM": {POS: NUM}, + "AUX": {POS: AUX}, + "X": {POS: X}, + "CONJ": {POS: CONJ}, + "ADJ": {POS: ADJ}, + "VERB": {POS: VERB} +} \ No newline at end of file From 0e7a9b9facdcdc24f5064070971653f8a75e51ad Mon Sep 17 00:00:00 2001 From: Yasuaki Uechi Date: Wed, 3 May 2017 13:56:45 +0900 Subject: [PATCH 02/10] =?UTF-8?q?Add=20Japanese=20to=20'Alpha=20support?= =?UTF-8?q?=E2=80=99=20section?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- website/docs/api/language-models.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade index a2ad9b9eb..40105b85c 100644 --- a/website/docs/api/language-models.jade +++ b/website/docs/api/language-models.jade @@ -36,7 +36,7 @@ p | the existing language data and extending the tokenization patterns. +table([ "Language", "Source" ]) - each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } + each language, code in { ja: "Japanese", zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } +row +cell #{language} #[code=code] +cell From 8676cd013593444324f101af2f3c0b8c680777bc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:38:07 +0200 Subject: [PATCH 03/10] Add newline --- spacy/ja/language_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py index 2e8dfbafb..007ed2b4e 100644 --- a/spacy/ja/language_data.py +++ b/spacy/ja/language_data.py @@ -20,4 +20,4 @@ STOP_WORDS = set(STOP_WORDS) # export -__all__ = ["TAG_MAP", "STOP_WORDS"] \ No newline at end of file +__all__ = ["TAG_MAP", "STOP_WORDS"] From d12ca587eababb75601078c4761e6a9d78fefecc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:38:29 +0200 Subject: [PATCH 04/10] Add newline --- spacy/ja/stop_words.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ja/stop_words.py b/spacy/ja/stop_words.py index b2120b30d..45bb7a4d8 100644 --- a/spacy/ja/stop_words.py +++ b/spacy/ja/stop_words.py @@ -6,4 +6,4 @@ from __future__ import unicode_literals STOP_WORDS = set(""" 。 、 -""".split()) \ No newline at end of file +""".split()) From 949ad6594b759ebd91da142187cbb6f675117eea Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:38:43 +0200 Subject: [PATCH 05/10] Add newline --- spacy/ja/tag_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py index 2196ff397..f5b6b5040 100644 --- a/spacy/ja/tag_map.py +++ b/spacy/ja/tag_map.py @@ -21,4 +21,4 @@ TAG_MAP = { "CONJ": {POS: CONJ}, "ADJ": {POS: ADJ}, "VERB": {POS: VERB} -} \ No newline at end of file +} From d730eb0c0df2fb6784f7adcce479c4c9588764b9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:43:29 +0200 Subject: [PATCH 06/10] Raise custom ImportError if importing janome fails --- spacy/ja/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index f9ab7b560..2915d6330 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -14,6 +14,9 @@ class Japanese(Language): lang = 'ja' def make_doc(self, text): - from janome.tokenizer import Tokenizer + try: + from janome.tokenizer import Tokenizer + except ImportError: + raise ImportError("The Japanese tokenizer requires the Janome library: https://github.com/mocobeta/janome") words = [x.surface for x in Tokenizer().tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) From 3ea23a3f4db561f800a21bed9b25ced648b826d4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:44:38 +0200 Subject: [PATCH 07/10] Fix formatting --- spacy/ja/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 2915d6330..07e40ada6 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -17,6 +17,7 @@ class Japanese(Language): try: from janome.tokenizer import Tokenizer except ImportError: - raise ImportError("The Japanese tokenizer requires the Janome library: https://github.com/mocobeta/janome") + raise ImportError("The Japanese tokenizer requires the Janome library: " + "https://github.com/mocobeta/janome") words = [x.surface for x in Tokenizer().tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) From f9384b0fbd5a555d688b353f2847d4ca32242a76 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 09:58:31 +0200 Subject: [PATCH 08/10] Update alpha languages and add aside for tokenizer dependencies --- website/docs/api/language-models.jade | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade index 40105b85c..3bce7272f 100644 --- a/website/docs/api/language-models.jade +++ b/website/docs/api/language-models.jade @@ -35,14 +35,15 @@ p | Work has started on the following languages. You can help by improving | the existing language data and extending the tokenization patterns. ++aside("Dependencies") + | Some language tokenizers require external dependencies. To use #[strong Chinese], + | you need to have #[+a("https://github.com/fxsjy/jieba") Jieba] installed. + | The #[strong Japanese] tokenizer requires + | #[+a("https://github.com/mocobeta/janome") Janome]. + +table([ "Language", "Source" ]) - each language, code in { ja: "Japanese", zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } + each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" } +row +cell #{language} #[code=code] +cell +src(gh("spaCy", "spacy/" + code)) spacy/#{code} - -p - | Chinese tokenization requires the - | #[+a("https://github.com/fxsjy/jieba") Jieba] library. Statistical - | models are coming soon. From e2380d87891a2591790f5873ad44a028a06f8540 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 10:00:04 +0200 Subject: [PATCH 09/10] Update README.rst --- README.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 9b8438ce8..24b0c232a 100644 --- a/README.rst +++ b/README.rst @@ -4,9 +4,10 @@ spaCy: Industrial-strength NLP spaCy is a library for advanced natural language processing in Python and Cython. spaCy is built on the very latest research, but it isn't researchware. It was designed from day one to be used in real products. spaCy currently supports -English, German and French, as well as tokenization for Chinese, Spanish, Italian, -Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali and Hebrew. It's -commercial open-source software, released under the MIT license. +English, German and French, as well as tokenization for Spanish, Italian, +Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew, +Chinese and Japanese. It's commercial open-source software, released under the +MIT license. 📊 **Help us improve the library!** `Take the spaCy user survey `_. From 6e1fad92a1c26ddf1f73a31b7b09f2e7f7cac093 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 10:01:40 +0200 Subject: [PATCH 10/10] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 69a562e48..b64dc8db3 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -52,4 +52,5 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Willem van Hage, [@wrvhage](https://github.com/wrvhage) * Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker) * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang) +* Yasuaki Uechi, [@uetchy](https://github.com/uetchy) * Yubing Dong, [@tomtung](https://github.com/tomtung)