diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 69a562e48..b64dc8db3 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -52,4 +52,5 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Willem van Hage, [@wrvhage](https://github.com/wrvhage) * Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker) * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang) +* Yasuaki Uechi, [@uetchy](https://github.com/uetchy) * Yubing Dong, [@tomtung](https://github.com/tomtung) diff --git a/README.rst b/README.rst index 9b8438ce8..24b0c232a 100644 --- a/README.rst +++ b/README.rst @@ -4,9 +4,10 @@ spaCy: Industrial-strength NLP spaCy is a library for advanced natural language processing in Python and Cython. spaCy is built on the very latest research, but it isn't researchware. It was designed from day one to be used in real products. spaCy currently supports -English, German and French, as well as tokenization for Chinese, Spanish, Italian, -Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali and Hebrew. It's -commercial open-source software, released under the MIT license. +English, German and French, as well as tokenization for Spanish, Italian, +Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew, +Chinese and Japanese. It's commercial open-source software, released under the +MIT license. 📊 **Help us improve the library!** `Take the spaCy user survey `_. diff --git a/setup.py b/setup.py index 1f13747dc..52ce06843 100755 --- a/setup.py +++ b/setup.py @@ -36,7 +36,8 @@ PACKAGES = [ 'spacy.fi', 'spacy.bn', 'spacy.he', - 'spacy.nb', + 'spacy.nb', + 'spacy.ja', 'spacy.en.lemmatizer', 'spacy.cli.converters', 'spacy.language_data', diff --git a/spacy/__init__.py b/spacy/__init__.py index 24ac28dfc..bcac2471c 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -9,7 +9,7 @@ from .cli.info import info _languages_name = set(["en", "de", "es", "pt", "fr", "it", "hu", "zh", "nl", "sv", - "fi", "bn", "he", "nb"]) + "fi", "bn", "he", "nb", "ja"]) def load(name, **overrides): diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py new file mode 100644 index 000000000..07e40ada6 --- /dev/null +++ b/spacy/ja/__init__.py @@ -0,0 +1,23 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language +from ..attrs import LANG +from ..tokens import Doc + +from .language_data import * + + +class Japanese(Language): + lang = 'ja' + + def make_doc(self, text): + try: + from janome.tokenizer import Tokenizer + except ImportError: + raise ImportError("The Japanese tokenizer requires the Janome library: " + "https://github.com/mocobeta/janome") + words = [x.surface for x in Tokenizer().tokenize(text)] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py new file mode 100644 index 000000000..007ed2b4e --- /dev/null +++ b/spacy/ja/language_data.py @@ -0,0 +1,23 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# import base language data +from .. import language_data as base + + +# import util functions +from ..language_data import update_exc, strings_to_exc + + +# import language-specific data from files +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS + + +TAG_MAP = dict(TAG_MAP) +STOP_WORDS = set(STOP_WORDS) + + +# export +__all__ = ["TAG_MAP", "STOP_WORDS"] diff --git a/spacy/ja/stop_words.py b/spacy/ja/stop_words.py new file mode 100644 index 000000000..45bb7a4d8 --- /dev/null +++ b/spacy/ja/stop_words.py @@ -0,0 +1,9 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# stop words as whitespace-separated list +STOP_WORDS = set(""" +。 +、 +""".split()) diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py new file mode 100644 index 000000000..f5b6b5040 --- /dev/null +++ b/spacy/ja/tag_map.py @@ -0,0 +1,24 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TAG_MAP = { + "ADV": {POS: ADV}, + "NOUN": {POS: NOUN}, + "ADP": {POS: ADP}, + "PRON": {POS: PRON}, + "SCONJ": {POS: SCONJ}, + "PROPN": {POS: PROPN}, + "DET": {POS: DET}, + "SYM": {POS: SYM}, + "INTJ": {POS: INTJ}, + "PUNCT": {POS: PUNCT}, + "NUM": {POS: NUM}, + "AUX": {POS: AUX}, + "X": {POS: X}, + "CONJ": {POS: CONJ}, + "ADJ": {POS: ADJ}, + "VERB": {POS: VERB} +} diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade index a2ad9b9eb..3bce7272f 100644 --- a/website/docs/api/language-models.jade +++ b/website/docs/api/language-models.jade @@ -35,14 +35,15 @@ p | Work has started on the following languages. You can help by improving | the existing language data and extending the tokenization patterns. ++aside("Dependencies") + | Some language tokenizers require external dependencies. To use #[strong Chinese], + | you need to have #[+a("https://github.com/fxsjy/jieba") Jieba] installed. + | The #[strong Japanese] tokenizer requires + | #[+a("https://github.com/mocobeta/janome") Janome]. + +table([ "Language", "Source" ]) - each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } + each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" } +row +cell #{language} #[code=code] +cell +src(gh("spaCy", "spacy/" + code)) spacy/#{code} - -p - | Chinese tokenization requires the - | #[+a("https://github.com/fxsjy/jieba") Jieba] library. Statistical - | models are coming soon.