diff --git a/setup.py b/setup.py index 1f13747dc..52ce06843 100755 --- a/setup.py +++ b/setup.py @@ -36,7 +36,8 @@ PACKAGES = [ 'spacy.fi', 'spacy.bn', 'spacy.he', - 'spacy.nb', + 'spacy.nb', + 'spacy.ja', 'spacy.en.lemmatizer', 'spacy.cli.converters', 'spacy.language_data', diff --git a/spacy/__init__.py b/spacy/__init__.py index f71d3addd..f5912e13e 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,12 +5,12 @@ from . import util from .deprecated import resolve_model_name from .cli.info import info -from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb +from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, - fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian) + fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese) for _lang in _languages: diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py new file mode 100644 index 000000000..07e40ada6 --- /dev/null +++ b/spacy/ja/__init__.py @@ -0,0 +1,23 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language +from ..attrs import LANG +from ..tokens import Doc + +from .language_data import * + + +class Japanese(Language): + lang = 'ja' + + def make_doc(self, text): + try: + from janome.tokenizer import Tokenizer + except ImportError: + raise ImportError("The Japanese tokenizer requires the Janome library: " + "https://github.com/mocobeta/janome") + words = [x.surface for x in Tokenizer().tokenize(text)] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py new file mode 100644 index 000000000..007ed2b4e --- /dev/null +++ b/spacy/ja/language_data.py @@ -0,0 +1,23 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# import base language data +from .. import language_data as base + + +# import util functions +from ..language_data import update_exc, strings_to_exc + + +# import language-specific data from files +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS + + +TAG_MAP = dict(TAG_MAP) +STOP_WORDS = set(STOP_WORDS) + + +# export +__all__ = ["TAG_MAP", "STOP_WORDS"] diff --git a/spacy/ja/stop_words.py b/spacy/ja/stop_words.py new file mode 100644 index 000000000..45bb7a4d8 --- /dev/null +++ b/spacy/ja/stop_words.py @@ -0,0 +1,9 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# stop words as whitespace-separated list +STOP_WORDS = set(""" +。 +、 +""".split()) diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py new file mode 100644 index 000000000..f5b6b5040 --- /dev/null +++ b/spacy/ja/tag_map.py @@ -0,0 +1,24 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TAG_MAP = { + "ADV": {POS: ADV}, + "NOUN": {POS: NOUN}, + "ADP": {POS: ADP}, + "PRON": {POS: PRON}, + "SCONJ": {POS: SCONJ}, + "PROPN": {POS: PROPN}, + "DET": {POS: DET}, + "SYM": {POS: SYM}, + "INTJ": {POS: INTJ}, + "PUNCT": {POS: PUNCT}, + "NUM": {POS: NUM}, + "AUX": {POS: AUX}, + "X": {POS: X}, + "CONJ": {POS: CONJ}, + "ADJ": {POS: ADJ}, + "VERB": {POS: VERB} +} diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade index a2ad9b9eb..40105b85c 100644 --- a/website/docs/api/language-models.jade +++ b/website/docs/api/language-models.jade @@ -36,7 +36,7 @@ p | the existing language data and extending the tokenization patterns. +table([ "Language", "Source" ]) - each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } + each language, code in { ja: "Japanese", zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } +row +cell #{language} #[code=code] +cell