Add basic japanese support

This commit is contained in:
Yasuaki Uechi 2017-05-03 13:56:21 +09:00
parent f26a3b5a50
commit c8f83aeb87
6 changed files with 79 additions and 3 deletions

View File

@ -36,7 +36,8 @@ PACKAGES = [
'spacy.fi',
'spacy.bn',
'spacy.he',
'spacy.nb',
'spacy.nb',
'spacy.ja',
'spacy.en.lemmatizer',
'spacy.cli.converters',
'spacy.language_data',

View File

@ -5,12 +5,12 @@ from . import util
from .deprecated import resolve_model_name
from .cli.info import info
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja
_languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian)
fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese)
for _lang in _languages:

19
spacy/ja/__init__.py Normal file
View File

@ -0,0 +1,19 @@
# encoding: utf8
from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
from ..attrs import LANG
from ..tokens import Doc
from .language_data import *
class Japanese(Language):
lang = 'ja'
def make_doc(self, text):
from janome.tokenizer import Tokenizer
words = [x.surface for x in Tokenizer().tokenize(text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words))

23
spacy/ja/language_data.py Normal file
View File

@ -0,0 +1,23 @@
# encoding: utf8
from __future__ import unicode_literals
# import base language data
from .. import language_data as base
# import util functions
from ..language_data import update_exc, strings_to_exc
# import language-specific data from files
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
# export
__all__ = ["TAG_MAP", "STOP_WORDS"]

9
spacy/ja/stop_words.py Normal file
View File

@ -0,0 +1,9 @@
# encoding: utf8
from __future__ import unicode_literals
# stop words as whitespace-separated list
STOP_WORDS = set("""
""".split())

24
spacy/ja/tag_map.py Normal file
View File

@ -0,0 +1,24 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
TAG_MAP = {
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
"ADP": {POS: ADP},
"PRON": {POS: PRON},
"SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN},
"DET": {POS: DET},
"SYM": {POS: SYM},
"INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT},
"NUM": {POS: NUM},
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"ADJ": {POS: ADJ},
"VERB": {POS: VERB}
}