Merge branch 'master' into master

This commit is contained in:
Grégory Howard 2017-05-03 11:04:51 +02:00 committed by GitHub
commit f9d7144224
9 changed files with 94 additions and 11 deletions

View File

@ -52,4 +52,5 @@ This is a list of everyone who has made significant contributions to spaCy, in a
* Willem van Hage, [@wrvhage](https://github.com/wrvhage)
* Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker)
* Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang)
* Yasuaki Uechi, [@uetchy](https://github.com/uetchy)
* Yubing Dong, [@tomtung](https://github.com/tomtung)

View File

@ -4,9 +4,10 @@ spaCy: Industrial-strength NLP
spaCy is a library for advanced natural language processing in Python and
Cython. spaCy is built on the very latest research, but it isn't researchware.
It was designed from day one to be used in real products. spaCy currently supports
English, German and French, as well as tokenization for Chinese, Spanish, Italian,
Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali and Hebrew. It's
commercial open-source software, released under the MIT license.
English, German and French, as well as tokenization for Spanish, Italian,
Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew,
Chinese and Japanese. It's commercial open-source software, released under the
MIT license.
📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.

View File

@ -36,7 +36,8 @@ PACKAGES = [
'spacy.fi',
'spacy.bn',
'spacy.he',
'spacy.nb',
'spacy.nb',
'spacy.ja',
'spacy.en.lemmatizer',
'spacy.cli.converters',
'spacy.language_data',

View File

@ -9,7 +9,7 @@ from .cli.info import info
_languages_name = set(["en", "de", "es", "pt", "fr",
"it", "hu", "zh", "nl", "sv",
"fi", "bn", "he", "nb"])
"fi", "bn", "he", "nb", "ja"])
def load(name, **overrides):

23
spacy/ja/__init__.py Normal file
View File

@ -0,0 +1,23 @@
# encoding: utf8
from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
from ..attrs import LANG
from ..tokens import Doc
from .language_data import *
class Japanese(Language):
lang = 'ja'
def make_doc(self, text):
try:
from janome.tokenizer import Tokenizer
except ImportError:
raise ImportError("The Japanese tokenizer requires the Janome library: "
"https://github.com/mocobeta/janome")
words = [x.surface for x in Tokenizer().tokenize(text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words))

23
spacy/ja/language_data.py Normal file
View File

@ -0,0 +1,23 @@
# encoding: utf8
from __future__ import unicode_literals
# import base language data
from .. import language_data as base
# import util functions
from ..language_data import update_exc, strings_to_exc
# import language-specific data from files
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
# export
__all__ = ["TAG_MAP", "STOP_WORDS"]

9
spacy/ja/stop_words.py Normal file
View File

@ -0,0 +1,9 @@
# encoding: utf8
from __future__ import unicode_literals
# stop words as whitespace-separated list
STOP_WORDS = set("""
""".split())

24
spacy/ja/tag_map.py Normal file
View File

@ -0,0 +1,24 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
TAG_MAP = {
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
"ADP": {POS: ADP},
"PRON": {POS: PRON},
"SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN},
"DET": {POS: DET},
"SYM": {POS: SYM},
"INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT},
"NUM": {POS: NUM},
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"ADJ": {POS: ADJ},
"VERB": {POS: VERB}
}

View File

@ -35,14 +35,15 @@ p
| Work has started on the following languages. You can help by improving
| the existing language data and extending the tokenization patterns.
+aside("Dependencies")
| Some language tokenizers require external dependencies. To use #[strong Chinese],
| you need to have #[+a("https://github.com/fxsjy/jieba") Jieba] installed.
| The #[strong Japanese] tokenizer requires
| #[+a("https://github.com/mocobeta/janome") Janome].
+table([ "Language", "Source" ])
each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" }
each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
+row
+cell #{language} #[code=code]
+cell
+src(gh("spaCy", "spacy/" + code)) spacy/#{code}
p
| Chinese tokenization requires the
| #[+a("https://github.com/fxsjy/jieba") Jieba] library. Statistical
| models are coming soon.