mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'master' into master
This commit is contained in:
commit
f9d7144224
|
@ -52,4 +52,5 @@ This is a list of everyone who has made significant contributions to spaCy, in a
|
|||
* Willem van Hage, [@wrvhage](https://github.com/wrvhage)
|
||||
* Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker)
|
||||
* Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang)
|
||||
* Yasuaki Uechi, [@uetchy](https://github.com/uetchy)
|
||||
* Yubing Dong, [@tomtung](https://github.com/tomtung)
|
||||
|
|
|
@ -4,9 +4,10 @@ spaCy: Industrial-strength NLP
|
|||
spaCy is a library for advanced natural language processing in Python and
|
||||
Cython. spaCy is built on the very latest research, but it isn't researchware.
|
||||
It was designed from day one to be used in real products. spaCy currently supports
|
||||
English, German and French, as well as tokenization for Chinese, Spanish, Italian,
|
||||
Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali and Hebrew. It's
|
||||
commercial open-source software, released under the MIT license.
|
||||
English, German and French, as well as tokenization for Spanish, Italian,
|
||||
Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew,
|
||||
Chinese and Japanese. It's commercial open-source software, released under the
|
||||
MIT license.
|
||||
|
||||
📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.
|
||||
|
||||
|
|
1
setup.py
1
setup.py
|
@ -37,6 +37,7 @@ PACKAGES = [
|
|||
'spacy.bn',
|
||||
'spacy.he',
|
||||
'spacy.nb',
|
||||
'spacy.ja',
|
||||
'spacy.en.lemmatizer',
|
||||
'spacy.cli.converters',
|
||||
'spacy.language_data',
|
||||
|
|
|
@ -9,7 +9,7 @@ from .cli.info import info
|
|||
|
||||
_languages_name = set(["en", "de", "es", "pt", "fr",
|
||||
"it", "hu", "zh", "nl", "sv",
|
||||
"fi", "bn", "he", "nb"])
|
||||
"fi", "bn", "he", "nb", "ja"])
|
||||
|
||||
|
||||
def load(name, **overrides):
|
||||
|
|
23
spacy/ja/__init__.py
Normal file
23
spacy/ja/__init__.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from os import path
|
||||
|
||||
from ..language import Language
|
||||
from ..attrs import LANG
|
||||
from ..tokens import Doc
|
||||
|
||||
from .language_data import *
|
||||
|
||||
|
||||
class Japanese(Language):
|
||||
lang = 'ja'
|
||||
|
||||
def make_doc(self, text):
|
||||
try:
|
||||
from janome.tokenizer import Tokenizer
|
||||
except ImportError:
|
||||
raise ImportError("The Japanese tokenizer requires the Janome library: "
|
||||
"https://github.com/mocobeta/janome")
|
||||
words = [x.surface for x in Tokenizer().tokenize(text)]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
23
spacy/ja/language_data.py
Normal file
23
spacy/ja/language_data.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# import base language data
|
||||
from .. import language_data as base
|
||||
|
||||
|
||||
# import util functions
|
||||
from ..language_data import update_exc, strings_to_exc
|
||||
|
||||
|
||||
# import language-specific data from files
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
TAG_MAP = dict(TAG_MAP)
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
|
||||
|
||||
# export
|
||||
__all__ = ["TAG_MAP", "STOP_WORDS"]
|
9
spacy/ja/stop_words.py
Normal file
9
spacy/ja/stop_words.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# stop words as whitespace-separated list
|
||||
STOP_WORDS = set("""
|
||||
。
|
||||
、
|
||||
""".split())
|
24
spacy/ja/tag_map.py
Normal file
24
spacy/ja/tag_map.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..symbols import *
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
"ADV": {POS: ADV},
|
||||
"NOUN": {POS: NOUN},
|
||||
"ADP": {POS: ADP},
|
||||
"PRON": {POS: PRON},
|
||||
"SCONJ": {POS: SCONJ},
|
||||
"PROPN": {POS: PROPN},
|
||||
"DET": {POS: DET},
|
||||
"SYM": {POS: SYM},
|
||||
"INTJ": {POS: INTJ},
|
||||
"PUNCT": {POS: PUNCT},
|
||||
"NUM": {POS: NUM},
|
||||
"AUX": {POS: AUX},
|
||||
"X": {POS: X},
|
||||
"CONJ": {POS: CONJ},
|
||||
"ADJ": {POS: ADJ},
|
||||
"VERB": {POS: VERB}
|
||||
}
|
|
@ -35,14 +35,15 @@ p
|
|||
| Work has started on the following languages. You can help by improving
|
||||
| the existing language data and extending the tokenization patterns.
|
||||
|
||||
+aside("Dependencies")
|
||||
| Some language tokenizers require external dependencies. To use #[strong Chinese],
|
||||
| you need to have #[+a("https://github.com/fxsjy/jieba") Jieba] installed.
|
||||
| The #[strong Japanese] tokenizer requires
|
||||
| #[+a("https://github.com/mocobeta/janome") Janome].
|
||||
|
||||
+table([ "Language", "Source" ])
|
||||
each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" }
|
||||
each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
|
||||
+row
|
||||
+cell #{language} #[code=code]
|
||||
+cell
|
||||
+src(gh("spaCy", "spacy/" + code)) spacy/#{code}
|
||||
|
||||
p
|
||||
| Chinese tokenization requires the
|
||||
| #[+a("https://github.com/fxsjy/jieba") Jieba] library. Statistical
|
||||
| models are coming soon.
|
||||
|
|
Loading…
Reference in New Issue
Block a user