Port BenDerPan's Chinese changes to v2 (finally) (#2591)

* add  template files for Chinese

* add  template files for Chinese, and test directory .
This commit is contained in:
Matthew Honnibal 2018-07-25 02:47:23 +02:00 committed by GitHub
parent f2e3e039b7
commit 66983d8412
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 2006 additions and 1 deletions

View File

@ -4,12 +4,16 @@ from __future__ import unicode_literals
from ...attrs import LANG from ...attrs import LANG
from ...language import Language from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
class ChineseDefaults(Language.Defaults): class ChineseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'zh' # for pickling lex_attr_getters[LANG] = lambda text: 'zh' # for pickling
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
class Chinese(Language): class Chinese(Language):
lang = 'zh' lang = 'zh'

1901
spacy/lang/zh/stop_words.py Normal file

File diff suppressed because it is too large Load Diff

24
spacy/lang/zh/tag_map.py Normal file
View File

@ -0,0 +1,24 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
TAG_MAP = {
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
"ADP": {POS: ADP},
"PRON": {POS: PRON},
"SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN},
"DET": {POS: DET},
"SYM": {POS: SYM},
"INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT},
"NUM": {POS: NUM},
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"ADJ": {POS: ADJ},
"VERB": {POS: VERB}
}

View File

@ -0,0 +1,46 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
TOKENIZER_EXCEPTIONS = {
"Jan.": [
{ORTH: "Jan.", LEMMA: "January"}
]
}
# exceptions mapped to a single token containing only ORTH property
# example: {"string": [{ORTH: "string"}]}
# converted using strings_to_exc() util
ORTH_ONLY = [
"a.",
"b.",
"c.",
"d.",
"e.",
"f.",
"g.",
"h.",
"i.",
"j.",
"k.",
"l.",
"m.",
"n.",
"o.",
"p.",
"q.",
"r.",
"s.",
"t.",
"u.",
"v.",
"w.",
"x.",
"y.",
"z."
]

View File

30
spacy/zh/language_data.py Normal file
View File

@ -0,0 +1,30 @@
# encoding: utf8
from __future__ import unicode_literals
# import base language data
from .. import language_data as base
# import util functions
from ..language_data import update_exc, strings_to_exc
# import language-specific data from files
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
# customize tokenizer exceptions
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
# export
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]