mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Port BenDerPan's Chinese changes to v2 (finally) (#2591)
* add template files for Chinese * add template files for Chinese, and test directory .
This commit is contained in:
parent
f2e3e039b7
commit
66983d8412
|
@ -4,12 +4,16 @@ from __future__ import unicode_literals
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
|
from .tag_map import TAG_MAP
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class ChineseDefaults(Language.Defaults):
|
class ChineseDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'zh' # for pickling
|
lex_attr_getters[LANG] = lambda text: 'zh' # for pickling
|
||||||
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
tag_map = TAG_MAP
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
class Chinese(Language):
|
class Chinese(Language):
|
||||||
lang = 'zh'
|
lang = 'zh'
|
||||||
|
|
1901
spacy/lang/zh/stop_words.py
Normal file
1901
spacy/lang/zh/stop_words.py
Normal file
File diff suppressed because it is too large
Load Diff
24
spacy/lang/zh/tag_map.py
Normal file
24
spacy/lang/zh/tag_map.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..symbols import *
|
||||||
|
|
||||||
|
|
||||||
|
TAG_MAP = {
|
||||||
|
"ADV": {POS: ADV},
|
||||||
|
"NOUN": {POS: NOUN},
|
||||||
|
"ADP": {POS: ADP},
|
||||||
|
"PRON": {POS: PRON},
|
||||||
|
"SCONJ": {POS: SCONJ},
|
||||||
|
"PROPN": {POS: PROPN},
|
||||||
|
"DET": {POS: DET},
|
||||||
|
"SYM": {POS: SYM},
|
||||||
|
"INTJ": {POS: INTJ},
|
||||||
|
"PUNCT": {POS: PUNCT},
|
||||||
|
"NUM": {POS: NUM},
|
||||||
|
"AUX": {POS: AUX},
|
||||||
|
"X": {POS: X},
|
||||||
|
"CONJ": {POS: CONJ},
|
||||||
|
"ADJ": {POS: ADJ},
|
||||||
|
"VERB": {POS: VERB}
|
||||||
|
}
|
46
spacy/lang/zh/tokenizer_exceptions.py
Normal file
46
spacy/lang/zh/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..symbols import *
|
||||||
|
from ..language_data import PRON_LEMMA
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = {
|
||||||
|
"Jan.": [
|
||||||
|
{ORTH: "Jan.", LEMMA: "January"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# exceptions mapped to a single token containing only ORTH property
|
||||||
|
# example: {"string": [{ORTH: "string"}]}
|
||||||
|
# converted using strings_to_exc() util
|
||||||
|
|
||||||
|
ORTH_ONLY = [
|
||||||
|
"a.",
|
||||||
|
"b.",
|
||||||
|
"c.",
|
||||||
|
"d.",
|
||||||
|
"e.",
|
||||||
|
"f.",
|
||||||
|
"g.",
|
||||||
|
"h.",
|
||||||
|
"i.",
|
||||||
|
"j.",
|
||||||
|
"k.",
|
||||||
|
"l.",
|
||||||
|
"m.",
|
||||||
|
"n.",
|
||||||
|
"o.",
|
||||||
|
"p.",
|
||||||
|
"q.",
|
||||||
|
"r.",
|
||||||
|
"s.",
|
||||||
|
"t.",
|
||||||
|
"u.",
|
||||||
|
"v.",
|
||||||
|
"w.",
|
||||||
|
"x.",
|
||||||
|
"y.",
|
||||||
|
"z."
|
||||||
|
]
|
0
spacy/tests/zh/__init__.py
Normal file
0
spacy/tests/zh/__init__.py
Normal file
30
spacy/zh/language_data.py
Normal file
30
spacy/zh/language_data.py
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
# import base language data
|
||||||
|
from .. import language_data as base
|
||||||
|
|
||||||
|
|
||||||
|
# import util functions
|
||||||
|
from ..language_data import update_exc, strings_to_exc
|
||||||
|
|
||||||
|
|
||||||
|
# import language-specific data from files
|
||||||
|
from .tag_map import TAG_MAP
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||||
|
TAG_MAP = dict(TAG_MAP)
|
||||||
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
# customize tokenizer exceptions
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||||
|
|
||||||
|
|
||||||
|
# export
|
||||||
|
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]
|
Loading…
Reference in New Issue
Block a user