Fix Chinese language related bugs (#2634)

This commit is contained in:
Xiaoquan Kong 2018-08-07 17:26:31 +08:00 committed by Matthew Honnibal
parent 664cfc29bc
commit 87fa847e6e
4 changed files with 6 additions and 33 deletions

View File

@ -6,6 +6,9 @@ from ...language import Language
from ...tokens import Doc
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class ChineseDefaults(Language.Defaults):
@ -15,6 +18,7 @@ class ChineseDefaults(Language.Defaults):
tag_map = TAG_MAP
stop_words = STOP_WORDS
class Chinese(Language):
lang = 'zh'
Defaults = ChineseDefaults # override defaults

View File

@ -1,7 +1,7 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ...symbols import *
TAG_MAP = {

View File

@ -1,8 +1,7 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
from ...symbols import *
TOKENIZER_EXCEPTIONS = {

View File

@ -1,30 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals
# import base language data
from .. import language_data as base
# import util functions
from ..language_data import update_exc, strings_to_exc
# import language-specific data from files
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
# customize tokenizer exceptions
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
# export
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]