Fix Chinese language related bugs (#2634)

This commit is contained in:
Xiaoquan Kong 2018-08-07 17:26:31 +08:00 committed by Matthew Honnibal
parent 664cfc29bc
commit 87fa847e6e
4 changed files with 6 additions and 33 deletions

View File

@ -6,6 +6,9 @@ from ...language import Language
from ...tokens import Doc from ...tokens import Doc
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class ChineseDefaults(Language.Defaults): class ChineseDefaults(Language.Defaults):
@ -15,6 +18,7 @@ class ChineseDefaults(Language.Defaults):
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS
class Chinese(Language): class Chinese(Language):
lang = 'zh' lang = 'zh'
Defaults = ChineseDefaults # override defaults Defaults = ChineseDefaults # override defaults

View File

@ -1,7 +1,7 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..symbols import * from ...symbols import *
TAG_MAP = { TAG_MAP = {

View File

@ -1,8 +1,7 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..symbols import * from ...symbols import *
from ..language_data import PRON_LEMMA
TOKENIZER_EXCEPTIONS = { TOKENIZER_EXCEPTIONS = {

View File

@ -1,30 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals
# import base language data
from .. import language_data as base
# import util functions
from ..language_data import update_exc, strings_to_exc
# import language-specific data from files
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
# customize tokenizer exceptions
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
# export
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]