From 87fa847e6e93c677775766dc46b226124b8e1388 Mon Sep 17 00:00:00 2001 From: Xiaoquan Kong Date: Tue, 7 Aug 2018 17:26:31 +0800 Subject: [PATCH] Fix Chinese language related bugs (#2634) --- spacy/lang/zh/__init__.py | 4 ++++ spacy/lang/zh/tag_map.py | 2 +- spacy/lang/zh/tokenizer_exceptions.py | 3 +-- spacy/zh/language_data.py | 30 --------------------------- 4 files changed, 6 insertions(+), 33 deletions(-) delete mode 100644 spacy/zh/language_data.py diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 399925a8a..04b9dcab3 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -6,6 +6,9 @@ from ...language import Language from ...tokens import Doc from .tag_map import TAG_MAP from .stop_words import STOP_WORDS +from ...util import update_exc +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS class ChineseDefaults(Language.Defaults): @@ -15,6 +18,7 @@ class ChineseDefaults(Language.Defaults): tag_map = TAG_MAP stop_words = STOP_WORDS + class Chinese(Language): lang = 'zh' Defaults = ChineseDefaults # override defaults diff --git a/spacy/lang/zh/tag_map.py b/spacy/lang/zh/tag_map.py index f5b6b5040..33829f27f 100644 --- a/spacy/lang/zh/tag_map.py +++ b/spacy/lang/zh/tag_map.py @@ -1,7 +1,7 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import * +from ...symbols import * TAG_MAP = { diff --git a/spacy/lang/zh/tokenizer_exceptions.py b/spacy/lang/zh/tokenizer_exceptions.py index ba592e84c..26a3ea908 100644 --- a/spacy/lang/zh/tokenizer_exceptions.py +++ b/spacy/lang/zh/tokenizer_exceptions.py @@ -1,8 +1,7 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import * -from ..language_data import PRON_LEMMA +from ...symbols import * TOKENIZER_EXCEPTIONS = { diff --git a/spacy/zh/language_data.py b/spacy/zh/language_data.py deleted file mode 100644 index 61f3e6b61..000000000 --- a/spacy/zh/language_data.py +++ /dev/null @@ -1,30 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals - - -# import base language data -from .. import language_data as base - - -# import util functions -from ..language_data import update_exc, strings_to_exc - - -# import language-specific data from files -from .tag_map import TAG_MAP -from .stop_words import STOP_WORDS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY - - -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) -TAG_MAP = dict(TAG_MAP) -STOP_WORDS = set(STOP_WORDS) - - -# customize tokenizer exceptions -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) - - -# export -__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]