Port BenDerPan's Chinese changes to v2 (finally) (#2591)

* add template files for Chinese * add template files for Chinese, and test directory .
2025-10-24 20:51:30 +03:00 · 2018-07-25 02:47:23 +02:00 · 2018-07-25 02:47:23 +02:00 · 66983d8412
commit 66983d8412
parent f2e3e039b7
6 changed files with 2006 additions and 1 deletions
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -4,12 +4,16 @@ from __future__ import unicode_literals
 from ...attrs import LANG
 from ...language import Language
 from ...tokens import Doc
+from .tag_map import TAG_MAP
+from .stop_words import STOP_WORDS


 class ChineseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'zh'  # for pickling
-
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    tag_map = TAG_MAP
+    stop_words = STOP_WORDS

 class Chinese(Language):
    lang = 'zh'
--- a/spacy/lang/zh/stop_words.py
+++ b/spacy/lang/zh/stop_words.py
--- a/spacy/lang/zh/tag_map.py
+++ b/spacy/lang/zh/tag_map.py
@ -0,0 +1,24 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import *
+
+
+TAG_MAP = {
+    "ADV":      {POS: ADV},
+    "NOUN":     {POS: NOUN},
+    "ADP":      {POS: ADP},
+    "PRON":     {POS: PRON},
+    "SCONJ":    {POS: SCONJ},
+    "PROPN":    {POS: PROPN},
+    "DET":      {POS: DET},
+    "SYM":      {POS: SYM},
+    "INTJ":     {POS: INTJ},
+    "PUNCT":    {POS: PUNCT},
+    "NUM":      {POS: NUM},
+    "AUX":      {POS: AUX},
+    "X":        {POS: X},
+    "CONJ":     {POS: CONJ},
+    "ADJ":      {POS: ADJ},
+    "VERB":     {POS: VERB}
+}
--- a/spacy/lang/zh/tokenizer_exceptions.py
+++ b/spacy/lang/zh/tokenizer_exceptions.py
@ -0,0 +1,46 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ..symbols import *
+from ..language_data import PRON_LEMMA
+
+
+TOKENIZER_EXCEPTIONS = {
+    "Jan.": [
+        {ORTH: "Jan.", LEMMA: "January"}
+    ]
+}
+
+
+# exceptions mapped to a single token containing only ORTH property
+# example: {"string": [{ORTH: "string"}]}
+# converted using strings_to_exc() util
+
+ORTH_ONLY = [
+    "a.",
+    "b.",
+    "c.",
+    "d.",
+    "e.",
+    "f.",
+    "g.",
+    "h.",
+    "i.",
+    "j.",
+    "k.",
+    "l.",
+    "m.",
+    "n.",
+    "o.",
+    "p.",
+    "q.",
+    "r.",
+    "s.",
+    "t.",
+    "u.",
+    "v.",
+    "w.",
+    "x.",
+    "y.",
+    "z."
+]
--- a/spacy/tests/zh/init.py
+++ b/spacy/tests/zh/init.py
--- a/spacy/zh/language_data.py
+++ b/spacy/zh/language_data.py
@ -0,0 +1,30 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# import base language data
+from .. import language_data as base
+
+
+# import util functions
+from ..language_data import update_exc, strings_to_exc
+
+
+# import language-specific data from files
+from .tag_map import TAG_MAP
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
+
+
+TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
+TAG_MAP = dict(TAG_MAP)
+STOP_WORDS = set(STOP_WORDS)
+
+
+# customize tokenizer exceptions
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
+
+
+# export
+__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]