From ada4fc0f09189eb32935fd0d17cb8b78e8ed51b7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sat, 14 Nov 2020 09:20:42 +0100 Subject: [PATCH] Update v2.2.x for bugfix release (#6384) * Fix on_match callback and remove empty patterns (#6312) For the `DependencyMatcher`: * Fix on_match callback so that it is called once per matched pattern * Fix results so that patterns with empty match lists are not returned * Add --prefer-binary for python 3.5 * Add version pins for pyrsistent * Use backwards-compatible super() * Try to fix tests on Travis (2.7) * Fix naming conflict and formatting * Update pkuseg version in Chinese tokenizer warnings * Some changes for Armenian (#5616) * Fixing numericals * We need a Armenian question sign to make the sentence a question * Update lex_attrs.py (#5608) * Fix compat * Update Armenian from v2.3.x Co-authored-by: Ines Montani Co-authored-by: Karen Hambardzumyan Co-authored-by: Marat M. Yavrumyan --- azure-pipelines.yml | 6 +- requirements.txt | 1 + spacy/errors.py | 4 +- spacy/lang/hy/__init__.py | 5 +- spacy/lang/hy/examples.py | 4 +- spacy/lang/hy/lex_attrs.py | 26 +- spacy/lang/hy/stop_words.py | 4 +- spacy/lang/hy/tag_map.py | 431 +++++++---------------- spacy/lang/pl/lemmatizer.py | 2 +- spacy/lang/zh/__init__.py | 34 +- spacy/matcher/dependencymatcher.pyx | 12 +- spacy/tests/lang/hy/test_text.py | 1 + spacy/tests/lang/hy/test_tokenizer.py | 1 + spacy/tests/matcher/test_matcher_api.py | 59 ++-- spacy/tests/regression/test_issue5152.py | 2 + 15 files changed, 209 insertions(+), 383 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 147d2e903..54489615c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -80,8 +80,8 @@ jobs: architecture: 'x64' - script: | - python -m pip install -U setuptools - pip install -r requirements.txt + python -m pip install -U pip setuptools + pip install -r requirements.txt --prefer-binary displayName: 'Install dependencies' - script: | @@ -96,7 +96,7 @@ jobs: - bash: | SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) - pip install dist/$SDIST + pip install dist/$SDIST --prefer-binary displayName: 'Install from sdist' - script: python -m pytest --pyargs spacy diff --git a/requirements.txt b/requirements.txt index ec30efc16..f76a8db10 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ plac>=0.9.6,<1.2.0 pathlib==1.0.1; python_version < "3.4" tqdm>=4.38.0,<5.0.0 # Optional dependencies +pyrsistent<0.17.0 jsonschema>=2.6.0,<3.1.0 # Development dependencies cython>=0.25 diff --git a/spacy/errors.py b/spacy/errors.py index d99c96922..f6b457345 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -7,8 +7,8 @@ def add_codes(err_cls): class ErrorsWithCodes(err_cls): def __getattribute__(self, code): - msg = super().__getattribute__(code) - if code.startswith('__'): # python system attributes like __class__ + msg = super(ErrorsWithCodes, self).__getattribute__(code) + if code.startswith("__"): # python system attributes like __class__ return msg else: return "[{code}] {msg}".format(code=code, msg=msg) diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py index 3320edb6c..6aaa965bb 100644 --- a/spacy/lang/hy/__init__.py +++ b/spacy/lang/hy/__init__.py @@ -1,11 +1,12 @@ +# coding: utf8 +from __future__ import unicode_literals + from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP - from ...attrs import LANG from ...language import Language -from ...tokens import Doc class ArmenianDefaults(Language.Defaults): diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py index b0df31aae..8a00fd243 100644 --- a/spacy/lang/hy/examples.py +++ b/spacy/lang/hy/examples.py @@ -1,6 +1,6 @@ +# coding: utf8 from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.hy.examples import sentences @@ -11,6 +11,6 @@ Example sentences to test spaCy and its language models. sentences = [ "Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։", "Ո՞վ է Ֆրանսիայի նախագահը։", - "Որն է Միացյալ Նահանգների մայրաքաղաքը։", + "Ո՞րն է Միացյալ Նահանգների մայրաքաղաքը։", "Ե՞րբ է ծնվել Բարաք Օբաման։", ] diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py index 7c1b9592f..dea3c0e97 100644 --- a/spacy/lang/hy/lex_attrs.py +++ b/spacy/lang/hy/lex_attrs.py @@ -1,11 +1,12 @@ +# coding: utf8 from __future__ import unicode_literals from ...attrs import LIKE_NUM _num_words = [ - "զրօ", - "մէկ", + "զրո", + "մեկ", "երկու", "երեք", "չորս", @@ -17,20 +18,21 @@ _num_words = [ "տասը", "տասնմեկ", "տասներկու", - "տասն­երեք", - "տասն­չորս", - "տասն­հինգ", - "տասն­վեց", - "տասն­յոթ", - "տասն­ութ", - "տասն­ինը", - "քսան" "երեսուն", + "տասներեք", + "տասնչորս", + "տասնհինգ", + "տասնվեց", + "տասնյոթ", + "տասնութ", + "տասնինը", + "քսան", + "երեսուն", "քառասուն", "հիսուն", - "վաթցսուն", + "վաթսուն", "յոթանասուն", "ութսուն", - "ինիսուն", + "իննսուն", "հարյուր", "հազար", "միլիոն", diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py index c671956a4..d75aad6e2 100644 --- a/spacy/lang/hy/stop_words.py +++ b/spacy/lang/hy/stop_words.py @@ -1,6 +1,6 @@ +# coding: utf8 from __future__ import unicode_literals - STOP_WORDS = set( """ նա @@ -105,6 +105,6 @@ STOP_WORDS = set( յուրաքանչյուր այս մեջ -թ +թ """.split() ) diff --git a/spacy/lang/hy/tag_map.py b/spacy/lang/hy/tag_map.py index 90690c22e..4d5b6e918 100644 --- a/spacy/lang/hy/tag_map.py +++ b/spacy/lang/hy/tag_map.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import POS, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN +from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ TAG_MAP = { @@ -27,7 +27,7 @@ TAG_MAP = { POS: ADP, "AdpType": "Post", "Number": "Plur", - "Person": "3", + "Person": "three", }, "ADP_AdpType=Post": {POS: ADP, "AdpType": "Post"}, "ADP_AdpType=Prep": {POS: ADP, "AdpType": "Prep"}, @@ -35,12 +35,11 @@ TAG_MAP = { "ADV_Degree=Cmp": {POS: ADV, "Degree": "Cmp"}, "ADV_Degree=Pos": {POS: ADV, "Degree": "Pos"}, "ADV_Degree=Sup": {POS: ADV, "Degree": "Sup"}, - "ADV_Distance=Dist|PronType=Dem": {POS: ADV, "Distance": "Dist", "PronType": "Dem"}, - "ADV_Distance=Dist|PronType=Exc": {POS: ADV, "Distance": "Dist", "PronType": "Exc"}, - "ADV_Distance=Med|PronType=Dem": {POS: ADV, "Distance": "Med", "PronType": "Dem"}, + "ADV_Distance=Dist|PronType=Dem": {POS: ADV, "PronType": "Dem"}, + "ADV_Distance=Dist|PronType=Exc": {POS: ADV, "PronType": "Exc"}, + "ADV_Distance=Med|PronType=Dem": {POS: ADV, "PronType": "Dem"}, "ADV_Distance=Med|PronType=Dem|Style=Coll": { POS: ADV, - "Distance": "Med", "PronType": "Dem", "Style": "Coll", }, @@ -63,7 +62,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Tense": "Pres", "VerbForm": "Fin", @@ -73,7 +72,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Plur", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -83,7 +82,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Imp", "VerbForm": "Fin", @@ -93,7 +92,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Pres", "VerbForm": "Fin", @@ -103,7 +102,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Imp", "VerbForm": "Fin", @@ -113,7 +112,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -123,7 +122,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Tense": "Imp", "VerbForm": "Fin", @@ -133,7 +132,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Neg", "Tense": "Pres", "VerbForm": "Fin", @@ -143,7 +142,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Tense": "Imp", "VerbForm": "Fin", @@ -153,7 +152,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -163,7 +162,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Neg", "Tense": "Pres", "VerbForm": "Fin", @@ -173,7 +172,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -183,7 +182,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Imp", "VerbForm": "Fin", @@ -193,7 +192,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Tense": "Pres", "VerbForm": "Fin", @@ -203,7 +202,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Imp", "VerbForm": "Fin", @@ -213,7 +212,7 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Tense": "Pres", "VerbForm": "Fin", @@ -227,7 +226,6 @@ TAG_MAP = { "DET_Case=Gen|Distance=Med|Number=Plur|Poss=Yes|PronType=Dem": { POS: DET, "Case": "Gen", - "Distance": "Med", "Number": "Plur", "Poss": "Yes", "PronType": "Dem", @@ -235,7 +233,6 @@ TAG_MAP = { "DET_Case=Gen|Distance=Med|Number=Sing|Poss=Yes|PronType=Dem": { POS: DET, "Case": "Gen", - "Distance": "Med", "Number": "Sing", "Poss": "Yes", "PronType": "Dem", @@ -244,7 +241,7 @@ TAG_MAP = { POS: DET, "Case": "Gen", "Number": "Plur", - "Person": "1", + "Person": "one", "Poss": "Yes", "PronType": "Prs", }, @@ -252,8 +249,7 @@ TAG_MAP = { POS: DET, "Case": "Gen", "Number": "Plur", - "Person": "2", - "Polite": "Infm", + "Person": "two", "Poss": "Yes", "PronType": "Prs", }, @@ -261,24 +257,22 @@ TAG_MAP = { POS: DET, "Case": "Gen", "Number": "Plur", - "Person": "3", + "Person": "three", "Poss": "Yes", - "PronType": "Emp", }, "DET_Case=Gen|Number=Plur|Person=3|Poss=Yes|PronType=Emp|Reflex=Yes": { POS: DET, "Case": "Gen", "Number": "Plur", - "Person": "3", + "Person": "three", "Poss": "Yes", - "PronType": "Emp", "Reflex": "Yes", }, "DET_Case=Gen|Number=Sing|Person=1|Poss=Yes|PronType=Prs": { POS: DET, "Case": "Gen", "Number": "Sing", - "Person": "1", + "Person": "one", "Poss": "Yes", "PronType": "Prs", }, @@ -286,8 +280,7 @@ TAG_MAP = { POS: DET, "Case": "Gen", "Number": "Sing", - "Person": "2", - "Polite": "Infm", + "Person": "two", "Poss": "Yes", "PronType": "Prs", }, @@ -295,24 +288,22 @@ TAG_MAP = { POS: DET, "Case": "Gen", "Number": "Sing", - "Person": "3", + "Person": "three", "Poss": "Yes", - "PronType": "Emp", }, "DET_Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Emp|Reflex=Yes": { POS: DET, "Case": "Gen", "Number": "Sing", - "Person": "3", + "Person": "three", "Poss": "Yes", - "PronType": "Emp", "Reflex": "Yes", }, "DET_Case=Gen|Number=Sing|Person=3|Poss=Yes|PronType=Prs": { POS: DET, "Case": "Gen", "Number": "Sing", - "Person": "3", + "Person": "three", "Poss": "Yes", "PronType": "Prs", }, @@ -323,30 +314,26 @@ TAG_MAP = { "Poss": "Yes", "PronType": "Rel", }, - "DET_Distance=Dist|PronType=Dem": {POS: DET, "Distance": "Dist", "PronType": "Dem"}, + "DET_Distance=Dist|PronType=Dem": {POS: DET, "PronType": "Dem"}, "DET_Distance=Dist|PronType=Dem|Style=Coll": { POS: DET, - "Distance": "Dist", "PronType": "Dem", "Style": "Coll", }, "DET_Distance=Dist|PronType=Dem|Style=Vrnc": { POS: DET, - "Distance": "Dist", "PronType": "Dem", "Style": "Vrnc", }, - "DET_Distance=Med|PronType=Dem": {POS: DET, "Distance": "Med", "PronType": "Dem"}, + "DET_Distance=Med|PronType=Dem": {POS: DET, "PronType": "Dem"}, "DET_Distance=Med|PronType=Dem|Style=Coll": { POS: DET, - "Distance": "Med", "PronType": "Dem", "Style": "Coll", }, - "DET_Distance=Prox|PronType=Dem": {POS: DET, "Distance": "Prox", "PronType": "Dem"}, + "DET_Distance=Prox|PronType=Dem": {POS: DET, "PronType": "Dem"}, "DET_Distance=Prox|PronType=Dem|Style=Coll": { POS: DET, - "Distance": "Prox", "PronType": "Dem", "Style": "Coll", }, @@ -386,7 +373,6 @@ TAG_MAP = { "Case": "Abl", "Definite": "Ind", "Number": "Plur", - "Style": "Slng", }, "NOUN_Animacy=Hum|Case=Abl|Definite=Ind|Number=Sing": { POS: NOUN, @@ -415,14 +401,12 @@ TAG_MAP = { "Case": "Dat", "Definite": "Def", "Number": "Sing", - "Style": "Slng", }, "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Assoc": { POS: NOUN, "Animacy": "Hum", "Case": "Dat", "Definite": "Ind", - "Number": "Assoc", }, "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Plur": { POS: NOUN, @@ -445,7 +429,6 @@ TAG_MAP = { "Case": "Dat", "Definite": "Ind", "Number": "Plur", - "Style": "Slng", }, "NOUN_Animacy=Hum|Case=Dat|Definite=Ind|Number=Sing": { POS: NOUN, @@ -468,7 +451,7 @@ TAG_MAP = { "Case": "Dat", "Number": "Sing", "Number": "Sing", - "Person": "1", + "Person": "one", }, "NOUN_Animacy=Hum|Case=Dat|Number=Sing|Number=Sing|Person=1|Style=Coll": { POS: NOUN, @@ -476,7 +459,7 @@ TAG_MAP = { "Case": "Dat", "Number": "Sing", "Number": "Sing", - "Person": "1", + "Person": "one", "Style": "Coll", }, "NOUN_Animacy=Hum|Case=Ins|Definite=Ind|Number=Sing": { @@ -499,7 +482,6 @@ TAG_MAP = { "Case": "Nom", "Definite": "Def", "Number": "Plur", - "Style": "Slng", }, "NOUN_Animacy=Hum|Case=Nom|Definite=Def|Number=Sing": { POS: NOUN, @@ -521,7 +503,6 @@ TAG_MAP = { "Animacy": "Hum", "Case": "Nom", "Definite": "Ind", - "Number": "Assoc", }, "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Plur": { POS: NOUN, @@ -544,7 +525,6 @@ TAG_MAP = { "Case": "Nom", "Definite": "Ind", "Number": "Plur", - "Style": "Slng", }, "NOUN_Animacy=Hum|Case=Nom|Definite=Ind|Number=Plur|Typo=Yes": { POS: NOUN, @@ -575,14 +555,13 @@ TAG_MAP = { "Case": "Nom", "Number": "Sing", "Number": "Sing", - "Person": "1", + "Person": "one", }, "NOUN_Animacy=Nhum|Case=Abl|Definite=Ind|Number=Coll": { POS: NOUN, "Animacy": "Nhum", "Case": "Abl", "Definite": "Ind", - "Number": "Coll", }, "NOUN_Animacy=Nhum|Case=Abl|Definite=Ind|Number=Plur": { POS: NOUN, @@ -612,14 +591,13 @@ TAG_MAP = { "Case": "Abl", "Number": "Sing", "Number": "Sing", - "Person": "2", + "Person": "two", }, "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Coll": { POS: NOUN, "Animacy": "Nhum", "Case": "Dat", "Definite": "Def", - "Number": "Coll", }, "NOUN_Animacy=Nhum|Case=Dat|Definite=Def|Number=Plur": { POS: NOUN, @@ -672,7 +650,6 @@ TAG_MAP = { "Animacy": "Nhum", "Case": "Dat", "Definite": "Ind", - "Number": "Coll", }, "NOUN_Animacy=Nhum|Case=Dat|Definite=Ind|Number=Plur": { POS: NOUN, @@ -716,9 +693,9 @@ TAG_MAP = { POS: NOUN, "Animacy": "Nhum", "Case": "Dat", - "Number": "Coll", + # "Number": "Sing", - "Person": "1", + "Person": "one", }, "NOUN_Animacy=Nhum|Case=Dat|Number=Sing|Number=Sing|Person=1": { POS: NOUN, @@ -726,7 +703,7 @@ TAG_MAP = { "Case": "Dat", "Number": "Sing", "Number": "Sing", - "Person": "1", + "Person": "one", }, "NOUN_Animacy=Nhum|Case=Dat|Number=Sing|Number=Sing|Person=2": { POS: NOUN, @@ -734,7 +711,7 @@ TAG_MAP = { "Case": "Dat", "Number": "Sing", "Number": "Sing", - "Person": "2", + "Person": "two", }, "NOUN_Animacy=Nhum|Case=Gen|Definite=Ind|Number=Sing|Style=Arch": { POS: NOUN, @@ -749,7 +726,6 @@ TAG_MAP = { "Animacy": "Nhum", "Case": "Ins", "Definite": "Ind", - "Number": "Coll", }, "NOUN_Animacy=Nhum|Case=Ins|Definite=Ind|Number=Plur": { POS: NOUN, @@ -779,7 +755,7 @@ TAG_MAP = { "Case": "Ins", "Number": "Sing", "Number": "Sing", - "Person": "1", + "Person": "one", }, "NOUN_Animacy=Nhum|Case=Loc|Definite=Ind|Number=Plur": { POS: NOUN, @@ -801,21 +777,20 @@ TAG_MAP = { "Case": "Loc", "Number": "Sing", "Number": "Sing", - "Person": "2", + "Person": "two", }, "NOUN_Animacy=Nhum|Case=Nom|Definite=Def|Number=Coll": { POS: NOUN, "Animacy": "Nhum", "Case": "Nom", "Definite": "Def", - "Number": "Coll", }, "NOUN_Animacy=Nhum|Case=Nom|Definite=Def|Number=Plur|Number=Sing|Poss=Yes": { POS: NOUN, "Animacy": "Nhum", "Case": "Nom", "Definite": "Def", - "Number": "Plur", + # "Number": "Plur", "Number": "Sing", "Poss": "Yes", }, @@ -846,14 +821,12 @@ TAG_MAP = { "Animacy": "Nhum", "Case": "Nom", "Definite": "Ind", - "Number": "Coll", }, "NOUN_Animacy=Nhum|Case=Nom|Definite=Ind|Number=Coll|Typo=Yes": { POS: NOUN, "Animacy": "Nhum", "Case": "Nom", "Definite": "Ind", - "Number": "Coll", "Typo": "Yes", }, "NOUN_Animacy=Nhum|Case=Nom|Definite=Ind|Number=Plur": { @@ -880,9 +853,9 @@ TAG_MAP = { POS: NOUN, "Animacy": "Nhum", "Case": "Nom", - "Number": "Plur", + # "Number": "Plur", "Number": "Sing", - "Person": "2", + "Person": "two", }, "NOUN_Animacy=Nhum|Case=Nom|Number=Sing|Number=Sing|Person=1": { POS: NOUN, @@ -890,7 +863,7 @@ TAG_MAP = { "Case": "Nom", "Number": "Sing", "Number": "Sing", - "Person": "1", + "Person": "one", }, "NOUN_Animacy=Nhum|Case=Nom|Number=Sing|Number=Sing|Person=2": { POS: NOUN, @@ -898,7 +871,7 @@ TAG_MAP = { "Case": "Nom", "Number": "Sing", "Number": "Sing", - "Person": "2", + "Person": "two", }, "NUM_NumForm=Digit|NumType=Card": {POS: NUM, "NumForm": "Digit", "NumType": "Card"}, "NUM_NumForm=Digit|NumType=Frac|Typo=Yes": { @@ -907,43 +880,37 @@ TAG_MAP = { "NumType": "Frac", "Typo": "Yes", }, - "NUM_NumForm=Digit|NumType=Range": { - POS: NUM, - "NumForm": "Digit", - "NumType": "Range", - }, + "NUM_NumForm=Digit|NumType=Range": {POS: NUM, "NumForm": "Digit",}, "NUM_NumForm=Word|NumType=Card": {POS: NUM, "NumForm": "Word", "NumType": "Card"}, "NUM_NumForm=Word|NumType=Dist": {POS: NUM, "NumForm": "Word", "NumType": "Dist"}, - "NUM_NumForm=Word|NumType=Range": {POS: NUM, "NumForm": "Word", "NumType": "Range"}, + "NUM_NumForm=Word|NumType=Range": {POS: NUM, "NumForm": "Word",}, "PART_Polarity=Neg": {POS: PART, "Polarity": "Neg"}, "PRON_Case=Abl|Definite=Ind|Number=Sing|Person=3|PronType=Prs": { POS: PRON, "Case": "Abl", "Definite": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "PronType": "Prs", }, "PRON_Case=Abl|Number=Plur|Person=3|PronType=Prs": { POS: PRON, "Case": "Abl", "Number": "Plur", - "Person": "3", + "Person": "three", "PronType": "Prs", }, "PRON_Case=Abl|Number=Sing|Person=2|Polite=Infm|PronType=Prs": { POS: PRON, "Case": "Abl", "Number": "Sing", - "Person": "2", - "Polite": "Infm", + "Person": "two", "PronType": "Prs", }, "PRON_Case=Dat|Definite=Def|Distance=Dist|Number=Sing|PronType=Dem": { POS: PRON, "Case": "Dat", "Definite": "Def", - "Distance": "Dist", "Number": "Sing", "PronType": "Dem", }, @@ -952,7 +919,7 @@ TAG_MAP = { "Case": "Dat", "Definite": "Def", "Number": "Sing", - "Person": "3", + "Person": "three", "PronType": "Prs", }, "PRON_Case=Dat|Definite=Ind|Number=Sing|PronType=Int": { @@ -965,14 +932,12 @@ TAG_MAP = { "PRON_Case=Dat|Distance=Dist|Number=Sing|PronType=Dem": { POS: PRON, "Case": "Dat", - "Distance": "Dist", "Number": "Sing", "PronType": "Dem", }, "PRON_Case=Dat|Distance=Med|Number=Plur|PronType=Dem": { POS: PRON, "Case": "Dat", - "Distance": "Med", "Number": "Plur", "PronType": "Dem", }, @@ -980,30 +945,28 @@ TAG_MAP = { POS: PRON, "Case": "Dat", "Number": "Plur", - "Person": "1", + "Person": "one", "PronType": "Prs", }, "PRON_Case=Dat|Number=Plur|Person=2|Polite=Infm|PronType=Prs": { POS: PRON, "Case": "Dat", "Number": "Plur", - "Person": "2", - "Polite": "Infm", + "Person": "two", "PronType": "Prs", }, "PRON_Case=Dat|Number=Plur|Person=3|PronType=Emp|Reflex=Yes": { POS: PRON, "Case": "Dat", "Number": "Plur", - "Person": "3", - "PronType": "Emp", + "Person": "three", "Reflex": "Yes", }, "PRON_Case=Dat|Number=Plur|Person=3|PronType=Prs": { POS: PRON, "Case": "Dat", "Number": "Plur", - "Person": "3", + "Person": "three", "PronType": "Prs", }, "PRON_Case=Dat|Number=Plur|PronType=Rcp": { @@ -1016,30 +979,27 @@ TAG_MAP = { POS: PRON, "Case": "Dat", "Number": "Sing", - "Person": "1", + "Person": "one", "PronType": "Prs", }, "PRON_Case=Dat|Number=Sing|Person=2|Polite=Infm|PronType=Prs": { POS: PRON, "Case": "Dat", "Number": "Sing", - "Person": "2", - "Polite": "Infm", + "Person": "two", "PronType": "Prs", }, "PRON_Case=Dat|Number=Sing|Person=3|PronType=Emp": { POS: PRON, "Case": "Dat", "Number": "Sing", - "Person": "3", - "PronType": "Emp", + "Person": "three", }, "PRON_Case=Dat|Number=Sing|Person=3|PronType=Emp|Reflex=Yes": { POS: PRON, "Case": "Dat", "Number": "Sing", - "Person": "3", - "PronType": "Emp", + "Person": "three", "Reflex": "Yes", }, "PRON_Case=Dat|Number=Sing|PronType=Int": { @@ -1058,7 +1018,6 @@ TAG_MAP = { "PRON_Case=Gen|Distance=Med|Number=Sing|PronType=Dem": { POS: PRON, "Case": "Gen", - "Distance": "Med", "Number": "Sing", "PronType": "Dem", }, @@ -1066,21 +1025,21 @@ TAG_MAP = { POS: PRON, "Case": "Gen", "Number": "Plur", - "Person": "1", + "Person": "one", "PronType": "Prs", }, "PRON_Case=Gen|Number=Sing|Person=2|PronType=Prs": { POS: PRON, "Case": "Gen", "Number": "Sing", - "Person": "2", + "Person": "two", "PronType": "Prs", }, "PRON_Case=Gen|Number=Sing|Person=3|PronType=Prs": { POS: PRON, "Case": "Gen", "Number": "Sing", - "Person": "3", + "Person": "three", "PronType": "Prs", }, "PRON_Case=Gen|PronType=Tot": {POS: PRON, "Case": "Gen", "PronType": "Tot"}, @@ -1094,7 +1053,6 @@ TAG_MAP = { "PRON_Case=Ins|Distance=Med|Number=Sing|PronType=Dem": { POS: PRON, "Case": "Ins", - "Distance": "Med", "Number": "Sing", "PronType": "Dem", }, @@ -1108,7 +1066,6 @@ TAG_MAP = { "PRON_Case=Loc|Distance=Med|Number=Sing|PronType=Dem": { POS: PRON, "Case": "Loc", - "Distance": "Med", "Number": "Sing", "PronType": "Dem", }, @@ -1116,7 +1073,6 @@ TAG_MAP = { POS: PRON, "Case": "Nom", "Definite": "Def", - "Distance": "Dist", "Number": "Plur", "PronType": "Dem", }, @@ -1124,7 +1080,6 @@ TAG_MAP = { POS: PRON, "Case": "Nom", "Definite": "Def", - "Distance": "Med", "Number": "Sing", "PronType": "Dem", "Style": "Coll", @@ -1167,29 +1122,25 @@ TAG_MAP = { "PRON_Case=Nom|Distance=Dist|Number=Plur|Person=1|PronType=Dem": { POS: PRON, "Case": "Nom", - "Distance": "Dist", "Number": "Plur", - "Person": "1", + "Person": "one", "PronType": "Dem", }, "PRON_Case=Nom|Distance=Med|Number=Plur|PronType=Dem": { POS: PRON, "Case": "Nom", - "Distance": "Med", "Number": "Plur", "PronType": "Dem", }, "PRON_Case=Nom|Distance=Med|Number=Sing|PronType=Dem": { POS: PRON, "Case": "Nom", - "Distance": "Med", "Number": "Sing", "PronType": "Dem", }, "PRON_Case=Nom|Distance=Prox|Number=Sing|PronType=Dem": { POS: PRON, "Case": "Nom", - "Distance": "Prox", "Number": "Sing", "PronType": "Dem", }, @@ -1197,21 +1148,20 @@ TAG_MAP = { POS: PRON, "Case": "Nom", "Number": "Plur", - "Person": "1", + "Person": "one", "PronType": "Prs", }, "PRON_Case=Nom|Number=Plur|Person=3|PronType=Emp": { POS: PRON, "Case": "Nom", "Number": "Plur", - "Person": "3", - "PronType": "Emp", + "Person": "three", }, "PRON_Case=Nom|Number=Plur|Person=3|PronType=Prs": { POS: PRON, "Case": "Nom", "Number": "Plur", - "Person": "3", + "Person": "three", "PronType": "Prs", }, "PRON_Case=Nom|Number=Plur|PronType=Rel": { @@ -1223,46 +1173,43 @@ TAG_MAP = { "PRON_Case=Nom|Number=Sing|Number=Plur|Person=3|Person=1|PronType=Emp": { POS: PRON, "Case": "Nom", - "Number": "Sing", + # "Number": "Sing", "Number": "Plur", - "Person": "3", - "Person": "1", - "PronType": "Emp", + # "Person": "three", + "Person": "one", }, "PRON_Case=Nom|Number=Sing|Person=1|PronType=Int": { POS: PRON, "Case": "Nom", "Number": "Sing", - "Person": "1", + "Person": "one", "PronType": "Int", }, "PRON_Case=Nom|Number=Sing|Person=1|PronType=Prs": { POS: PRON, "Case": "Nom", "Number": "Sing", - "Person": "1", + "Person": "one", "PronType": "Prs", }, "PRON_Case=Nom|Number=Sing|Person=2|Polite=Infm|PronType=Prs": { POS: PRON, "Case": "Nom", "Number": "Sing", - "Person": "2", - "Polite": "Infm", + "Person": "two", "PronType": "Prs", }, "PRON_Case=Nom|Number=Sing|Person=3|PronType=Emp": { POS: PRON, "Case": "Nom", "Number": "Sing", - "Person": "3", - "PronType": "Emp", + "Person": "three", }, "PRON_Case=Nom|Number=Sing|Person=3|PronType=Prs": { POS: PRON, "Case": "Nom", "Number": "Sing", - "Person": "3", + "Person": "three", "PronType": "Prs", }, "PRON_Case=Nom|Number=Sing|PronType=Int": { @@ -1280,26 +1227,23 @@ TAG_MAP = { "PRON_Case=Nom|Person=1|PronType=Tot": { POS: PRON, "Case": "Nom", - "Person": "1", + "Person": "one", "PronType": "Tot", }, "PRON_Case=Nom|PronType=Ind": {POS: PRON, "Case": "Nom", "PronType": "Ind"}, "PRON_Case=Nom|PronType=Tot": {POS: PRON, "Case": "Nom", "PronType": "Tot"}, "PRON_Distance=Dist|Number=Sing|PronType=Dem": { POS: PRON, - "Distance": "Dist", "Number": "Sing", "PronType": "Dem", }, "PRON_Distance=Med|PronType=Dem|Style=Coll": { POS: PRON, - "Distance": "Med", "PronType": "Dem", "Style": "Coll", }, "PRON_Distance=Prox|PronType=Dem|Style=Coll": { POS: PRON, - "Distance": "Prox", "PronType": "Dem", "Style": "Coll", }, @@ -1384,7 +1328,6 @@ TAG_MAP = { "Case": "Abl", "Definite": "Ind", "NameType": "Geo", - "Number": "Coll", }, "PROPN_Animacy=Nhum|Case=Abl|Definite=Ind|NameType=Geo|Number=Sing": { POS: PROPN, @@ -1449,7 +1392,6 @@ TAG_MAP = { "Case": "Nom", "Definite": "Ind", "NameType": "Geo", - "Number": "Coll", }, "PROPN_Animacy=Nhum|Case=Nom|Definite=Ind|NameType=Geo|Number=Sing": { POS: PROPN, @@ -1471,41 +1413,31 @@ TAG_MAP = { "SCONJ_Style=Coll": {POS: SCONJ, "Style": "Coll"}, "VERB_Aspect=Dur|Polarity=Neg|Subcat=Intr|VerbForm=Part|Voice=Pass": { POS: VERB, - "Aspect": "Dur", "Polarity": "Neg", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Pass", }, "VERB_Aspect=Dur|Polarity=Pos|Subcat=Intr|VerbForm=Part|Voice=Mid": { POS: VERB, - "Aspect": "Dur", "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Mid", }, "VERB_Aspect=Dur|Polarity=Pos|Subcat=Intr|VerbForm=Part|Voice=Pass": { POS: VERB, - "Aspect": "Dur", "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Pass", }, "VERB_Aspect=Dur|Polarity=Pos|Subcat=Tran|VerbForm=Part|Voice=Act": { POS: VERB, - "Aspect": "Dur", "Polarity": "Pos", - "Subcat": "Tran", "VerbForm": "Part", "Voice": "Act", }, "VERB_Aspect=Dur|Polarity=Pos|Subcat=Tran|VerbForm=Part|Voice=Mid": { POS: VERB, - "Aspect": "Dur", "Polarity": "Pos", - "Subcat": "Tran", "VerbForm": "Part", "Voice": "Mid", }, @@ -1514,9 +1446,8 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Neg", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -1526,9 +1457,8 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -1538,9 +1468,8 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Neg", - "Subcat": "Intr", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Mid", @@ -1550,9 +1479,8 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Intr", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Mid", @@ -1562,9 +1490,8 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Imp", "VerbForm": "Fin", "Voice": "Act", @@ -1574,9 +1501,8 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Neg", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -1586,9 +1512,8 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", - "Subcat": "Intr", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Mid", @@ -1598,9 +1523,8 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -1610,9 +1534,8 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Intr", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Mid", @@ -1622,9 +1545,8 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Imp", "VerbForm": "Fin", "Voice": "Act", @@ -1634,9 +1556,8 @@ TAG_MAP = { "Aspect": "Imp", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -1645,7 +1566,6 @@ TAG_MAP = { POS: VERB, "Aspect": "Imp", "Style": "Coll", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Mid", }, @@ -1653,48 +1573,41 @@ TAG_MAP = { POS: VERB, "Aspect": "Imp", "Style": "Vrnc", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Mid", }, "VERB_Aspect=Imp|Subcat=Intr|VerbForm=Part": { POS: VERB, "Aspect": "Imp", - "Subcat": "Intr", "VerbForm": "Part", }, "VERB_Aspect=Imp|Subcat=Intr|VerbForm=Part|Voice=Act": { POS: VERB, "Aspect": "Imp", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Act", }, "VERB_Aspect=Imp|Subcat=Intr|VerbForm=Part|Voice=Mid": { POS: VERB, "Aspect": "Imp", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Mid", }, "VERB_Aspect=Imp|Subcat=Intr|VerbForm=Part|Voice=Pass": { POS: VERB, "Aspect": "Imp", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Pass", }, "VERB_Aspect=Imp|Subcat=Tran|VerbForm=Part|Voice=Act": { POS: VERB, "Aspect": "Imp", - "Subcat": "Tran", "VerbForm": "Part", "Voice": "Act", }, "VERB_Aspect=Imp|Subcat=Tran|VerbForm=Part|Voice=Cau": { POS: VERB, "Aspect": "Imp", - "Subcat": "Tran", "VerbForm": "Part", "Voice": "Cau", }, @@ -1703,9 +1616,7 @@ TAG_MAP = { "Aspect": "Iter", "Case": "Ins", "Definite": "Ind", - "Number": "Coll", "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Gdv", "Voice": "Mid", }, @@ -1714,9 +1625,7 @@ TAG_MAP = { "Aspect": "Iter", "Case": "Ins", "Definite": "Ind", - "Number": "Coll", "Polarity": "Pos", - "Subcat": "Tran", "VerbForm": "Gdv", "Voice": "Act", }, @@ -1726,9 +1635,8 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Neg", - "Subcat": "Intr", "Tense": "Past", "VerbForm": "Fin", "Voice": "Mid", @@ -1738,9 +1646,8 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Intr", "Tense": "Past", "VerbForm": "Fin", "Voice": "Mid", @@ -1750,9 +1657,8 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Past", "VerbForm": "Fin", "Voice": "Act", @@ -1762,9 +1668,8 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Neg", - "Subcat": "Intr", "Tense": "Past", "VerbForm": "Fin", "Voice": "Mid", @@ -1774,10 +1679,9 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", "Style": "Vrnc", - "Subcat": "Tran", "Tense": "Past", "VerbForm": "Fin", "Voice": "Act", @@ -1787,9 +1691,8 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", - "Subcat": "Intr", "Tense": "Past", "VerbForm": "Fin", "Voice": "Mid", @@ -1799,9 +1702,8 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Past", "VerbForm": "Fin", "Voice": "Act", @@ -1811,9 +1713,8 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Past", "VerbForm": "Fin", "Voice": "Act", @@ -1823,10 +1724,9 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", "Style": "Vrnc", - "Subcat": "Intr", "Tense": "Past", "VerbForm": "Fin", "Voice": "Mid", @@ -1836,9 +1736,8 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Neg", - "Subcat": "Tran", "Tense": "Past", "VerbForm": "Fin", "Voice": "Act", @@ -1848,9 +1747,8 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Intr", "Tense": "Past", "VerbForm": "Fin", "Voice": "Mid", @@ -1860,9 +1758,8 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Past", "VerbForm": "Fin", "Voice": "Act", @@ -1872,9 +1769,8 @@ TAG_MAP = { "Aspect": "Perf", "Mood": "Ind", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Past", "VerbForm": "Fin", "Voice": "Mid", @@ -1883,7 +1779,6 @@ TAG_MAP = { POS: VERB, "Aspect": "Perf", "Polarity": "Neg", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Pass", }, @@ -1891,7 +1786,6 @@ TAG_MAP = { POS: VERB, "Aspect": "Perf", "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Mid", }, @@ -1899,7 +1793,6 @@ TAG_MAP = { POS: VERB, "Aspect": "Perf", "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Pass", }, @@ -1907,7 +1800,6 @@ TAG_MAP = { POS: VERB, "Aspect": "Perf", "Polarity": "Pos", - "Subcat": "Tran", "VerbForm": "Part", "Voice": "Act", }, @@ -1915,7 +1807,6 @@ TAG_MAP = { POS: VERB, "Aspect": "Perf", "Polarity": "Pos", - "Subcat": "Tran", "VerbForm": "Part", "Voice": "Pass", }, @@ -1929,35 +1820,30 @@ TAG_MAP = { "VERB_Aspect=Perf|Subcat=Intr|VerbForm=Part|Voice=Mid": { POS: VERB, "Aspect": "Perf", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Mid", }, "VERB_Aspect=Perf|Subcat=Intr|VerbForm=Part|Voice=Pass": { POS: VERB, "Aspect": "Perf", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Pass", }, "VERB_Aspect=Perf|Subcat=Tran|VerbForm=Part|Voice=Act": { POS: VERB, "Aspect": "Perf", - "Subcat": "Tran", "VerbForm": "Part", "Voice": "Act", }, "VERB_Aspect=Perf|Subcat=Tran|VerbForm=Part|Voice=Cau": { POS: VERB, "Aspect": "Perf", - "Subcat": "Tran", "VerbForm": "Part", "Voice": "Cau", }, "VERB_Aspect=Prog|Subcat=Intr|VerbForm=Conv|Voice=Mid": { POS: VERB, "Aspect": "Prog", - "Subcat": "Intr", "VerbForm": "Conv", "Voice": "Mid", }, @@ -1966,7 +1852,6 @@ TAG_MAP = { "Aspect": "Prosp", "Connegative": "Yes", "Mood": "Cnd", - "Subcat": "Tran", "VerbForm": "Fin", "Voice": "Act", }, @@ -1975,10 +1860,9 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Cnd", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", "Style": "Vrnc", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -1988,9 +1872,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Cnd", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Intr", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Mid", @@ -2000,9 +1883,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Cnd", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", - "Subcat": "Intr", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Mid", @@ -2012,9 +1894,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Cnd", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -2024,9 +1905,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Cnd", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Intr", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Mid", @@ -2036,9 +1916,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Cnd", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Intr", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Pass", @@ -2048,9 +1927,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Cnd", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Imp", "VerbForm": "Fin", "Voice": "Act", @@ -2060,9 +1938,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Cnd", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -2072,8 +1949,7 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Imp", "Number": "Sing", - "Person": "2", - "Subcat": "Intr", + "Person": "two", "VerbForm": "Fin", "Voice": "Mid", }, @@ -2082,8 +1958,7 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Imp", "Number": "Sing", - "Person": "2", - "Subcat": "Tran", + "Person": "two", "VerbForm": "Fin", "Voice": "Act", }, @@ -2092,9 +1967,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Plur", - "Person": "1", + "Person": "one", "Polarity": "Pos", - "Subcat": "Intr", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Mid", @@ -2104,9 +1978,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Neg", - "Subcat": "Intr", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Mid", @@ -2116,9 +1989,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Plur", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -2128,9 +2000,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Neg", - "Subcat": "Intr", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Mid", @@ -2140,9 +2011,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Neg", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -2152,9 +2022,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", - "Subcat": "Intr", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Mid", @@ -2164,9 +2033,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Sing", - "Person": "1", + "Person": "one", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -2176,9 +2044,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Imp", "VerbForm": "Fin", "Voice": "Act", @@ -2188,9 +2055,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Sing", - "Person": "2", + "Person": "two", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -2200,9 +2066,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Intr", "Tense": "Imp", "VerbForm": "Fin", "Voice": "Mid", @@ -2212,9 +2077,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Intr", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Mid", @@ -2224,9 +2088,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Fin", "Voice": "Pass", }, @@ -2235,9 +2098,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Imp", "VerbForm": "Fin", "Voice": "Act", @@ -2247,9 +2109,8 @@ TAG_MAP = { "Aspect": "Prosp", "Mood": "Sub", "Number": "Sing", - "Person": "3", + "Person": "three", "Polarity": "Pos", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -2258,9 +2119,8 @@ TAG_MAP = { POS: VERB, "Aspect": "Prosp", "Mood": "Sub", - "Person": "1", + "Person": "one", "Polarity": "Neg", - "Subcat": "Tran", "Tense": "Pres", "VerbForm": "Fin", "Voice": "Act", @@ -2269,7 +2129,6 @@ TAG_MAP = { POS: VERB, "Aspect": "Prosp", "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Mid", }, @@ -2277,28 +2136,24 @@ TAG_MAP = { POS: VERB, "Aspect": "Prosp", "Polarity": "Pos", - "Subcat": "Tran", "VerbForm": "Part", "Voice": "Act", }, "VERB_Aspect=Prosp|Subcat=Intr|VerbForm=Part|Voice=Mid": { POS: VERB, "Aspect": "Prosp", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Mid", }, "VERB_Aspect=Prosp|Subcat=Intr|VerbForm=Part|Voice=Pass": { POS: VERB, "Aspect": "Prosp", - "Subcat": "Intr", "VerbForm": "Part", "Voice": "Pass", }, "VERB_Aspect=Prosp|Subcat=Tran|VerbForm=Part|Voice=Act": { POS: VERB, "Aspect": "Prosp", - "Subcat": "Tran", "VerbForm": "Part", "Voice": "Act", }, @@ -2306,9 +2161,7 @@ TAG_MAP = { POS: VERB, "Case": "Abl", "Definite": "Ind", - "Number": "Coll", "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Gdv", "Voice": "Mid", }, @@ -2316,9 +2169,7 @@ TAG_MAP = { POS: VERB, "Case": "Abl", "Definite": "Ind", - "Number": "Coll", "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Gdv", "Voice": "Pass", }, @@ -2326,9 +2177,7 @@ TAG_MAP = { POS: VERB, "Case": "Abl", "Definite": "Ind", - "Number": "Coll", "Polarity": "Pos", - "Subcat": "Tran", "VerbForm": "Gdv", "Voice": "Act", }, @@ -2336,9 +2185,7 @@ TAG_MAP = { POS: VERB, "Case": "Dat", "Definite": "Def", - "Number": "Coll", "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Gdv", "Voice": "Mid", }, @@ -2346,9 +2193,7 @@ TAG_MAP = { POS: VERB, "Case": "Dat", "Definite": "Ind", - "Number": "Coll", "Polarity": "Neg", - "Subcat": "Intr", "VerbForm": "Gdv", "Voice": "Pass", }, @@ -2356,9 +2201,7 @@ TAG_MAP = { POS: VERB, "Case": "Dat", "Definite": "Ind", - "Number": "Coll", "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Gdv", "Voice": "Mid", }, @@ -2366,9 +2209,7 @@ TAG_MAP = { POS: VERB, "Case": "Dat", "Definite": "Ind", - "Number": "Coll", "Polarity": "Pos", - "Subcat": "Tran", "VerbForm": "Gdv", "Voice": "Act", }, @@ -2376,9 +2217,7 @@ TAG_MAP = { POS: VERB, "Case": "Ins", "Definite": "Ind", - "Number": "Coll", "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Gdv", "Voice": "Mid", }, @@ -2386,9 +2225,7 @@ TAG_MAP = { POS: VERB, "Case": "Ins", "Definite": "Ind", - "Number": "Coll", "Polarity": "Pos", - "Subcat": "Tran", "VerbForm": "Gdv", "Voice": "Act", }, @@ -2396,9 +2233,7 @@ TAG_MAP = { POS: VERB, "Case": "Nom", "Definite": "Def", - "Number": "Coll", "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Gdv", "Voice": "Mid", }, @@ -2406,9 +2241,7 @@ TAG_MAP = { POS: VERB, "Case": "Nom", "Definite": "Def", - "Number": "Coll", "Polarity": "Pos", - "Subcat": "Tran", "VerbForm": "Gdv", "Voice": "Act", }, @@ -2416,15 +2249,13 @@ TAG_MAP = { POS: VERB, "Mood": "Imp", "Number": "Sing", - "Person": "2", - "Subcat": "Intr", + "Person": "two", "VerbForm": "Fin", "Voice": "Mid", }, "VERB_Polarity=Neg|Subcat=Intr|VerbForm=Inf|Voice=Mid": { POS: VERB, "Polarity": "Neg", - "Subcat": "Intr", "VerbForm": "Inf", "Voice": "Mid", }, @@ -2432,7 +2263,6 @@ TAG_MAP = { POS: VERB, "Polarity": "Pos", "Style": "Coll", - "Subcat": "Tran", "VerbForm": "Inf", "Voice": "Act", }, @@ -2440,28 +2270,24 @@ TAG_MAP = { POS: VERB, "Polarity": "Pos", "Style": "Vrnc", - "Subcat": "Tran", "VerbForm": "Inf", "Voice": "Act", }, "VERB_Polarity=Pos|Subcat=Intr|VerbForm=Inf|Voice=Mid": { POS: VERB, "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Inf", "Voice": "Mid", }, "VERB_Polarity=Pos|Subcat=Intr|VerbForm=Inf|Voice=Pass": { POS: VERB, "Polarity": "Pos", - "Subcat": "Intr", "VerbForm": "Inf", "Voice": "Pass", }, "VERB_Polarity=Pos|Subcat=Tran|Typo=Yes|VerbForm=Inf|Voice=Act": { POS: VERB, "Polarity": "Pos", - "Subcat": "Tran", "Typo": "Yes", "VerbForm": "Inf", "Voice": "Act", @@ -2469,7 +2295,6 @@ TAG_MAP = { "VERB_Polarity=Pos|Subcat=Tran|VerbForm=Inf|Voice=Act": { POS: VERB, "Polarity": "Pos", - "Subcat": "Tran", "VerbForm": "Inf", "Voice": "Act", }, diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index 2be4b0fb7..2ceb940c3 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -14,7 +14,7 @@ class PolishLemmatizer(Lemmatizer): # lemmatization for nouns def __init__(self, lookups, *args, **kwargs): # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules - super().__init__(lookups) + super(PolishLemmatizer, self).__init__(lookups) self.lemma_lookups = {} for tag in [ "ADJ", diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index ed0b3eb74..9f8a82c10 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -16,7 +16,7 @@ from .tag_map import TAG_MAP from ... import util -_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python" +_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python" def try_jieba_import(use_jieba): @@ -109,6 +109,7 @@ class ChineseTokenizer(DummyTokenizer): if reset: try: import pkuseg + self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None) except ImportError: if self.use_pkuseg: @@ -118,7 +119,7 @@ class ChineseTokenizer(DummyTokenizer): ) raise ImportError(msg) for word in words: - self.pkuseg_seg.preprocesser.insert(word.strip(), '') + self.pkuseg_seg.preprocesser.insert(word.strip(), "") def _get_config(self): config = OrderedDict( @@ -168,21 +169,16 @@ class ChineseTokenizer(DummyTokenizer): return util.to_bytes(serializers, []) def from_bytes(self, data, **kwargs): - pkuseg_features_b = b"" - pkuseg_weights_b = b"" - pkuseg_processors_data = None + pkuseg_data = {"features_b": b"", "weights_b": b"", "processors_data": None} def deserialize_pkuseg_features(b): - nonlocal pkuseg_features_b - pkuseg_features_b = b + pkuseg_data["features_b"] = b def deserialize_pkuseg_weights(b): - nonlocal pkuseg_weights_b - pkuseg_weights_b = b + pkuseg_data["weights_b"] = b def deserialize_pkuseg_processors(b): - nonlocal pkuseg_processors_data - pkuseg_processors_data = srsly.msgpack_loads(b) + pkuseg_data["processors_data"] = srsly.msgpack_loads(b) deserializers = OrderedDict( ( @@ -194,13 +190,13 @@ class ChineseTokenizer(DummyTokenizer): ) util.from_bytes(data, deserializers, []) - if pkuseg_features_b and pkuseg_weights_b: + if pkuseg_data["features_b"] and pkuseg_data["weights_b"]: with tempfile.TemporaryDirectory() as tempdir: tempdir = Path(tempdir) with open(tempdir / "features.pkl", "wb") as fileh: - fileh.write(pkuseg_features_b) + fileh.write(pkuseg_data["features_b"]) with open(tempdir / "weights.npz", "wb") as fileh: - fileh.write(pkuseg_weights_b) + fileh.write(pkuseg_data["weights_b"]) try: import pkuseg except ImportError: @@ -209,13 +205,9 @@ class ChineseTokenizer(DummyTokenizer): + _PKUSEG_INSTALL_MSG ) self.pkuseg_seg = pkuseg.pkuseg(str(tempdir)) - if pkuseg_processors_data: - ( - user_dict, - do_process, - common_words, - other_words, - ) = pkuseg_processors_data + if pkuseg_data["processors_data"]: + processors_data = pkuseg_data["processors_data"] + (user_dict, do_process, common_words, other_words) = processors_data self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words) diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 56d27024d..e93416043 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -235,12 +235,12 @@ cdef class DependencyMatcher: matched_trees = [] self.recurse(_tree,id_to_position,_node_operator_map,0,[],matched_trees) - matched_key_trees.append((key,matched_trees)) - - for i, (ent_id, nodes) in enumerate(matched_key_trees): - on_match = self._callbacks.get(ent_id) - if on_match is not None: - on_match(self, doc, i, matched_key_trees) + if len(matched_trees) > 0: + matched_key_trees.append((key,matched_trees)) + for i, (ent_id, nodes) in enumerate(matched_key_trees): + on_match = self._callbacks.get(ent_id) + if on_match is not None: + on_match(self, doc, i, matched_key_trees) return matched_key_trees def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees): diff --git a/spacy/tests/lang/hy/test_text.py b/spacy/tests/lang/hy/test_text.py index 6b785bdfc..cbdb77e4e 100644 --- a/spacy/tests/lang/hy/test_text.py +++ b/spacy/tests/lang/hy/test_text.py @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals import pytest diff --git a/spacy/tests/lang/hy/test_tokenizer.py b/spacy/tests/lang/hy/test_tokenizer.py index 424fb886f..5043273f9 100644 --- a/spacy/tests/lang/hy/test_tokenizer.py +++ b/spacy/tests/lang/hy/test_tokenizer.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import pytest diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 0295ada82..a2ca69111 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -7,6 +7,7 @@ from mock import Mock from spacy.matcher import Matcher, DependencyMatcher from spacy.tokens import Doc, Token from ..doc.test_underscore import clean_underscore # noqa: F401 +from ..util import get_doc @pytest.fixture @@ -301,22 +302,6 @@ def test_matcher_extension_set_membership(en_vocab): assert len(matches) == 0 -@pytest.fixture -def text(): - return "The quick brown fox jumped over the lazy fox" - - -@pytest.fixture -def heads(): - return [3, 2, 1, 1, 0, -1, 2, 1, -3] - - -@pytest.fixture -def deps(): - return ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"] - - -@pytest.fixture def dependency_matcher(en_vocab): def is_brown_yellow(text): return bool(re.compile(r"brown|yellow|over").match(text)) @@ -359,24 +344,40 @@ def dependency_matcher(en_vocab): }, ] + # pattern that doesn't match + pattern4 = [ + {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "NOMATCH"}}, + { + "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, + { + "SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"}, + "PATTERN": {"ORTH": "brown"}, + }, + ] + matcher = DependencyMatcher(en_vocab) - matcher.add("pattern1", [pattern1]) - matcher.add("pattern2", [pattern2]) - matcher.add("pattern3", [pattern3]) + on_match = Mock() + matcher.add("pattern1", [pattern1], on_match=on_match) + matcher.add("pattern2", [pattern2], on_match=on_match) + matcher.add("pattern3", [pattern3], on_match=on_match) + matcher.add("pattern4", [pattern4], on_match=on_match) - return matcher + assert len(dependency_matcher) == 4 + text = "The quick brown fox jumped over the lazy fox" + heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] + deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod"] -def test_dependency_matcher_compile(dependency_matcher): - assert len(dependency_matcher) == 3 + doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps) + matches = dependency_matcher(doc) - -# def test_dependency_matcher(dependency_matcher, text, heads, deps): -# doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps) -# matches = dependency_matcher(doc) -# assert matches[0][1] == [[3, 1, 2]] -# assert matches[1][1] == [[4, 3, 3]] -# assert matches[2][1] == [[4, 3, 2]] + assert len(matches) == 3 + assert matches[0][1] == [[3, 1, 2]] + assert matches[1][1] == [[4, 3, 3]] + assert matches[2][1] == [[4, 3, 2]] + assert on_match.call_count == 3 def test_matcher_basic_check(en_vocab): diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py index a9a57746d..d2956d6e8 100644 --- a/spacy/tests/regression/test_issue5152.py +++ b/spacy/tests/regression/test_issue5152.py @@ -1,3 +1,5 @@ +# coding: utf8 +from __future__ import unicode_literals from spacy.lang.en import English