diff --git a/spacy/lang/el/tag_map.py b/spacy/lang/el/tag_map.py index 30816dbe4..b346299bc 100644 --- a/spacy/lang/el/tag_map.py +++ b/spacy/lang/el/tag_map.py @@ -4249,20 +4249,20 @@ TAG_MAP = { "Voice": "Act", "Case": "Nom|Gen|Dat|Acc|Voc", }, - 'ADJ': {POS: ADJ}, - 'ADP': {POS: ADP}, - 'ADV': {POS: ADV}, - 'AtDf': {POS: DET}, - 'AUX': {POS: AUX}, - 'CCONJ': {POS: CCONJ}, - 'DET': {POS: DET}, - 'NOUN': {POS: NOUN}, - 'NUM': {POS: NUM}, - 'PART': {POS: PART}, - 'PRON': {POS: PRON}, - 'PROPN': {POS: PROPN}, - 'SCONJ': {POS: SCONJ}, - 'SYM': {POS: SYM}, - 'VERB': {POS: VERB}, - 'X': {POS: X}, + "ADJ": {POS: ADJ}, + "ADP": {POS: ADP}, + "ADV": {POS: ADV}, + "AtDf": {POS: DET}, + "AUX": {POS: AUX}, + "CCONJ": {POS: CCONJ}, + "DET": {POS: DET}, + "NOUN": {POS: NOUN}, + "NUM": {POS: NUM}, + "PART": {POS: PART}, + "PRON": {POS: PRON}, + "PROPN": {POS: PROPN}, + "SCONJ": {POS: SCONJ}, + "SYM": {POS: SYM}, + "VERB": {POS: VERB}, + "X": {POS: X}, } diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 0538461a3..22590043f 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -16,7 +16,8 @@ from ...util import DummyTokenizer # the flow by creating a dummy with the same interface. DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"]) DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"]) -DummySpace = DummyNode(' ', ' ', DummyNodeFeatures(' ')) +DummySpace = DummyNode(" ", " ", DummyNodeFeatures(" ")) + def try_fugashi_import(): """Fugashi is required for Japanese support, so check for it. @@ -27,8 +28,7 @@ def try_fugashi_import(): return fugashi except ImportError: raise ImportError( - "Japanese support requires Fugashi: " - "https://github.com/polm/fugashi" + "Japanese support requires Fugashi: " "https://github.com/polm/fugashi" ) @@ -55,13 +55,14 @@ def resolve_pos(token): return token.pos + ",ADJ" return token.pos + def get_words_and_spaces(tokenizer, text): """Get the individual tokens that make up the sentence and handle white space. Japanese doesn't usually use white space, and MeCab's handling of it for multiple spaces in a row is somewhat awkward. """ - + tokens = tokenizer.parseToNodeList(text) words = [] @@ -76,6 +77,7 @@ def get_words_and_spaces(tokenizer, text): spaces.append(bool(token.white_space)) return words, spaces + class JapaneseTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py index 68531d9d0..1571e13d7 100644 --- a/spacy/lang/lb/punctuation.py +++ b/spacy/lang/lb/punctuation.py @@ -1,8 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, LIST_ICONS -from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER ELISION = " ' ’ ".strip().replace(" ", "") diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py index 8a35b6fb7..b32daa58c 100644 --- a/spacy/lang/lb/tokenizer_exceptions.py +++ b/spacy/lang/lb/tokenizer_exceptions.py @@ -20,7 +20,7 @@ for exc_data in [ {ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"}, {ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"}, {ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"}, - {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"} + {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/lang/nb/tag_map.py b/spacy/lang/nb/tag_map.py index cf4c95840..ca0ece265 100644 --- a/spacy/lang/nb/tag_map.py +++ b/spacy/lang/nb/tag_map.py @@ -467,38 +467,110 @@ TAG_MAP = { "VERB__VerbForm=Part": {"morph": "VerbForm=Part", POS: VERB}, "VERB___": {"morph": "_", POS: VERB}, "X___": {"morph": "_", POS: X}, - 'CCONJ___': {"morph": "_", POS: CCONJ}, + "CCONJ___": {"morph": "_", POS: CCONJ}, "ADJ__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADJ}, "ADJ__Abbr=Yes|Degree=Pos": {"morph": "Abbr=Yes|Degree=Pos", POS: ADJ}, - "ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ}, - "ADJ__Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ}, - "ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part", POS: ADJ}, - "ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part", POS: ADJ}, - "ADJ__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: ADJ}, + "ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": { + "morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part", + POS: ADJ, + }, + "ADJ__Definite=Def|Number=Sing|VerbForm=Part": { + "morph": "Definite=Def|Number=Sing|VerbForm=Part", + POS: ADJ, + }, + "ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": { + "morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part", + POS: ADJ, + }, + "ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": { + "morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part", + POS: ADJ, + }, + "ADJ__Definite=Ind|Number=Sing|VerbForm=Part": { + "morph": "Definite=Ind|Number=Sing|VerbForm=Part", + POS: ADJ, + }, "ADJ__Number=Sing|VerbForm=Part": {"morph": "Number=Sing|VerbForm=Part", POS: ADJ}, "ADJ__VerbForm=Part": {"morph": "VerbForm=Part", POS: ADJ}, "ADP__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADP}, "ADV__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADV}, - "DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": {"morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art", POS: DET}, - "DET__Case=Gen|Number=Plur|PronType=Tot": {"morph": "Case=Gen|Number=Plur|PronType=Tot", POS: DET}, + "DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": { + "morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art", + POS: DET, + }, + "DET__Case=Gen|Number=Plur|PronType=Tot": { + "morph": "Case=Gen|Number=Plur|PronType=Tot", + POS: DET, + }, "DET__Definite=Def|PronType=Prs": {"morph": "Definite=Def|PronType=Prs", POS: DET}, - "DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs", POS: DET}, - "DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs", POS: DET}, - "DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs", POS: DET}, - "DET__Gender=Fem|Number=Sing|PronType=Art": {"morph": "Gender=Fem|Number=Sing|PronType=Art", POS: DET}, - "DET__Gender=Fem|Number=Sing|PronType=Ind": {"morph": "Gender=Fem|Number=Sing|PronType=Ind", POS: DET}, - "DET__Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|PronType=Prs", POS: DET}, - "DET__Gender=Fem|Number=Sing|PronType=Tot": {"morph": "Gender=Fem|Number=Sing|PronType=Tot", POS: DET}, - "DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET}, - "DET__Gender=Masc|Number=Sing|PronType=Art": {"morph": "Gender=Masc|Number=Sing|PronType=Art", POS: DET}, - "DET__Gender=Masc|Number=Sing|PronType=Ind": {"morph": "Gender=Masc|Number=Sing|PronType=Ind", POS: DET}, - "DET__Gender=Masc|Number=Sing|PronType=Tot": {"morph": "Gender=Masc|Number=Sing|PronType=Tot", POS: DET}, - "DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET}, - "DET__Gender=Neut|Number=Sing|PronType=Art": {"morph": "Gender=Neut|Number=Sing|PronType=Art", POS: DET}, - "DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind", POS: DET}, - "DET__Gender=Neut|Number=Sing|PronType=Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Ind", POS: DET}, - "DET__Gender=Neut|Number=Sing|PronType=Tot": {"morph": "Gender=Neut|Number=Sing|PronType=Tot", POS: DET}, - "DET__Number=Plur|Polarity=Neg|PronType=Neg": {"morph": "Number=Plur|Polarity=Neg|PronType=Neg", POS: DET}, + "DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": { + "morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs", + POS: DET, + }, + "DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": { + "morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs", + POS: DET, + }, + "DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": { + "morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs", + POS: DET, + }, + "DET__Gender=Fem|Number=Sing|PronType=Art": { + "morph": "Gender=Fem|Number=Sing|PronType=Art", + POS: DET, + }, + "DET__Gender=Fem|Number=Sing|PronType=Ind": { + "morph": "Gender=Fem|Number=Sing|PronType=Ind", + POS: DET, + }, + "DET__Gender=Fem|Number=Sing|PronType=Prs": { + "morph": "Gender=Fem|Number=Sing|PronType=Prs", + POS: DET, + }, + "DET__Gender=Fem|Number=Sing|PronType=Tot": { + "morph": "Gender=Fem|Number=Sing|PronType=Tot", + POS: DET, + }, + "DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": { + "morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg", + POS: DET, + }, + "DET__Gender=Masc|Number=Sing|PronType=Art": { + "morph": "Gender=Masc|Number=Sing|PronType=Art", + POS: DET, + }, + "DET__Gender=Masc|Number=Sing|PronType=Ind": { + "morph": "Gender=Masc|Number=Sing|PronType=Ind", + POS: DET, + }, + "DET__Gender=Masc|Number=Sing|PronType=Tot": { + "morph": "Gender=Masc|Number=Sing|PronType=Tot", + POS: DET, + }, + "DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": { + "morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg", + POS: DET, + }, + "DET__Gender=Neut|Number=Sing|PronType=Art": { + "morph": "Gender=Neut|Number=Sing|PronType=Art", + POS: DET, + }, + "DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": { + "morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind", + POS: DET, + }, + "DET__Gender=Neut|Number=Sing|PronType=Ind": { + "morph": "Gender=Neut|Number=Sing|PronType=Ind", + POS: DET, + }, + "DET__Gender=Neut|Number=Sing|PronType=Tot": { + "morph": "Gender=Neut|Number=Sing|PronType=Tot", + POS: DET, + }, + "DET__Number=Plur|Polarity=Neg|PronType=Neg": { + "morph": "Number=Plur|Polarity=Neg|PronType=Neg", + POS: DET, + }, "DET__Number=Plur|PronType=Art": {"morph": "Number=Plur|PronType=Art", POS: DET}, "DET__Number=Plur|PronType=Ind": {"morph": "Number=Plur|PronType=Ind", POS: DET}, "DET__Number=Plur|PronType=Prs": {"morph": "Number=Plur|PronType=Prs", POS: DET}, @@ -507,57 +579,183 @@ TAG_MAP = { "DET__PronType=Prs": {"morph": "PronType=Prs", POS: DET}, "NOUN__Abbr=Yes": {"morph": "Abbr=Yes", POS: NOUN}, "NOUN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: NOUN}, - "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing", POS: NOUN}, - "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing", POS: NOUN}, - "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing", POS: NOUN}, + "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": { + "morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing", + POS: NOUN, + }, + "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": { + "morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing", + POS: NOUN, + }, + "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": { + "morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing", + POS: NOUN, + }, "NOUN__Abbr=Yes|Gender=Masc": {"morph": "Abbr=Yes|Gender=Masc", POS: NOUN}, - "NUM__Case=Gen|Number=Plur|NumType=Card": {"morph": "Case=Gen|Number=Plur|NumType=Card", POS: NUM}, - "NUM__Definite=Def|Number=Sing|NumType=Card": {"morph": "Definite=Def|Number=Sing|NumType=Card", POS: NUM}, + "NUM__Case=Gen|Number=Plur|NumType=Card": { + "morph": "Case=Gen|Number=Plur|NumType=Card", + POS: NUM, + }, + "NUM__Definite=Def|Number=Sing|NumType=Card": { + "morph": "Definite=Def|Number=Sing|NumType=Card", + POS: NUM, + }, "NUM__Definite=Def|NumType=Card": {"morph": "Definite=Def|NumType=Card", POS: NUM}, - "NUM__Gender=Fem|Number=Sing|NumType=Card": {"morph": "Gender=Fem|Number=Sing|NumType=Card", POS: NUM}, - "NUM__Gender=Masc|Number=Sing|NumType=Card": {"morph": "Gender=Masc|Number=Sing|NumType=Card", POS: NUM}, - "NUM__Gender=Neut|Number=Sing|NumType=Card": {"morph": "Gender=Neut|Number=Sing|NumType=Card", POS: NUM}, + "NUM__Gender=Fem|Number=Sing|NumType=Card": { + "morph": "Gender=Fem|Number=Sing|NumType=Card", + POS: NUM, + }, + "NUM__Gender=Masc|Number=Sing|NumType=Card": { + "morph": "Gender=Masc|Number=Sing|NumType=Card", + POS: NUM, + }, + "NUM__Gender=Neut|Number=Sing|NumType=Card": { + "morph": "Gender=Neut|Number=Sing|NumType=Card", + POS: NUM, + }, "NUM__Number=Plur|NumType=Card": {"morph": "Number=Plur|NumType=Card", POS: NUM}, "NUM__Number=Sing|NumType=Card": {"morph": "Number=Sing|NumType=Card", POS: NUM}, "NUM__NumType=Card": {"morph": "NumType=Card", POS: NUM}, "PART__Polarity=Neg": {"morph": "Polarity=Neg", POS: PART}, - "PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs", POS: PRON}, - "PRON__Animacy=Hum|Number=Plur|PronType=Rcp": {"morph": "Animacy=Hum|Number=Plur|PronType=Rcp", POS: PRON}, - "PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs", POS: PRON}, - "PRON__Animacy=Hum|Poss=Yes|PronType=Int": {"morph": "Animacy=Hum|Poss=Yes|PronType=Int", POS: PRON}, + "PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { + "morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { + "morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": { + "morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": { + "morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": { + "morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": { + "morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": { + "morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": { + "morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": { + "morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Number=Plur|PronType=Rcp": { + "morph": "Animacy=Hum|Number=Plur|PronType=Rcp", + POS: PRON, + }, + "PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": { + "morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs", + POS: PRON, + }, + "PRON__Animacy=Hum|Poss=Yes|PronType=Int": { + "morph": "Animacy=Hum|Poss=Yes|PronType=Int", + POS: PRON, + }, "PRON__Animacy=Hum|PronType=Int": {"morph": "Animacy=Hum|PronType=Int", POS: PRON}, - "PRON__Case=Acc|PronType=Prs|Reflex=Yes": {"morph": "Case=Acc|PronType=Prs|Reflex=Yes", POS: PRON}, - "PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": { "morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON}, - "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON}, - "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot", POS: PRON}, - "PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, - "PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, - "PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON}, - "PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, - "PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": {"morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON}, - "PRON__Number=Plur|Person=3|PronType=Ind,Prs": {"morph": "Number=Plur|Person=3|PronType=Ind,Prs", POS: PRON}, - "PRON__Number=Plur|Person=3|PronType=Prs,Tot": {"morph": "Number=Plur|Person=3|PronType=Prs,Tot", POS: PRON}, - "PRON__Number=Plur|Poss=Yes|PronType=Prs": {"morph": "Number=Plur|Poss=Yes|PronType=Prs", POS: PRON}, - "PRON__Number=Plur|Poss=Yes|PronType=Rcp": {"morph": "Number=Plur|Poss=Yes|PronType=Rcp", POS: PRON}, - "PRON__Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Number=Sing|Polarity=Neg|PronType=Neg", POS: PRON}, + "PRON__Case=Acc|PronType=Prs|Reflex=Yes": { + "morph": "Case=Acc|PronType=Prs|Reflex=Yes", + POS: PRON, + }, + "PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": { + "morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs", + POS: PRON, + }, + "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": { + "morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs", + POS: PRON, + }, + "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": { + "morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot", + POS: PRON, + }, + "PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": { + "morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs", + POS: PRON, + }, + "PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": { + "morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs", + POS: PRON, + }, + "PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": { + "morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs", + POS: PRON, + }, + "PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": { + "morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs", + POS: PRON, + }, + "PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": { + "morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs", + POS: PRON, + }, + "PRON__Number=Plur|Person=3|PronType=Ind,Prs": { + "morph": "Number=Plur|Person=3|PronType=Ind,Prs", + POS: PRON, + }, + "PRON__Number=Plur|Person=3|PronType=Prs,Tot": { + "morph": "Number=Plur|Person=3|PronType=Prs,Tot", + POS: PRON, + }, + "PRON__Number=Plur|Poss=Yes|PronType=Prs": { + "morph": "Number=Plur|Poss=Yes|PronType=Prs", + POS: PRON, + }, + "PRON__Number=Plur|Poss=Yes|PronType=Rcp": { + "morph": "Number=Plur|Poss=Yes|PronType=Rcp", + POS: PRON, + }, + "PRON__Number=Sing|Polarity=Neg|PronType=Neg": { + "morph": "Number=Sing|Polarity=Neg|PronType=Neg", + POS: PRON, + }, "PRON__PronType=Prs": {"morph": "PronType=Prs", POS: PRON}, "PRON__PronType=Rel": {"morph": "PronType=Rel", POS: PRON}, "PROPN__Abbr=Yes": {"morph": "Abbr=Yes", POS: PROPN}, "PROPN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: PROPN}, - "VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": {"morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin", POS: VERB}, - "VERB__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: VERB}, + "VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": { + "morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin", + POS: VERB, + }, + "VERB__Definite=Ind|Number=Sing|VerbForm=Part": { + "morph": "Definite=Ind|Number=Sing|VerbForm=Part", + POS: VERB, + }, } diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 205697637..2db312d64 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -295,10 +295,9 @@ class EntityRuler(object): deserializers_patterns = { "patterns": lambda p: self.add_patterns( srsly.read_jsonl(p.with_suffix(".jsonl")) - )} - deserializers_cfg = { - "cfg": lambda p: cfg.update(srsly.read_json(p)) + ) } + deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))} from_disk(path, deserializers_cfg, {}) self.overwrite = cfg.get("overwrite", False) self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 1a33221c2..816970e61 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -219,14 +219,13 @@ def uk_tokenizer(): def ur_tokenizer(): return get_lang_class("ur").Defaults.create_tokenizer() - + @pytest.fixture(scope="session") def yo_tokenizer(): return get_lang_class("yo").Defaults.create_tokenizer() - + @pytest.fixture(scope="session") def zh_tokenizer(): pytest.importorskip("jieba") return get_lang_class("zh").Defaults.create_tokenizer() - diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index cbbebcf28..17f6f0ccc 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -15,7 +15,7 @@ ABBREVIATION_TESTS = [ HYPHENATED_TESTS = [ ( "1700-luvulle sijoittuva taide-elokuva", - ["1700-luvulle", "sijoittuva", "taide-elokuva"] + ["1700-luvulle", "sijoittuva", "taide-elokuva"], ) ] diff --git a/spacy/tests/lang/lb/test_exceptions.py b/spacy/tests/lang/lb/test_exceptions.py index 57541fc26..7ca2394b7 100644 --- a/spacy/tests/lang/lb/test_exceptions.py +++ b/spacy/tests/lang/lb/test_exceptions.py @@ -3,16 +3,19 @@ from __future__ import unicode_literals import pytest + @pytest.mark.parametrize("text", ["z.B.", "Jan."]) def test_lb_tokenizer_handles_abbr(lb_tokenizer, text): tokens = lb_tokenizer(text) assert len(tokens) == 1 + @pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"]) def test_lb_tokenizer_splits_contractions(lb_tokenizer, text): tokens = lb_tokenizer(text) assert len(tokens) == 2 + def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer): text = "Mee 't ass net evident, d'Liewen." tokens = lb_tokenizer(text) @@ -20,6 +23,7 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer): assert tokens[1].text == "'t" assert tokens[1].lemma_ == "et" + @pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")]) def test_lb_norm_exceptions(lb_tokenizer, text, norm): tokens = lb_tokenizer(text) diff --git a/spacy/tests/lang/lb/test_text.py b/spacy/tests/lang/lb/test_text.py index 2284ff794..36464b379 100644 --- a/spacy/tests/lang/lb/test_text.py +++ b/spacy/tests/lang/lb/test_text.py @@ -16,7 +16,7 @@ def test_lb_tokenizer_handles_long_text(lb_tokenizer): [ ("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13), ("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15), - ("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14) + ("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14), ], ) def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length): diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index ff630f0fa..4bb5aac70 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -87,4 +87,4 @@ def test_lex_attrs_like_url(text, match): ], ) def test_lex_attrs_word_shape(text, shape): - assert word_shape(text) == shape \ No newline at end of file + assert word_shape(text) == shape diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 384f14dad..fb5301718 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -151,17 +151,17 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): def test_parser_set_sent_starts(en_vocab): + # fmt: off words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n'] heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1] deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', ''] - doc = get_doc( - en_vocab, words=words, deps=deps, heads=heads - ) + # fmt: on + doc = get_doc(en_vocab, words=words, deps=deps, heads=heads) for i in range(len(words)): if i == 0 or i == 3: - assert doc[i].is_sent_start == True + assert doc[i].is_sent_start is True else: - assert doc[i].is_sent_start == None + assert doc[i].is_sent_start is None for sent in doc.sents: for token in sent: assert token.head in sent diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index d0331602c..a5bda9090 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import pytest from spacy.language import Language -from spacy.pipeline import Tagger def test_label_types(): diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py index 36e9f02c1..5f8d1573f 100644 --- a/spacy/tests/regression/test_issue4674.py +++ b/spacy/tests/regression/test_issue4674.py @@ -15,7 +15,9 @@ def test_issue4674(): vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] - kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]) + kb.set_entities( + entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2] + ) assert kb.get_size_entities() == 1 @@ -31,4 +33,3 @@ def test_issue4674(): kb2.load_bulk(str(file_path)) assert kb2.get_size_entities() == 1 -