Merge branch 'master' into develop

This commit is contained in:
Ines Montani 2019-12-21 19:04:43 +01:00
commit 947dba7141
14 changed files with 308 additions and 107 deletions

View File

@ -4249,20 +4249,20 @@ TAG_MAP = {
"Voice": "Act", "Voice": "Act",
"Case": "Nom|Gen|Dat|Acc|Voc", "Case": "Nom|Gen|Dat|Acc|Voc",
}, },
'ADJ': {POS: ADJ}, "ADJ": {POS: ADJ},
'ADP': {POS: ADP}, "ADP": {POS: ADP},
'ADV': {POS: ADV}, "ADV": {POS: ADV},
'AtDf': {POS: DET}, "AtDf": {POS: DET},
'AUX': {POS: AUX}, "AUX": {POS: AUX},
'CCONJ': {POS: CCONJ}, "CCONJ": {POS: CCONJ},
'DET': {POS: DET}, "DET": {POS: DET},
'NOUN': {POS: NOUN}, "NOUN": {POS: NOUN},
'NUM': {POS: NUM}, "NUM": {POS: NUM},
'PART': {POS: PART}, "PART": {POS: PART},
'PRON': {POS: PRON}, "PRON": {POS: PRON},
'PROPN': {POS: PROPN}, "PROPN": {POS: PROPN},
'SCONJ': {POS: SCONJ}, "SCONJ": {POS: SCONJ},
'SYM': {POS: SYM}, "SYM": {POS: SYM},
'VERB': {POS: VERB}, "VERB": {POS: VERB},
'X': {POS: X}, "X": {POS: X},
} }

View File

@ -16,7 +16,8 @@ from ...util import DummyTokenizer
# the flow by creating a dummy with the same interface. # the flow by creating a dummy with the same interface.
DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"]) DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"])
DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"]) DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"])
DummySpace = DummyNode(' ', ' ', DummyNodeFeatures(' ')) DummySpace = DummyNode(" ", " ", DummyNodeFeatures(" "))
def try_fugashi_import(): def try_fugashi_import():
"""Fugashi is required for Japanese support, so check for it. """Fugashi is required for Japanese support, so check for it.
@ -27,8 +28,7 @@ def try_fugashi_import():
return fugashi return fugashi
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"Japanese support requires Fugashi: " "Japanese support requires Fugashi: " "https://github.com/polm/fugashi"
"https://github.com/polm/fugashi"
) )
@ -55,6 +55,7 @@ def resolve_pos(token):
return token.pos + ",ADJ" return token.pos + ",ADJ"
return token.pos return token.pos
def get_words_and_spaces(tokenizer, text): def get_words_and_spaces(tokenizer, text):
"""Get the individual tokens that make up the sentence and handle white space. """Get the individual tokens that make up the sentence and handle white space.
@ -76,6 +77,7 @@ def get_words_and_spaces(tokenizer, text):
spaces.append(bool(token.white_space)) spaces.append(bool(token.white_space))
return words, spaces return words, spaces
class JapaneseTokenizer(DummyTokenizer): class JapaneseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None): def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)

View File

@ -1,8 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
ELISION = " ' ".strip().replace(" ", "") ELISION = " ' ".strip().replace(" ", "")

View File

@ -20,7 +20,7 @@ for exc_data in [
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"}, {ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"}, {ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"}, {ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"} {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]

View File

@ -467,38 +467,110 @@ TAG_MAP = {
"VERB__VerbForm=Part": {"morph": "VerbForm=Part", POS: VERB}, "VERB__VerbForm=Part": {"morph": "VerbForm=Part", POS: VERB},
"VERB___": {"morph": "_", POS: VERB}, "VERB___": {"morph": "_", POS: VERB},
"X___": {"morph": "_", POS: X}, "X___": {"morph": "_", POS: X},
'CCONJ___': {"morph": "_", POS: CCONJ}, "CCONJ___": {"morph": "_", POS: CCONJ},
"ADJ__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADJ}, "ADJ__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADJ},
"ADJ__Abbr=Yes|Degree=Pos": {"morph": "Abbr=Yes|Degree=Pos", POS: ADJ}, "ADJ__Abbr=Yes|Degree=Pos": {"morph": "Abbr=Yes|Degree=Pos", POS: ADJ},
"ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ}, "ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": {
"ADJ__Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ}, "morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part",
"ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part", POS: ADJ}, POS: ADJ,
"ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part", POS: ADJ}, },
"ADJ__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: ADJ}, "ADJ__Definite=Def|Number=Sing|VerbForm=Part": {
"morph": "Definite=Def|Number=Sing|VerbForm=Part",
POS: ADJ,
},
"ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": {
"morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part",
POS: ADJ,
},
"ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": {
"morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part",
POS: ADJ,
},
"ADJ__Definite=Ind|Number=Sing|VerbForm=Part": {
"morph": "Definite=Ind|Number=Sing|VerbForm=Part",
POS: ADJ,
},
"ADJ__Number=Sing|VerbForm=Part": {"morph": "Number=Sing|VerbForm=Part", POS: ADJ}, "ADJ__Number=Sing|VerbForm=Part": {"morph": "Number=Sing|VerbForm=Part", POS: ADJ},
"ADJ__VerbForm=Part": {"morph": "VerbForm=Part", POS: ADJ}, "ADJ__VerbForm=Part": {"morph": "VerbForm=Part", POS: ADJ},
"ADP__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADP}, "ADP__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADP},
"ADV__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADV}, "ADV__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADV},
"DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": {"morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art", POS: DET}, "DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": {
"DET__Case=Gen|Number=Plur|PronType=Tot": {"morph": "Case=Gen|Number=Plur|PronType=Tot", POS: DET}, "morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art",
POS: DET,
},
"DET__Case=Gen|Number=Plur|PronType=Tot": {
"morph": "Case=Gen|Number=Plur|PronType=Tot",
POS: DET,
},
"DET__Definite=Def|PronType=Prs": {"morph": "Definite=Def|PronType=Prs", POS: DET}, "DET__Definite=Def|PronType=Prs": {"morph": "Definite=Def|PronType=Prs", POS: DET},
"DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs", POS: DET}, "DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": {
"DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs", POS: DET}, "morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs",
"DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs", POS: DET}, POS: DET,
"DET__Gender=Fem|Number=Sing|PronType=Art": {"morph": "Gender=Fem|Number=Sing|PronType=Art", POS: DET}, },
"DET__Gender=Fem|Number=Sing|PronType=Ind": {"morph": "Gender=Fem|Number=Sing|PronType=Ind", POS: DET}, "DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": {
"DET__Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|PronType=Prs", POS: DET}, "morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs",
"DET__Gender=Fem|Number=Sing|PronType=Tot": {"morph": "Gender=Fem|Number=Sing|PronType=Tot", POS: DET}, POS: DET,
"DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET}, },
"DET__Gender=Masc|Number=Sing|PronType=Art": {"morph": "Gender=Masc|Number=Sing|PronType=Art", POS: DET}, "DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": {
"DET__Gender=Masc|Number=Sing|PronType=Ind": {"morph": "Gender=Masc|Number=Sing|PronType=Ind", POS: DET}, "morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs",
"DET__Gender=Masc|Number=Sing|PronType=Tot": {"morph": "Gender=Masc|Number=Sing|PronType=Tot", POS: DET}, POS: DET,
"DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET}, },
"DET__Gender=Neut|Number=Sing|PronType=Art": {"morph": "Gender=Neut|Number=Sing|PronType=Art", POS: DET}, "DET__Gender=Fem|Number=Sing|PronType=Art": {
"DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind", POS: DET}, "morph": "Gender=Fem|Number=Sing|PronType=Art",
"DET__Gender=Neut|Number=Sing|PronType=Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Ind", POS: DET}, POS: DET,
"DET__Gender=Neut|Number=Sing|PronType=Tot": {"morph": "Gender=Neut|Number=Sing|PronType=Tot", POS: DET}, },
"DET__Number=Plur|Polarity=Neg|PronType=Neg": {"morph": "Number=Plur|Polarity=Neg|PronType=Neg", POS: DET}, "DET__Gender=Fem|Number=Sing|PronType=Ind": {
"morph": "Gender=Fem|Number=Sing|PronType=Ind",
POS: DET,
},
"DET__Gender=Fem|Number=Sing|PronType=Prs": {
"morph": "Gender=Fem|Number=Sing|PronType=Prs",
POS: DET,
},
"DET__Gender=Fem|Number=Sing|PronType=Tot": {
"morph": "Gender=Fem|Number=Sing|PronType=Tot",
POS: DET,
},
"DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": {
"morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg",
POS: DET,
},
"DET__Gender=Masc|Number=Sing|PronType=Art": {
"morph": "Gender=Masc|Number=Sing|PronType=Art",
POS: DET,
},
"DET__Gender=Masc|Number=Sing|PronType=Ind": {
"morph": "Gender=Masc|Number=Sing|PronType=Ind",
POS: DET,
},
"DET__Gender=Masc|Number=Sing|PronType=Tot": {
"morph": "Gender=Masc|Number=Sing|PronType=Tot",
POS: DET,
},
"DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": {
"morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg",
POS: DET,
},
"DET__Gender=Neut|Number=Sing|PronType=Art": {
"morph": "Gender=Neut|Number=Sing|PronType=Art",
POS: DET,
},
"DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": {
"morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind",
POS: DET,
},
"DET__Gender=Neut|Number=Sing|PronType=Ind": {
"morph": "Gender=Neut|Number=Sing|PronType=Ind",
POS: DET,
},
"DET__Gender=Neut|Number=Sing|PronType=Tot": {
"morph": "Gender=Neut|Number=Sing|PronType=Tot",
POS: DET,
},
"DET__Number=Plur|Polarity=Neg|PronType=Neg": {
"morph": "Number=Plur|Polarity=Neg|PronType=Neg",
POS: DET,
},
"DET__Number=Plur|PronType=Art": {"morph": "Number=Plur|PronType=Art", POS: DET}, "DET__Number=Plur|PronType=Art": {"morph": "Number=Plur|PronType=Art", POS: DET},
"DET__Number=Plur|PronType=Ind": {"morph": "Number=Plur|PronType=Ind", POS: DET}, "DET__Number=Plur|PronType=Ind": {"morph": "Number=Plur|PronType=Ind", POS: DET},
"DET__Number=Plur|PronType=Prs": {"morph": "Number=Plur|PronType=Prs", POS: DET}, "DET__Number=Plur|PronType=Prs": {"morph": "Number=Plur|PronType=Prs", POS: DET},
@ -507,57 +579,183 @@ TAG_MAP = {
"DET__PronType=Prs": {"morph": "PronType=Prs", POS: DET}, "DET__PronType=Prs": {"morph": "PronType=Prs", POS: DET},
"NOUN__Abbr=Yes": {"morph": "Abbr=Yes", POS: NOUN}, "NOUN__Abbr=Yes": {"morph": "Abbr=Yes", POS: NOUN},
"NOUN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: NOUN}, "NOUN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: NOUN},
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing", POS: NOUN}, "NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": {
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing", POS: NOUN}, "morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing",
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing", POS: NOUN}, POS: NOUN,
},
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": {
"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing",
POS: NOUN,
},
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": {
"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing",
POS: NOUN,
},
"NOUN__Abbr=Yes|Gender=Masc": {"morph": "Abbr=Yes|Gender=Masc", POS: NOUN}, "NOUN__Abbr=Yes|Gender=Masc": {"morph": "Abbr=Yes|Gender=Masc", POS: NOUN},
"NUM__Case=Gen|Number=Plur|NumType=Card": {"morph": "Case=Gen|Number=Plur|NumType=Card", POS: NUM}, "NUM__Case=Gen|Number=Plur|NumType=Card": {
"NUM__Definite=Def|Number=Sing|NumType=Card": {"morph": "Definite=Def|Number=Sing|NumType=Card", POS: NUM}, "morph": "Case=Gen|Number=Plur|NumType=Card",
POS: NUM,
},
"NUM__Definite=Def|Number=Sing|NumType=Card": {
"morph": "Definite=Def|Number=Sing|NumType=Card",
POS: NUM,
},
"NUM__Definite=Def|NumType=Card": {"morph": "Definite=Def|NumType=Card", POS: NUM}, "NUM__Definite=Def|NumType=Card": {"morph": "Definite=Def|NumType=Card", POS: NUM},
"NUM__Gender=Fem|Number=Sing|NumType=Card": {"morph": "Gender=Fem|Number=Sing|NumType=Card", POS: NUM}, "NUM__Gender=Fem|Number=Sing|NumType=Card": {
"NUM__Gender=Masc|Number=Sing|NumType=Card": {"morph": "Gender=Masc|Number=Sing|NumType=Card", POS: NUM}, "morph": "Gender=Fem|Number=Sing|NumType=Card",
"NUM__Gender=Neut|Number=Sing|NumType=Card": {"morph": "Gender=Neut|Number=Sing|NumType=Card", POS: NUM}, POS: NUM,
},
"NUM__Gender=Masc|Number=Sing|NumType=Card": {
"morph": "Gender=Masc|Number=Sing|NumType=Card",
POS: NUM,
},
"NUM__Gender=Neut|Number=Sing|NumType=Card": {
"morph": "Gender=Neut|Number=Sing|NumType=Card",
POS: NUM,
},
"NUM__Number=Plur|NumType=Card": {"morph": "Number=Plur|NumType=Card", POS: NUM}, "NUM__Number=Plur|NumType=Card": {"morph": "Number=Plur|NumType=Card", POS: NUM},
"NUM__Number=Sing|NumType=Card": {"morph": "Number=Sing|NumType=Card", POS: NUM}, "NUM__Number=Sing|NumType=Card": {"morph": "Number=Sing|NumType=Card", POS: NUM},
"NUM__NumType=Card": {"morph": "NumType=Card", POS: NUM}, "NUM__NumType=Card": {"morph": "NumType=Card", POS: NUM},
"PART__Polarity=Neg": {"morph": "Polarity=Neg", POS: PART}, "PART__Polarity=Neg": {"morph": "Polarity=Neg", POS: PART},
"PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON}, "PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": {
"PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON}, "morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs",
"PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs", POS: PRON}, POS: PRON,
"PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs", POS: PRON}, },
"PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs", POS: PRON}, "PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
"PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs", POS: PRON}, "morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs",
"PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs", POS: PRON}, POS: PRON,
"PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs", POS: PRON}, },
"PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON}, "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": {
"PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON}, "morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs",
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs", POS: PRON}, POS: PRON,
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs", POS: PRON}, },
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs", POS: PRON}, "PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": {
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs", POS: PRON}, "morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs",
"PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs", POS: PRON}, POS: PRON,
"PRON__Animacy=Hum|Number=Plur|PronType=Rcp": {"morph": "Animacy=Hum|Number=Plur|PronType=Rcp", POS: PRON}, },
"PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs", POS: PRON}, "PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": {
"PRON__Animacy=Hum|Poss=Yes|PronType=Int": {"morph": "Animacy=Hum|Poss=Yes|PronType=Int", POS: PRON}, "morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": {
"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": {
"morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": {
"morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Number=Plur|PronType=Rcp": {
"morph": "Animacy=Hum|Number=Plur|PronType=Rcp",
POS: PRON,
},
"PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": {
"morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Poss=Yes|PronType=Int": {
"morph": "Animacy=Hum|Poss=Yes|PronType=Int",
POS: PRON,
},
"PRON__Animacy=Hum|PronType=Int": {"morph": "Animacy=Hum|PronType=Int", POS: PRON}, "PRON__Animacy=Hum|PronType=Int": {"morph": "Animacy=Hum|PronType=Int", POS: PRON},
"PRON__Case=Acc|PronType=Prs|Reflex=Yes": {"morph": "Case=Acc|PronType=Prs|Reflex=Yes", POS: PRON}, "PRON__Case=Acc|PronType=Prs|Reflex=Yes": {
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": { "morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON}, "morph": "Case=Acc|PronType=Prs|Reflex=Yes",
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON}, POS: PRON,
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot", POS: PRON}, },
"PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, "PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": {
"PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, "morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs",
"PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON}, POS: PRON,
"PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON}, },
"PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": {"morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON}, "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": {
"PRON__Number=Plur|Person=3|PronType=Ind,Prs": {"morph": "Number=Plur|Person=3|PronType=Ind,Prs", POS: PRON}, "morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs",
"PRON__Number=Plur|Person=3|PronType=Prs,Tot": {"morph": "Number=Plur|Person=3|PronType=Prs,Tot", POS: PRON}, POS: PRON,
"PRON__Number=Plur|Poss=Yes|PronType=Prs": {"morph": "Number=Plur|Poss=Yes|PronType=Prs", POS: PRON}, },
"PRON__Number=Plur|Poss=Yes|PronType=Rcp": {"morph": "Number=Plur|Poss=Yes|PronType=Rcp", POS: PRON}, "PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": {
"PRON__Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Number=Sing|Polarity=Neg|PronType=Neg", POS: PRON}, "morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot",
POS: PRON,
},
"PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": {
"morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs",
POS: PRON,
},
"PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": {
"morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs",
POS: PRON,
},
"PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": {
"morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs",
POS: PRON,
},
"PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": {
"morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs",
POS: PRON,
},
"PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": {
"morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs",
POS: PRON,
},
"PRON__Number=Plur|Person=3|PronType=Ind,Prs": {
"morph": "Number=Plur|Person=3|PronType=Ind,Prs",
POS: PRON,
},
"PRON__Number=Plur|Person=3|PronType=Prs,Tot": {
"morph": "Number=Plur|Person=3|PronType=Prs,Tot",
POS: PRON,
},
"PRON__Number=Plur|Poss=Yes|PronType=Prs": {
"morph": "Number=Plur|Poss=Yes|PronType=Prs",
POS: PRON,
},
"PRON__Number=Plur|Poss=Yes|PronType=Rcp": {
"morph": "Number=Plur|Poss=Yes|PronType=Rcp",
POS: PRON,
},
"PRON__Number=Sing|Polarity=Neg|PronType=Neg": {
"morph": "Number=Sing|Polarity=Neg|PronType=Neg",
POS: PRON,
},
"PRON__PronType=Prs": {"morph": "PronType=Prs", POS: PRON}, "PRON__PronType=Prs": {"morph": "PronType=Prs", POS: PRON},
"PRON__PronType=Rel": {"morph": "PronType=Rel", POS: PRON}, "PRON__PronType=Rel": {"morph": "PronType=Rel", POS: PRON},
"PROPN__Abbr=Yes": {"morph": "Abbr=Yes", POS: PROPN}, "PROPN__Abbr=Yes": {"morph": "Abbr=Yes", POS: PROPN},
"PROPN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: PROPN}, "PROPN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: PROPN},
"VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": {"morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin", POS: VERB}, "VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": {
"VERB__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: VERB}, "morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin",
POS: VERB,
},
"VERB__Definite=Ind|Number=Sing|VerbForm=Part": {
"morph": "Definite=Ind|Number=Sing|VerbForm=Part",
POS: VERB,
},
} }

View File

@ -295,10 +295,9 @@ class EntityRuler(object):
deserializers_patterns = { deserializers_patterns = {
"patterns": lambda p: self.add_patterns( "patterns": lambda p: self.add_patterns(
srsly.read_jsonl(p.with_suffix(".jsonl")) srsly.read_jsonl(p.with_suffix(".jsonl"))
)} )
deserializers_cfg = {
"cfg": lambda p: cfg.update(srsly.read_json(p))
} }
deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
from_disk(path, deserializers_cfg, {}) from_disk(path, deserializers_cfg, {})
self.overwrite = cfg.get("overwrite", False) self.overwrite = cfg.get("overwrite", False)
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")

View File

@ -229,4 +229,3 @@ def yo_tokenizer():
def zh_tokenizer(): def zh_tokenizer():
pytest.importorskip("jieba") pytest.importorskip("jieba")
return get_lang_class("zh").Defaults.create_tokenizer() return get_lang_class("zh").Defaults.create_tokenizer()

View File

@ -15,7 +15,7 @@ ABBREVIATION_TESTS = [
HYPHENATED_TESTS = [ HYPHENATED_TESTS = [
( (
"1700-luvulle sijoittuva taide-elokuva", "1700-luvulle sijoittuva taide-elokuva",
["1700-luvulle", "sijoittuva", "taide-elokuva"] ["1700-luvulle", "sijoittuva", "taide-elokuva"],
) )
] ]

View File

@ -3,16 +3,19 @@ from __future__ import unicode_literals
import pytest import pytest
@pytest.mark.parametrize("text", ["z.B.", "Jan."]) @pytest.mark.parametrize("text", ["z.B.", "Jan."])
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text): def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
tokens = lb_tokenizer(text) tokens = lb_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1
@pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "dWelt", "dSuen"]) @pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "dWelt", "dSuen"])
def test_lb_tokenizer_splits_contractions(lb_tokenizer, text): def test_lb_tokenizer_splits_contractions(lb_tokenizer, text):
tokens = lb_tokenizer(text) tokens = lb_tokenizer(text)
assert len(tokens) == 2 assert len(tokens) == 2
def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer): def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
text = "Mee 't ass net evident, d'Liewen." text = "Mee 't ass net evident, d'Liewen."
tokens = lb_tokenizer(text) tokens = lb_tokenizer(text)
@ -20,6 +23,7 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
assert tokens[1].text == "'t" assert tokens[1].text == "'t"
assert tokens[1].lemma_ == "et" assert tokens[1].lemma_ == "et"
@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")]) @pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
def test_lb_norm_exceptions(lb_tokenizer, text, norm): def test_lb_norm_exceptions(lb_tokenizer, text, norm):
tokens = lb_tokenizer(text) tokens = lb_tokenizer(text)

View File

@ -16,7 +16,7 @@ def test_lb_tokenizer_handles_long_text(lb_tokenizer):
[ [
("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13), ("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13),
("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15), ("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15),
("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14) ("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14),
], ],
) )
def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length): def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length):

View File

@ -151,17 +151,17 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
def test_parser_set_sent_starts(en_vocab): def test_parser_set_sent_starts(en_vocab):
# fmt: off
words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n'] words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n']
heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1] heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1]
deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', ''] deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', '']
doc = get_doc( # fmt: on
en_vocab, words=words, deps=deps, heads=heads doc = get_doc(en_vocab, words=words, deps=deps, heads=heads)
)
for i in range(len(words)): for i in range(len(words)):
if i == 0 or i == 3: if i == 0 or i == 3:
assert doc[i].is_sent_start == True assert doc[i].is_sent_start is True
else: else:
assert doc[i].is_sent_start == None assert doc[i].is_sent_start is None
for sent in doc.sents: for sent in doc.sents:
for token in sent: for token in sent:
assert token.head in sent assert token.head in sent

View File

@ -3,7 +3,6 @@ from __future__ import unicode_literals
import pytest import pytest
from spacy.language import Language from spacy.language import Language
from spacy.pipeline import Tagger
def test_label_types(): def test_label_types():

View File

@ -15,7 +15,9 @@ def test_issue4674():
vector1 = [0.9, 1.1, 1.01] vector1 = [0.9, 1.1, 1.01]
vector2 = [1.8, 2.25, 2.01] vector2 = [1.8, 2.25, 2.01]
kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]) kb.set_entities(
entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]
)
assert kb.get_size_entities() == 1 assert kb.get_size_entities() == 1
@ -31,4 +33,3 @@ def test_issue4674():
kb2.load_bulk(str(file_path)) kb2.load_bulk(str(file_path))
assert kb2.get_size_entities() == 1 assert kb2.get_size_entities() == 1