Merge branch 'master' into develop

This commit is contained in:
Ines Montani 2019-12-21 19:04:43 +01:00
commit 947dba7141
14 changed files with 308 additions and 107 deletions

View File

@ -4249,20 +4249,20 @@ TAG_MAP = {
"Voice": "Act",
"Case": "Nom|Gen|Dat|Acc|Voc",
},
'ADJ': {POS: ADJ},
'ADP': {POS: ADP},
'ADV': {POS: ADV},
'AtDf': {POS: DET},
'AUX': {POS: AUX},
'CCONJ': {POS: CCONJ},
'DET': {POS: DET},
'NOUN': {POS: NOUN},
'NUM': {POS: NUM},
'PART': {POS: PART},
'PRON': {POS: PRON},
'PROPN': {POS: PROPN},
'SCONJ': {POS: SCONJ},
'SYM': {POS: SYM},
'VERB': {POS: VERB},
'X': {POS: X},
"ADJ": {POS: ADJ},
"ADP": {POS: ADP},
"ADV": {POS: ADV},
"AtDf": {POS: DET},
"AUX": {POS: AUX},
"CCONJ": {POS: CCONJ},
"DET": {POS: DET},
"NOUN": {POS: NOUN},
"NUM": {POS: NUM},
"PART": {POS: PART},
"PRON": {POS: PRON},
"PROPN": {POS: PROPN},
"SCONJ": {POS: SCONJ},
"SYM": {POS: SYM},
"VERB": {POS: VERB},
"X": {POS: X},
}

View File

@ -16,7 +16,8 @@ from ...util import DummyTokenizer
# the flow by creating a dummy with the same interface.
DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"])
DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"])
DummySpace = DummyNode(' ', ' ', DummyNodeFeatures(' '))
DummySpace = DummyNode(" ", " ", DummyNodeFeatures(" "))
def try_fugashi_import():
"""Fugashi is required for Japanese support, so check for it.
@ -27,8 +28,7 @@ def try_fugashi_import():
return fugashi
except ImportError:
raise ImportError(
"Japanese support requires Fugashi: "
"https://github.com/polm/fugashi"
"Japanese support requires Fugashi: " "https://github.com/polm/fugashi"
)
@ -55,13 +55,14 @@ def resolve_pos(token):
return token.pos + ",ADJ"
return token.pos
def get_words_and_spaces(tokenizer, text):
"""Get the individual tokens that make up the sentence and handle white space.
Japanese doesn't usually use white space, and MeCab's handling of it for
multiple spaces in a row is somewhat awkward.
"""
tokens = tokenizer.parseToNodeList(text)
words = []
@ -76,6 +77,7 @@ def get_words_and_spaces(tokenizer, text):
spaces.append(bool(token.white_space))
return words, spaces
class JapaneseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)

View File

@ -1,8 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
ELISION = " ' ".strip().replace(" ", "")

View File

@ -20,7 +20,7 @@ for exc_data in [
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
]:
_exc[exc_data[ORTH]] = [exc_data]

View File

@ -467,38 +467,110 @@ TAG_MAP = {
"VERB__VerbForm=Part": {"morph": "VerbForm=Part", POS: VERB},
"VERB___": {"morph": "_", POS: VERB},
"X___": {"morph": "_", POS: X},
'CCONJ___': {"morph": "_", POS: CCONJ},
"CCONJ___": {"morph": "_", POS: CCONJ},
"ADJ__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADJ},
"ADJ__Abbr=Yes|Degree=Pos": {"morph": "Abbr=Yes|Degree=Pos", POS: ADJ},
"ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ},
"ADJ__Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ},
"ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part", POS: ADJ},
"ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part", POS: ADJ},
"ADJ__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: ADJ},
"ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": {
"morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part",
POS: ADJ,
},
"ADJ__Definite=Def|Number=Sing|VerbForm=Part": {
"morph": "Definite=Def|Number=Sing|VerbForm=Part",
POS: ADJ,
},
"ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": {
"morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part",
POS: ADJ,
},
"ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": {
"morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part",
POS: ADJ,
},
"ADJ__Definite=Ind|Number=Sing|VerbForm=Part": {
"morph": "Definite=Ind|Number=Sing|VerbForm=Part",
POS: ADJ,
},
"ADJ__Number=Sing|VerbForm=Part": {"morph": "Number=Sing|VerbForm=Part", POS: ADJ},
"ADJ__VerbForm=Part": {"morph": "VerbForm=Part", POS: ADJ},
"ADP__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADP},
"ADV__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADV},
"DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": {"morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art", POS: DET},
"DET__Case=Gen|Number=Plur|PronType=Tot": {"morph": "Case=Gen|Number=Plur|PronType=Tot", POS: DET},
"DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": {
"morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art",
POS: DET,
},
"DET__Case=Gen|Number=Plur|PronType=Tot": {
"morph": "Case=Gen|Number=Plur|PronType=Tot",
POS: DET,
},
"DET__Definite=Def|PronType=Prs": {"morph": "Definite=Def|PronType=Prs", POS: DET},
"DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs", POS: DET},
"DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs", POS: DET},
"DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs", POS: DET},
"DET__Gender=Fem|Number=Sing|PronType=Art": {"morph": "Gender=Fem|Number=Sing|PronType=Art", POS: DET},
"DET__Gender=Fem|Number=Sing|PronType=Ind": {"morph": "Gender=Fem|Number=Sing|PronType=Ind", POS: DET},
"DET__Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|PronType=Prs", POS: DET},
"DET__Gender=Fem|Number=Sing|PronType=Tot": {"morph": "Gender=Fem|Number=Sing|PronType=Tot", POS: DET},
"DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET},
"DET__Gender=Masc|Number=Sing|PronType=Art": {"morph": "Gender=Masc|Number=Sing|PronType=Art", POS: DET},
"DET__Gender=Masc|Number=Sing|PronType=Ind": {"morph": "Gender=Masc|Number=Sing|PronType=Ind", POS: DET},
"DET__Gender=Masc|Number=Sing|PronType=Tot": {"morph": "Gender=Masc|Number=Sing|PronType=Tot", POS: DET},
"DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET},
"DET__Gender=Neut|Number=Sing|PronType=Art": {"morph": "Gender=Neut|Number=Sing|PronType=Art", POS: DET},
"DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind", POS: DET},
"DET__Gender=Neut|Number=Sing|PronType=Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Ind", POS: DET},
"DET__Gender=Neut|Number=Sing|PronType=Tot": {"morph": "Gender=Neut|Number=Sing|PronType=Tot", POS: DET},
"DET__Number=Plur|Polarity=Neg|PronType=Neg": {"morph": "Number=Plur|Polarity=Neg|PronType=Neg", POS: DET},
"DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": {
"morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs",
POS: DET,
},
"DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": {
"morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs",
POS: DET,
},
"DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": {
"morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs",
POS: DET,
},
"DET__Gender=Fem|Number=Sing|PronType=Art": {
"morph": "Gender=Fem|Number=Sing|PronType=Art",
POS: DET,
},
"DET__Gender=Fem|Number=Sing|PronType=Ind": {
"morph": "Gender=Fem|Number=Sing|PronType=Ind",
POS: DET,
},
"DET__Gender=Fem|Number=Sing|PronType=Prs": {
"morph": "Gender=Fem|Number=Sing|PronType=Prs",
POS: DET,
},
"DET__Gender=Fem|Number=Sing|PronType=Tot": {
"morph": "Gender=Fem|Number=Sing|PronType=Tot",
POS: DET,
},
"DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": {
"morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg",
POS: DET,
},
"DET__Gender=Masc|Number=Sing|PronType=Art": {
"morph": "Gender=Masc|Number=Sing|PronType=Art",
POS: DET,
},
"DET__Gender=Masc|Number=Sing|PronType=Ind": {
"morph": "Gender=Masc|Number=Sing|PronType=Ind",
POS: DET,
},
"DET__Gender=Masc|Number=Sing|PronType=Tot": {
"morph": "Gender=Masc|Number=Sing|PronType=Tot",
POS: DET,
},
"DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": {
"morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg",
POS: DET,
},
"DET__Gender=Neut|Number=Sing|PronType=Art": {
"morph": "Gender=Neut|Number=Sing|PronType=Art",
POS: DET,
},
"DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": {
"morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind",
POS: DET,
},
"DET__Gender=Neut|Number=Sing|PronType=Ind": {
"morph": "Gender=Neut|Number=Sing|PronType=Ind",
POS: DET,
},
"DET__Gender=Neut|Number=Sing|PronType=Tot": {
"morph": "Gender=Neut|Number=Sing|PronType=Tot",
POS: DET,
},
"DET__Number=Plur|Polarity=Neg|PronType=Neg": {
"morph": "Number=Plur|Polarity=Neg|PronType=Neg",
POS: DET,
},
"DET__Number=Plur|PronType=Art": {"morph": "Number=Plur|PronType=Art", POS: DET},
"DET__Number=Plur|PronType=Ind": {"morph": "Number=Plur|PronType=Ind", POS: DET},
"DET__Number=Plur|PronType=Prs": {"morph": "Number=Plur|PronType=Prs", POS: DET},
@ -507,57 +579,183 @@ TAG_MAP = {
"DET__PronType=Prs": {"morph": "PronType=Prs", POS: DET},
"NOUN__Abbr=Yes": {"morph": "Abbr=Yes", POS: NOUN},
"NOUN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: NOUN},
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing", POS: NOUN},
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing", POS: NOUN},
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing", POS: NOUN},
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": {
"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing",
POS: NOUN,
},
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": {
"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing",
POS: NOUN,
},
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": {
"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing",
POS: NOUN,
},
"NOUN__Abbr=Yes|Gender=Masc": {"morph": "Abbr=Yes|Gender=Masc", POS: NOUN},
"NUM__Case=Gen|Number=Plur|NumType=Card": {"morph": "Case=Gen|Number=Plur|NumType=Card", POS: NUM},
"NUM__Definite=Def|Number=Sing|NumType=Card": {"morph": "Definite=Def|Number=Sing|NumType=Card", POS: NUM},
"NUM__Case=Gen|Number=Plur|NumType=Card": {
"morph": "Case=Gen|Number=Plur|NumType=Card",
POS: NUM,
},
"NUM__Definite=Def|Number=Sing|NumType=Card": {
"morph": "Definite=Def|Number=Sing|NumType=Card",
POS: NUM,
},
"NUM__Definite=Def|NumType=Card": {"morph": "Definite=Def|NumType=Card", POS: NUM},
"NUM__Gender=Fem|Number=Sing|NumType=Card": {"morph": "Gender=Fem|Number=Sing|NumType=Card", POS: NUM},
"NUM__Gender=Masc|Number=Sing|NumType=Card": {"morph": "Gender=Masc|Number=Sing|NumType=Card", POS: NUM},
"NUM__Gender=Neut|Number=Sing|NumType=Card": {"morph": "Gender=Neut|Number=Sing|NumType=Card", POS: NUM},
"NUM__Gender=Fem|Number=Sing|NumType=Card": {
"morph": "Gender=Fem|Number=Sing|NumType=Card",
POS: NUM,
},
"NUM__Gender=Masc|Number=Sing|NumType=Card": {
"morph": "Gender=Masc|Number=Sing|NumType=Card",
POS: NUM,
},
"NUM__Gender=Neut|Number=Sing|NumType=Card": {
"morph": "Gender=Neut|Number=Sing|NumType=Card",
POS: NUM,
},
"NUM__Number=Plur|NumType=Card": {"morph": "Number=Plur|NumType=Card", POS: NUM},
"NUM__Number=Sing|NumType=Card": {"morph": "Number=Sing|NumType=Card", POS: NUM},
"NUM__NumType=Card": {"morph": "NumType=Card", POS: NUM},
"PART__Polarity=Neg": {"morph": "Polarity=Neg", POS: PART},
"PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs", POS: PRON},
"PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs", POS: PRON},
"PRON__Animacy=Hum|Number=Plur|PronType=Rcp": {"morph": "Animacy=Hum|Number=Plur|PronType=Rcp", POS: PRON},
"PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs", POS: PRON},
"PRON__Animacy=Hum|Poss=Yes|PronType=Int": {"morph": "Animacy=Hum|Poss=Yes|PronType=Int", POS: PRON},
"PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": {
"morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
"morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": {
"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": {
"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": {
"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": {
"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": {
"morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": {
"morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": {
"morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Number=Plur|PronType=Rcp": {
"morph": "Animacy=Hum|Number=Plur|PronType=Rcp",
POS: PRON,
},
"PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": {
"morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs",
POS: PRON,
},
"PRON__Animacy=Hum|Poss=Yes|PronType=Int": {
"morph": "Animacy=Hum|Poss=Yes|PronType=Int",
POS: PRON,
},
"PRON__Animacy=Hum|PronType=Int": {"morph": "Animacy=Hum|PronType=Int", POS: PRON},
"PRON__Case=Acc|PronType=Prs|Reflex=Yes": {"morph": "Case=Acc|PronType=Prs|Reflex=Yes", POS: PRON},
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": { "morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON},
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON},
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot", POS: PRON},
"PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON},
"PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON},
"PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON},
"PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON},
"PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": {"morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON},
"PRON__Number=Plur|Person=3|PronType=Ind,Prs": {"morph": "Number=Plur|Person=3|PronType=Ind,Prs", POS: PRON},
"PRON__Number=Plur|Person=3|PronType=Prs,Tot": {"morph": "Number=Plur|Person=3|PronType=Prs,Tot", POS: PRON},
"PRON__Number=Plur|Poss=Yes|PronType=Prs": {"morph": "Number=Plur|Poss=Yes|PronType=Prs", POS: PRON},
"PRON__Number=Plur|Poss=Yes|PronType=Rcp": {"morph": "Number=Plur|Poss=Yes|PronType=Rcp", POS: PRON},
"PRON__Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Number=Sing|Polarity=Neg|PronType=Neg", POS: PRON},
"PRON__Case=Acc|PronType=Prs|Reflex=Yes": {
"morph": "Case=Acc|PronType=Prs|Reflex=Yes",
POS: PRON,
},
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": {
"morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs",
POS: PRON,
},
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": {
"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs",
POS: PRON,
},
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": {
"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot",
POS: PRON,
},
"PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": {
"morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs",
POS: PRON,
},
"PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": {
"morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs",
POS: PRON,
},
"PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": {
"morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs",
POS: PRON,
},
"PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": {
"morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs",
POS: PRON,
},
"PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": {
"morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs",
POS: PRON,
},
"PRON__Number=Plur|Person=3|PronType=Ind,Prs": {
"morph": "Number=Plur|Person=3|PronType=Ind,Prs",
POS: PRON,
},
"PRON__Number=Plur|Person=3|PronType=Prs,Tot": {
"morph": "Number=Plur|Person=3|PronType=Prs,Tot",
POS: PRON,
},
"PRON__Number=Plur|Poss=Yes|PronType=Prs": {
"morph": "Number=Plur|Poss=Yes|PronType=Prs",
POS: PRON,
},
"PRON__Number=Plur|Poss=Yes|PronType=Rcp": {
"morph": "Number=Plur|Poss=Yes|PronType=Rcp",
POS: PRON,
},
"PRON__Number=Sing|Polarity=Neg|PronType=Neg": {
"morph": "Number=Sing|Polarity=Neg|PronType=Neg",
POS: PRON,
},
"PRON__PronType=Prs": {"morph": "PronType=Prs", POS: PRON},
"PRON__PronType=Rel": {"morph": "PronType=Rel", POS: PRON},
"PROPN__Abbr=Yes": {"morph": "Abbr=Yes", POS: PROPN},
"PROPN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: PROPN},
"VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": {"morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin", POS: VERB},
"VERB__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: VERB},
"VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": {
"morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin",
POS: VERB,
},
"VERB__Definite=Ind|Number=Sing|VerbForm=Part": {
"morph": "Definite=Ind|Number=Sing|VerbForm=Part",
POS: VERB,
},
}

View File

@ -295,10 +295,9 @@ class EntityRuler(object):
deserializers_patterns = {
"patterns": lambda p: self.add_patterns(
srsly.read_jsonl(p.with_suffix(".jsonl"))
)}
deserializers_cfg = {
"cfg": lambda p: cfg.update(srsly.read_json(p))
)
}
deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
from_disk(path, deserializers_cfg, {})
self.overwrite = cfg.get("overwrite", False)
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")

View File

@ -219,14 +219,13 @@ def uk_tokenizer():
def ur_tokenizer():
return get_lang_class("ur").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def yo_tokenizer():
return get_lang_class("yo").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def zh_tokenizer():
pytest.importorskip("jieba")
return get_lang_class("zh").Defaults.create_tokenizer()

View File

@ -15,7 +15,7 @@ ABBREVIATION_TESTS = [
HYPHENATED_TESTS = [
(
"1700-luvulle sijoittuva taide-elokuva",
["1700-luvulle", "sijoittuva", "taide-elokuva"]
["1700-luvulle", "sijoittuva", "taide-elokuva"],
)
]

View File

@ -3,16 +3,19 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize("text", ["z.B.", "Jan."])
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
tokens = lb_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "dWelt", "dSuen"])
def test_lb_tokenizer_splits_contractions(lb_tokenizer, text):
tokens = lb_tokenizer(text)
assert len(tokens) == 2
def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
text = "Mee 't ass net evident, d'Liewen."
tokens = lb_tokenizer(text)
@ -20,6 +23,7 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
assert tokens[1].text == "'t"
assert tokens[1].lemma_ == "et"
@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
def test_lb_norm_exceptions(lb_tokenizer, text, norm):
tokens = lb_tokenizer(text)

View File

@ -16,7 +16,7 @@ def test_lb_tokenizer_handles_long_text(lb_tokenizer):
[
("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13),
("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15),
("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14)
("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14),
],
)
def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length):

View File

@ -87,4 +87,4 @@ def test_lex_attrs_like_url(text, match):
],
)
def test_lex_attrs_word_shape(text, shape):
assert word_shape(text) == shape
assert word_shape(text) == shape

View File

@ -151,17 +151,17 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
def test_parser_set_sent_starts(en_vocab):
# fmt: off
words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n']
heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1]
deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', '']
doc = get_doc(
en_vocab, words=words, deps=deps, heads=heads
)
# fmt: on
doc = get_doc(en_vocab, words=words, deps=deps, heads=heads)
for i in range(len(words)):
if i == 0 or i == 3:
assert doc[i].is_sent_start == True
assert doc[i].is_sent_start is True
else:
assert doc[i].is_sent_start == None
assert doc[i].is_sent_start is None
for sent in doc.sents:
for token in sent:
assert token.head in sent

View File

@ -3,7 +3,6 @@ from __future__ import unicode_literals
import pytest
from spacy.language import Language
from spacy.pipeline import Tagger
def test_label_types():

View File

@ -15,7 +15,9 @@ def test_issue4674():
vector1 = [0.9, 1.1, 1.01]
vector2 = [1.8, 2.25, 2.01]
kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2])
kb.set_entities(
entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]
)
assert kb.get_size_entities() == 1
@ -31,4 +33,3 @@ def test_issue4674():
kb2.load_bulk(str(file_path))
assert kb2.get_size_entities() == 1