mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'master' into develop
This commit is contained in:
commit
947dba7141
|
@ -4249,20 +4249,20 @@ TAG_MAP = {
|
||||||
"Voice": "Act",
|
"Voice": "Act",
|
||||||
"Case": "Nom|Gen|Dat|Acc|Voc",
|
"Case": "Nom|Gen|Dat|Acc|Voc",
|
||||||
},
|
},
|
||||||
'ADJ': {POS: ADJ},
|
"ADJ": {POS: ADJ},
|
||||||
'ADP': {POS: ADP},
|
"ADP": {POS: ADP},
|
||||||
'ADV': {POS: ADV},
|
"ADV": {POS: ADV},
|
||||||
'AtDf': {POS: DET},
|
"AtDf": {POS: DET},
|
||||||
'AUX': {POS: AUX},
|
"AUX": {POS: AUX},
|
||||||
'CCONJ': {POS: CCONJ},
|
"CCONJ": {POS: CCONJ},
|
||||||
'DET': {POS: DET},
|
"DET": {POS: DET},
|
||||||
'NOUN': {POS: NOUN},
|
"NOUN": {POS: NOUN},
|
||||||
'NUM': {POS: NUM},
|
"NUM": {POS: NUM},
|
||||||
'PART': {POS: PART},
|
"PART": {POS: PART},
|
||||||
'PRON': {POS: PRON},
|
"PRON": {POS: PRON},
|
||||||
'PROPN': {POS: PROPN},
|
"PROPN": {POS: PROPN},
|
||||||
'SCONJ': {POS: SCONJ},
|
"SCONJ": {POS: SCONJ},
|
||||||
'SYM': {POS: SYM},
|
"SYM": {POS: SYM},
|
||||||
'VERB': {POS: VERB},
|
"VERB": {POS: VERB},
|
||||||
'X': {POS: X},
|
"X": {POS: X},
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,7 +16,8 @@ from ...util import DummyTokenizer
|
||||||
# the flow by creating a dummy with the same interface.
|
# the flow by creating a dummy with the same interface.
|
||||||
DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"])
|
DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"])
|
||||||
DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"])
|
DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"])
|
||||||
DummySpace = DummyNode(' ', ' ', DummyNodeFeatures(' '))
|
DummySpace = DummyNode(" ", " ", DummyNodeFeatures(" "))
|
||||||
|
|
||||||
|
|
||||||
def try_fugashi_import():
|
def try_fugashi_import():
|
||||||
"""Fugashi is required for Japanese support, so check for it.
|
"""Fugashi is required for Japanese support, so check for it.
|
||||||
|
@ -27,8 +28,7 @@ def try_fugashi_import():
|
||||||
return fugashi
|
return fugashi
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"Japanese support requires Fugashi: "
|
"Japanese support requires Fugashi: " "https://github.com/polm/fugashi"
|
||||||
"https://github.com/polm/fugashi"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -55,13 +55,14 @@ def resolve_pos(token):
|
||||||
return token.pos + ",ADJ"
|
return token.pos + ",ADJ"
|
||||||
return token.pos
|
return token.pos
|
||||||
|
|
||||||
|
|
||||||
def get_words_and_spaces(tokenizer, text):
|
def get_words_and_spaces(tokenizer, text):
|
||||||
"""Get the individual tokens that make up the sentence and handle white space.
|
"""Get the individual tokens that make up the sentence and handle white space.
|
||||||
|
|
||||||
Japanese doesn't usually use white space, and MeCab's handling of it for
|
Japanese doesn't usually use white space, and MeCab's handling of it for
|
||||||
multiple spaces in a row is somewhat awkward.
|
multiple spaces in a row is somewhat awkward.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tokens = tokenizer.parseToNodeList(text)
|
tokens = tokenizer.parseToNodeList(text)
|
||||||
|
|
||||||
words = []
|
words = []
|
||||||
|
@ -76,6 +77,7 @@ def get_words_and_spaces(tokenizer, text):
|
||||||
spaces.append(bool(token.white_space))
|
spaces.append(bool(token.white_space))
|
||||||
return words, spaces
|
return words, spaces
|
||||||
|
|
||||||
|
|
||||||
class JapaneseTokenizer(DummyTokenizer):
|
class JapaneseTokenizer(DummyTokenizer):
|
||||||
def __init__(self, cls, nlp=None):
|
def __init__(self, cls, nlp=None):
|
||||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "")
|
ELISION = " ' ’ ".strip().replace(" ", "")
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ for exc_data in [
|
||||||
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
|
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
|
||||||
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
|
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
|
||||||
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
|
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
|
||||||
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}
|
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
|
||||||
]:
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
|
@ -467,38 +467,110 @@ TAG_MAP = {
|
||||||
"VERB__VerbForm=Part": {"morph": "VerbForm=Part", POS: VERB},
|
"VERB__VerbForm=Part": {"morph": "VerbForm=Part", POS: VERB},
|
||||||
"VERB___": {"morph": "_", POS: VERB},
|
"VERB___": {"morph": "_", POS: VERB},
|
||||||
"X___": {"morph": "_", POS: X},
|
"X___": {"morph": "_", POS: X},
|
||||||
'CCONJ___': {"morph": "_", POS: CCONJ},
|
"CCONJ___": {"morph": "_", POS: CCONJ},
|
||||||
"ADJ__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADJ},
|
"ADJ__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADJ},
|
||||||
"ADJ__Abbr=Yes|Degree=Pos": {"morph": "Abbr=Yes|Degree=Pos", POS: ADJ},
|
"ADJ__Abbr=Yes|Degree=Pos": {"morph": "Abbr=Yes|Degree=Pos", POS: ADJ},
|
||||||
"ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ},
|
"ADJ__Case=Gen|Definite=Def|Number=Sing|VerbForm=Part": {
|
||||||
"ADJ__Definite=Def|Number=Sing|VerbForm=Part": {"morph": "Definite=Def|Number=Sing|VerbForm=Part", POS: ADJ},
|
"morph": "Case=Gen|Definite=Def|Number=Sing|VerbForm=Part",
|
||||||
"ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part", POS: ADJ},
|
POS: ADJ,
|
||||||
"ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part", POS: ADJ},
|
},
|
||||||
"ADJ__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: ADJ},
|
"ADJ__Definite=Def|Number=Sing|VerbForm=Part": {
|
||||||
|
"morph": "Definite=Def|Number=Sing|VerbForm=Part",
|
||||||
|
POS: ADJ,
|
||||||
|
},
|
||||||
|
"ADJ__Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part": {
|
||||||
|
"morph": "Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part",
|
||||||
|
POS: ADJ,
|
||||||
|
},
|
||||||
|
"ADJ__Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part": {
|
||||||
|
"morph": "Definite=Ind|Gender=Neut|Number=Sing|VerbForm=Part",
|
||||||
|
POS: ADJ,
|
||||||
|
},
|
||||||
|
"ADJ__Definite=Ind|Number=Sing|VerbForm=Part": {
|
||||||
|
"morph": "Definite=Ind|Number=Sing|VerbForm=Part",
|
||||||
|
POS: ADJ,
|
||||||
|
},
|
||||||
"ADJ__Number=Sing|VerbForm=Part": {"morph": "Number=Sing|VerbForm=Part", POS: ADJ},
|
"ADJ__Number=Sing|VerbForm=Part": {"morph": "Number=Sing|VerbForm=Part", POS: ADJ},
|
||||||
"ADJ__VerbForm=Part": {"morph": "VerbForm=Part", POS: ADJ},
|
"ADJ__VerbForm=Part": {"morph": "VerbForm=Part", POS: ADJ},
|
||||||
"ADP__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADP},
|
"ADP__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADP},
|
||||||
"ADV__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADV},
|
"ADV__Abbr=Yes": {"morph": "Abbr=Yes", POS: ADV},
|
||||||
"DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": {"morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art", POS: DET},
|
"DET__Case=Gen|Gender=Masc|Number=Sing|PronType=Art": {
|
||||||
"DET__Case=Gen|Number=Plur|PronType=Tot": {"morph": "Case=Gen|Number=Plur|PronType=Tot", POS: DET},
|
"morph": "Case=Gen|Gender=Masc|Number=Sing|PronType=Art",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
|
"DET__Case=Gen|Number=Plur|PronType=Tot": {
|
||||||
|
"morph": "Case=Gen|Number=Plur|PronType=Tot",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
"DET__Definite=Def|PronType=Prs": {"morph": "Definite=Def|PronType=Prs", POS: DET},
|
"DET__Definite=Def|PronType=Prs": {"morph": "Definite=Def|PronType=Prs", POS: DET},
|
||||||
"DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs", POS: DET},
|
"DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs": {
|
||||||
"DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs", POS: DET},
|
"morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Prs",
|
||||||
"DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": {"morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs", POS: DET},
|
POS: DET,
|
||||||
"DET__Gender=Fem|Number=Sing|PronType=Art": {"morph": "Gender=Fem|Number=Sing|PronType=Art", POS: DET},
|
},
|
||||||
"DET__Gender=Fem|Number=Sing|PronType=Ind": {"morph": "Gender=Fem|Number=Sing|PronType=Ind", POS: DET},
|
"DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs": {
|
||||||
"DET__Gender=Fem|Number=Sing|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|PronType=Prs", POS: DET},
|
"morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Prs",
|
||||||
"DET__Gender=Fem|Number=Sing|PronType=Tot": {"morph": "Gender=Fem|Number=Sing|PronType=Tot", POS: DET},
|
POS: DET,
|
||||||
"DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET},
|
},
|
||||||
"DET__Gender=Masc|Number=Sing|PronType=Art": {"morph": "Gender=Masc|Number=Sing|PronType=Art", POS: DET},
|
"DET__Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs": {
|
||||||
"DET__Gender=Masc|Number=Sing|PronType=Ind": {"morph": "Gender=Masc|Number=Sing|PronType=Ind", POS: DET},
|
"morph": "Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs",
|
||||||
"DET__Gender=Masc|Number=Sing|PronType=Tot": {"morph": "Gender=Masc|Number=Sing|PronType=Tot", POS: DET},
|
POS: DET,
|
||||||
"DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg", POS: DET},
|
},
|
||||||
"DET__Gender=Neut|Number=Sing|PronType=Art": {"morph": "Gender=Neut|Number=Sing|PronType=Art", POS: DET},
|
"DET__Gender=Fem|Number=Sing|PronType=Art": {
|
||||||
"DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind", POS: DET},
|
"morph": "Gender=Fem|Number=Sing|PronType=Art",
|
||||||
"DET__Gender=Neut|Number=Sing|PronType=Ind": {"morph": "Gender=Neut|Number=Sing|PronType=Ind", POS: DET},
|
POS: DET,
|
||||||
"DET__Gender=Neut|Number=Sing|PronType=Tot": {"morph": "Gender=Neut|Number=Sing|PronType=Tot", POS: DET},
|
},
|
||||||
"DET__Number=Plur|Polarity=Neg|PronType=Neg": {"morph": "Number=Plur|Polarity=Neg|PronType=Neg", POS: DET},
|
"DET__Gender=Fem|Number=Sing|PronType=Ind": {
|
||||||
|
"morph": "Gender=Fem|Number=Sing|PronType=Ind",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
|
"DET__Gender=Fem|Number=Sing|PronType=Prs": {
|
||||||
|
"morph": "Gender=Fem|Number=Sing|PronType=Prs",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
|
"DET__Gender=Fem|Number=Sing|PronType=Tot": {
|
||||||
|
"morph": "Gender=Fem|Number=Sing|PronType=Tot",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
|
"DET__Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg": {
|
||||||
|
"morph": "Gender=Masc|Number=Sing|Polarity=Neg|PronType=Neg",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
|
"DET__Gender=Masc|Number=Sing|PronType=Art": {
|
||||||
|
"morph": "Gender=Masc|Number=Sing|PronType=Art",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
|
"DET__Gender=Masc|Number=Sing|PronType=Ind": {
|
||||||
|
"morph": "Gender=Masc|Number=Sing|PronType=Ind",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
|
"DET__Gender=Masc|Number=Sing|PronType=Tot": {
|
||||||
|
"morph": "Gender=Masc|Number=Sing|PronType=Tot",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
|
"DET__Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg": {
|
||||||
|
"morph": "Gender=Neut|Number=Sing|Polarity=Neg|PronType=Neg",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
|
"DET__Gender=Neut|Number=Sing|PronType=Art": {
|
||||||
|
"morph": "Gender=Neut|Number=Sing|PronType=Art",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
|
"DET__Gender=Neut|Number=Sing|PronType=Dem,Ind": {
|
||||||
|
"morph": "Gender=Neut|Number=Sing|PronType=Dem,Ind",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
|
"DET__Gender=Neut|Number=Sing|PronType=Ind": {
|
||||||
|
"morph": "Gender=Neut|Number=Sing|PronType=Ind",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
|
"DET__Gender=Neut|Number=Sing|PronType=Tot": {
|
||||||
|
"morph": "Gender=Neut|Number=Sing|PronType=Tot",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
|
"DET__Number=Plur|Polarity=Neg|PronType=Neg": {
|
||||||
|
"morph": "Number=Plur|Polarity=Neg|PronType=Neg",
|
||||||
|
POS: DET,
|
||||||
|
},
|
||||||
"DET__Number=Plur|PronType=Art": {"morph": "Number=Plur|PronType=Art", POS: DET},
|
"DET__Number=Plur|PronType=Art": {"morph": "Number=Plur|PronType=Art", POS: DET},
|
||||||
"DET__Number=Plur|PronType=Ind": {"morph": "Number=Plur|PronType=Ind", POS: DET},
|
"DET__Number=Plur|PronType=Ind": {"morph": "Number=Plur|PronType=Ind", POS: DET},
|
||||||
"DET__Number=Plur|PronType=Prs": {"morph": "Number=Plur|PronType=Prs", POS: DET},
|
"DET__Number=Plur|PronType=Prs": {"morph": "Number=Plur|PronType=Prs", POS: DET},
|
||||||
|
@ -507,57 +579,183 @@ TAG_MAP = {
|
||||||
"DET__PronType=Prs": {"morph": "PronType=Prs", POS: DET},
|
"DET__PronType=Prs": {"morph": "PronType=Prs", POS: DET},
|
||||||
"NOUN__Abbr=Yes": {"morph": "Abbr=Yes", POS: NOUN},
|
"NOUN__Abbr=Yes": {"morph": "Abbr=Yes", POS: NOUN},
|
||||||
"NOUN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: NOUN},
|
"NOUN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: NOUN},
|
||||||
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing", POS: NOUN},
|
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing": {
|
||||||
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing", POS: NOUN},
|
"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Plur,Sing",
|
||||||
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": {"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing", POS: NOUN},
|
POS: NOUN,
|
||||||
|
},
|
||||||
|
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing": {
|
||||||
|
"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Masc|Number=Sing",
|
||||||
|
POS: NOUN,
|
||||||
|
},
|
||||||
|
"NOUN__Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing": {
|
||||||
|
"morph": "Abbr=Yes|Definite=Def,Ind|Gender=Neut|Number=Plur,Sing",
|
||||||
|
POS: NOUN,
|
||||||
|
},
|
||||||
"NOUN__Abbr=Yes|Gender=Masc": {"morph": "Abbr=Yes|Gender=Masc", POS: NOUN},
|
"NOUN__Abbr=Yes|Gender=Masc": {"morph": "Abbr=Yes|Gender=Masc", POS: NOUN},
|
||||||
"NUM__Case=Gen|Number=Plur|NumType=Card": {"morph": "Case=Gen|Number=Plur|NumType=Card", POS: NUM},
|
"NUM__Case=Gen|Number=Plur|NumType=Card": {
|
||||||
"NUM__Definite=Def|Number=Sing|NumType=Card": {"morph": "Definite=Def|Number=Sing|NumType=Card", POS: NUM},
|
"morph": "Case=Gen|Number=Plur|NumType=Card",
|
||||||
|
POS: NUM,
|
||||||
|
},
|
||||||
|
"NUM__Definite=Def|Number=Sing|NumType=Card": {
|
||||||
|
"morph": "Definite=Def|Number=Sing|NumType=Card",
|
||||||
|
POS: NUM,
|
||||||
|
},
|
||||||
"NUM__Definite=Def|NumType=Card": {"morph": "Definite=Def|NumType=Card", POS: NUM},
|
"NUM__Definite=Def|NumType=Card": {"morph": "Definite=Def|NumType=Card", POS: NUM},
|
||||||
"NUM__Gender=Fem|Number=Sing|NumType=Card": {"morph": "Gender=Fem|Number=Sing|NumType=Card", POS: NUM},
|
"NUM__Gender=Fem|Number=Sing|NumType=Card": {
|
||||||
"NUM__Gender=Masc|Number=Sing|NumType=Card": {"morph": "Gender=Masc|Number=Sing|NumType=Card", POS: NUM},
|
"morph": "Gender=Fem|Number=Sing|NumType=Card",
|
||||||
"NUM__Gender=Neut|Number=Sing|NumType=Card": {"morph": "Gender=Neut|Number=Sing|NumType=Card", POS: NUM},
|
POS: NUM,
|
||||||
|
},
|
||||||
|
"NUM__Gender=Masc|Number=Sing|NumType=Card": {
|
||||||
|
"morph": "Gender=Masc|Number=Sing|NumType=Card",
|
||||||
|
POS: NUM,
|
||||||
|
},
|
||||||
|
"NUM__Gender=Neut|Number=Sing|NumType=Card": {
|
||||||
|
"morph": "Gender=Neut|Number=Sing|NumType=Card",
|
||||||
|
POS: NUM,
|
||||||
|
},
|
||||||
"NUM__Number=Plur|NumType=Card": {"morph": "Number=Plur|NumType=Card", POS: NUM},
|
"NUM__Number=Plur|NumType=Card": {"morph": "Number=Plur|NumType=Card", POS: NUM},
|
||||||
"NUM__Number=Sing|NumType=Card": {"morph": "Number=Sing|NumType=Card", POS: NUM},
|
"NUM__Number=Sing|NumType=Card": {"morph": "Number=Sing|NumType=Card", POS: NUM},
|
||||||
"NUM__NumType=Card": {"morph": "NumType=Card", POS: NUM},
|
"NUM__NumType=Card": {"morph": "NumType=Card", POS: NUM},
|
||||||
"PART__Polarity=Neg": {"morph": "Polarity=Neg", POS: PART},
|
"PART__Polarity=Neg": {"morph": "Polarity=Neg", POS: PART},
|
||||||
"PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON},
|
"PRON__Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": {
|
||||||
"PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON},
|
"morph": "Animacy=Hum|Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs",
|
||||||
"PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs", POS: PRON},
|
POS: PRON,
|
||||||
"PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs", POS: PRON},
|
},
|
||||||
"PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs", POS: PRON},
|
"PRON__Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
|
||||||
"PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs", POS: PRON},
|
"morph": "Animacy=Hum|Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs",
|
||||||
"PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs", POS: PRON},
|
POS: PRON,
|
||||||
"PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs", POS: PRON},
|
},
|
||||||
"PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs", POS: PRON},
|
"PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs": {
|
||||||
"PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { "morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs", POS: PRON},
|
"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=1|PronType=Prs",
|
||||||
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs", POS: PRON},
|
POS: PRON,
|
||||||
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs", POS: PRON},
|
},
|
||||||
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs", POS: PRON},
|
"PRON__Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs": {
|
||||||
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs", POS: PRON},
|
"morph": "Animacy=Hum|Case=Acc|Number=Plur|Person=2|PronType=Prs",
|
||||||
"PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": {"morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs", POS: PRON},
|
POS: PRON,
|
||||||
"PRON__Animacy=Hum|Number=Plur|PronType=Rcp": {"morph": "Animacy=Hum|Number=Plur|PronType=Rcp", POS: PRON},
|
},
|
||||||
"PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": {"morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs", POS: PRON},
|
"PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs": {
|
||||||
"PRON__Animacy=Hum|Poss=Yes|PronType=Int": {"morph": "Animacy=Hum|Poss=Yes|PronType=Int", POS: PRON},
|
"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=1|PronType=Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs": {
|
||||||
|
"morph": "Animacy=Hum|Case=Acc|Number=Sing|Person=2|PronType=Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs": {
|
||||||
|
"morph": "Animacy=Hum|Case=Gen,Nom|Number=Sing|PronType=Art,Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs": {
|
||||||
|
"morph": "Animacy=Hum|Case=Gen|Number=Sing|PronType=Art,Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs": {
|
||||||
|
"morph": "Animacy=Hum|Case=Nom|Gender=Fem|Number=Sing|Person=3|PronType=Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
|
||||||
|
"morph": "Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs": {
|
||||||
|
"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=1|PronType=Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs": {
|
||||||
|
"morph": "Animacy=Hum|Case=Nom|Number=Plur|Person=2|PronType=Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs": {
|
||||||
|
"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=1|PronType=Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs": {
|
||||||
|
"morph": "Animacy=Hum|Case=Nom|Number=Sing|Person=2|PronType=Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs": {
|
||||||
|
"morph": "Animacy=Hum|Case=Nom|Number=Sing|PronType=Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Animacy=Hum|Number=Plur|PronType=Rcp": {
|
||||||
|
"morph": "Animacy=Hum|Number=Plur|PronType=Rcp",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Animacy=Hum|Number=Sing|PronType=Art,Prs": {
|
||||||
|
"morph": "Animacy=Hum|Number=Sing|PronType=Art,Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Animacy=Hum|Poss=Yes|PronType=Int": {
|
||||||
|
"morph": "Animacy=Hum|Poss=Yes|PronType=Int",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
"PRON__Animacy=Hum|PronType=Int": {"morph": "Animacy=Hum|PronType=Int", POS: PRON},
|
"PRON__Animacy=Hum|PronType=Int": {"morph": "Animacy=Hum|PronType=Int", POS: PRON},
|
||||||
"PRON__Case=Acc|PronType=Prs|Reflex=Yes": {"morph": "Case=Acc|PronType=Prs|Reflex=Yes", POS: PRON},
|
"PRON__Case=Acc|PronType=Prs|Reflex=Yes": {
|
||||||
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": { "morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON},
|
"morph": "Case=Acc|PronType=Prs|Reflex=Yes",
|
||||||
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON},
|
POS: PRON,
|
||||||
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": {"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot", POS: PRON},
|
},
|
||||||
"PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON},
|
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs": {
|
||||||
"PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON},
|
"morph": "Gender=Fem,Masc|Number=Sing|Person=3|Polarity=Neg|PronType=Neg,Prs",
|
||||||
"PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": {"morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs", POS: PRON},
|
POS: PRON,
|
||||||
"PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": {"morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs", POS: PRON},
|
},
|
||||||
"PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": {"morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs", POS: PRON},
|
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs": {
|
||||||
"PRON__Number=Plur|Person=3|PronType=Ind,Prs": {"morph": "Number=Plur|Person=3|PronType=Ind,Prs", POS: PRON},
|
"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Ind,Prs",
|
||||||
"PRON__Number=Plur|Person=3|PronType=Prs,Tot": {"morph": "Number=Plur|Person=3|PronType=Prs,Tot", POS: PRON},
|
POS: PRON,
|
||||||
"PRON__Number=Plur|Poss=Yes|PronType=Prs": {"morph": "Number=Plur|Poss=Yes|PronType=Prs", POS: PRON},
|
},
|
||||||
"PRON__Number=Plur|Poss=Yes|PronType=Rcp": {"morph": "Number=Plur|Poss=Yes|PronType=Rcp", POS: PRON},
|
"PRON__Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot": {
|
||||||
"PRON__Number=Sing|Polarity=Neg|PronType=Neg": {"morph": "Number=Sing|Polarity=Neg|PronType=Neg", POS: PRON},
|
"morph": "Gender=Fem,Masc|Number=Sing|Person=3|PronType=Prs,Tot",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs": {
|
||||||
|
"morph": "Gender=Fem|Number=Sing|Poss=Yes|PronType=Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs": {
|
||||||
|
"morph": "Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs": {
|
||||||
|
"morph": "Gender=Neut|Number=Sing|Person=3|PronType=Ind,Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs": {
|
||||||
|
"morph": "Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs": {
|
||||||
|
"morph": "Number=Plur|Person=3|Polarity=Neg|PronType=Neg,Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Number=Plur|Person=3|PronType=Ind,Prs": {
|
||||||
|
"morph": "Number=Plur|Person=3|PronType=Ind,Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Number=Plur|Person=3|PronType=Prs,Tot": {
|
||||||
|
"morph": "Number=Plur|Person=3|PronType=Prs,Tot",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Number=Plur|Poss=Yes|PronType=Prs": {
|
||||||
|
"morph": "Number=Plur|Poss=Yes|PronType=Prs",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Number=Plur|Poss=Yes|PronType=Rcp": {
|
||||||
|
"morph": "Number=Plur|Poss=Yes|PronType=Rcp",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
|
"PRON__Number=Sing|Polarity=Neg|PronType=Neg": {
|
||||||
|
"morph": "Number=Sing|Polarity=Neg|PronType=Neg",
|
||||||
|
POS: PRON,
|
||||||
|
},
|
||||||
"PRON__PronType=Prs": {"morph": "PronType=Prs", POS: PRON},
|
"PRON__PronType=Prs": {"morph": "PronType=Prs", POS: PRON},
|
||||||
"PRON__PronType=Rel": {"morph": "PronType=Rel", POS: PRON},
|
"PRON__PronType=Rel": {"morph": "PronType=Rel", POS: PRON},
|
||||||
"PROPN__Abbr=Yes": {"morph": "Abbr=Yes", POS: PROPN},
|
"PROPN__Abbr=Yes": {"morph": "Abbr=Yes", POS: PROPN},
|
||||||
"PROPN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: PROPN},
|
"PROPN__Abbr=Yes|Case=Gen": {"morph": "Abbr=Yes|Case=Gen", POS: PROPN},
|
||||||
"VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": {"morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin", POS: VERB},
|
"VERB__Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin": {
|
||||||
"VERB__Definite=Ind|Number=Sing|VerbForm=Part": {"morph": "Definite=Ind|Number=Sing|VerbForm=Part", POS: VERB},
|
"morph": "Abbr=Yes|Mood=Ind|Tense=Pres|VerbForm=Fin",
|
||||||
|
POS: VERB,
|
||||||
|
},
|
||||||
|
"VERB__Definite=Ind|Number=Sing|VerbForm=Part": {
|
||||||
|
"morph": "Definite=Ind|Number=Sing|VerbForm=Part",
|
||||||
|
POS: VERB,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -295,10 +295,9 @@ class EntityRuler(object):
|
||||||
deserializers_patterns = {
|
deserializers_patterns = {
|
||||||
"patterns": lambda p: self.add_patterns(
|
"patterns": lambda p: self.add_patterns(
|
||||||
srsly.read_jsonl(p.with_suffix(".jsonl"))
|
srsly.read_jsonl(p.with_suffix(".jsonl"))
|
||||||
)}
|
)
|
||||||
deserializers_cfg = {
|
|
||||||
"cfg": lambda p: cfg.update(srsly.read_json(p))
|
|
||||||
}
|
}
|
||||||
|
deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
|
||||||
from_disk(path, deserializers_cfg, {})
|
from_disk(path, deserializers_cfg, {})
|
||||||
self.overwrite = cfg.get("overwrite", False)
|
self.overwrite = cfg.get("overwrite", False)
|
||||||
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
|
self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
|
||||||
|
|
|
@ -219,14 +219,13 @@ def uk_tokenizer():
|
||||||
def ur_tokenizer():
|
def ur_tokenizer():
|
||||||
return get_lang_class("ur").Defaults.create_tokenizer()
|
return get_lang_class("ur").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def yo_tokenizer():
|
def yo_tokenizer():
|
||||||
return get_lang_class("yo").Defaults.create_tokenizer()
|
return get_lang_class("yo").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def zh_tokenizer():
|
def zh_tokenizer():
|
||||||
pytest.importorskip("jieba")
|
pytest.importorskip("jieba")
|
||||||
return get_lang_class("zh").Defaults.create_tokenizer()
|
return get_lang_class("zh").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ ABBREVIATION_TESTS = [
|
||||||
HYPHENATED_TESTS = [
|
HYPHENATED_TESTS = [
|
||||||
(
|
(
|
||||||
"1700-luvulle sijoittuva taide-elokuva",
|
"1700-luvulle sijoittuva taide-elokuva",
|
||||||
["1700-luvulle", "sijoittuva", "taide-elokuva"]
|
["1700-luvulle", "sijoittuva", "taide-elokuva"],
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -3,16 +3,19 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text", ["z.B.", "Jan."])
|
@pytest.mark.parametrize("text", ["z.B.", "Jan."])
|
||||||
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
|
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
|
||||||
tokens = lb_tokenizer(text)
|
tokens = lb_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"])
|
@pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"])
|
||||||
def test_lb_tokenizer_splits_contractions(lb_tokenizer, text):
|
def test_lb_tokenizer_splits_contractions(lb_tokenizer, text):
|
||||||
tokens = lb_tokenizer(text)
|
tokens = lb_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
|
def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
|
||||||
text = "Mee 't ass net evident, d'Liewen."
|
text = "Mee 't ass net evident, d'Liewen."
|
||||||
tokens = lb_tokenizer(text)
|
tokens = lb_tokenizer(text)
|
||||||
|
@ -20,6 +23,7 @@ def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
|
||||||
assert tokens[1].text == "'t"
|
assert tokens[1].text == "'t"
|
||||||
assert tokens[1].lemma_ == "et"
|
assert tokens[1].lemma_ == "et"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
|
@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
|
||||||
def test_lb_norm_exceptions(lb_tokenizer, text, norm):
|
def test_lb_norm_exceptions(lb_tokenizer, text, norm):
|
||||||
tokens = lb_tokenizer(text)
|
tokens = lb_tokenizer(text)
|
||||||
|
|
|
@ -16,7 +16,7 @@ def test_lb_tokenizer_handles_long_text(lb_tokenizer):
|
||||||
[
|
[
|
||||||
("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13),
|
("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13),
|
||||||
("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15),
|
("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15),
|
||||||
("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14)
|
("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length):
|
def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length):
|
||||||
|
|
|
@ -87,4 +87,4 @@ def test_lex_attrs_like_url(text, match):
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_lex_attrs_word_shape(text, shape):
|
def test_lex_attrs_word_shape(text, shape):
|
||||||
assert word_shape(text) == shape
|
assert word_shape(text) == shape
|
||||||
|
|
|
@ -151,17 +151,17 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
||||||
|
|
||||||
|
|
||||||
def test_parser_set_sent_starts(en_vocab):
|
def test_parser_set_sent_starts(en_vocab):
|
||||||
|
# fmt: off
|
||||||
words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n']
|
words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n']
|
||||||
heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1]
|
heads = [1, 0, -1, 27, 0, -1, 1, -3, -1, 8, 4, 3, -1, 1, 3, 1, 1, -11, -1, 1, -9, -1, 4, -1, 2, 1, -6, -1, 1, 2, 1, -6, -1, -1, -17, -31, -32, -1]
|
||||||
deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', '']
|
deps = ['nk', 'ROOT', 'punct', 'mo', 'ROOT', 'sb', 'op', 'pd', 'punct', 'cp', 'mo', 'nk', '', 'nk', 'sb', 'nk', 'oa', 're', 'punct', 'mo', 'app', 'punct', 'sb', '', 'oa', 'op', 'rc', 'punct', 'nk', 'sb', 'oc', 're', 'cd', '', 'oa', 'ng', 'punct', '']
|
||||||
doc = get_doc(
|
# fmt: on
|
||||||
en_vocab, words=words, deps=deps, heads=heads
|
doc = get_doc(en_vocab, words=words, deps=deps, heads=heads)
|
||||||
)
|
|
||||||
for i in range(len(words)):
|
for i in range(len(words)):
|
||||||
if i == 0 or i == 3:
|
if i == 0 or i == 3:
|
||||||
assert doc[i].is_sent_start == True
|
assert doc[i].is_sent_start is True
|
||||||
else:
|
else:
|
||||||
assert doc[i].is_sent_start == None
|
assert doc[i].is_sent_start is None
|
||||||
for sent in doc.sents:
|
for sent in doc.sents:
|
||||||
for token in sent:
|
for token in sent:
|
||||||
assert token.head in sent
|
assert token.head in sent
|
||||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipeline import Tagger
|
|
||||||
|
|
||||||
|
|
||||||
def test_label_types():
|
def test_label_types():
|
||||||
|
|
|
@ -15,7 +15,9 @@ def test_issue4674():
|
||||||
|
|
||||||
vector1 = [0.9, 1.1, 1.01]
|
vector1 = [0.9, 1.1, 1.01]
|
||||||
vector2 = [1.8, 2.25, 2.01]
|
vector2 = [1.8, 2.25, 2.01]
|
||||||
kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2])
|
kb.set_entities(
|
||||||
|
entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]
|
||||||
|
)
|
||||||
|
|
||||||
assert kb.get_size_entities() == 1
|
assert kb.get_size_entities() == 1
|
||||||
|
|
||||||
|
@ -31,4 +33,3 @@ def test_issue4674():
|
||||||
kb2.load_bulk(str(file_path))
|
kb2.load_bulk(str(file_path))
|
||||||
|
|
||||||
assert kb2.get_size_entities() == 1
|
assert kb2.get_size_entities() == 1
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user