Refactor Token morph setting (#6175)

* Refactor Token morph setting

* Remove `Token.morph_`
* Add `Token.set_morph()`
  * `0` resets `token.c.morph` to unset
  * Any other values are passed to `Morphology.add`

* Add token.morph setter to set from MorphAnalysis
This commit is contained in:
Adriane Boyd 2020-10-01 22:21:46 +02:00 committed by GitHub
parent da30701cd1
commit 86c3ec9c2b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 118 additions and 91 deletions

View File

@ -710,6 +710,9 @@ class Errors:
"options: {modes}") "options: {modes}")
E1012 = ("Entity spans and blocked/missing/outside spans should be " E1012 = ("Entity spans and blocked/missing/outside spans should be "
"provided to doc.set_ents as lists of `Span` objects.") "provided to doc.set_ents as lists of `Span` objects.")
E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the "
"token itself. To set the morph from this MorphAnalysis, set from "
"the string value with: `token.set_morph(str(other_morph))`.")
@add_codes @add_codes

View File

@ -149,7 +149,7 @@ class Morphologizer(Tagger):
for example in get_examples(): for example in get_examples():
for i, token in enumerate(example.reference): for i, token in enumerate(example.reference):
pos = token.pos_ pos = token.pos_
morph = token.morph_ morph = str(token.morph)
# create and add the combined morph+POS label # create and add the combined morph+POS label
morph_dict = Morphology.feats_to_dict(morph) morph_dict = Morphology.feats_to_dict(morph)
if pos: if pos:
@ -167,7 +167,7 @@ class Morphologizer(Tagger):
gold_array = [] gold_array = []
for i, token in enumerate(example.reference): for i, token in enumerate(example.reference):
pos = token.pos_ pos = token.pos_
morph = token.morph_ morph = str(token.morph)
morph_dict = Morphology.feats_to_dict(morph) morph_dict = Morphology.feats_to_dict(morph)
if pos: if pos:
morph_dict[self.POS_FEAT] = pos morph_dict[self.POS_FEAT] = pos

View File

@ -46,9 +46,9 @@ def test_doc_array_morph(en_vocab):
words = ["Eat", "blue", "ham"] words = ["Eat", "blue", "ham"]
morph = ["Feat=V", "Feat=J", "Feat=N"] morph = ["Feat=V", "Feat=J", "Feat=N"]
doc = Doc(en_vocab, words=words, morphs=morph) doc = Doc(en_vocab, words=words, morphs=morph)
assert morph[0] == doc[0].morph_ assert morph[0] == str(doc[0].morph)
assert morph[1] == doc[1].morph_ assert morph[1] == str(doc[1].morph)
assert morph[2] == doc[2].morph_ assert morph[2] == str(doc[2].morph)
feats_array = doc.to_array((ORTH, MORPH)) feats_array = doc.to_array((ORTH, MORPH))
assert feats_array[0][1] == doc[0].morph.key assert feats_array[0][1] == doc[0].morph.key

View File

@ -319,15 +319,13 @@ def test_doc_from_array_morph(en_vocab):
words = ["I", "live", "in", "New", "York", "."] words = ["I", "live", "in", "New", "York", "."]
morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"] morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
# fmt: on # fmt: on
doc = Doc(en_vocab, words=words) doc = Doc(en_vocab, words=words, morphs=morphs)
for i, morph in enumerate(morphs):
doc[i].morph_ = morph
attrs = [MORPH] attrs = [MORPH]
arr = doc.to_array(attrs) arr = doc.to_array(attrs)
new_doc = Doc(en_vocab, words=words) new_doc = Doc(en_vocab, words=words)
new_doc.from_array(attrs, arr) new_doc.from_array(attrs, arr)
assert [t.morph_ for t in new_doc] == morphs assert [str(t.morph) for t in new_doc] == morphs
assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc] assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
def test_doc_api_from_docs(en_tokenizer, de_tokenizer): def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
@ -423,7 +421,7 @@ def test_has_annotation(en_vocab):
doc[0].tag_ = "A" doc[0].tag_ = "A"
doc[0].pos_ = "X" doc[0].pos_ = "X"
doc[0].morph_ = "Feat=Val" doc[0].set_morph("Feat=Val")
doc[0].lemma_ = "a" doc[0].lemma_ = "a"
doc[0].dep_ = "dep" doc[0].dep_ = "dep"
doc[0].head = doc[1] doc[0].head = doc[1]
@ -435,7 +433,7 @@ def test_has_annotation(en_vocab):
doc[1].tag_ = "A" doc[1].tag_ = "A"
doc[1].pos_ = "X" doc[1].pos_ = "X"
doc[1].morph_ = "" doc[1].set_morph("")
doc[1].lemma_ = "a" doc[1].lemma_ = "a"
doc[1].dep_ = "dep" doc[1].dep_ = "dep"
doc.ents = [Span(doc, 0, 2, label="HELLO")] doc.ents = [Span(doc, 0, 2, label="HELLO")]
@ -538,6 +536,32 @@ def test_doc_ents_setter():
assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
def test_doc_morph_setter(en_tokenizer, de_tokenizer):
doc1 = en_tokenizer("a b")
doc1b = en_tokenizer("c d")
doc2 = de_tokenizer("a b")
# unset values can be copied
doc1[0].morph = doc1[1].morph
assert doc1[0].morph.key == 0
assert doc1[1].morph.key == 0
# morph values from the same vocab can be copied
doc1[0].set_morph("Feat=Val")
doc1[1].morph = doc1[0].morph
assert doc1[0].morph == doc1[1].morph
# ... also across docs
doc1b[0].morph = doc1[0].morph
assert doc1[0].morph == doc1b[0].morph
doc2[0].set_morph("Feat2=Val2")
# the morph value must come from the same vocab
with pytest.raises(ValueError):
doc1[0].morph = doc2[0].morph
def test_doc_init_iob(): def test_doc_init_iob():
"""Test ents validation/normalization in Doc.__init__""" """Test ents validation/normalization in Doc.__init__"""
words = ["a", "b", "c", "d", "e"] words = ["a", "b", "c", "d", "e"]

View File

@ -4,13 +4,13 @@ import pytest
@pytest.fixture @pytest.fixture
def i_has(en_tokenizer): def i_has(en_tokenizer):
doc = en_tokenizer("I has") doc = en_tokenizer("I has")
doc[0].morph_ = {"PronType": "prs"} doc[0].set_morph({"PronType": "prs"})
doc[1].morph_ = { doc[1].set_morph({
"VerbForm": "fin", "VerbForm": "fin",
"Tense": "pres", "Tense": "pres",
"Number": "sing", "Number": "sing",
"Person": "three", "Person": "three",
} })
return doc return doc
@ -47,20 +47,20 @@ def test_morph_get(i_has):
def test_morph_set(i_has): def test_morph_set(i_has):
assert i_has[0].morph.get("PronType") == ["prs"] assert i_has[0].morph.get("PronType") == ["prs"]
# set by string # set by string
i_has[0].morph_ = "PronType=unk" i_has[0].set_morph("PronType=unk")
assert i_has[0].morph.get("PronType") == ["unk"] assert i_has[0].morph.get("PronType") == ["unk"]
# set by string, fields are alphabetized # set by string, fields are alphabetized
i_has[0].morph_ = "PronType=123|NounType=unk" i_has[0].set_morph("PronType=123|NounType=unk")
assert i_has[0].morph_ == "NounType=unk|PronType=123" assert str(i_has[0].morph) == "NounType=unk|PronType=123"
# set by dict # set by dict
i_has[0].morph_ = {"AType": "123", "BType": "unk"} i_has[0].set_morph({"AType": "123", "BType": "unk"})
assert i_has[0].morph_ == "AType=123|BType=unk" assert str(i_has[0].morph) == "AType=123|BType=unk"
# set by string with multiple values, fields and values are alphabetized # set by string with multiple values, fields and values are alphabetized
i_has[0].morph_ = "BType=c|AType=b,a" i_has[0].set_morph("BType=c|AType=b,a")
assert i_has[0].morph_ == "AType=a,b|BType=c" assert str(i_has[0].morph) == "AType=a,b|BType=c"
# set by dict with multiple values, fields and values are alphabetized # set by dict with multiple values, fields and values are alphabetized
i_has[0].morph_ = {"AType": "b,a", "BType": "c"} i_has[0].set_morph({"AType": "b,a", "BType": "c"})
assert i_has[0].morph_ == "AType=a,b|BType=c" assert str(i_has[0].morph) == "AType=a,b|BType=c"
def test_morph_str(i_has): def test_morph_str(i_has):
@ -72,25 +72,25 @@ def test_morph_property(tokenizer):
doc = tokenizer("a dog") doc = tokenizer("a dog")
# set through token.morph_ # set through token.morph_
doc[0].morph_ = "PronType=prs" doc[0].set_morph("PronType=prs")
assert doc[0].morph_ == "PronType=prs" assert str(doc[0].morph) == "PronType=prs"
assert doc.to_array(["MORPH"])[0] != 0 assert doc.to_array(["MORPH"])[0] != 0
# unset with token.morph # unset with token.morph
doc[0].morph = 0 doc[0].set_morph(0)
assert doc.to_array(["MORPH"])[0] == 0 assert doc.to_array(["MORPH"])[0] == 0
# empty morph is equivalent to "_" # empty morph is equivalent to "_"
doc[0].morph_ = "" doc[0].set_morph("")
assert doc[0].morph_ == "" assert str(doc[0].morph) == ""
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
# "_" morph is also equivalent to empty morph # "_" morph is also equivalent to empty morph
doc[0].morph_ = "_" doc[0].set_morph("_")
assert doc[0].morph_ == "" assert str(doc[0].morph) == ""
assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"]
# set through existing hash with token.morph # set through existing hash with token.morph
tokenizer.vocab.strings.add("Feat=Val") tokenizer.vocab.strings.add("Feat=Val")
doc[0].morph = tokenizer.vocab.strings.add("Feat=Val") doc[0].set_morph(tokenizer.vocab.strings.add("Feat=Val"))
assert doc[0].morph_ == "Feat=Val" assert str(doc[0].morph) == "Feat=Val"

View File

@ -21,11 +21,11 @@ def test_doc_retokenize_merge(en_tokenizer):
assert doc[4].text == "the beach boys" assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys " assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED" assert doc[4].tag_ == "NAMED"
assert doc[4].morph_ == "Number=Plur" assert str(doc[4].morph) == "Number=Plur"
assert doc[5].text == "all night" assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night" assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED" assert doc[5].tag_ == "NAMED"
assert doc[5].morph_ == "Number=Plur" assert str(doc[5].morph) == "Number=Plur"
def test_doc_retokenize_merge_children(en_tokenizer): def test_doc_retokenize_merge_children(en_tokenizer):

View File

@ -27,11 +27,11 @@ def test_doc_retokenize_split(en_vocab):
assert doc[0].text == "Los" assert doc[0].text == "Los"
assert doc[0].head.text == "Angeles" assert doc[0].head.text == "Angeles"
assert doc[0].idx == 0 assert doc[0].idx == 0
assert doc[0].morph_ == "Number=Sing" assert str(doc[0].morph) == "Number=Sing"
assert doc[1].idx == 3 assert doc[1].idx == 3
assert doc[1].text == "Angeles" assert doc[1].text == "Angeles"
assert doc[1].head.text == "start" assert doc[1].head.text == "start"
assert doc[1].morph_ == "Number=Sing" assert str(doc[1].morph) == "Number=Sing"
assert doc[2].text == "start" assert doc[2].text == "start"
assert doc[2].head.text == "." assert doc[2].head.text == "."
assert doc[3].text == "." assert doc[3].text == "."

View File

@ -236,13 +236,13 @@ def test_matcher_subset_value_operator(en_vocab):
matcher.add("M", [pattern]) matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"]) doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 3 assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val" doc[0].set_morph("Feat=Val")
assert len(matcher(doc)) == 3 assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2" doc[0].set_morph("Feat=Val|Feat2=Val2")
assert len(matcher(doc)) == 3 assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
# IS_SUBSET acts like "IN" for attrs other than MORPH # IS_SUBSET acts like "IN" for attrs other than MORPH
@ -268,11 +268,11 @@ def test_matcher_superset_value_operator(en_vocab):
matcher.add("M", [pattern]) matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"]) doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2" doc[0].set_morph("Feat=Val|Feat2=Val2")
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
# IS_SUPERSET with more than one value only matches for MORPH # IS_SUPERSET with more than one value only matches for MORPH
@ -310,9 +310,9 @@ def test_matcher_morph_handling(en_vocab):
doc = Doc(en_vocab, words=["a", "b", "c"]) doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2|Feat1=Val1" doc[0].set_morph("Feat2=Val2|Feat1=Val1")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat1=Val1|Feat2=Val2" doc[0].set_morph("Feat1=Val1|Feat2=Val2")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
# multiple values are split # multiple values are split
@ -324,9 +324,9 @@ def test_matcher_morph_handling(en_vocab):
doc = Doc(en_vocab, words=["a", "b", "c"]) doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1" doc[0].set_morph("Feat2=Val2,Val3|Feat1=Val1")
assert len(matcher(doc)) == 1 assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2" doc[0].set_morph("Feat1=Val1,Val3|Feat2=Val2")
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
doc2 = Doc(en_vocab, words=["Test"]) doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG" doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X" doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val" doc2[0].set_morph("Feat=Val")
doc2[0].lemma_ = "LEMMA" doc2[0].lemma_ = "LEMMA"
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
# DEP requires DEP # DEP requires DEP

View File

@ -190,7 +190,7 @@ def test_phrase_matcher_validation(en_vocab):
doc2 = Doc(en_vocab, words=["Test"]) doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG" doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X" doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val" doc2[0].set_morph("Feat=Val")
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
matcher = PhraseMatcher(en_vocab, validate=True) matcher = PhraseMatcher(en_vocab, validate=True)
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
@ -217,7 +217,7 @@ def test_attr_pipeline_checks(en_vocab):
doc2 = Doc(en_vocab, words=["Test"]) doc2 = Doc(en_vocab, words=["Test"])
doc2[0].tag_ = "TAG" doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X" doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val" doc2[0].set_morph("Feat=Val")
doc2[0].lemma_ = "LEMMA" doc2[0].lemma_ = "LEMMA"
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
# DEP requires DEP # DEP requires DEP

View File

@ -69,9 +69,9 @@ def test_attributeruler_init(nlp, pattern_dicts):
a.add(**p) a.add(**p)
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA") assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
@ -81,9 +81,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA") assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler") nlp.remove_pipe("attribute_ruler")
@ -94,9 +94,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
) )
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA") assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH") assert doc.has_annotation("MORPH")
@ -106,9 +106,9 @@ def test_attributeruler_score(nlp, pattern_dicts):
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
doc = nlp("This is a test.") doc = nlp("This is a test.")
assert doc[2].lemma_ == "the" assert doc[2].lemma_ == "the"
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert str(doc[3].morph) == "Case=Nom|Number=Sing"
dev_examples = [ dev_examples = [
Example.from_dict( Example.from_dict(
@ -150,10 +150,10 @@ def test_attributeruler_tag_map(nlp, tag_map):
for i in range(len(doc)): for i in range(len(doc)):
if i == 4: if i == 4:
assert doc[i].pos_ == "PUNCT" assert doc[i].pos_ == "PUNCT"
assert doc[i].morph_ == "PunctType=peri" assert str(doc[i].morph) == "PunctType=peri"
else: else:
assert doc[i].pos_ == "" assert doc[i].pos_ == ""
assert doc[i].morph_ == "" assert str(doc[i].morph) == ""
def test_attributeruler_morph_rules(nlp, morph_rules): def test_attributeruler_morph_rules(nlp, morph_rules):
@ -168,11 +168,11 @@ def test_attributeruler_morph_rules(nlp, morph_rules):
for i in range(len(doc)): for i in range(len(doc)):
if i != 2: if i != 2:
assert doc[i].pos_ == "" assert doc[i].pos_ == ""
assert doc[i].morph_ == "" assert str(doc[i].morph) == ""
else: else:
assert doc[2].pos_ == "DET" assert doc[2].pos_ == "DET"
assert doc[2].lemma_ == "a" assert doc[2].lemma_ == "a"
assert doc[2].morph_ == "Case=Nom" assert str(doc[2].morph) == "Case=Nom"
def test_attributeruler_indices(nlp): def test_attributeruler_indices(nlp):
@ -194,14 +194,14 @@ def test_attributeruler_indices(nlp):
for i in range(len(doc)): for i in range(len(doc)):
if i == 1: if i == 1:
assert doc[i].lemma_ == "was" assert doc[i].lemma_ == "was"
assert doc[i].morph_ == "Case=Nom|Number=Sing" assert str(doc[i].morph) == "Case=Nom|Number=Sing"
elif i == 2: elif i == 2:
assert doc[i].lemma_ == "the" assert doc[i].lemma_ == "the"
assert doc[i].morph_ == "Case=Nom|Number=Plur" assert str(doc[i].morph) == "Case=Nom|Number=Plur"
elif i == 3: elif i == 3:
assert doc[i].lemma_ == "cat" assert doc[i].lemma_ == "cat"
else: else:
assert doc[i].morph_ == "" assert str(doc[i].morph) == ""
# raises an error when trying to modify a token outside of the match # raises an error when trying to modify a token outside of the match
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2) a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
with pytest.raises(ValueError): with pytest.raises(ValueError):

View File

@ -91,7 +91,7 @@ def test_overfitting_IO():
doc = nlp(test_text) doc = nlp(test_text)
gold_morphs = ["Feat=N", "Feat=V", "", ""] gold_morphs = ["Feat=N", "Feat=V", "", ""]
gold_pos_tags = ["NOUN", "VERB", "ADJ", ""] gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
assert [t.morph_ for t in doc] == gold_morphs assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags assert [t.pos_ for t in doc] == gold_pos_tags
# Also test the results are still the same after IO # Also test the results are still the same after IO
@ -99,5 +99,5 @@ def test_overfitting_IO():
nlp.to_disk(tmp_dir) nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(test_text) doc2 = nlp2(test_text)
assert [t.morph_ for t in doc2] == gold_morphs assert [str(t.morph) for t in doc2] == gold_morphs
assert [t.pos_ for t in doc2] == gold_pos_tags assert [t.pos_ for t in doc2] == gold_pos_tags

View File

@ -76,7 +76,7 @@ def tagged_doc():
for i in range(len(tags)): for i in range(len(tags)):
doc[i].tag_ = tags[i] doc[i].tag_ = tags[i]
doc[i].pos_ = pos[i] doc[i].pos_ = pos[i]
doc[i].morph_ = morphs[i] doc[i].set_morph(morphs[i])
if i > 0: if i > 0:
doc[i].is_sent_start = False doc[i].is_sent_start = False
return doc return doc
@ -242,7 +242,7 @@ def test_tag_score(tagged_doc):
gold = { gold = {
"tags": [t.tag_ for t in tagged_doc], "tags": [t.tag_ for t in tagged_doc],
"pos": [t.pos_ for t in tagged_doc], "pos": [t.pos_ for t in tagged_doc],
"morphs": [t.morph_ for t in tagged_doc], "morphs": [str(t.morph) for t in tagged_doc],
"sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc], "sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc],
} }
example = Example.from_dict(tagged_doc, gold) example = Example.from_dict(tagged_doc, gold)
@ -259,7 +259,7 @@ def test_tag_score(tagged_doc):
tags[0] = "NN" tags[0] = "NN"
pos = [t.pos_ for t in tagged_doc] pos = [t.pos_ for t in tagged_doc]
pos[1] = "X" pos[1] = "X"
morphs = [t.morph_ for t in tagged_doc] morphs = [str(t.morph) for t in tagged_doc]
morphs[1] = "Number=sing" morphs[1] = "Number=sing"
morphs[2] = "Number=plur" morphs[2] = "Number=plur"
gold = { gold = {

View File

@ -113,7 +113,7 @@ def test_Example_from_dict_with_morphology(annots):
predicted = Doc(vocab, words=annots["words"]) predicted = Doc(vocab, words=annots["words"])
example = Example.from_dict(predicted, annots) example = Example.from_dict(predicted, annots)
for i, token in enumerate(example.reference): for i, token in enumerate(example.reference):
assert token.morph_ == annots["morphs"][i] assert str(token.morph) == annots["morphs"][i]
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@ -460,7 +460,7 @@ def test_roundtrip_docs_to_docbin(doc):
idx = [t.idx for t in doc] idx = [t.idx for t in doc]
tags = [t.tag_ for t in doc] tags = [t.tag_ for t in doc]
pos = [t.pos_ for t in doc] pos = [t.pos_ for t in doc]
morphs = [t.morph_ for t in doc] morphs = [str(t.morph) for t in doc]
lemmas = [t.lemma_ for t in doc] lemmas = [t.lemma_ for t in doc]
deps = [t.dep_ for t in doc] deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc] heads = [t.head.i for t in doc]
@ -482,7 +482,7 @@ def test_roundtrip_docs_to_docbin(doc):
assert idx == [t.idx for t in reloaded_example.reference] assert idx == [t.idx for t in reloaded_example.reference]
assert tags == [t.tag_ for t in reloaded_example.reference] assert tags == [t.tag_ for t in reloaded_example.reference]
assert pos == [t.pos_ for t in reloaded_example.reference] assert pos == [t.pos_ for t in reloaded_example.reference]
assert morphs == [t.morph_ for t in reloaded_example.reference] assert morphs == [str(t.morph) for t in reloaded_example.reference]
assert lemmas == [t.lemma_ for t in reloaded_example.reference] assert lemmas == [t.lemma_ for t in reloaded_example.reference]
assert deps == [t.dep_ for t in reloaded_example.reference] assert deps == [t.dep_ for t in reloaded_example.reference]
assert heads == [t.head.i for t in reloaded_example.reference] assert heads == [t.head.i for t in reloaded_example.reference]

View File

@ -101,7 +101,7 @@ class DocBin:
self.strings.add(token.text) self.strings.add(token.text)
self.strings.add(token.tag_) self.strings.add(token.tag_)
self.strings.add(token.lemma_) self.strings.add(token.lemma_)
self.strings.add(token.morph_) self.strings.add(str(token.morph))
self.strings.add(token.dep_) self.strings.add(token.dep_)
self.strings.add(token.ent_type_) self.strings.add(token.ent_type_)
self.strings.add(token.ent_kb_id_) self.strings.add(token.ent_kb_id_)

View File

@ -1248,7 +1248,7 @@ cdef class Doc:
for token in self: for token in self:
strings.add(token.tag_) strings.add(token.tag_)
strings.add(token.lemma_) strings.add(token.lemma_)
strings.add(token.morph_) strings.add(str(token.morph))
strings.add(token.dep_) strings.add(token.dep_)
strings.add(token.ent_type_) strings.add(token.ent_type_)
strings.add(token.ent_kb_id_) strings.add(token.ent_kb_id_)

View File

@ -215,20 +215,20 @@ cdef class Token:
def __get__(self): def __get__(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph) return MorphAnalysis.from_id(self.vocab, self.c.morph)
def __set__(self, attr_t morph): def __set__(self, MorphAnalysis morph):
if morph == 0: # Check that the morph has the same vocab
self.c.morph = morph if self.vocab != morph.vocab:
elif morph in self.vocab.strings: raise ValueError(Errors.E1013)
self.morph_ = self.vocab.strings[morph] self.c.morph = morph.c.key
def set_morph(self, features):
cdef hash_t key
if features is 0:
self.c.morph = 0
else: else:
raise ValueError(Errors.E1009.format(val=morph)) if isinstance(features, int):
features = self.vocab.strings[features]
property morph_: key = self.vocab.morphology.add(features)
def __get__(self):
return str(MorphAnalysis.from_id(self.vocab, self.c.morph))
def __set__(self, features):
cdef hash_t key = self.vocab.morphology.add(features)
self.c.morph = key self.c.morph = key
@property @property

View File

@ -226,7 +226,7 @@ cdef class Example:
"TAG": [t.tag_ for t in self.reference], "TAG": [t.tag_ for t in self.reference],
"LEMMA": [t.lemma_ for t in self.reference], "LEMMA": [t.lemma_ for t in self.reference],
"POS": [t.pos_ for t in self.reference], "POS": [t.pos_ for t in self.reference],
"MORPH": [t.morph_ for t in self.reference], "MORPH": [str(t.morph) for t in self.reference],
"HEAD": [t.head.i for t in self.reference], "HEAD": [t.head.i for t in self.reference],
"DEP": [t.dep_ for t in self.reference], "DEP": [t.dep_ for t in self.reference],
"SENT_START": [int(bool(t.is_sent_start)) for t in self.reference] "SENT_START": [int(bool(t.is_sent_start)) for t in self.reference]

View File

@ -44,7 +44,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
if include_annotation["POS"]: if include_annotation["POS"]:
json_token["pos"] = token.pos_ json_token["pos"] = token.pos_
if include_annotation["MORPH"]: if include_annotation["MORPH"]:
json_token["morph"] = token.morph_ json_token["morph"] = str(token.morph)
if include_annotation["LEMMA"]: if include_annotation["LEMMA"]:
json_token["lemma"] = token.lemma_ json_token["lemma"] = token.lemma_
if include_annotation["DEP"]: if include_annotation["DEP"]: