From 86c3ec9c2b3ad28797b26fb75b808bf573087b35 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 1 Oct 2020 22:21:46 +0200 Subject: [PATCH 1/3] Refactor Token morph setting (#6175) * Refactor Token morph setting * Remove `Token.morph_` * Add `Token.set_morph()` * `0` resets `token.c.morph` to unset * Any other values are passed to `Morphology.add` * Add token.morph setter to set from MorphAnalysis --- spacy/errors.py | 3 ++ spacy/pipeline/morphologizer.pyx | 4 +- spacy/tests/doc/test_array.py | 6 +-- spacy/tests/doc/test_doc_api.py | 38 +++++++++++++++---- spacy/tests/doc/test_morphanalysis.py | 42 ++++++++++----------- spacy/tests/doc/test_retokenize_merge.py | 4 +- spacy/tests/doc/test_retokenize_split.py | 4 +- spacy/tests/matcher/test_matcher_api.py | 24 ++++++------ spacy/tests/matcher/test_phrase_matcher.py | 4 +- spacy/tests/pipeline/test_attributeruler.py | 30 +++++++-------- spacy/tests/pipeline/test_morphologizer.py | 4 +- spacy/tests/test_scorer.py | 6 +-- spacy/tests/training/test_new_example.py | 2 +- spacy/tests/training/test_training.py | 4 +- spacy/tokens/_serialize.py | 2 +- spacy/tokens/doc.pyx | 2 +- spacy/tokens/token.pyx | 26 ++++++------- spacy/training/example.pyx | 2 +- spacy/training/gold_io.pyx | 2 +- 19 files changed, 118 insertions(+), 91 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 1c934d188..5236992e9 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -710,6 +710,9 @@ class Errors: "options: {modes}") E1012 = ("Entity spans and blocked/missing/outside spans should be " "provided to doc.set_ents as lists of `Span` objects.") + E1013 = ("Invalid morph: the MorphAnalysis must have the same vocab as the " + "token itself. To set the morph from this MorphAnalysis, set from " + "the string value with: `token.set_morph(str(other_morph))`.") @add_codes diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 60ad10a2b..ab0554692 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -149,7 +149,7 @@ class Morphologizer(Tagger): for example in get_examples(): for i, token in enumerate(example.reference): pos = token.pos_ - morph = token.morph_ + morph = str(token.morph) # create and add the combined morph+POS label morph_dict = Morphology.feats_to_dict(morph) if pos: @@ -167,7 +167,7 @@ class Morphologizer(Tagger): gold_array = [] for i, token in enumerate(example.reference): pos = token.pos_ - morph = token.morph_ + morph = str(token.morph) morph_dict = Morphology.feats_to_dict(morph) if pos: morph_dict[self.POS_FEAT] = pos diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index 9c050f740..ef54c581c 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -46,9 +46,9 @@ def test_doc_array_morph(en_vocab): words = ["Eat", "blue", "ham"] morph = ["Feat=V", "Feat=J", "Feat=N"] doc = Doc(en_vocab, words=words, morphs=morph) - assert morph[0] == doc[0].morph_ - assert morph[1] == doc[1].morph_ - assert morph[2] == doc[2].morph_ + assert morph[0] == str(doc[0].morph) + assert morph[1] == str(doc[1].morph) + assert morph[2] == str(doc[2].morph) feats_array = doc.to_array((ORTH, MORPH)) assert feats_array[0][1] == doc[0].morph.key diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 55a1c1ad2..e3e056685 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -319,15 +319,13 @@ def test_doc_from_array_morph(en_vocab): words = ["I", "live", "in", "New", "York", "."] morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"] # fmt: on - doc = Doc(en_vocab, words=words) - for i, morph in enumerate(morphs): - doc[i].morph_ = morph + doc = Doc(en_vocab, words=words, morphs=morphs) attrs = [MORPH] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) new_doc.from_array(attrs, arr) - assert [t.morph_ for t in new_doc] == morphs - assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc] + assert [str(t.morph) for t in new_doc] == morphs + assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc] def test_doc_api_from_docs(en_tokenizer, de_tokenizer): @@ -423,7 +421,7 @@ def test_has_annotation(en_vocab): doc[0].tag_ = "A" doc[0].pos_ = "X" - doc[0].morph_ = "Feat=Val" + doc[0].set_morph("Feat=Val") doc[0].lemma_ = "a" doc[0].dep_ = "dep" doc[0].head = doc[1] @@ -435,7 +433,7 @@ def test_has_annotation(en_vocab): doc[1].tag_ = "A" doc[1].pos_ = "X" - doc[1].morph_ = "" + doc[1].set_morph("") doc[1].lemma_ = "a" doc[1].dep_ = "dep" doc.ents = [Span(doc, 0, 2, label="HELLO")] @@ -538,6 +536,32 @@ def test_doc_ents_setter(): assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] +def test_doc_morph_setter(en_tokenizer, de_tokenizer): + doc1 = en_tokenizer("a b") + doc1b = en_tokenizer("c d") + doc2 = de_tokenizer("a b") + + # unset values can be copied + doc1[0].morph = doc1[1].morph + assert doc1[0].morph.key == 0 + assert doc1[1].morph.key == 0 + + # morph values from the same vocab can be copied + doc1[0].set_morph("Feat=Val") + doc1[1].morph = doc1[0].morph + assert doc1[0].morph == doc1[1].morph + + # ... also across docs + doc1b[0].morph = doc1[0].morph + assert doc1[0].morph == doc1b[0].morph + + doc2[0].set_morph("Feat2=Val2") + + # the morph value must come from the same vocab + with pytest.raises(ValueError): + doc1[0].morph = doc2[0].morph + + def test_doc_init_iob(): """Test ents validation/normalization in Doc.__init__""" words = ["a", "b", "c", "d", "e"] diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index f378ce042..56c80dd66 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -4,13 +4,13 @@ import pytest @pytest.fixture def i_has(en_tokenizer): doc = en_tokenizer("I has") - doc[0].morph_ = {"PronType": "prs"} - doc[1].morph_ = { + doc[0].set_morph({"PronType": "prs"}) + doc[1].set_morph({ "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": "three", - } + }) return doc @@ -47,20 +47,20 @@ def test_morph_get(i_has): def test_morph_set(i_has): assert i_has[0].morph.get("PronType") == ["prs"] # set by string - i_has[0].morph_ = "PronType=unk" + i_has[0].set_morph("PronType=unk") assert i_has[0].morph.get("PronType") == ["unk"] # set by string, fields are alphabetized - i_has[0].morph_ = "PronType=123|NounType=unk" - assert i_has[0].morph_ == "NounType=unk|PronType=123" + i_has[0].set_morph("PronType=123|NounType=unk") + assert str(i_has[0].morph) == "NounType=unk|PronType=123" # set by dict - i_has[0].morph_ = {"AType": "123", "BType": "unk"} - assert i_has[0].morph_ == "AType=123|BType=unk" + i_has[0].set_morph({"AType": "123", "BType": "unk"}) + assert str(i_has[0].morph) == "AType=123|BType=unk" # set by string with multiple values, fields and values are alphabetized - i_has[0].morph_ = "BType=c|AType=b,a" - assert i_has[0].morph_ == "AType=a,b|BType=c" + i_has[0].set_morph("BType=c|AType=b,a") + assert str(i_has[0].morph) == "AType=a,b|BType=c" # set by dict with multiple values, fields and values are alphabetized - i_has[0].morph_ = {"AType": "b,a", "BType": "c"} - assert i_has[0].morph_ == "AType=a,b|BType=c" + i_has[0].set_morph({"AType": "b,a", "BType": "c"}) + assert str(i_has[0].morph) == "AType=a,b|BType=c" def test_morph_str(i_has): @@ -72,25 +72,25 @@ def test_morph_property(tokenizer): doc = tokenizer("a dog") # set through token.morph_ - doc[0].morph_ = "PronType=prs" - assert doc[0].morph_ == "PronType=prs" + doc[0].set_morph("PronType=prs") + assert str(doc[0].morph) == "PronType=prs" assert doc.to_array(["MORPH"])[0] != 0 # unset with token.morph - doc[0].morph = 0 + doc[0].set_morph(0) assert doc.to_array(["MORPH"])[0] == 0 # empty morph is equivalent to "_" - doc[0].morph_ = "" - assert doc[0].morph_ == "" + doc[0].set_morph("") + assert str(doc[0].morph) == "" assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] # "_" morph is also equivalent to empty morph - doc[0].morph_ = "_" - assert doc[0].morph_ == "" + doc[0].set_morph("_") + assert str(doc[0].morph) == "" assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] # set through existing hash with token.morph tokenizer.vocab.strings.add("Feat=Val") - doc[0].morph = tokenizer.vocab.strings.add("Feat=Val") - assert doc[0].morph_ == "Feat=Val" + doc[0].set_morph(tokenizer.vocab.strings.add("Feat=Val")) + assert str(doc[0].morph) == "Feat=Val" diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index ab186b062..cb886545a 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -21,11 +21,11 @@ def test_doc_retokenize_merge(en_tokenizer): assert doc[4].text == "the beach boys" assert doc[4].text_with_ws == "the beach boys " assert doc[4].tag_ == "NAMED" - assert doc[4].morph_ == "Number=Plur" + assert str(doc[4].morph) == "Number=Plur" assert doc[5].text == "all night" assert doc[5].text_with_ws == "all night" assert doc[5].tag_ == "NAMED" - assert doc[5].morph_ == "Number=Plur" + assert str(doc[5].morph) == "Number=Plur" def test_doc_retokenize_merge_children(en_tokenizer): diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index 4d4b170f9..238e36d59 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -27,11 +27,11 @@ def test_doc_retokenize_split(en_vocab): assert doc[0].text == "Los" assert doc[0].head.text == "Angeles" assert doc[0].idx == 0 - assert doc[0].morph_ == "Number=Sing" + assert str(doc[0].morph) == "Number=Sing" assert doc[1].idx == 3 assert doc[1].text == "Angeles" assert doc[1].head.text == "start" - assert doc[1].morph_ == "Number=Sing" + assert str(doc[1].morph) == "Number=Sing" assert doc[2].text == "start" assert doc[2].head.text == "." assert doc[3].text == "." diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 627110cdd..77b09f376 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -236,13 +236,13 @@ def test_matcher_subset_value_operator(en_vocab): matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc)) == 3 - doc[0].morph_ = "Feat=Val" + doc[0].set_morph("Feat=Val") assert len(matcher(doc)) == 3 - doc[0].morph_ = "Feat=Val|Feat2=Val2" + doc[0].set_morph("Feat=Val|Feat2=Val2") assert len(matcher(doc)) == 3 - doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" + doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3") assert len(matcher(doc)) == 2 - doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" + doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4") assert len(matcher(doc)) == 2 # IS_SUBSET acts like "IN" for attrs other than MORPH @@ -268,11 +268,11 @@ def test_matcher_superset_value_operator(en_vocab): matcher.add("M", [pattern]) doc = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc)) == 0 - doc[0].morph_ = "Feat=Val|Feat2=Val2" + doc[0].set_morph("Feat=Val|Feat2=Val2") assert len(matcher(doc)) == 0 - doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" + doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3") assert len(matcher(doc)) == 1 - doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" + doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4") assert len(matcher(doc)) == 1 # IS_SUPERSET with more than one value only matches for MORPH @@ -310,9 +310,9 @@ def test_matcher_morph_handling(en_vocab): doc = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc)) == 0 - doc[0].morph_ = "Feat2=Val2|Feat1=Val1" + doc[0].set_morph("Feat2=Val2|Feat1=Val1") assert len(matcher(doc)) == 2 - doc[0].morph_ = "Feat1=Val1|Feat2=Val2" + doc[0].set_morph("Feat1=Val1|Feat2=Val2") assert len(matcher(doc)) == 2 # multiple values are split @@ -324,9 +324,9 @@ def test_matcher_morph_handling(en_vocab): doc = Doc(en_vocab, words=["a", "b", "c"]) assert len(matcher(doc)) == 0 - doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1" + doc[0].set_morph("Feat2=Val2,Val3|Feat1=Val1") assert len(matcher(doc)) == 1 - doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2" + doc[0].set_morph("Feat1=Val1,Val3|Feat2=Val2") assert len(matcher(doc)) == 2 @@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab): doc2 = Doc(en_vocab, words=["Test"]) doc2[0].tag_ = "TAG" doc2[0].pos_ = "X" - doc2[0].morph_ = "Feat=Val" + doc2[0].set_morph("Feat=Val") doc2[0].lemma_ = "LEMMA" doc3 = Doc(en_vocab, words=["Test"]) # DEP requires DEP diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 522356ffc..1b81fd780 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -190,7 +190,7 @@ def test_phrase_matcher_validation(en_vocab): doc2 = Doc(en_vocab, words=["Test"]) doc2[0].tag_ = "TAG" doc2[0].pos_ = "X" - doc2[0].morph_ = "Feat=Val" + doc2[0].set_morph("Feat=Val") doc3 = Doc(en_vocab, words=["Test"]) matcher = PhraseMatcher(en_vocab, validate=True) with pytest.warns(UserWarning): @@ -217,7 +217,7 @@ def test_attr_pipeline_checks(en_vocab): doc2 = Doc(en_vocab, words=["Test"]) doc2[0].tag_ = "TAG" doc2[0].pos_ = "X" - doc2[0].morph_ = "Feat=Val" + doc2[0].set_morph("Feat=Val") doc2[0].lemma_ = "LEMMA" doc3 = Doc(en_vocab, words=["Test"]) # DEP requires DEP diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index b9e5894dd..5773127af 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -69,9 +69,9 @@ def test_attributeruler_init(nlp, pattern_dicts): a.add(**p) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" - assert doc[2].morph_ == "Case=Nom|Number=Plur" + assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" - assert doc[3].morph_ == "Case=Nom|Number=Sing" + assert str(doc[3].morph) == "Case=Nom|Number=Sing" assert doc.has_annotation("LEMMA") assert doc.has_annotation("MORPH") @@ -81,9 +81,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" - assert doc[2].morph_ == "Case=Nom|Number=Plur" + assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" - assert doc[3].morph_ == "Case=Nom|Number=Sing" + assert str(doc[3].morph) == "Case=Nom|Number=Sing" assert doc.has_annotation("LEMMA") assert doc.has_annotation("MORPH") nlp.remove_pipe("attribute_ruler") @@ -94,9 +94,9 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): ) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" - assert doc[2].morph_ == "Case=Nom|Number=Plur" + assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" - assert doc[3].morph_ == "Case=Nom|Number=Sing" + assert str(doc[3].morph) == "Case=Nom|Number=Sing" assert doc.has_annotation("LEMMA") assert doc.has_annotation("MORPH") @@ -106,9 +106,9 @@ def test_attributeruler_score(nlp, pattern_dicts): nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts}) doc = nlp("This is a test.") assert doc[2].lemma_ == "the" - assert doc[2].morph_ == "Case=Nom|Number=Plur" + assert str(doc[2].morph) == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" - assert doc[3].morph_ == "Case=Nom|Number=Sing" + assert str(doc[3].morph) == "Case=Nom|Number=Sing" dev_examples = [ Example.from_dict( @@ -150,10 +150,10 @@ def test_attributeruler_tag_map(nlp, tag_map): for i in range(len(doc)): if i == 4: assert doc[i].pos_ == "PUNCT" - assert doc[i].morph_ == "PunctType=peri" + assert str(doc[i].morph) == "PunctType=peri" else: assert doc[i].pos_ == "" - assert doc[i].morph_ == "" + assert str(doc[i].morph) == "" def test_attributeruler_morph_rules(nlp, morph_rules): @@ -168,11 +168,11 @@ def test_attributeruler_morph_rules(nlp, morph_rules): for i in range(len(doc)): if i != 2: assert doc[i].pos_ == "" - assert doc[i].morph_ == "" + assert str(doc[i].morph) == "" else: assert doc[2].pos_ == "DET" assert doc[2].lemma_ == "a" - assert doc[2].morph_ == "Case=Nom" + assert str(doc[2].morph) == "Case=Nom" def test_attributeruler_indices(nlp): @@ -194,14 +194,14 @@ def test_attributeruler_indices(nlp): for i in range(len(doc)): if i == 1: assert doc[i].lemma_ == "was" - assert doc[i].morph_ == "Case=Nom|Number=Sing" + assert str(doc[i].morph) == "Case=Nom|Number=Sing" elif i == 2: assert doc[i].lemma_ == "the" - assert doc[i].morph_ == "Case=Nom|Number=Plur" + assert str(doc[i].morph) == "Case=Nom|Number=Plur" elif i == 3: assert doc[i].lemma_ == "cat" else: - assert doc[i].morph_ == "" + assert str(doc[i].morph) == "" # raises an error when trying to modify a token outside of the match a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2) with pytest.raises(ValueError): diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 5d605f4e6..af81129c0 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -91,7 +91,7 @@ def test_overfitting_IO(): doc = nlp(test_text) gold_morphs = ["Feat=N", "Feat=V", "", ""] gold_pos_tags = ["NOUN", "VERB", "ADJ", ""] - assert [t.morph_ for t in doc] == gold_morphs + assert [str(t.morph) for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags # Also test the results are still the same after IO @@ -99,5 +99,5 @@ def test_overfitting_IO(): nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) - assert [t.morph_ for t in doc2] == gold_morphs + assert [str(t.morph) for t in doc2] == gold_morphs assert [t.pos_ for t in doc2] == gold_pos_tags diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 187aa1b52..039f3d4d8 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -76,7 +76,7 @@ def tagged_doc(): for i in range(len(tags)): doc[i].tag_ = tags[i] doc[i].pos_ = pos[i] - doc[i].morph_ = morphs[i] + doc[i].set_morph(morphs[i]) if i > 0: doc[i].is_sent_start = False return doc @@ -242,7 +242,7 @@ def test_tag_score(tagged_doc): gold = { "tags": [t.tag_ for t in tagged_doc], "pos": [t.pos_ for t in tagged_doc], - "morphs": [t.morph_ for t in tagged_doc], + "morphs": [str(t.morph) for t in tagged_doc], "sent_starts": [1 if t.is_sent_start else -1 for t in tagged_doc], } example = Example.from_dict(tagged_doc, gold) @@ -259,7 +259,7 @@ def test_tag_score(tagged_doc): tags[0] = "NN" pos = [t.pos_ for t in tagged_doc] pos[1] = "X" - morphs = [t.morph_ for t in tagged_doc] + morphs = [str(t.morph) for t in tagged_doc] morphs[1] = "Number=sing" morphs[2] = "Number=plur" gold = { diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py index 81207b640..06db86a12 100644 --- a/spacy/tests/training/test_new_example.py +++ b/spacy/tests/training/test_new_example.py @@ -113,7 +113,7 @@ def test_Example_from_dict_with_morphology(annots): predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) for i, token in enumerate(example.reference): - assert token.morph_ == annots["morphs"][i] + assert str(token.morph) == annots["morphs"][i] @pytest.mark.parametrize( diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 28a411e6d..405801f62 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -460,7 +460,7 @@ def test_roundtrip_docs_to_docbin(doc): idx = [t.idx for t in doc] tags = [t.tag_ for t in doc] pos = [t.pos_ for t in doc] - morphs = [t.morph_ for t in doc] + morphs = [str(t.morph) for t in doc] lemmas = [t.lemma_ for t in doc] deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] @@ -482,7 +482,7 @@ def test_roundtrip_docs_to_docbin(doc): assert idx == [t.idx for t in reloaded_example.reference] assert tags == [t.tag_ for t in reloaded_example.reference] assert pos == [t.pos_ for t in reloaded_example.reference] - assert morphs == [t.morph_ for t in reloaded_example.reference] + assert morphs == [str(t.morph) for t in reloaded_example.reference] assert lemmas == [t.lemma_ for t in reloaded_example.reference] assert deps == [t.dep_ for t in reloaded_example.reference] assert heads == [t.head.i for t in reloaded_example.reference] diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 2d4e9af9d..ed283a86b 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -101,7 +101,7 @@ class DocBin: self.strings.add(token.text) self.strings.add(token.tag_) self.strings.add(token.lemma_) - self.strings.add(token.morph_) + self.strings.add(str(token.morph)) self.strings.add(token.dep_) self.strings.add(token.ent_type_) self.strings.add(token.ent_kb_id_) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 29fbb6076..9dfa6e714 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1248,7 +1248,7 @@ cdef class Doc: for token in self: strings.add(token.tag_) strings.add(token.lemma_) - strings.add(token.morph_) + strings.add(str(token.morph)) strings.add(token.dep_) strings.add(token.ent_type_) strings.add(token.ent_kb_id_) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 239de4559..8099abd92 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -215,20 +215,20 @@ cdef class Token: def __get__(self): return MorphAnalysis.from_id(self.vocab, self.c.morph) - def __set__(self, attr_t morph): - if morph == 0: - self.c.morph = morph - elif morph in self.vocab.strings: - self.morph_ = self.vocab.strings[morph] - else: - raise ValueError(Errors.E1009.format(val=morph)) + def __set__(self, MorphAnalysis morph): + # Check that the morph has the same vocab + if self.vocab != morph.vocab: + raise ValueError(Errors.E1013) + self.c.morph = morph.c.key - property morph_: - def __get__(self): - return str(MorphAnalysis.from_id(self.vocab, self.c.morph)) - - def __set__(self, features): - cdef hash_t key = self.vocab.morphology.add(features) + def set_morph(self, features): + cdef hash_t key + if features is 0: + self.c.morph = 0 + else: + if isinstance(features, int): + features = self.vocab.strings[features] + key = self.vocab.morphology.add(features) self.c.morph = key @property diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index ca93b6464..f6225135c 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -226,7 +226,7 @@ cdef class Example: "TAG": [t.tag_ for t in self.reference], "LEMMA": [t.lemma_ for t in self.reference], "POS": [t.pos_ for t in self.reference], - "MORPH": [t.morph_ for t in self.reference], + "MORPH": [str(t.morph) for t in self.reference], "HEAD": [t.head.i for t in self.reference], "DEP": [t.dep_ for t in self.reference], "SENT_START": [int(bool(t.is_sent_start)) for t in self.reference] diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx index 8b9f5ab2b..8fb6b8565 100644 --- a/spacy/training/gold_io.pyx +++ b/spacy/training/gold_io.pyx @@ -44,7 +44,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): if include_annotation["POS"]: json_token["pos"] = token.pos_ if include_annotation["MORPH"]: - json_token["morph"] = token.morph_ + json_token["morph"] = str(token.morph) if include_annotation["LEMMA"]: json_token["lemma"] = token.lemma_ if include_annotation["DEP"]: From 5762876dcc1be42e982ad989335f9a485a7c3be3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 1 Oct 2020 22:27:37 +0200 Subject: [PATCH 2/3] Update default config [ci skip] --- spacy/default_config.cfg | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 6bd1ed24d..d7fc46ea0 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -2,7 +2,6 @@ train = null dev = null vectors = null -vocab_data = null init_tok2vec = null [system] @@ -11,8 +10,13 @@ gpu_allocator = null [nlp] lang = null +# List of pipeline component names, in order. The names should correspond to +# components defined in the [components block] pipeline = [] +# Components that are loaded but disabled by default disabled = [] +# Optional callbacks to modify the nlp object before it's initialized, after +# it's created and after the pipeline has been set up before_creation = null after_creation = null after_pipeline_creation = null @@ -20,6 +24,7 @@ after_pipeline_creation = null [nlp.tokenizer] @tokenizers = "spacy.Tokenizer.v1" +# The pipeline components and their models [components] # Readers for corpora like dev and train. @@ -38,8 +43,7 @@ max_length = 0 limit = 0 # Apply some simply data augmentation, where we replace tokens with variations. # This is especially useful for punctuation and case replacement, to help -# generalize beyond corpora that don't have smart-quotes, or only have smart -# quotes, etc. +# generalize beyond corpora that don't/only have smart quotes etc. augmenter = null [corpora.dev] @@ -53,6 +57,7 @@ gold_preproc = false max_length = 0 # Limitation on number of training examples limit = 0 +# Optional callback for data augmentation augmenter = null # Training hyper-parameters and additional features. @@ -102,17 +107,18 @@ use_averages = false eps = 1e-8 learn_rate = 0.001 -# The 'initialize' step is run before training or pretraining. Components and -# the tokenizer can each define their own arguments via their .initialize -# methods that are populated by the config. This lets them gather resources like -# lookup tables and build label sets, construct vocabularies, etc. +# These settings are used when nlp.initialize() is called (typically before +# training or pretraining). Components and the tokenizer can each define their +# own arguments via their initialize methods that are populated by the config. +# This lets them gather data resources, build label sets etc. [initialize] -vocab_data = ${paths.vocab_data} -lookups = null vectors = ${paths.vectors} # Extra resources for transfer-learning or pseudo-rehearsal init_tok2vec = ${paths.init_tok2vec} +# Data and lookups for vocabulary +vocab_data = null +lookups = null # Arguments passed to the tokenizer's initialize method tokenizer = {} -# Arguments passed to the initialize methods of the components (keyed by component name) +# Arguments for initialize methods of the components (keyed by component) components = {} From 50162b8726641248802bfa43d4d63ca26f8efd09 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 1 Oct 2020 22:27:45 +0200 Subject: [PATCH 3/3] Try to work around Sharp build issue [ci skip] --- website/gatsby-config.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/website/gatsby-config.js b/website/gatsby-config.js index c1a2f9ab9..4650711ac 100644 --- a/website/gatsby-config.js +++ b/website/gatsby-config.js @@ -1,6 +1,11 @@ const autoprefixer = require('autoprefixer') const path = require('path') +// https://florian.ec/blog/gatsby-build-netlify-segmentation-fault/ +const sharp = require('sharp') +sharp.cache(false) +sharp.simd(false) + // Markdown plugins const wrapSectionPlugin = require('./src/plugins/remark-wrap-section.js') const customAttrsPlugin = require('./src/plugins/remark-custom-attrs.js')