mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Raise error if deps not provided with heads (#8335)
* Fill in deps if not provided with heads Before this change, if heads were passed without deps they would be silently ignored, which could be confusing. See #8334. * Use "dep" instead of a blank string This is the customary placeholder dep. It might be better to show an error here instead though. * Throw error on heads without deps * Add a test * Fix tests * Formatting * Fix all tests * Fix a test I missed * Revise error message * Clean up whitespace Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
		
							parent
							
								
									0fd0d949c4
								
							
						
					
					
						commit
						2c105cdbce
					
				| 
						 | 
					@ -843,6 +843,9 @@ class Errors:
 | 
				
			||||||
             "DependencyMatcher token patterns. The token pattern in "
 | 
					             "DependencyMatcher token patterns. The token pattern in "
 | 
				
			||||||
             "RIGHT_ATTR should return matches that are each exactly one token "
 | 
					             "RIGHT_ATTR should return matches that are each exactly one token "
 | 
				
			||||||
             "long. Invalid pattern:\n{node}")
 | 
					             "long. Invalid pattern:\n{node}")
 | 
				
			||||||
 | 
					    E1017 = ("A Doc object requires both 'deps' and 'heads' for dependency "
 | 
				
			||||||
 | 
					             "parses. If no dependency labels are available, provide "
 | 
				
			||||||
 | 
					             "placeholder deps such as `deps=[\"dep\"]*len(heads)`.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Deprecated model shortcuts, only used in errors and warnings
 | 
					# Deprecated model shortcuts, only used in errors and warnings
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -63,3 +63,10 @@ def test_create_from_words_and_text(vocab):
 | 
				
			||||||
        words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
 | 
					        words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
 | 
				
			||||||
        text = "  'dogs'\n\nrun  "
 | 
					        text = "  'dogs'\n\nrun  "
 | 
				
			||||||
        (words, spaces) = util.get_words_and_spaces(words + ["away"], text)
 | 
					        (words, spaces) = util.get_words_and_spaces(words + ["away"], text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_create_with_heads_and_no_deps(vocab):
 | 
				
			||||||
 | 
					    words = "I like ginger".split()
 | 
				
			||||||
 | 
					    heads = list(range(len(words)))
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        doc = Doc(vocab, words=words, heads=heads)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -108,9 +108,12 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
 | 
				
			||||||
    words = ["The", "players", "start", "."]
 | 
					    words = ["The", "players", "start", "."]
 | 
				
			||||||
    lemmas = [t.lower() for t in words]
 | 
					    lemmas = [t.lower() for t in words]
 | 
				
			||||||
    heads = [1, 2, 2, 2]
 | 
					    heads = [1, 2, 2, 2]
 | 
				
			||||||
 | 
					    deps = ["dep"] * len(heads)
 | 
				
			||||||
    tags = ["DT", "NN", "VBZ", "."]
 | 
					    tags = ["DT", "NN", "VBZ", "."]
 | 
				
			||||||
    pos = ["DET", "NOUN", "VERB", "PUNCT"]
 | 
					    pos = ["DET", "NOUN", "VERB", "PUNCT"]
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
 | 
					    doc = Doc(
 | 
				
			||||||
 | 
					        en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    assert len(doc) == 4
 | 
					    assert len(doc) == 4
 | 
				
			||||||
    assert doc[0].text == "The"
 | 
					    assert doc[0].text == "The"
 | 
				
			||||||
    assert doc[0].tag_ == "DT"
 | 
					    assert doc[0].tag_ == "DT"
 | 
				
			||||||
| 
						 | 
					@ -123,7 +126,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
 | 
				
			||||||
    assert doc[0].tag_ == "NN"
 | 
					    assert doc[0].tag_ == "NN"
 | 
				
			||||||
    assert doc[0].pos_ == "NOUN"
 | 
					    assert doc[0].pos_ == "NOUN"
 | 
				
			||||||
    assert doc[0].lemma_ == "the players"
 | 
					    assert doc[0].lemma_ == "the players"
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
 | 
					    doc = Doc(
 | 
				
			||||||
 | 
					        en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    assert len(doc) == 4
 | 
					    assert len(doc) == 4
 | 
				
			||||||
    assert doc[0].text == "The"
 | 
					    assert doc[0].text == "The"
 | 
				
			||||||
    assert doc[0].tag_ == "DT"
 | 
					    assert doc[0].tag_ == "DT"
 | 
				
			||||||
| 
						 | 
					@ -190,8 +195,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
 | 
					    text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
 | 
				
			||||||
    heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1]
 | 
					    heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1]
 | 
				
			||||||
 | 
					    deps = ["dep"] * len(heads)
 | 
				
			||||||
    tokens = en_tokenizer(text)
 | 
					    tokens = en_tokenizer(text)
 | 
				
			||||||
    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
 | 
					    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 | 
				
			||||||
    with doc.retokenize() as retokenizer:
 | 
					    with doc.retokenize() as retokenizer:
 | 
				
			||||||
        for ent in doc.ents:
 | 
					        for ent in doc.ents:
 | 
				
			||||||
            attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
 | 
					            attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
 | 
				
			||||||
| 
						 | 
					@ -199,8 +205,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    text = "One test with entities like New York City so the ents list is not void"
 | 
					    text = "One test with entities like New York City so the ents list is not void"
 | 
				
			||||||
    heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12]
 | 
					    heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12]
 | 
				
			||||||
 | 
					    deps = ["dep"] * len(heads)
 | 
				
			||||||
    tokens = en_tokenizer(text)
 | 
					    tokens = en_tokenizer(text)
 | 
				
			||||||
    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
 | 
					    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
 | 
				
			||||||
    with doc.retokenize() as retokenizer:
 | 
					    with doc.retokenize() as retokenizer:
 | 
				
			||||||
        for ent in doc.ents:
 | 
					        for ent in doc.ents:
 | 
				
			||||||
            retokenizer.merge(ent)
 | 
					            retokenizer.merge(ent)
 | 
				
			||||||
| 
						 | 
					@ -210,6 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
 | 
				
			||||||
    # fmt: off
 | 
					    # fmt: off
 | 
				
			||||||
    text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
 | 
					    text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
 | 
				
			||||||
    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
 | 
					    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
 | 
				
			||||||
 | 
					    deps = ["dep"] * len(heads)
 | 
				
			||||||
    tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
 | 
					    tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
 | 
				
			||||||
    ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
 | 
					    ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
 | 
				
			||||||
    ents = ["O"] * len(heads)
 | 
					    ents = ["O"] * len(heads)
 | 
				
			||||||
| 
						 | 
					@ -221,7 +229,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
    tokens = en_tokenizer(text)
 | 
					    tokens = en_tokenizer(text)
 | 
				
			||||||
    doc = Doc(
 | 
					    doc = Doc(
 | 
				
			||||||
        tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
 | 
					        tokens.vocab,
 | 
				
			||||||
 | 
					        words=[t.text for t in tokens],
 | 
				
			||||||
 | 
					        heads=heads,
 | 
				
			||||||
 | 
					        deps=deps,
 | 
				
			||||||
 | 
					        tags=tags,
 | 
				
			||||||
 | 
					        ents=ents,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    assert len(doc) == 17
 | 
					    assert len(doc) == 17
 | 
				
			||||||
    with doc.retokenize() as retokenizer:
 | 
					    with doc.retokenize() as retokenizer:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -44,7 +44,8 @@ def test_doc_retokenize_split_lemmas(en_vocab):
 | 
				
			||||||
    # If lemmas are not set, leave unset
 | 
					    # If lemmas are not set, leave unset
 | 
				
			||||||
    words = ["LosAngeles", "start", "."]
 | 
					    words = ["LosAngeles", "start", "."]
 | 
				
			||||||
    heads = [1, 2, 2]
 | 
					    heads = [1, 2, 2]
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, heads=heads)
 | 
					    deps = ["dep"] * len(heads)
 | 
				
			||||||
 | 
					    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
 | 
				
			||||||
    with doc.retokenize() as retokenizer:
 | 
					    with doc.retokenize() as retokenizer:
 | 
				
			||||||
        retokenizer.split(
 | 
					        retokenizer.split(
 | 
				
			||||||
            doc[0],
 | 
					            doc[0],
 | 
				
			||||||
| 
						 | 
					@ -57,7 +58,8 @@ def test_doc_retokenize_split_lemmas(en_vocab):
 | 
				
			||||||
    # If lemmas are set, use split orth as default lemma
 | 
					    # If lemmas are set, use split orth as default lemma
 | 
				
			||||||
    words = ["LosAngeles", "start", "."]
 | 
					    words = ["LosAngeles", "start", "."]
 | 
				
			||||||
    heads = [1, 2, 2]
 | 
					    heads = [1, 2, 2]
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, heads=heads)
 | 
					    deps = ["dep"] * len(heads)
 | 
				
			||||||
 | 
					    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
 | 
				
			||||||
    for t in doc:
 | 
					    for t in doc:
 | 
				
			||||||
        t.lemma_ = "a"
 | 
					        t.lemma_ = "a"
 | 
				
			||||||
    with doc.retokenize() as retokenizer:
 | 
					    with doc.retokenize() as retokenizer:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -95,7 +95,8 @@ def test_doc_token_api_ancestors(en_vocab):
 | 
				
			||||||
    # the structure of this sentence depends on the English annotation scheme
 | 
					    # the structure of this sentence depends on the English annotation scheme
 | 
				
			||||||
    words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
 | 
					    words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
 | 
				
			||||||
    heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
 | 
					    heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, heads=heads)
 | 
					    deps = ["dep"] * len(heads)
 | 
				
			||||||
 | 
					    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
 | 
				
			||||||
    assert [t.text for t in doc[6].ancestors] == ["dog", "saw"]
 | 
					    assert [t.text for t in doc[6].ancestors] == ["dog", "saw"]
 | 
				
			||||||
    assert [t.text for t in doc[1].ancestors] == ["saw"]
 | 
					    assert [t.text for t in doc[1].ancestors] == ["saw"]
 | 
				
			||||||
    assert [t.text for t in doc[2].ancestors] == []
 | 
					    assert [t.text for t in doc[2].ancestors] == []
 | 
				
			||||||
| 
						 | 
					@ -146,7 +147,7 @@ def test_doc_token_api_head_setter(en_vocab):
 | 
				
			||||||
    assert doc[4].left_edge.i == 0
 | 
					    assert doc[4].left_edge.i == 0
 | 
				
			||||||
    assert doc[2].left_edge.i == 0
 | 
					    assert doc[2].left_edge.i == 0
 | 
				
			||||||
    # head token must be from the same document
 | 
					    # head token must be from the same document
 | 
				
			||||||
    doc2 = Doc(en_vocab, words=words, heads=heads)
 | 
					    doc2 = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
 | 
				
			||||||
    with pytest.raises(ValueError):
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
        doc[0].head = doc2[0]
 | 
					        doc[0].head = doc2[0]
 | 
				
			||||||
    # test sentence starts when two sentences are joined
 | 
					    # test sentence starts when two sentences are joined
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -69,7 +69,7 @@ def heads():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_parser_parse_navigate_consistency(en_vocab, words, heads):
 | 
					def test_parser_parse_navigate_consistency(en_vocab, words, heads):
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, heads=heads)
 | 
					    doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
 | 
				
			||||||
    for head in doc:
 | 
					    for head in doc:
 | 
				
			||||||
        for child in head.lefts:
 | 
					        for child in head.lefts:
 | 
				
			||||||
            assert child.head == head
 | 
					            assert child.head == head
 | 
				
			||||||
| 
						 | 
					@ -109,7 +109,7 @@ def test_parser_parse_navigate_child_consistency(en_vocab, words, heads):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_parser_parse_navigate_edges(en_vocab, words, heads):
 | 
					def test_parser_parse_navigate_edges(en_vocab, words, heads):
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, heads=heads)
 | 
					    doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
 | 
				
			||||||
    for token in doc:
 | 
					    for token in doc:
 | 
				
			||||||
        subtree = list(token.subtree)
 | 
					        subtree = list(token.subtree)
 | 
				
			||||||
        debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
 | 
					        debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -275,6 +275,8 @@ cdef class Doc:
 | 
				
			||||||
            deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
 | 
					            deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
 | 
				
			||||||
        if deps and not heads:
 | 
					        if deps and not heads:
 | 
				
			||||||
            heads = [0] * len(deps)
 | 
					            heads = [0] * len(deps)
 | 
				
			||||||
 | 
					        if heads and not deps:
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E1017)
 | 
				
			||||||
        if sent_starts is not None:
 | 
					        if sent_starts is not None:
 | 
				
			||||||
            for i in range(len(sent_starts)):
 | 
					            for i in range(len(sent_starts)):
 | 
				
			||||||
                if sent_starts[i] is True:
 | 
					                if sent_starts[i] is True:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user