Raise error if deps not provided with heads (#8335)

* Fill in deps if not provided with heads Before this change, if heads were passed without deps they would be silently ignored, which could be confusing. See #8334. * Use "dep" instead of a blank string This is the customary placeholder dep. It might be better to show an error here instead though. * Throw error on heads without deps * Add a test * Fix tests * Formatting * Fix all tests * Fix a test I missed * Revise error message * Clean up whitespace Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-12-24 02:23:19 +03:00 · 2021-06-15 20:23:32 +09:00 · 2021-06-15 20:23:32 +09:00 · 2c105cdbce
commit 2c105cdbce
parent 0fd0d949c4
7 changed files with 39 additions and 11 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -843,6 +843,9 @@ class Errors:
             "DependencyMatcher token patterns. The token pattern in "
             "RIGHT_ATTR should return matches that are each exactly one token "
             "long. Invalid pattern:\n{node}")
+    E1017 = ("A Doc object requires both 'deps' and 'heads' for dependency "
+             "parses. If no dependency labels are available, provide "
+             "placeholder deps such as `deps=[\"dep\"]*len(heads)`.")


 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@ -63,3 +63,10 @@ def test_create_from_words_and_text(vocab):
        words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
        text = "  'dogs'\n\nrun  "
        (words, spaces) = util.get_words_and_spaces(words + ["away"], text)
+
+
+def test_create_with_heads_and_no_deps(vocab):
+    words = "I like ginger".split()
+    heads = list(range(len(words)))
+    with pytest.raises(ValueError):
+        doc = Doc(vocab, words=words, heads=heads)
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -108,9 +108,12 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
    words = ["The", "players", "start", "."]
    lemmas = [t.lower() for t in words]
    heads = [1, 2, 2, 2]
+    deps = ["dep"] * len(heads)
    tags = ["DT", "NN", "VBZ", "."]
    pos = ["DET", "NOUN", "VERB", "PUNCT"]
-    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
+    doc = Doc(
+        en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas
+    )
    assert len(doc) == 4
    assert doc[0].text == "The"
    assert doc[0].tag_ == "DT"
@ -123,7 +126,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
    assert doc[0].tag_ == "NN"
    assert doc[0].pos_ == "NOUN"
    assert doc[0].lemma_ == "the players"
-    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
+    doc = Doc(
+        en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas
+    )
    assert len(doc) == 4
    assert doc[0].text == "The"
    assert doc[0].tag_ == "DT"
@ -190,8 +195,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):

    text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
    heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1]
+    deps = ["dep"] * len(heads)
    tokens = en_tokenizer(text)
-    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
@ -199,8 +205,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):

    text = "One test with entities like New York City so the ents list is not void"
    heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12]
+    deps = ["dep"] * len(heads)
    tokens = en_tokenizer(text)
-    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(ent)
@ -210,6 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
    # fmt: off
    text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
    heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
+    deps = ["dep"] * len(heads)
    tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
    ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
    ents = ["O"] * len(heads)
@ -221,7 +229,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
    # fmt: on
    tokens = en_tokenizer(text)
    doc = Doc(
-        tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
+        tokens.vocab,
+        words=[t.text for t in tokens],
+        heads=heads,
+        deps=deps,
+        tags=tags,
+        ents=ents,
    )
    assert len(doc) == 17
    with doc.retokenize() as retokenizer:
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@ -44,7 +44,8 @@ def test_doc_retokenize_split_lemmas(en_vocab):
    # If lemmas are not set, leave unset
    words = ["LosAngeles", "start", "."]
    heads = [1, 2, 2]
-    doc = Doc(en_vocab, words=words, heads=heads)
+    deps = ["dep"] * len(heads)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
    with doc.retokenize() as retokenizer:
        retokenizer.split(
            doc[0],
@ -57,7 +58,8 @@ def test_doc_retokenize_split_lemmas(en_vocab):
    # If lemmas are set, use split orth as default lemma
    words = ["LosAngeles", "start", "."]
    heads = [1, 2, 2]
-    doc = Doc(en_vocab, words=words, heads=heads)
+    deps = ["dep"] * len(heads)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
    for t in doc:
        t.lemma_ = "a"
    with doc.retokenize() as retokenizer:
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -95,7 +95,8 @@ def test_doc_token_api_ancestors(en_vocab):
    # the structure of this sentence depends on the English annotation scheme
    words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
    heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
-    doc = Doc(en_vocab, words=words, heads=heads)
+    deps = ["dep"] * len(heads)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
    assert [t.text for t in doc[6].ancestors] == ["dog", "saw"]
    assert [t.text for t in doc[1].ancestors] == ["saw"]
    assert [t.text for t in doc[2].ancestors] == []
@ -146,7 +147,7 @@ def test_doc_token_api_head_setter(en_vocab):
    assert doc[4].left_edge.i == 0
    assert doc[2].left_edge.i == 0
    # head token must be from the same document
-    doc2 = Doc(en_vocab, words=words, heads=heads)
+    doc2 = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
    with pytest.raises(ValueError):
        doc[0].head = doc2[0]
    # test sentence starts when two sentences are joined
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@ -69,7 +69,7 @@ def heads():


 def test_parser_parse_navigate_consistency(en_vocab, words, heads):
-    doc = Doc(en_vocab, words=words, heads=heads)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
    for head in doc:
        for child in head.lefts:
            assert child.head == head
@ -109,7 +109,7 @@ def test_parser_parse_navigate_child_consistency(en_vocab, words, heads):


 def test_parser_parse_navigate_edges(en_vocab, words, heads):
-    doc = Doc(en_vocab, words=words, heads=heads)
+    doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
    for token in doc:
        subtree = list(token.subtree)
        debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -275,6 +275,8 @@ cdef class Doc:
            deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
        if deps and not heads:
            heads = [0] * len(deps)
+        if heads and not deps:
+            raise ValueError(Errors.E1017)
        if sent_starts is not None:
            for i in range(len(sent_starts)):
                if sent_starts[i] is True: