diff --git a/spacy/errors.py b/spacy/errors.py index f26558327..e8eccaece 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -843,6 +843,9 @@ class Errors: "DependencyMatcher token patterns. The token pattern in " "RIGHT_ATTR should return matches that are each exactly one token " "long. Invalid pattern:\n{node}") + E1017 = ("A Doc object requires both 'deps' and 'heads' for dependency " + "parses. If no dependency labels are available, provide " + "placeholder deps such as `deps=[\"dep\"]*len(heads)`.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 0dc6c4866..6c9de8f07 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -63,3 +63,10 @@ def test_create_from_words_and_text(vocab): words = [" ", " ", "'", "dogs", "'", "\n\n", "run"] text = " 'dogs'\n\nrun " (words, spaces) = util.get_words_and_spaces(words + ["away"], text) + + +def test_create_with_heads_and_no_deps(vocab): + words = "I like ginger".split() + heads = list(range(len(words))) + with pytest.raises(ValueError): + doc = Doc(vocab, words=words, heads=heads) diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 36fa3c15d..6b53088b5 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -108,9 +108,12 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): words = ["The", "players", "start", "."] lemmas = [t.lower() for t in words] heads = [1, 2, 2, 2] + deps = ["dep"] * len(heads) tags = ["DT", "NN", "VBZ", "."] pos = ["DET", "NOUN", "VERB", "PUNCT"] - doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas) + doc = Doc( + en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas + ) assert len(doc) == 4 assert doc[0].text == "The" assert doc[0].tag_ == "DT" @@ -123,7 +126,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): assert doc[0].tag_ == "NN" assert doc[0].pos_ == "NOUN" assert doc[0].lemma_ == "the players" - doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas) + doc = Doc( + en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas + ) assert len(doc) == 4 assert doc[0].text == "The" assert doc[0].tag_ == "DT" @@ -190,8 +195,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer): text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript." heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1] + deps = ["dep"] * len(heads) tokens = en_tokenizer(text) - doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) with doc.retokenize() as retokenizer: for ent in doc.ents: attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_} @@ -199,8 +205,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer): text = "One test with entities like New York City so the ents list is not void" heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12] + deps = ["dep"] * len(heads) tokens = en_tokenizer(text) - doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) with doc.retokenize() as retokenizer: for ent in doc.ents: retokenizer.merge(ent) @@ -210,6 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer): # fmt: off text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n" heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15] + deps = ["dep"] * len(heads) tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"] ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)] ents = ["O"] * len(heads) @@ -221,7 +229,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer): # fmt: on tokens = en_tokenizer(text) doc = Doc( - tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents + tokens.vocab, + words=[t.text for t in tokens], + heads=heads, + deps=deps, + tags=tags, + ents=ents, ) assert len(doc) == 17 with doc.retokenize() as retokenizer: diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index 6bfd508bc..16df1713d 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -44,7 +44,8 @@ def test_doc_retokenize_split_lemmas(en_vocab): # If lemmas are not set, leave unset words = ["LosAngeles", "start", "."] heads = [1, 2, 2] - doc = Doc(en_vocab, words=words, heads=heads) + deps = ["dep"] * len(heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) with doc.retokenize() as retokenizer: retokenizer.split( doc[0], @@ -57,7 +58,8 @@ def test_doc_retokenize_split_lemmas(en_vocab): # If lemmas are set, use split orth as default lemma words = ["LosAngeles", "start", "."] heads = [1, 2, 2] - doc = Doc(en_vocab, words=words, heads=heads) + deps = ["dep"] * len(heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) for t in doc: t.lemma_ = "a" with doc.retokenize() as retokenizer: diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 1e13882c5..8338a7290 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -95,7 +95,8 @@ def test_doc_token_api_ancestors(en_vocab): # the structure of this sentence depends on the English annotation scheme words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."] heads = [2, 2, 2, 4, 2, 6, 4, 6, 2] - doc = Doc(en_vocab, words=words, heads=heads) + deps = ["dep"] * len(heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) assert [t.text for t in doc[6].ancestors] == ["dog", "saw"] assert [t.text for t in doc[1].ancestors] == ["saw"] assert [t.text for t in doc[2].ancestors] == [] @@ -146,7 +147,7 @@ def test_doc_token_api_head_setter(en_vocab): assert doc[4].left_edge.i == 0 assert doc[2].left_edge.i == 0 # head token must be from the same document - doc2 = Doc(en_vocab, words=words, heads=heads) + doc2 = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads)) with pytest.raises(ValueError): doc[0].head = doc2[0] # test sentence starts when two sentences are joined diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index 8ca4039a2..50da60594 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -69,7 +69,7 @@ def heads(): def test_parser_parse_navigate_consistency(en_vocab, words, heads): - doc = Doc(en_vocab, words=words, heads=heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads)) for head in doc: for child in head.lefts: assert child.head == head @@ -109,7 +109,7 @@ def test_parser_parse_navigate_child_consistency(en_vocab, words, heads): def test_parser_parse_navigate_edges(en_vocab, words, heads): - doc = Doc(en_vocab, words=words, heads=heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads)) for token in doc: subtree = list(token.subtree) debug = "\t".join((token.text, token.left_edge.text, subtree[0].text)) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4ab8562c3..7a50d3d53 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -275,6 +275,8 @@ cdef class Doc: deps = [dep if dep is not None else MISSING_DEP_ for dep in deps] if deps and not heads: heads = [0] * len(deps) + if heads and not deps: + raise ValueError(Errors.E1017) if sent_starts is not None: for i in range(len(sent_starts)): if sent_starts[i] is True: