mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Raise error if deps not provided with heads (#8335)
* Fill in deps if not provided with heads Before this change, if heads were passed without deps they would be silently ignored, which could be confusing. See #8334. * Use "dep" instead of a blank string This is the customary placeholder dep. It might be better to show an error here instead though. * Throw error on heads without deps * Add a test * Fix tests * Formatting * Fix all tests * Fix a test I missed * Revise error message * Clean up whitespace Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
0fd0d949c4
commit
2c105cdbce
|
@ -843,6 +843,9 @@ class Errors:
|
||||||
"DependencyMatcher token patterns. The token pattern in "
|
"DependencyMatcher token patterns. The token pattern in "
|
||||||
"RIGHT_ATTR should return matches that are each exactly one token "
|
"RIGHT_ATTR should return matches that are each exactly one token "
|
||||||
"long. Invalid pattern:\n{node}")
|
"long. Invalid pattern:\n{node}")
|
||||||
|
E1017 = ("A Doc object requires both 'deps' and 'heads' for dependency "
|
||||||
|
"parses. If no dependency labels are available, provide "
|
||||||
|
"placeholder deps such as `deps=[\"dep\"]*len(heads)`.")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -63,3 +63,10 @@ def test_create_from_words_and_text(vocab):
|
||||||
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
|
words = [" ", " ", "'", "dogs", "'", "\n\n", "run"]
|
||||||
text = " 'dogs'\n\nrun "
|
text = " 'dogs'\n\nrun "
|
||||||
(words, spaces) = util.get_words_and_spaces(words + ["away"], text)
|
(words, spaces) = util.get_words_and_spaces(words + ["away"], text)
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_with_heads_and_no_deps(vocab):
|
||||||
|
words = "I like ginger".split()
|
||||||
|
heads = list(range(len(words)))
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc = Doc(vocab, words=words, heads=heads)
|
||||||
|
|
|
@ -108,9 +108,12 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
|
||||||
words = ["The", "players", "start", "."]
|
words = ["The", "players", "start", "."]
|
||||||
lemmas = [t.lower() for t in words]
|
lemmas = [t.lower() for t in words]
|
||||||
heads = [1, 2, 2, 2]
|
heads = [1, 2, 2, 2]
|
||||||
|
deps = ["dep"] * len(heads)
|
||||||
tags = ["DT", "NN", "VBZ", "."]
|
tags = ["DT", "NN", "VBZ", "."]
|
||||||
pos = ["DET", "NOUN", "VERB", "PUNCT"]
|
pos = ["DET", "NOUN", "VERB", "PUNCT"]
|
||||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
|
doc = Doc(
|
||||||
|
en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas
|
||||||
|
)
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
assert doc[0].text == "The"
|
assert doc[0].text == "The"
|
||||||
assert doc[0].tag_ == "DT"
|
assert doc[0].tag_ == "DT"
|
||||||
|
@ -123,7 +126,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
|
||||||
assert doc[0].tag_ == "NN"
|
assert doc[0].tag_ == "NN"
|
||||||
assert doc[0].pos_ == "NOUN"
|
assert doc[0].pos_ == "NOUN"
|
||||||
assert doc[0].lemma_ == "the players"
|
assert doc[0].lemma_ == "the players"
|
||||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
|
doc = Doc(
|
||||||
|
en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas
|
||||||
|
)
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
assert doc[0].text == "The"
|
assert doc[0].text == "The"
|
||||||
assert doc[0].tag_ == "DT"
|
assert doc[0].tag_ == "DT"
|
||||||
|
@ -190,8 +195,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
|
||||||
|
|
||||||
text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
|
text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
|
||||||
heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1]
|
heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1]
|
||||||
|
deps = ["dep"] * len(heads)
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
|
attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
|
||||||
|
@ -199,8 +205,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer):
|
||||||
|
|
||||||
text = "One test with entities like New York City so the ents list is not void"
|
text = "One test with entities like New York City so the ents list is not void"
|
||||||
heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12]
|
heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12]
|
||||||
|
deps = ["dep"] * len(heads)
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
retokenizer.merge(ent)
|
retokenizer.merge(ent)
|
||||||
|
@ -210,6 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
|
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
|
||||||
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15]
|
||||||
|
deps = ["dep"] * len(heads)
|
||||||
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
||||||
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)]
|
||||||
ents = ["O"] * len(heads)
|
ents = ["O"] * len(heads)
|
||||||
|
@ -221,7 +229,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
|
||||||
# fmt: on
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = Doc(
|
doc = Doc(
|
||||||
tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
|
tokens.vocab,
|
||||||
|
words=[t.text for t in tokens],
|
||||||
|
heads=heads,
|
||||||
|
deps=deps,
|
||||||
|
tags=tags,
|
||||||
|
ents=ents,
|
||||||
)
|
)
|
||||||
assert len(doc) == 17
|
assert len(doc) == 17
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
|
|
|
@ -44,7 +44,8 @@ def test_doc_retokenize_split_lemmas(en_vocab):
|
||||||
# If lemmas are not set, leave unset
|
# If lemmas are not set, leave unset
|
||||||
words = ["LosAngeles", "start", "."]
|
words = ["LosAngeles", "start", "."]
|
||||||
heads = [1, 2, 2]
|
heads = [1, 2, 2]
|
||||||
doc = Doc(en_vocab, words=words, heads=heads)
|
deps = ["dep"] * len(heads)
|
||||||
|
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(
|
retokenizer.split(
|
||||||
doc[0],
|
doc[0],
|
||||||
|
@ -57,7 +58,8 @@ def test_doc_retokenize_split_lemmas(en_vocab):
|
||||||
# If lemmas are set, use split orth as default lemma
|
# If lemmas are set, use split orth as default lemma
|
||||||
words = ["LosAngeles", "start", "."]
|
words = ["LosAngeles", "start", "."]
|
||||||
heads = [1, 2, 2]
|
heads = [1, 2, 2]
|
||||||
doc = Doc(en_vocab, words=words, heads=heads)
|
deps = ["dep"] * len(heads)
|
||||||
|
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
for t in doc:
|
for t in doc:
|
||||||
t.lemma_ = "a"
|
t.lemma_ = "a"
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
|
|
|
@ -95,7 +95,8 @@ def test_doc_token_api_ancestors(en_vocab):
|
||||||
# the structure of this sentence depends on the English annotation scheme
|
# the structure of this sentence depends on the English annotation scheme
|
||||||
words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
|
words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."]
|
||||||
heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
|
heads = [2, 2, 2, 4, 2, 6, 4, 6, 2]
|
||||||
doc = Doc(en_vocab, words=words, heads=heads)
|
deps = ["dep"] * len(heads)
|
||||||
|
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
assert [t.text for t in doc[6].ancestors] == ["dog", "saw"]
|
assert [t.text for t in doc[6].ancestors] == ["dog", "saw"]
|
||||||
assert [t.text for t in doc[1].ancestors] == ["saw"]
|
assert [t.text for t in doc[1].ancestors] == ["saw"]
|
||||||
assert [t.text for t in doc[2].ancestors] == []
|
assert [t.text for t in doc[2].ancestors] == []
|
||||||
|
@ -146,7 +147,7 @@ def test_doc_token_api_head_setter(en_vocab):
|
||||||
assert doc[4].left_edge.i == 0
|
assert doc[4].left_edge.i == 0
|
||||||
assert doc[2].left_edge.i == 0
|
assert doc[2].left_edge.i == 0
|
||||||
# head token must be from the same document
|
# head token must be from the same document
|
||||||
doc2 = Doc(en_vocab, words=words, heads=heads)
|
doc2 = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
doc[0].head = doc2[0]
|
doc[0].head = doc2[0]
|
||||||
# test sentence starts when two sentences are joined
|
# test sentence starts when two sentences are joined
|
||||||
|
|
|
@ -69,7 +69,7 @@ def heads():
|
||||||
|
|
||||||
|
|
||||||
def test_parser_parse_navigate_consistency(en_vocab, words, heads):
|
def test_parser_parse_navigate_consistency(en_vocab, words, heads):
|
||||||
doc = Doc(en_vocab, words=words, heads=heads)
|
doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
|
||||||
for head in doc:
|
for head in doc:
|
||||||
for child in head.lefts:
|
for child in head.lefts:
|
||||||
assert child.head == head
|
assert child.head == head
|
||||||
|
@ -109,7 +109,7 @@ def test_parser_parse_navigate_child_consistency(en_vocab, words, heads):
|
||||||
|
|
||||||
|
|
||||||
def test_parser_parse_navigate_edges(en_vocab, words, heads):
|
def test_parser_parse_navigate_edges(en_vocab, words, heads):
|
||||||
doc = Doc(en_vocab, words=words, heads=heads)
|
doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads))
|
||||||
for token in doc:
|
for token in doc:
|
||||||
subtree = list(token.subtree)
|
subtree = list(token.subtree)
|
||||||
debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
|
debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
|
||||||
|
|
|
@ -275,6 +275,8 @@ cdef class Doc:
|
||||||
deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
|
deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
|
||||||
if deps and not heads:
|
if deps and not heads:
|
||||||
heads = [0] * len(deps)
|
heads = [0] * len(deps)
|
||||||
|
if heads and not deps:
|
||||||
|
raise ValueError(Errors.E1017)
|
||||||
if sent_starts is not None:
|
if sent_starts is not None:
|
||||||
for i in range(len(sent_starts)):
|
for i in range(len(sent_starts)):
|
||||||
if sent_starts[i] is True:
|
if sent_starts[i] is True:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user