mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Raise error if deps not provided with heads (#8335)
* Fill in deps if not provided with heads Before this change, if heads were passed without deps they would be silently ignored, which could be confusing. See #8334. * Use "dep" instead of a blank string This is the customary placeholder dep. It might be better to show an error here instead though. * Throw error on heads without deps * Add a test * Fix tests * Formatting * Fix all tests * Fix a test I missed * Revise error message * Clean up whitespace Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
		
							parent
							
								
									0fd0d949c4
								
							
						
					
					
						commit
						2c105cdbce
					
				|  | @ -843,6 +843,9 @@ class Errors: | |||
|              "DependencyMatcher token patterns. The token pattern in " | ||||
|              "RIGHT_ATTR should return matches that are each exactly one token " | ||||
|              "long. Invalid pattern:\n{node}") | ||||
|     E1017 = ("A Doc object requires both 'deps' and 'heads' for dependency " | ||||
|              "parses. If no dependency labels are available, provide " | ||||
|              "placeholder deps such as `deps=[\"dep\"]*len(heads)`.") | ||||
| 
 | ||||
| 
 | ||||
| # Deprecated model shortcuts, only used in errors and warnings | ||||
|  |  | |||
|  | @ -63,3 +63,10 @@ def test_create_from_words_and_text(vocab): | |||
|         words = [" ", " ", "'", "dogs", "'", "\n\n", "run"] | ||||
|         text = "  'dogs'\n\nrun  " | ||||
|         (words, spaces) = util.get_words_and_spaces(words + ["away"], text) | ||||
| 
 | ||||
| 
 | ||||
| def test_create_with_heads_and_no_deps(vocab): | ||||
|     words = "I like ginger".split() | ||||
|     heads = list(range(len(words))) | ||||
|     with pytest.raises(ValueError): | ||||
|         doc = Doc(vocab, words=words, heads=heads) | ||||
|  |  | |||
|  | @ -108,9 +108,12 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): | |||
|     words = ["The", "players", "start", "."] | ||||
|     lemmas = [t.lower() for t in words] | ||||
|     heads = [1, 2, 2, 2] | ||||
|     deps = ["dep"] * len(heads) | ||||
|     tags = ["DT", "NN", "VBZ", "."] | ||||
|     pos = ["DET", "NOUN", "VERB", "PUNCT"] | ||||
|     doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas) | ||||
|     doc = Doc( | ||||
|         en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas | ||||
|     ) | ||||
|     assert len(doc) == 4 | ||||
|     assert doc[0].text == "The" | ||||
|     assert doc[0].tag_ == "DT" | ||||
|  | @ -123,7 +126,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): | |||
|     assert doc[0].tag_ == "NN" | ||||
|     assert doc[0].pos_ == "NOUN" | ||||
|     assert doc[0].lemma_ == "the players" | ||||
|     doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas) | ||||
|     doc = Doc( | ||||
|         en_vocab, words=words, tags=tags, pos=pos, heads=heads, deps=deps, lemmas=lemmas | ||||
|     ) | ||||
|     assert len(doc) == 4 | ||||
|     assert doc[0].text == "The" | ||||
|     assert doc[0].tag_ == "DT" | ||||
|  | @ -190,8 +195,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer): | |||
| 
 | ||||
|     text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript." | ||||
|     heads = [1, 1, 10, 7, 3, 3, 7, 10, 9, 10, 1, 10, 11, 12, 13, 13, 1] | ||||
|     deps = ["dep"] * len(heads) | ||||
|     tokens = en_tokenizer(text) | ||||
|     doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) | ||||
|     doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) | ||||
|     with doc.retokenize() as retokenizer: | ||||
|         for ent in doc.ents: | ||||
|             attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_} | ||||
|  | @ -199,8 +205,9 @@ def test_doc_retokenize_span_np_merges(en_tokenizer): | |||
| 
 | ||||
|     text = "One test with entities like New York City so the ents list is not void" | ||||
|     heads = [1, 1, 1, 2, 3, 6, 7, 4, 12, 11, 11, 12, 1, 12, 12] | ||||
|     deps = ["dep"] * len(heads) | ||||
|     tokens = en_tokenizer(text) | ||||
|     doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) | ||||
|     doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) | ||||
|     with doc.retokenize() as retokenizer: | ||||
|         for ent in doc.ents: | ||||
|             retokenizer.merge(ent) | ||||
|  | @ -210,6 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer): | |||
|     # fmt: off | ||||
|     text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n" | ||||
|     heads = [1, 2, 2, 4, 6, 4, 2, 8, 6, 8, 9, 8, 8, 14, 12, 2, 15] | ||||
|     deps = ["dep"] * len(heads) | ||||
|     tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"] | ||||
|     ents = [("PERSON", 0, 2), ("GPE", 10, 11), ("PERSON", 13, 15)] | ||||
|     ents = ["O"] * len(heads) | ||||
|  | @ -221,7 +229,12 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer): | |||
|     # fmt: on | ||||
|     tokens = en_tokenizer(text) | ||||
|     doc = Doc( | ||||
|         tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents | ||||
|         tokens.vocab, | ||||
|         words=[t.text for t in tokens], | ||||
|         heads=heads, | ||||
|         deps=deps, | ||||
|         tags=tags, | ||||
|         ents=ents, | ||||
|     ) | ||||
|     assert len(doc) == 17 | ||||
|     with doc.retokenize() as retokenizer: | ||||
|  |  | |||
|  | @ -44,7 +44,8 @@ def test_doc_retokenize_split_lemmas(en_vocab): | |||
|     # If lemmas are not set, leave unset | ||||
|     words = ["LosAngeles", "start", "."] | ||||
|     heads = [1, 2, 2] | ||||
|     doc = Doc(en_vocab, words=words, heads=heads) | ||||
|     deps = ["dep"] * len(heads) | ||||
|     doc = Doc(en_vocab, words=words, heads=heads, deps=deps) | ||||
|     with doc.retokenize() as retokenizer: | ||||
|         retokenizer.split( | ||||
|             doc[0], | ||||
|  | @ -57,7 +58,8 @@ def test_doc_retokenize_split_lemmas(en_vocab): | |||
|     # If lemmas are set, use split orth as default lemma | ||||
|     words = ["LosAngeles", "start", "."] | ||||
|     heads = [1, 2, 2] | ||||
|     doc = Doc(en_vocab, words=words, heads=heads) | ||||
|     deps = ["dep"] * len(heads) | ||||
|     doc = Doc(en_vocab, words=words, heads=heads, deps=deps) | ||||
|     for t in doc: | ||||
|         t.lemma_ = "a" | ||||
|     with doc.retokenize() as retokenizer: | ||||
|  |  | |||
|  | @ -95,7 +95,8 @@ def test_doc_token_api_ancestors(en_vocab): | |||
|     # the structure of this sentence depends on the English annotation scheme | ||||
|     words = ["Yesterday", "I", "saw", "a", "dog", "that", "barked", "loudly", "."] | ||||
|     heads = [2, 2, 2, 4, 2, 6, 4, 6, 2] | ||||
|     doc = Doc(en_vocab, words=words, heads=heads) | ||||
|     deps = ["dep"] * len(heads) | ||||
|     doc = Doc(en_vocab, words=words, heads=heads, deps=deps) | ||||
|     assert [t.text for t in doc[6].ancestors] == ["dog", "saw"] | ||||
|     assert [t.text for t in doc[1].ancestors] == ["saw"] | ||||
|     assert [t.text for t in doc[2].ancestors] == [] | ||||
|  | @ -146,7 +147,7 @@ def test_doc_token_api_head_setter(en_vocab): | |||
|     assert doc[4].left_edge.i == 0 | ||||
|     assert doc[2].left_edge.i == 0 | ||||
|     # head token must be from the same document | ||||
|     doc2 = Doc(en_vocab, words=words, heads=heads) | ||||
|     doc2 = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads)) | ||||
|     with pytest.raises(ValueError): | ||||
|         doc[0].head = doc2[0] | ||||
|     # test sentence starts when two sentences are joined | ||||
|  |  | |||
|  | @ -69,7 +69,7 @@ def heads(): | |||
| 
 | ||||
| 
 | ||||
| def test_parser_parse_navigate_consistency(en_vocab, words, heads): | ||||
|     doc = Doc(en_vocab, words=words, heads=heads) | ||||
|     doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads)) | ||||
|     for head in doc: | ||||
|         for child in head.lefts: | ||||
|             assert child.head == head | ||||
|  | @ -109,7 +109,7 @@ def test_parser_parse_navigate_child_consistency(en_vocab, words, heads): | |||
| 
 | ||||
| 
 | ||||
| def test_parser_parse_navigate_edges(en_vocab, words, heads): | ||||
|     doc = Doc(en_vocab, words=words, heads=heads) | ||||
|     doc = Doc(en_vocab, words=words, heads=heads, deps=["dep"] * len(heads)) | ||||
|     for token in doc: | ||||
|         subtree = list(token.subtree) | ||||
|         debug = "\t".join((token.text, token.left_edge.text, subtree[0].text)) | ||||
|  |  | |||
|  | @ -275,6 +275,8 @@ cdef class Doc: | |||
|             deps = [dep if dep is not None else MISSING_DEP_ for dep in deps] | ||||
|         if deps and not heads: | ||||
|             heads = [0] * len(deps) | ||||
|         if heads and not deps: | ||||
|             raise ValueError(Errors.E1017) | ||||
|         if sent_starts is not None: | ||||
|             for i in range(len(sent_starts)): | ||||
|                 if sent_starts[i] is True: | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user