mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	introduce token.has_head and refer to MISSING_DEP_ (WIP)
This commit is contained in:
		
							parent
							
								
									dd12c6c8fd
								
							
						
					
					
						commit
						a581d82f33
					
				|  | @ -467,4 +467,3 @@ cdef enum symbol_t: | |||
| 
 | ||||
|     IDX | ||||
|     _ | ||||
|     MISSING_LABEL | ||||
|  |  | |||
|  | @ -466,7 +466,6 @@ IDS = { | |||
|     "LAW": LAW, | ||||
|     "MORPH": MORPH, | ||||
|     "_": _, | ||||
|     "MISSING_LABEL": MISSING_LABEL, | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -98,16 +98,10 @@ def test_doc_from_array_heads_in_bounds(en_vocab): | |||
|     doc_from_array = Doc(en_vocab, words=words) | ||||
|     doc_from_array.from_array(["HEAD"], arr) | ||||
| 
 | ||||
|     # head before start is used to denote a missing value | ||||
|     # head before start | ||||
|     arr = doc.to_array(["HEAD"]) | ||||
|     arr[0] = -1 | ||||
|     doc_from_array = Doc(en_vocab, words=words) | ||||
|     doc_from_array.from_array(["HEAD"], arr) | ||||
| 
 | ||||
|     # other negative values are invalid | ||||
|     arr = doc.to_array(["HEAD"]) | ||||
|     arr[0] = -2 | ||||
|     doc_from_array = Doc(en_vocab, words=words) | ||||
|     with pytest.raises(ValueError): | ||||
|         doc_from_array.from_array(["HEAD"], arr) | ||||
| 
 | ||||
|  |  | |||
|  | @ -89,8 +89,9 @@ def test_doc_retokenize_lex_attrs(en_tokenizer): | |||
| def test_doc_retokenize_spans_merge_tokens(en_tokenizer): | ||||
|     text = "Los Angeles start." | ||||
|     heads = [1, 2, 2, 2] | ||||
|     deps = ["dep"] * len(heads) | ||||
|     tokens = en_tokenizer(text) | ||||
|     doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) | ||||
|     doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) | ||||
|     assert len(doc) == 4 | ||||
|     assert doc[0].head.text == "Angeles" | ||||
|     assert doc[1].head.text == "start" | ||||
|  | @ -145,7 +146,8 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): | |||
| def test_doc_retokenize_spans_merge_heads(en_vocab): | ||||
|     words = ["I", "found", "a", "pilates", "class", "near", "work", "."] | ||||
|     heads = [1, 1, 4, 6, 1, 4, 5, 1] | ||||
|     doc = Doc(en_vocab, words=words, heads=heads) | ||||
|     deps = ["dep"] * len(heads) | ||||
|     doc = Doc(en_vocab, words=words, heads=heads, deps=deps) | ||||
|     assert len(doc) == 8 | ||||
|     with doc.retokenize() as retokenizer: | ||||
|         attrs = {"tag": doc[4].tag_, "lemma": "pilates class", "ent_type": "O"} | ||||
|  | @ -177,8 +179,9 @@ def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer): | |||
| def test_doc_retokenize_span_np_merges(en_tokenizer): | ||||
|     text = "displaCy is a parse tool built with Javascript" | ||||
|     heads = [1, 1, 4, 4, 1, 4, 5, 6] | ||||
|     deps = ["dep"] * len(heads) | ||||
|     tokens = en_tokenizer(text) | ||||
|     doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) | ||||
|     doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) | ||||
|     assert doc[4].head.i == 1 | ||||
|     with doc.retokenize() as retokenizer: | ||||
|         attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"} | ||||
|  |  | |||
|  | @ -6,7 +6,8 @@ from spacy.tokens import Doc, Token | |||
| def test_doc_retokenize_split(en_vocab): | ||||
|     words = ["LosAngeles", "start", "."] | ||||
|     heads = [1, 2, 2] | ||||
|     doc = Doc(en_vocab, words=words, heads=heads) | ||||
|     deps = ["dep"] * len(heads) | ||||
|     doc = Doc(en_vocab, words=words, heads=heads, deps=deps) | ||||
|     assert len(doc) == 3 | ||||
|     assert len(str(doc)) == 19 | ||||
|     assert doc[0].head.text == "start" | ||||
|  |  | |||
|  | @ -4,6 +4,8 @@ from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STO | |||
| from spacy.symbols import VERB | ||||
| from spacy.vocab import Vocab | ||||
| from spacy.tokens import Doc | ||||
| from spacy.tokens.token import MISSING_DEP_ | ||||
| from spacy.training import Example | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
|  | @ -250,3 +252,23 @@ def test_token_api_non_conjuncts(en_vocab): | |||
|     doc = Doc(en_vocab, words=words, heads=heads, deps=deps) | ||||
|     assert [w.text for w in doc[0].conjuncts] == [] | ||||
|     assert [w.text for w in doc[1].conjuncts] == [] | ||||
| 
 | ||||
| 
 | ||||
| def test_missing_head_dep(en_vocab): | ||||
|     heads = [1, 1, 1, 1, 2, None] | ||||
|     deps = ["nsubj", "ROOT", "dobj", "cc", "conj", None] | ||||
|     words = ["I", "like", "London", "and", "Berlin", "."] | ||||
|     doc = Doc(en_vocab, words=words, heads=heads, deps=deps) | ||||
|     pred_has_heads =  [t.has_head() for t in doc] | ||||
|     pred_deps =  [t.dep_ for t in doc] | ||||
|     assert pred_has_heads == [True, True, True, True, True, False] | ||||
|     assert pred_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_] | ||||
|     example = Example.from_dict(doc, {"heads": heads, "deps": deps}) | ||||
|     ref_heads = [t.head.i for t in example.reference] | ||||
|     ref_deps = [t.dep_ for t in example.reference] | ||||
|     ref_has_heads = [t.has_head() for t in example.reference] | ||||
|     assert ref_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_] | ||||
|     assert ref_has_heads == [True, True, True, True, True, False] | ||||
|     aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True) | ||||
|     assert aligned_heads[5] == ref_heads[5] | ||||
|     assert aligned_deps[5] == MISSING_DEP_ | ||||
|  | @ -121,7 +121,7 @@ def test_parser_pseudoprojectivity(en_vocab): | |||
|     assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux", | ||||
|                              "nsubj", "acl", "punct"] | ||||
|     # if there are two potential new heads, the first one is chosen even if | ||||
|     # it"s wrong | ||||
|     # it's wrong | ||||
|     proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1] | ||||
|     deco_labels = ["advmod||aux", "root", "det", "aux", "advmod", "det", | ||||
|                    "dobj", "det", "nmod", "aux", "nmod||dobj", "advmod", | ||||
|  |  | |||
|  | @ -263,3 +263,22 @@ def test_Example_from_dict_sentences(): | |||
|     annots = {"sent_starts": [1, -1, 0, 0, 0]} | ||||
|     ex = Example.from_dict(predicted, annots) | ||||
|     assert len(list(ex.reference.sents)) == 1 | ||||
| 
 | ||||
| 
 | ||||
| def test_Example_from_dict_with_parse(): | ||||
|     vocab = Vocab() | ||||
|     words = ["I", "like", "London", "and", "Berlin", "."] | ||||
|     deps = ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"] | ||||
|     heads = [1, 1, 1, 2, 2, 1] | ||||
|     annots_head_only = {"words": words, "heads": heads} | ||||
|     annots_head_dep = {"words": words, "heads": heads, "deps": deps} | ||||
|     predicted = Doc(vocab, words=words) | ||||
| 
 | ||||
|     # when not providing deps, the head information is considered to be missing | ||||
|     # in this case, the token's heads refer to themselves | ||||
|     example_1 = Example.from_dict(predicted, annots_head_only) | ||||
|     assert [t.head.i for t in example_1.reference] == [0, 1, 2, 3, 4, 5] | ||||
| 
 | ||||
|     # when providing deps, the head information is actually used | ||||
|     example_2 = Example.from_dict(predicted, annots_head_dep) | ||||
|     assert [t.head.i for t in example_2.reference] == heads | ||||
|  |  | |||
|  | @ -436,7 +436,8 @@ def test_gold_ner_missing_tags(en_tokenizer): | |||
| def test_projectivize(en_tokenizer): | ||||
|     doc = en_tokenizer("He pretty quickly walks away") | ||||
|     heads = [3, 2, 3, 0, 2] | ||||
|     example = Example.from_dict(doc, {"heads": heads}) | ||||
|     deps = ["dep"] * len(heads) | ||||
|     example = Example.from_dict(doc, {"heads": heads, "deps": deps}) | ||||
|     proj_heads, proj_labels = example.get_aligned_parse(projectivize=True) | ||||
|     nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False) | ||||
|     assert proj_heads == [3, 2, 3, 0, 3] | ||||
|  |  | |||
|  | @ -16,6 +16,7 @@ from thinc.util import copy_array | |||
| import warnings | ||||
| 
 | ||||
| from .span cimport Span | ||||
| from .token import MISSING_DEP_ | ||||
| from .token cimport Token | ||||
| from ..lexeme cimport Lexeme, EMPTY_LEXEME | ||||
| from ..typedefs cimport attr_t, flags_t | ||||
|  | @ -266,7 +267,9 @@ cdef class Doc: | |||
|             self.push_back(lexeme, has_space) | ||||
| 
 | ||||
|         if heads is not None: | ||||
|             heads = [head - i if head is not None else None for i, head in enumerate(heads)] | ||||
|             heads = [head - i if head is not None else 0 for i, head in enumerate(heads)] | ||||
|         if deps is not None: | ||||
|             deps = [dep if dep is not None else MISSING_DEP_ for dep in deps] | ||||
|         if deps and not heads: | ||||
|             heads = [0] * len(deps) | ||||
|         if sent_starts is not None: | ||||
|  | @ -1040,8 +1043,7 @@ cdef class Doc: | |||
|                 # cast index to signed int | ||||
|                 abs_head_index = <int32_t>values[col * stride + i] | ||||
|                 abs_head_index += i | ||||
|                 # abs_head_index -1 refers to missing value | ||||
|                 if abs_head_index < -1 or abs_head_index >= length: | ||||
|                 if abs_head_index < 0 or abs_head_index >= length: | ||||
|                     raise ValueError( | ||||
|                         Errors.E190.format( | ||||
|                             index=i, | ||||
|  |  | |||
|  | @ -22,6 +22,8 @@ from .. import parts_of_speech | |||
| from ..errors import Errors, Warnings | ||||
| from .underscore import Underscore, get_ext_args | ||||
| 
 | ||||
| MISSING_DEP_ = "" | ||||
| 
 | ||||
| 
 | ||||
| cdef class Token: | ||||
|     """An individual token – i.e. a word, punctuation symbol, whitespace, | ||||
|  | @ -638,17 +640,27 @@ cdef class Token: | |||
|             return False | ||||
|         return any(ancestor.i == self.i for ancestor in descendant.ancestors) | ||||
| 
 | ||||
| 
 | ||||
|     def has_head(self): | ||||
|         """Check whether the token has annotated head information. | ||||
| 
 | ||||
|         RETURNS (bool): Whether the head annotation is valid or not. | ||||
|         """ | ||||
|         return self.dep_ != MISSING_DEP_ | ||||
| 
 | ||||
| 
 | ||||
|     property head: | ||||
|         """The syntactic parent, or "governor", of this token.  | ||||
|         If token.has_head() is `False`, this method will return itself.  | ||||
| 
 | ||||
|         RETURNS (Token): The token predicted by the parser to be the head of | ||||
|             the current token. Returns None if unknown. | ||||
|             the current token. | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             head_i = self.i + self.c.head | ||||
|             if head_i == -1: | ||||
|                 return None | ||||
|             return self.doc[head_i] | ||||
|             if not self.has_head(): | ||||
|                 return self | ||||
|             else: | ||||
|                 return self.doc[self.i + self.c.head] | ||||
| 
 | ||||
|         def __set__(self, Token new_head): | ||||
|             # This function sets the head of self to new_head and updates the | ||||
|  |  | |||
|  | @ -11,8 +11,8 @@ from .alignment import Alignment | |||
| from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags | ||||
| from .iob_utils import biluo_tags_to_spans | ||||
| from ..errors import Errors, Warnings | ||||
| from ..symbols import MISSING_LABEL | ||||
| from ..pipeline._parser_internals import nonproj | ||||
| from ..tokens.token import MISSING_DEP_ | ||||
| from ..util import logger | ||||
| 
 | ||||
| 
 | ||||
|  | @ -180,18 +180,15 @@ cdef class Example: | |||
|         gold_to_cand = self.alignment.y2x | ||||
|         aligned_heads = [None] * self.x.length | ||||
|         aligned_deps = [None] * self.x.length | ||||
|         heads = [token.head.i if token.head is not None else -1 for token in self.y] | ||||
|         has_heads = [token.has_head() for token in self.y] | ||||
|         heads = [token.head.i for token in self.y] | ||||
|         deps = [token.dep_ for token in self.y] | ||||
|         if projectivize: | ||||
|             proj_heads, proj_deps = nonproj.projectivize(heads, deps) | ||||
|             # don't touch the missing data | ||||
|             heads = [h if heads[i] != -1 else -1 for i, h in enumerate(proj_heads)] | ||||
|             MISSING = self.x.vocab.strings[MISSING_LABEL] | ||||
|             deps = [d if deps[i] != MISSING else MISSING for i, d in enumerate(proj_deps)] | ||||
|             heads, deps = nonproj.projectivize(heads, deps) | ||||
|         for cand_i in range(self.x.length): | ||||
|             if cand_to_gold.lengths[cand_i] == 1: | ||||
|                 gold_i = cand_to_gold[cand_i].dataXd[0, 0] | ||||
|                 if heads[gold_i] != -1 and gold_to_cand.lengths[heads[gold_i]] == 1: | ||||
|                 if gold_to_cand.lengths[heads[gold_i]] == 1: | ||||
|                     aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0]) | ||||
|                     aligned_deps[cand_i] = deps[gold_i] | ||||
|         return aligned_heads, aligned_deps | ||||
|  | @ -334,10 +331,11 @@ def _annot2array(vocab, tok_annot, doc_annot): | |||
|             pass | ||||
|         elif key == "HEAD": | ||||
|             attrs.append(key) | ||||
|             values.append([h-i if h is not None else -(i+1) for i, h in enumerate(value)]) | ||||
|             values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) | ||||
|         elif key == "DEP": | ||||
|             attrs.append(key) | ||||
|             values.append([vocab.strings.add(h) if h is not None else MISSING_LABEL for h in value]) | ||||
|             value = [v if v is not None else MISSING_DEP_ for v in value] | ||||
|             values.append([vocab.strings.add(h) for h in value]) | ||||
|         elif key == "SENT_START": | ||||
|             attrs.append(key) | ||||
|             values.append(value) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user