introduce token.has_head and refer to MISSING_DEP_ (WIP)

This commit is contained in:
svlandeg 2021-01-12 17:17:06 +01:00
parent dd12c6c8fd
commit a581d82f33
12 changed files with 83 additions and 33 deletions

View File

@ -467,4 +467,3 @@ cdef enum symbol_t:
IDX IDX
_ _
MISSING_LABEL

View File

@ -466,7 +466,6 @@ IDS = {
"LAW": LAW, "LAW": LAW,
"MORPH": MORPH, "MORPH": MORPH,
"_": _, "_": _,
"MISSING_LABEL": MISSING_LABEL,
} }

View File

@ -98,16 +98,10 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
doc_from_array = Doc(en_vocab, words=words) doc_from_array = Doc(en_vocab, words=words)
doc_from_array.from_array(["HEAD"], arr) doc_from_array.from_array(["HEAD"], arr)
# head before start is used to denote a missing value # head before start
arr = doc.to_array(["HEAD"]) arr = doc.to_array(["HEAD"])
arr[0] = -1 arr[0] = -1
doc_from_array = Doc(en_vocab, words=words) doc_from_array = Doc(en_vocab, words=words)
doc_from_array.from_array(["HEAD"], arr)
# other negative values are invalid
arr = doc.to_array(["HEAD"])
arr[0] = -2
doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError): with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr) doc_from_array.from_array(["HEAD"], arr)

View File

@ -89,8 +89,9 @@ def test_doc_retokenize_lex_attrs(en_tokenizer):
def test_doc_retokenize_spans_merge_tokens(en_tokenizer): def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
text = "Los Angeles start." text = "Los Angeles start."
heads = [1, 2, 2, 2] heads = [1, 2, 2, 2]
deps = ["dep"] * len(heads)
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
assert len(doc) == 4 assert len(doc) == 4
assert doc[0].head.text == "Angeles" assert doc[0].head.text == "Angeles"
assert doc[1].head.text == "start" assert doc[1].head.text == "start"
@ -145,7 +146,8 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
def test_doc_retokenize_spans_merge_heads(en_vocab): def test_doc_retokenize_spans_merge_heads(en_vocab):
words = ["I", "found", "a", "pilates", "class", "near", "work", "."] words = ["I", "found", "a", "pilates", "class", "near", "work", "."]
heads = [1, 1, 4, 6, 1, 4, 5, 1] heads = [1, 1, 4, 6, 1, 4, 5, 1]
doc = Doc(en_vocab, words=words, heads=heads) deps = ["dep"] * len(heads)
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
assert len(doc) == 8 assert len(doc) == 8
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
attrs = {"tag": doc[4].tag_, "lemma": "pilates class", "ent_type": "O"} attrs = {"tag": doc[4].tag_, "lemma": "pilates class", "ent_type": "O"}
@ -177,8 +179,9 @@ def test_doc_retokenize_spans_merge_non_disjoint(en_tokenizer):
def test_doc_retokenize_span_np_merges(en_tokenizer): def test_doc_retokenize_span_np_merges(en_tokenizer):
text = "displaCy is a parse tool built with Javascript" text = "displaCy is a parse tool built with Javascript"
heads = [1, 1, 4, 4, 1, 4, 5, 6] heads = [1, 1, 4, 4, 1, 4, 5, 6]
deps = ["dep"] * len(heads)
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
assert doc[4].head.i == 1 assert doc[4].head.i == 1
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"} attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"}

View File

@ -6,7 +6,8 @@ from spacy.tokens import Doc, Token
def test_doc_retokenize_split(en_vocab): def test_doc_retokenize_split(en_vocab):
words = ["LosAngeles", "start", "."] words = ["LosAngeles", "start", "."]
heads = [1, 2, 2] heads = [1, 2, 2]
doc = Doc(en_vocab, words=words, heads=heads) deps = ["dep"] * len(heads)
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
assert len(doc) == 3 assert len(doc) == 3
assert len(str(doc)) == 19 assert len(str(doc)) == 19
assert doc[0].head.text == "start" assert doc[0].head.text == "start"

View File

@ -4,6 +4,8 @@ from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STO
from spacy.symbols import VERB from spacy.symbols import VERB
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.tokens.token import MISSING_DEP_
from spacy.training import Example
@pytest.fixture @pytest.fixture
@ -250,3 +252,23 @@ def test_token_api_non_conjuncts(en_vocab):
doc = Doc(en_vocab, words=words, heads=heads, deps=deps) doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
assert [w.text for w in doc[0].conjuncts] == [] assert [w.text for w in doc[0].conjuncts] == []
assert [w.text for w in doc[1].conjuncts] == [] assert [w.text for w in doc[1].conjuncts] == []
def test_missing_head_dep(en_vocab):
heads = [1, 1, 1, 1, 2, None]
deps = ["nsubj", "ROOT", "dobj", "cc", "conj", None]
words = ["I", "like", "London", "and", "Berlin", "."]
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
pred_has_heads = [t.has_head() for t in doc]
pred_deps = [t.dep_ for t in doc]
assert pred_has_heads == [True, True, True, True, True, False]
assert pred_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_]
example = Example.from_dict(doc, {"heads": heads, "deps": deps})
ref_heads = [t.head.i for t in example.reference]
ref_deps = [t.dep_ for t in example.reference]
ref_has_heads = [t.has_head() for t in example.reference]
assert ref_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_]
assert ref_has_heads == [True, True, True, True, True, False]
aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True)
assert aligned_heads[5] == ref_heads[5]
assert aligned_deps[5] == MISSING_DEP_

View File

@ -121,7 +121,7 @@ def test_parser_pseudoprojectivity(en_vocab):
assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux", assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
"nsubj", "acl", "punct"] "nsubj", "acl", "punct"]
# if there are two potential new heads, the first one is chosen even if # if there are two potential new heads, the first one is chosen even if
# it"s wrong # it's wrong
proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1] proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
deco_labels = ["advmod||aux", "root", "det", "aux", "advmod", "det", deco_labels = ["advmod||aux", "root", "det", "aux", "advmod", "det",
"dobj", "det", "nmod", "aux", "nmod||dobj", "advmod", "dobj", "det", "nmod", "aux", "nmod||dobj", "advmod",

View File

@ -263,3 +263,22 @@ def test_Example_from_dict_sentences():
annots = {"sent_starts": [1, -1, 0, 0, 0]} annots = {"sent_starts": [1, -1, 0, 0, 0]}
ex = Example.from_dict(predicted, annots) ex = Example.from_dict(predicted, annots)
assert len(list(ex.reference.sents)) == 1 assert len(list(ex.reference.sents)) == 1
def test_Example_from_dict_with_parse():
vocab = Vocab()
words = ["I", "like", "London", "and", "Berlin", "."]
deps = ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"]
heads = [1, 1, 1, 2, 2, 1]
annots_head_only = {"words": words, "heads": heads}
annots_head_dep = {"words": words, "heads": heads, "deps": deps}
predicted = Doc(vocab, words=words)
# when not providing deps, the head information is considered to be missing
# in this case, the token's heads refer to themselves
example_1 = Example.from_dict(predicted, annots_head_only)
assert [t.head.i for t in example_1.reference] == [0, 1, 2, 3, 4, 5]
# when providing deps, the head information is actually used
example_2 = Example.from_dict(predicted, annots_head_dep)
assert [t.head.i for t in example_2.reference] == heads

View File

@ -436,7 +436,8 @@ def test_gold_ner_missing_tags(en_tokenizer):
def test_projectivize(en_tokenizer): def test_projectivize(en_tokenizer):
doc = en_tokenizer("He pretty quickly walks away") doc = en_tokenizer("He pretty quickly walks away")
heads = [3, 2, 3, 0, 2] heads = [3, 2, 3, 0, 2]
example = Example.from_dict(doc, {"heads": heads}) deps = ["dep"] * len(heads)
example = Example.from_dict(doc, {"heads": heads, "deps": deps})
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True) proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False) nonproj_heads, nonproj_labels = example.get_aligned_parse(projectivize=False)
assert proj_heads == [3, 2, 3, 0, 3] assert proj_heads == [3, 2, 3, 0, 3]

View File

@ -16,6 +16,7 @@ from thinc.util import copy_array
import warnings import warnings
from .span cimport Span from .span cimport Span
from .token import MISSING_DEP_
from .token cimport Token from .token cimport Token
from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
@ -266,7 +267,9 @@ cdef class Doc:
self.push_back(lexeme, has_space) self.push_back(lexeme, has_space)
if heads is not None: if heads is not None:
heads = [head - i if head is not None else None for i, head in enumerate(heads)] heads = [head - i if head is not None else 0 for i, head in enumerate(heads)]
if deps is not None:
deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
if deps and not heads: if deps and not heads:
heads = [0] * len(deps) heads = [0] * len(deps)
if sent_starts is not None: if sent_starts is not None:
@ -1040,8 +1043,7 @@ cdef class Doc:
# cast index to signed int # cast index to signed int
abs_head_index = <int32_t>values[col * stride + i] abs_head_index = <int32_t>values[col * stride + i]
abs_head_index += i abs_head_index += i
# abs_head_index -1 refers to missing value if abs_head_index < 0 or abs_head_index >= length:
if abs_head_index < -1 or abs_head_index >= length:
raise ValueError( raise ValueError(
Errors.E190.format( Errors.E190.format(
index=i, index=i,

View File

@ -22,6 +22,8 @@ from .. import parts_of_speech
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from .underscore import Underscore, get_ext_args from .underscore import Underscore, get_ext_args
MISSING_DEP_ = ""
cdef class Token: cdef class Token:
"""An individual token i.e. a word, punctuation symbol, whitespace, """An individual token i.e. a word, punctuation symbol, whitespace,
@ -638,17 +640,27 @@ cdef class Token:
return False return False
return any(ancestor.i == self.i for ancestor in descendant.ancestors) return any(ancestor.i == self.i for ancestor in descendant.ancestors)
def has_head(self):
"""Check whether the token has annotated head information.
RETURNS (bool): Whether the head annotation is valid or not.
"""
return self.dep_ != MISSING_DEP_
property head: property head:
"""The syntactic parent, or "governor", of this token. """The syntactic parent, or "governor", of this token.
If token.has_head() is `False`, this method will return itself.
RETURNS (Token): The token predicted by the parser to be the head of RETURNS (Token): The token predicted by the parser to be the head of
the current token. Returns None if unknown. the current token.
""" """
def __get__(self): def __get__(self):
head_i = self.i + self.c.head if not self.has_head():
if head_i == -1: return self
return None else:
return self.doc[head_i] return self.doc[self.i + self.c.head]
def __set__(self, Token new_head): def __set__(self, Token new_head):
# This function sets the head of self to new_head and updates the # This function sets the head of self to new_head and updates the

View File

@ -11,8 +11,8 @@ from .alignment import Alignment
from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
from .iob_utils import biluo_tags_to_spans from .iob_utils import biluo_tags_to_spans
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..symbols import MISSING_LABEL
from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals import nonproj
from ..tokens.token import MISSING_DEP_
from ..util import logger from ..util import logger
@ -180,18 +180,15 @@ cdef class Example:
gold_to_cand = self.alignment.y2x gold_to_cand = self.alignment.y2x
aligned_heads = [None] * self.x.length aligned_heads = [None] * self.x.length
aligned_deps = [None] * self.x.length aligned_deps = [None] * self.x.length
heads = [token.head.i if token.head is not None else -1 for token in self.y] has_heads = [token.has_head() for token in self.y]
heads = [token.head.i for token in self.y]
deps = [token.dep_ for token in self.y] deps = [token.dep_ for token in self.y]
if projectivize: if projectivize:
proj_heads, proj_deps = nonproj.projectivize(heads, deps) heads, deps = nonproj.projectivize(heads, deps)
# don't touch the missing data
heads = [h if heads[i] != -1 else -1 for i, h in enumerate(proj_heads)]
MISSING = self.x.vocab.strings[MISSING_LABEL]
deps = [d if deps[i] != MISSING else MISSING for i, d in enumerate(proj_deps)]
for cand_i in range(self.x.length): for cand_i in range(self.x.length):
if cand_to_gold.lengths[cand_i] == 1: if cand_to_gold.lengths[cand_i] == 1:
gold_i = cand_to_gold[cand_i].dataXd[0, 0] gold_i = cand_to_gold[cand_i].dataXd[0, 0]
if heads[gold_i] != -1 and gold_to_cand.lengths[heads[gold_i]] == 1: if gold_to_cand.lengths[heads[gold_i]] == 1:
aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0]) aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0])
aligned_deps[cand_i] = deps[gold_i] aligned_deps[cand_i] = deps[gold_i]
return aligned_heads, aligned_deps return aligned_heads, aligned_deps
@ -334,10 +331,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
pass pass
elif key == "HEAD": elif key == "HEAD":
attrs.append(key) attrs.append(key)
values.append([h-i if h is not None else -(i+1) for i, h in enumerate(value)]) values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
elif key == "DEP": elif key == "DEP":
attrs.append(key) attrs.append(key)
values.append([vocab.strings.add(h) if h is not None else MISSING_LABEL for h in value]) value = [v if v is not None else MISSING_DEP_ for v in value]
values.append([vocab.strings.add(h) for h in value])
elif key == "SENT_START": elif key == "SENT_START":
attrs.append(key) attrs.append(key)
values.append(value) values.append(value)