This commit is contained in:
svlandeg 2021-01-13 14:20:05 +01:00
parent 86a4e316b8
commit ed53bb979d
7 changed files with 48 additions and 29 deletions

View File

@ -9,7 +9,7 @@ from ...typedefs cimport hash_t, attr_t
from ...strings cimport hash_string
from ...structs cimport TokenC
from ...tokens.doc cimport Doc, set_children_from_heads
from ...tokens.token import MISSING_DEP_
from ...tokens.token cimport MISSING_DEP
from ...training.example cimport Example
from .stateclass cimport StateClass
from ._state cimport StateC, ArcC
@ -196,8 +196,7 @@ cdef class ArcEagerGold:
def __init__(self, ArcEager moves, StateClass stcls, Example example):
self.mem = Pool()
heads, labels = example.get_aligned_parse(projectivize=True)
labels = [label if label is not None else MISSING_DEP_ for label in labels]
labels = [example.x.vocab.strings.add(label) for label in labels]
labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
sent_starts = example.get_aligned_sent_starts()
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)

View File

@ -4,7 +4,6 @@ from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STO
from spacy.symbols import VERB
from spacy.vocab import Vocab
from spacy.tokens import Doc
from spacy.tokens.token import MISSING_DEP_
from spacy.training import Example
@ -255,24 +254,35 @@ def test_token_api_non_conjuncts(en_vocab):
def test_missing_head_dep(en_vocab):
heads = [1, 1, 1, 1, 2, None]
deps = ["nsubj", "ROOT", "dobj", "cc", "conj", None]
""" Check that the Doc constructor and Example.from_dict parse missing information the same"""
heads = [1, 1, 1, 1, 2, None] # element 5 is missing
deps = ["", "ROOT", "dobj", "cc", "conj", None] # element 0 and 5 are missing
words = ["I", "like", "London", "and", "Berlin", "."]
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
pred_has_heads = [t.has_head() for t in doc]
pred_has_deps = [t.has_dep() for t in doc]
pred_heads = [t.head.i for t in doc]
pred_deps = [t.dep_ for t in doc]
pred_sent_starts = [t.is_sent_start for t in doc]
assert pred_has_heads == [True, True, True, True, True, False]
assert pred_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_]
assert pred_has_heads == [False, True, True, True, True, False]
assert pred_has_deps == [False, True, True, True, True, False]
assert pred_heads[1:5] == [1, 1, 1, 2]
assert pred_deps[1:5] == ["ROOT", "dobj", "cc", "conj"]
assert pred_sent_starts == [True, False, False, False, False, False]
example = Example.from_dict(doc, {"heads": heads, "deps": deps})
ref_has_heads = [t.has_head() for t in example.reference]
ref_has_deps = [t.has_dep() for t in example.reference]
ref_heads = [t.head.i for t in example.reference]
ref_deps = [t.dep_ for t in example.reference]
ref_has_heads = [t.has_head() for t in example.reference]
ref_sent_starts = [t.is_sent_start for t in example.reference]
assert ref_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_]
assert ref_has_heads == [True, True, True, True, True, False]
assert ref_sent_starts == [True, False, False, False, False, False]
assert ref_has_heads == pred_has_heads
assert ref_has_deps == pred_has_heads
assert ref_heads == pred_heads
assert ref_deps == pred_deps
assert ref_sent_starts == pred_sent_starts
# check that the aligned parse preserves the missing information
aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True)
assert aligned_deps[0] == ref_deps[0]
assert aligned_heads[0] == ref_heads[0]
assert aligned_deps[5] == ref_deps[5]
assert aligned_heads[5] == ref_heads[5]
assert aligned_deps[5] == MISSING_DEP_

View File

@ -253,7 +253,7 @@ def test_overfitting_IO(pipe_name):
parser.add_label(dep)
optimizer = nlp.initialize()
# run overfitting
for i in range(150):
for i in range(200):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses[pipe_name] < 0.0001

View File

@ -16,7 +16,7 @@ from thinc.util import copy_array
import warnings
from .span cimport Span
from .token import MISSING_DEP_
from .token cimport MISSING_DEP
from .token cimport Token
from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t
@ -269,6 +269,7 @@ cdef class Doc:
if heads is not None:
heads = [head - i if head is not None else 0 for i, head in enumerate(heads)]
if deps is not None:
MISSING_DEP_ = self.vocab.strings[MISSING_DEP]
deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
if deps and not heads:
heads = [0] * len(deps)

View File

@ -9,6 +9,7 @@ from ..lexeme cimport Lexeme
from ..errors import Errors
cdef int MISSING_DEP = 0
cdef class Token:
cdef readonly Vocab vocab
@ -95,9 +96,12 @@ cdef class Token:
elif feat_name == SENT_START:
token.sent_start = value
@staticmethod
cdef inline int missing_dep(const TokenC* token) nogil:
return token.dep == MISSING_DEP
@staticmethod
cdef inline int missing_head(const TokenC* token) nogil:
if token.dep == 0:
return 1
else:
return 0
return Token.missing_dep(token)

View File

@ -22,8 +22,6 @@ from .. import parts_of_speech
from ..errors import Errors, Warnings
from .underscore import Underscore, get_ext_args
MISSING_DEP_ = ""
cdef class Token:
"""An individual token i.e. a word, punctuation symbol, whitespace,
@ -640,14 +638,13 @@ cdef class Token:
return False
return any(ancestor.i == self.i for ancestor in descendant.ancestors)
def has_head(self):
"""Check whether the token has annotated head information.
Return False when the head annotation is unset/missing.
RETURNS (bool): Whether the head annotation is valid or not.
"""
return self.dep_ != MISSING_DEP_
return not Token.missing_head(self.c)
property head:
"""The syntactic parent, or "governor", of this token.
@ -873,6 +870,14 @@ cdef class Token:
def __set__(self, tag):
self.tag = self.vocab.strings.add(tag)
def has_dep(self):
"""Check whether the token has annotated dep information.
Returns False when the dep label is unset/missing.
RETURNS (bool): Whether the dep label is valid or not.
"""
return not Token.missing_dep(self.c)
property dep_:
"""RETURNS (str): The syntactic dependency label."""
def __get__(self):

View File

@ -12,7 +12,7 @@ from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
from .iob_utils import biluo_tags_to_spans
from ..errors import Errors, Warnings
from ..pipeline._parser_internals import nonproj
from ..tokens.token import MISSING_DEP_
from ..tokens.token cimport MISSING_DEP
from ..util import logger
@ -180,14 +180,15 @@ cdef class Example:
gold_to_cand = self.alignment.y2x
aligned_heads = [None] * self.x.length
aligned_deps = [None] * self.x.length
has_deps = [token.has_dep() for token in self.y]
has_heads = [token.has_head() for token in self.y]
heads = [token.head.i for token in self.y]
deps = [token.dep_ for token in self.y]
if projectivize:
proj_heads, proj_deps = nonproj.projectivize(heads, deps)
# ensure that data that was previously missing, remains missing
# ensure that missing data remains missing
heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
deps = [d if deps[i] != MISSING_DEP_ else MISSING_DEP_ for i, d in enumerate(proj_deps)]
deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
for cand_i in range(self.x.length):
if cand_to_gold.lengths[cand_i] == 1:
gold_i = cand_to_gold[cand_i].dataXd[0, 0]
@ -337,8 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
elif key == "DEP":
attrs.append(key)
value = [v if v is not None else MISSING_DEP_ for v in value]
values.append([vocab.strings.add(h) for h in value])
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
elif key == "SENT_START":
attrs.append(key)
values.append(value)