This commit is contained in:
svlandeg 2021-01-13 14:20:05 +01:00
parent 86a4e316b8
commit ed53bb979d
7 changed files with 48 additions and 29 deletions

View File

@ -9,7 +9,7 @@ from ...typedefs cimport hash_t, attr_t
from ...strings cimport hash_string from ...strings cimport hash_string
from ...structs cimport TokenC from ...structs cimport TokenC
from ...tokens.doc cimport Doc, set_children_from_heads from ...tokens.doc cimport Doc, set_children_from_heads
from ...tokens.token import MISSING_DEP_ from ...tokens.token cimport MISSING_DEP
from ...training.example cimport Example from ...training.example cimport Example
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC, ArcC from ._state cimport StateC, ArcC
@ -196,8 +196,7 @@ cdef class ArcEagerGold:
def __init__(self, ArcEager moves, StateClass stcls, Example example): def __init__(self, ArcEager moves, StateClass stcls, Example example):
self.mem = Pool() self.mem = Pool()
heads, labels = example.get_aligned_parse(projectivize=True) heads, labels = example.get_aligned_parse(projectivize=True)
labels = [label if label is not None else MISSING_DEP_ for label in labels] labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
labels = [example.x.vocab.strings.add(label) for label in labels]
sent_starts = example.get_aligned_sent_starts() sent_starts = example.get_aligned_sent_starts()
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts)) assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts) self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)

View File

@ -4,7 +4,6 @@ from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STO
from spacy.symbols import VERB from spacy.symbols import VERB
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.tokens.token import MISSING_DEP_
from spacy.training import Example from spacy.training import Example
@ -255,24 +254,35 @@ def test_token_api_non_conjuncts(en_vocab):
def test_missing_head_dep(en_vocab): def test_missing_head_dep(en_vocab):
heads = [1, 1, 1, 1, 2, None] """ Check that the Doc constructor and Example.from_dict parse missing information the same"""
deps = ["nsubj", "ROOT", "dobj", "cc", "conj", None] heads = [1, 1, 1, 1, 2, None] # element 5 is missing
deps = ["", "ROOT", "dobj", "cc", "conj", None] # element 0 and 5 are missing
words = ["I", "like", "London", "and", "Berlin", "."] words = ["I", "like", "London", "and", "Berlin", "."]
doc = Doc(en_vocab, words=words, heads=heads, deps=deps) doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
pred_has_heads = [t.has_head() for t in doc] pred_has_heads = [t.has_head() for t in doc]
pred_has_deps = [t.has_dep() for t in doc]
pred_heads = [t.head.i for t in doc]
pred_deps = [t.dep_ for t in doc] pred_deps = [t.dep_ for t in doc]
pred_sent_starts = [t.is_sent_start for t in doc] pred_sent_starts = [t.is_sent_start for t in doc]
assert pred_has_heads == [True, True, True, True, True, False] assert pred_has_heads == [False, True, True, True, True, False]
assert pred_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_] assert pred_has_deps == [False, True, True, True, True, False]
assert pred_heads[1:5] == [1, 1, 1, 2]
assert pred_deps[1:5] == ["ROOT", "dobj", "cc", "conj"]
assert pred_sent_starts == [True, False, False, False, False, False] assert pred_sent_starts == [True, False, False, False, False, False]
example = Example.from_dict(doc, {"heads": heads, "deps": deps}) example = Example.from_dict(doc, {"heads": heads, "deps": deps})
ref_has_heads = [t.has_head() for t in example.reference]
ref_has_deps = [t.has_dep() for t in example.reference]
ref_heads = [t.head.i for t in example.reference] ref_heads = [t.head.i for t in example.reference]
ref_deps = [t.dep_ for t in example.reference] ref_deps = [t.dep_ for t in example.reference]
ref_has_heads = [t.has_head() for t in example.reference]
ref_sent_starts = [t.is_sent_start for t in example.reference] ref_sent_starts = [t.is_sent_start for t in example.reference]
assert ref_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_] assert ref_has_heads == pred_has_heads
assert ref_has_heads == [True, True, True, True, True, False] assert ref_has_deps == pred_has_heads
assert ref_sent_starts == [True, False, False, False, False, False] assert ref_heads == pred_heads
assert ref_deps == pred_deps
assert ref_sent_starts == pred_sent_starts
# check that the aligned parse preserves the missing information
aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True) aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True)
assert aligned_deps[0] == ref_deps[0]
assert aligned_heads[0] == ref_heads[0]
assert aligned_deps[5] == ref_deps[5]
assert aligned_heads[5] == ref_heads[5] assert aligned_heads[5] == ref_heads[5]
assert aligned_deps[5] == MISSING_DEP_

View File

@ -253,7 +253,7 @@ def test_overfitting_IO(pipe_name):
parser.add_label(dep) parser.add_label(dep)
optimizer = nlp.initialize() optimizer = nlp.initialize()
# run overfitting # run overfitting
for i in range(150): for i in range(200):
losses = {} losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses[pipe_name] < 0.0001 assert losses[pipe_name] < 0.0001

View File

@ -16,7 +16,7 @@ from thinc.util import copy_array
import warnings import warnings
from .span cimport Span from .span cimport Span
from .token import MISSING_DEP_ from .token cimport MISSING_DEP
from .token cimport Token from .token cimport Token
from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
@ -269,6 +269,7 @@ cdef class Doc:
if heads is not None: if heads is not None:
heads = [head - i if head is not None else 0 for i, head in enumerate(heads)] heads = [head - i if head is not None else 0 for i, head in enumerate(heads)]
if deps is not None: if deps is not None:
MISSING_DEP_ = self.vocab.strings[MISSING_DEP]
deps = [dep if dep is not None else MISSING_DEP_ for dep in deps] deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
if deps and not heads: if deps and not heads:
heads = [0] * len(deps) heads = [0] * len(deps)

View File

@ -9,6 +9,7 @@ from ..lexeme cimport Lexeme
from ..errors import Errors from ..errors import Errors
cdef int MISSING_DEP = 0
cdef class Token: cdef class Token:
cdef readonly Vocab vocab cdef readonly Vocab vocab
@ -95,9 +96,12 @@ cdef class Token:
elif feat_name == SENT_START: elif feat_name == SENT_START:
token.sent_start = value token.sent_start = value
@staticmethod
cdef inline int missing_dep(const TokenC* token) nogil:
return token.dep == MISSING_DEP
@staticmethod @staticmethod
cdef inline int missing_head(const TokenC* token) nogil: cdef inline int missing_head(const TokenC* token) nogil:
if token.dep == 0: return Token.missing_dep(token)
return 1
else:
return 0

View File

@ -22,8 +22,6 @@ from .. import parts_of_speech
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from .underscore import Underscore, get_ext_args from .underscore import Underscore, get_ext_args
MISSING_DEP_ = ""
cdef class Token: cdef class Token:
"""An individual token i.e. a word, punctuation symbol, whitespace, """An individual token i.e. a word, punctuation symbol, whitespace,
@ -640,14 +638,13 @@ cdef class Token:
return False return False
return any(ancestor.i == self.i for ancestor in descendant.ancestors) return any(ancestor.i == self.i for ancestor in descendant.ancestors)
def has_head(self): def has_head(self):
"""Check whether the token has annotated head information. """Check whether the token has annotated head information.
Return False when the head annotation is unset/missing.
RETURNS (bool): Whether the head annotation is valid or not. RETURNS (bool): Whether the head annotation is valid or not.
""" """
return self.dep_ != MISSING_DEP_ return not Token.missing_head(self.c)
property head: property head:
"""The syntactic parent, or "governor", of this token. """The syntactic parent, or "governor", of this token.
@ -873,6 +870,14 @@ cdef class Token:
def __set__(self, tag): def __set__(self, tag):
self.tag = self.vocab.strings.add(tag) self.tag = self.vocab.strings.add(tag)
def has_dep(self):
"""Check whether the token has annotated dep information.
Returns False when the dep label is unset/missing.
RETURNS (bool): Whether the dep label is valid or not.
"""
return not Token.missing_dep(self.c)
property dep_: property dep_:
"""RETURNS (str): The syntactic dependency label.""" """RETURNS (str): The syntactic dependency label."""
def __get__(self): def __get__(self):

View File

@ -12,7 +12,7 @@ from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
from .iob_utils import biluo_tags_to_spans from .iob_utils import biluo_tags_to_spans
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals import nonproj
from ..tokens.token import MISSING_DEP_ from ..tokens.token cimport MISSING_DEP
from ..util import logger from ..util import logger
@ -180,14 +180,15 @@ cdef class Example:
gold_to_cand = self.alignment.y2x gold_to_cand = self.alignment.y2x
aligned_heads = [None] * self.x.length aligned_heads = [None] * self.x.length
aligned_deps = [None] * self.x.length aligned_deps = [None] * self.x.length
has_deps = [token.has_dep() for token in self.y]
has_heads = [token.has_head() for token in self.y] has_heads = [token.has_head() for token in self.y]
heads = [token.head.i for token in self.y] heads = [token.head.i for token in self.y]
deps = [token.dep_ for token in self.y] deps = [token.dep_ for token in self.y]
if projectivize: if projectivize:
proj_heads, proj_deps = nonproj.projectivize(heads, deps) proj_heads, proj_deps = nonproj.projectivize(heads, deps)
# ensure that data that was previously missing, remains missing # ensure that missing data remains missing
heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)] heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
deps = [d if deps[i] != MISSING_DEP_ else MISSING_DEP_ for i, d in enumerate(proj_deps)] deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
for cand_i in range(self.x.length): for cand_i in range(self.x.length):
if cand_to_gold.lengths[cand_i] == 1: if cand_to_gold.lengths[cand_i] == 1:
gold_i = cand_to_gold[cand_i].dataXd[0, 0] gold_i = cand_to_gold[cand_i].dataXd[0, 0]
@ -337,8 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
elif key == "DEP": elif key == "DEP":
attrs.append(key) attrs.append(key)
value = [v if v is not None else MISSING_DEP_ for v in value] values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
values.append([vocab.strings.add(h) for h in value])
elif key == "SENT_START": elif key == "SENT_START":
attrs.append(key) attrs.append(key)
values.append(value) values.append(value)