From ed53bb979d2b8e62577ff3d45140ca6a9a6bb1c5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 13 Jan 2021 14:20:05 +0100 Subject: [PATCH] cleanup --- .../pipeline/_parser_internals/arc_eager.pyx | 5 ++-- spacy/tests/doc/test_token_api.py | 30 ++++++++++++------- spacy/tests/parser/test_parse.py | 2 +- spacy/tokens/doc.pyx | 3 +- spacy/tokens/token.pxd | 12 +++++--- spacy/tokens/token.pyx | 15 ++++++---- spacy/training/example.pyx | 10 +++---- 7 files changed, 48 insertions(+), 29 deletions(-) diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index 50b620b7a..f50f91f21 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -9,7 +9,7 @@ from ...typedefs cimport hash_t, attr_t from ...strings cimport hash_string from ...structs cimport TokenC from ...tokens.doc cimport Doc, set_children_from_heads -from ...tokens.token import MISSING_DEP_ +from ...tokens.token cimport MISSING_DEP from ...training.example cimport Example from .stateclass cimport StateClass from ._state cimport StateC, ArcC @@ -196,8 +196,7 @@ cdef class ArcEagerGold: def __init__(self, ArcEager moves, StateClass stcls, Example example): self.mem = Pool() heads, labels = example.get_aligned_parse(projectivize=True) - labels = [label if label is not None else MISSING_DEP_ for label in labels] - labels = [example.x.vocab.strings.add(label) for label in labels] + labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels] sent_starts = example.get_aligned_sent_starts() assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts)) self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts) diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 0795080a5..dda28809d 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -4,7 +4,6 @@ from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STO from spacy.symbols import VERB from spacy.vocab import Vocab from spacy.tokens import Doc -from spacy.tokens.token import MISSING_DEP_ from spacy.training import Example @@ -255,24 +254,35 @@ def test_token_api_non_conjuncts(en_vocab): def test_missing_head_dep(en_vocab): - heads = [1, 1, 1, 1, 2, None] - deps = ["nsubj", "ROOT", "dobj", "cc", "conj", None] + """ Check that the Doc constructor and Example.from_dict parse missing information the same""" + heads = [1, 1, 1, 1, 2, None] # element 5 is missing + deps = ["", "ROOT", "dobj", "cc", "conj", None] # element 0 and 5 are missing words = ["I", "like", "London", "and", "Berlin", "."] doc = Doc(en_vocab, words=words, heads=heads, deps=deps) pred_has_heads = [t.has_head() for t in doc] + pred_has_deps = [t.has_dep() for t in doc] + pred_heads = [t.head.i for t in doc] pred_deps = [t.dep_ for t in doc] pred_sent_starts = [t.is_sent_start for t in doc] - assert pred_has_heads == [True, True, True, True, True, False] - assert pred_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_] + assert pred_has_heads == [False, True, True, True, True, False] + assert pred_has_deps == [False, True, True, True, True, False] + assert pred_heads[1:5] == [1, 1, 1, 2] + assert pred_deps[1:5] == ["ROOT", "dobj", "cc", "conj"] assert pred_sent_starts == [True, False, False, False, False, False] example = Example.from_dict(doc, {"heads": heads, "deps": deps}) + ref_has_heads = [t.has_head() for t in example.reference] + ref_has_deps = [t.has_dep() for t in example.reference] ref_heads = [t.head.i for t in example.reference] ref_deps = [t.dep_ for t in example.reference] - ref_has_heads = [t.has_head() for t in example.reference] ref_sent_starts = [t.is_sent_start for t in example.reference] - assert ref_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_] - assert ref_has_heads == [True, True, True, True, True, False] - assert ref_sent_starts == [True, False, False, False, False, False] + assert ref_has_heads == pred_has_heads + assert ref_has_deps == pred_has_heads + assert ref_heads == pred_heads + assert ref_deps == pred_deps + assert ref_sent_starts == pred_sent_starts + # check that the aligned parse preserves the missing information aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True) + assert aligned_deps[0] == ref_deps[0] + assert aligned_heads[0] == ref_heads[0] + assert aligned_deps[5] == ref_deps[5] assert aligned_heads[5] == ref_heads[5] - assert aligned_deps[5] == MISSING_DEP_ diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 5b68bbc37..dc878dd7a 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -253,7 +253,7 @@ def test_overfitting_IO(pipe_name): parser.add_label(dep) optimizer = nlp.initialize() # run overfitting - for i in range(150): + for i in range(200): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses[pipe_name] < 0.0001 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 221e78b2e..456d0a732 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -16,7 +16,7 @@ from thinc.util import copy_array import warnings from .span cimport Span -from .token import MISSING_DEP_ +from .token cimport MISSING_DEP from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t @@ -269,6 +269,7 @@ cdef class Doc: if heads is not None: heads = [head - i if head is not None else 0 for i, head in enumerate(heads)] if deps is not None: + MISSING_DEP_ = self.vocab.strings[MISSING_DEP] deps = [dep if dep is not None else MISSING_DEP_ for dep in deps] if deps and not heads: heads = [0] * len(deps) diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index 9006c874c..58b727764 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -9,6 +9,7 @@ from ..lexeme cimport Lexeme from ..errors import Errors +cdef int MISSING_DEP = 0 cdef class Token: cdef readonly Vocab vocab @@ -95,9 +96,12 @@ cdef class Token: elif feat_name == SENT_START: token.sent_start = value + + @staticmethod + cdef inline int missing_dep(const TokenC* token) nogil: + return token.dep == MISSING_DEP + + @staticmethod cdef inline int missing_head(const TokenC* token) nogil: - if token.dep == 0: - return 1 - else: - return 0 + return Token.missing_dep(token) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index a6f9a2a0c..27aa30199 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -22,8 +22,6 @@ from .. import parts_of_speech from ..errors import Errors, Warnings from .underscore import Underscore, get_ext_args -MISSING_DEP_ = "" - cdef class Token: """An individual token – i.e. a word, punctuation symbol, whitespace, @@ -640,14 +638,13 @@ cdef class Token: return False return any(ancestor.i == self.i for ancestor in descendant.ancestors) - def has_head(self): """Check whether the token has annotated head information. + Return False when the head annotation is unset/missing. RETURNS (bool): Whether the head annotation is valid or not. """ - return self.dep_ != MISSING_DEP_ - + return not Token.missing_head(self.c) property head: """The syntactic parent, or "governor", of this token. @@ -873,6 +870,14 @@ cdef class Token: def __set__(self, tag): self.tag = self.vocab.strings.add(tag) + def has_dep(self): + """Check whether the token has annotated dep information. + Returns False when the dep label is unset/missing. + + RETURNS (bool): Whether the dep label is valid or not. + """ + return not Token.missing_dep(self.c) + property dep_: """RETURNS (str): The syntactic dependency label.""" def __get__(self): diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 3303a8456..fe4ee6ff4 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -12,7 +12,7 @@ from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags from .iob_utils import biluo_tags_to_spans from ..errors import Errors, Warnings from ..pipeline._parser_internals import nonproj -from ..tokens.token import MISSING_DEP_ +from ..tokens.token cimport MISSING_DEP from ..util import logger @@ -180,14 +180,15 @@ cdef class Example: gold_to_cand = self.alignment.y2x aligned_heads = [None] * self.x.length aligned_deps = [None] * self.x.length + has_deps = [token.has_dep() for token in self.y] has_heads = [token.has_head() for token in self.y] heads = [token.head.i for token in self.y] deps = [token.dep_ for token in self.y] if projectivize: proj_heads, proj_deps = nonproj.projectivize(heads, deps) - # ensure that data that was previously missing, remains missing + # ensure that missing data remains missing heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)] - deps = [d if deps[i] != MISSING_DEP_ else MISSING_DEP_ for i, d in enumerate(proj_deps)] + deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)] for cand_i in range(self.x.length): if cand_to_gold.lengths[cand_i] == 1: gold_i = cand_to_gold[cand_i].dataXd[0, 0] @@ -337,8 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot): values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) elif key == "DEP": attrs.append(key) - value = [v if v is not None else MISSING_DEP_ for v in value] - values.append([vocab.strings.add(h) for h in value]) + values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]) elif key == "SENT_START": attrs.append(key) values.append(value)