cleanup

2025-11-22 18:55:43 +03:00 · 2021-01-13 14:20:05 +01:00 · 2021-01-13 14:20:05 +01:00 · ed53bb979d
commit ed53bb979d
parent 86a4e316b8
7 changed files with 48 additions and 29 deletions
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -9,7 +9,7 @@ from ...typedefs cimport hash_t, attr_t
 from ...strings cimport hash_string
 from ...structs cimport TokenC
 from ...tokens.doc cimport Doc, set_children_from_heads
-from ...tokens.token import MISSING_DEP_
+from ...tokens.token cimport MISSING_DEP
 from ...training.example cimport Example
 from .stateclass cimport StateClass
 from ._state cimport StateC, ArcC
@ -196,8 +196,7 @@ cdef class ArcEagerGold:
    def __init__(self, ArcEager moves, StateClass stcls, Example example):
        self.mem = Pool()
        heads, labels = example.get_aligned_parse(projectivize=True)
-        labels = [label if label is not None else MISSING_DEP_ for label in labels]
-        labels = [example.x.vocab.strings.add(label) for label in labels]
+        labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
        sent_starts = example.get_aligned_sent_starts()
        assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
        self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -4,7 +4,6 @@ from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STO
 from spacy.symbols import VERB
 from spacy.vocab import Vocab
 from spacy.tokens import Doc
-from spacy.tokens.token import MISSING_DEP_
 from spacy.training import Example


@ -255,24 +254,35 @@ def test_token_api_non_conjuncts(en_vocab):


 def test_missing_head_dep(en_vocab):
-    heads = [1, 1, 1, 1, 2, None]
-    deps = ["nsubj", "ROOT", "dobj", "cc", "conj", None]
+    """ Check that the Doc constructor and Example.from_dict parse missing information the same"""
+    heads = [1, 1, 1, 1, 2, None]                           # element 5 is missing
+    deps = ["", "ROOT", "dobj", "cc", "conj", None]         # element 0 and 5 are missing
    words = ["I", "like", "London", "and", "Berlin", "."]
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
    pred_has_heads = [t.has_head() for t in doc]
+    pred_has_deps = [t.has_dep() for t in doc]
+    pred_heads = [t.head.i for t in doc]
    pred_deps = [t.dep_ for t in doc]
    pred_sent_starts = [t.is_sent_start for t in doc]
-    assert pred_has_heads == [True, True, True, True, True, False]
-    assert pred_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_]
+    assert pred_has_heads == [False, True, True, True, True, False]
+    assert pred_has_deps == [False, True, True, True, True, False]
+    assert pred_heads[1:5] == [1, 1, 1, 2]
+    assert pred_deps[1:5] == ["ROOT", "dobj", "cc", "conj"]
    assert pred_sent_starts == [True, False, False, False, False, False]
    example = Example.from_dict(doc, {"heads": heads, "deps": deps})
+    ref_has_heads = [t.has_head() for t in example.reference]
+    ref_has_deps = [t.has_dep() for t in example.reference]
    ref_heads = [t.head.i for t in example.reference]
    ref_deps = [t.dep_ for t in example.reference]
-    ref_has_heads = [t.has_head() for t in example.reference]
    ref_sent_starts = [t.is_sent_start for t in example.reference]
-    assert ref_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_]
-    assert ref_has_heads == [True, True, True, True, True, False]
-    assert ref_sent_starts == [True, False, False, False, False, False]
+    assert ref_has_heads == pred_has_heads
+    assert ref_has_deps == pred_has_heads
+    assert ref_heads == pred_heads
+    assert ref_deps == pred_deps
+    assert ref_sent_starts == pred_sent_starts
+    # check that the aligned parse preserves the missing information
    aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True)
+    assert aligned_deps[0] == ref_deps[0]
+    assert aligned_heads[0] == ref_heads[0]
+    assert aligned_deps[5] == ref_deps[5]
    assert aligned_heads[5] == ref_heads[5]
-    assert aligned_deps[5] == MISSING_DEP_
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@ -253,7 +253,7 @@ def test_overfitting_IO(pipe_name):
            parser.add_label(dep)
    optimizer = nlp.initialize()
    # run overfitting
-    for i in range(150):
+    for i in range(200):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses[pipe_name] < 0.0001
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -16,7 +16,7 @@ from thinc.util import copy_array
 import warnings

 from .span cimport Span
-from .token import MISSING_DEP_
+from .token cimport MISSING_DEP
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
@ -269,6 +269,7 @@ cdef class Doc:
        if heads is not None:
            heads = [head - i if head is not None else 0 for i, head in enumerate(heads)]
        if deps is not None:
+            MISSING_DEP_ = self.vocab.strings[MISSING_DEP]
            deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
        if deps and not heads:
            heads = [0] * len(deps)
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@ -9,6 +9,7 @@ from ..lexeme cimport Lexeme

 from ..errors import Errors

+cdef int MISSING_DEP = 0

 cdef class Token:
    cdef readonly Vocab vocab
@ -95,9 +96,12 @@ cdef class Token:
        elif feat_name == SENT_START:
            token.sent_start = value

+
+    @staticmethod
+    cdef inline int missing_dep(const TokenC* token) nogil:
+        return token.dep == MISSING_DEP
+
+
    @staticmethod
    cdef inline int missing_head(const TokenC* token) nogil:
-        if token.dep == 0:
-            return 1
-        else:
-            return 0
+        return Token.missing_dep(token)
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -22,8 +22,6 @@ from .. import parts_of_speech
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args

-MISSING_DEP_ = ""
-

 cdef class Token:
    """An individual token – i.e. a word, punctuation symbol, whitespace,
@ -640,14 +638,13 @@ cdef class Token:
            return False
        return any(ancestor.i == self.i for ancestor in descendant.ancestors)

-
    def has_head(self):
        """Check whether the token has annotated head information.
+        Return False when the head annotation is unset/missing.

        RETURNS (bool): Whether the head annotation is valid or not.
        """
-        return self.dep_ != MISSING_DEP_
-
+        return not Token.missing_head(self.c)

    property head:
        """The syntactic parent, or "governor", of this token. 
@ -873,6 +870,14 @@ cdef class Token:
        def __set__(self, tag):
            self.tag = self.vocab.strings.add(tag)

+    def has_dep(self):
+        """Check whether the token has annotated dep information.
+        Returns False when the dep label is unset/missing.
+
+        RETURNS (bool): Whether the dep label is valid or not.
+        """
+        return not Token.missing_dep(self.c)
+
    property dep_:
        """RETURNS (str): The syntactic dependency label."""
        def __get__(self):
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -12,7 +12,7 @@ from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
 from .iob_utils import biluo_tags_to_spans
 from ..errors import Errors, Warnings
 from ..pipeline._parser_internals import nonproj
-from ..tokens.token import MISSING_DEP_
+from ..tokens.token cimport MISSING_DEP
 from ..util import logger


@ -180,14 +180,15 @@ cdef class Example:
        gold_to_cand = self.alignment.y2x
        aligned_heads = [None] * self.x.length
        aligned_deps = [None] * self.x.length
+        has_deps = [token.has_dep() for token in self.y]
        has_heads = [token.has_head() for token in self.y]
        heads = [token.head.i for token in self.y]
        deps = [token.dep_ for token in self.y]
        if projectivize:
            proj_heads, proj_deps = nonproj.projectivize(heads, deps)
-            # ensure that data that was previously missing, remains missing
+            # ensure that missing data remains missing
            heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
-            deps = [d if deps[i] != MISSING_DEP_ else MISSING_DEP_ for i, d in enumerate(proj_deps)]
+            deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
        for cand_i in range(self.x.length):
            if cand_to_gold.lengths[cand_i] == 1:
                gold_i = cand_to_gold[cand_i].dataXd[0, 0]
@ -337,8 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
            values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
        elif key == "DEP":
            attrs.append(key)
-            value = [v if v is not None else MISSING_DEP_ for v in value]
-            values.append([vocab.strings.add(h) for h in value])
+            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
        elif key == "SENT_START":
            attrs.append(key)
            values.append(value)