From ed53bb979d2b8e62577ff3d45140ca6a9a6bb1c5 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 13 Jan 2021 14:20:05 +0100
Subject: [PATCH] cleanup

---
 .../pipeline/_parser_internals/arc_eager.pyx  |  5 ++--
 spacy/tests/doc/test_token_api.py             | 30 ++++++++++++-------
 spacy/tests/parser/test_parse.py              |  2 +-
 spacy/tokens/doc.pyx                          |  3 +-
 spacy/tokens/token.pxd                        | 12 +++++---
 spacy/tokens/token.pyx                        | 15 ++++++----
 spacy/training/example.pyx                    | 10 +++----
 7 files changed, 48 insertions(+), 29 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 50b620b7a..f50f91f21 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -9,7 +9,7 @@ from ...typedefs cimport hash_t, attr_t
 from ...strings cimport hash_string
 from ...structs cimport TokenC
 from ...tokens.doc cimport Doc, set_children_from_heads
-from ...tokens.token import MISSING_DEP_
+from ...tokens.token cimport MISSING_DEP
 from ...training.example cimport Example
 from .stateclass cimport StateClass
 from ._state cimport StateC, ArcC
@@ -196,8 +196,7 @@ cdef class ArcEagerGold:
     def __init__(self, ArcEager moves, StateClass stcls, Example example):
         self.mem = Pool()
         heads, labels = example.get_aligned_parse(projectivize=True)
-        labels = [label if label is not None else MISSING_DEP_ for label in labels]
-        labels = [example.x.vocab.strings.add(label) for label in labels]
+        labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
         sent_starts = example.get_aligned_sent_starts()
         assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
         self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index 0795080a5..dda28809d 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -4,7 +4,6 @@ from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STO
 from spacy.symbols import VERB
 from spacy.vocab import Vocab
 from spacy.tokens import Doc
-from spacy.tokens.token import MISSING_DEP_
 from spacy.training import Example
 
 
@@ -255,24 +254,35 @@ def test_token_api_non_conjuncts(en_vocab):
 
 
 def test_missing_head_dep(en_vocab):
-    heads = [1, 1, 1, 1, 2, None]
-    deps = ["nsubj", "ROOT", "dobj", "cc", "conj", None]
+    """ Check that the Doc constructor and Example.from_dict parse missing information the same"""
+    heads = [1, 1, 1, 1, 2, None]                           # element 5 is missing
+    deps = ["", "ROOT", "dobj", "cc", "conj", None]         # element 0 and 5 are missing
     words = ["I", "like", "London", "and", "Berlin", "."]
     doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
     pred_has_heads = [t.has_head() for t in doc]
+    pred_has_deps = [t.has_dep() for t in doc]
+    pred_heads = [t.head.i for t in doc]
     pred_deps = [t.dep_ for t in doc]
     pred_sent_starts = [t.is_sent_start for t in doc]
-    assert pred_has_heads == [True, True, True, True, True, False]
-    assert pred_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_]
+    assert pred_has_heads == [False, True, True, True, True, False]
+    assert pred_has_deps == [False, True, True, True, True, False]
+    assert pred_heads[1:5] == [1, 1, 1, 2]
+    assert pred_deps[1:5] == ["ROOT", "dobj", "cc", "conj"]
     assert pred_sent_starts == [True, False, False, False, False, False]
     example = Example.from_dict(doc, {"heads": heads, "deps": deps})
+    ref_has_heads = [t.has_head() for t in example.reference]
+    ref_has_deps = [t.has_dep() for t in example.reference]
     ref_heads = [t.head.i for t in example.reference]
     ref_deps = [t.dep_ for t in example.reference]
-    ref_has_heads = [t.has_head() for t in example.reference]
     ref_sent_starts = [t.is_sent_start for t in example.reference]
-    assert ref_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_]
-    assert ref_has_heads == [True, True, True, True, True, False]
-    assert ref_sent_starts == [True, False, False, False, False, False]
+    assert ref_has_heads == pred_has_heads
+    assert ref_has_deps == pred_has_heads
+    assert ref_heads == pred_heads
+    assert ref_deps == pred_deps
+    assert ref_sent_starts == pred_sent_starts
+    # check that the aligned parse preserves the missing information
     aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True)
+    assert aligned_deps[0] == ref_deps[0]
+    assert aligned_heads[0] == ref_heads[0]
+    assert aligned_deps[5] == ref_deps[5]
     assert aligned_heads[5] == ref_heads[5]
-    assert aligned_deps[5] == MISSING_DEP_
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 5b68bbc37..dc878dd7a 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -253,7 +253,7 @@ def test_overfitting_IO(pipe_name):
             parser.add_label(dep)
     optimizer = nlp.initialize()
     # run overfitting
-    for i in range(150):
+    for i in range(200):
         losses = {}
         nlp.update(train_examples, sgd=optimizer, losses=losses)
     assert losses[pipe_name] < 0.0001
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 221e78b2e..456d0a732 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -16,7 +16,7 @@ from thinc.util import copy_array
 import warnings
 
 from .span cimport Span
-from .token import MISSING_DEP_
+from .token cimport MISSING_DEP
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
@@ -269,6 +269,7 @@ cdef class Doc:
         if heads is not None:
             heads = [head - i if head is not None else 0 for i, head in enumerate(heads)]
         if deps is not None:
+            MISSING_DEP_ = self.vocab.strings[MISSING_DEP]
             deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
         if deps and not heads:
             heads = [0] * len(deps)
diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd
index 9006c874c..58b727764 100644
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@@ -9,6 +9,7 @@ from ..lexeme cimport Lexeme
 
 from ..errors import Errors
 
+cdef int MISSING_DEP = 0
 
 cdef class Token:
     cdef readonly Vocab vocab
@@ -95,9 +96,12 @@ cdef class Token:
         elif feat_name == SENT_START:
             token.sent_start = value
 
+
+    @staticmethod
+    cdef inline int missing_dep(const TokenC* token) nogil:
+        return token.dep == MISSING_DEP
+
+
     @staticmethod
     cdef inline int missing_head(const TokenC* token) nogil:
-        if token.dep == 0:
-            return 1
-        else:
-            return 0
+        return Token.missing_dep(token)
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index a6f9a2a0c..27aa30199 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -22,8 +22,6 @@ from .. import parts_of_speech
 from ..errors import Errors, Warnings
 from .underscore import Underscore, get_ext_args
 
-MISSING_DEP_ = ""
-
 
 cdef class Token:
     """An individual token – i.e. a word, punctuation symbol, whitespace,
@@ -640,14 +638,13 @@ cdef class Token:
             return False
         return any(ancestor.i == self.i for ancestor in descendant.ancestors)
 
-
     def has_head(self):
         """Check whether the token has annotated head information.
+        Return False when the head annotation is unset/missing.
 
         RETURNS (bool): Whether the head annotation is valid or not.
         """
-        return self.dep_ != MISSING_DEP_
-
+        return not Token.missing_head(self.c)
 
     property head:
         """The syntactic parent, or "governor", of this token. 
@@ -873,6 +870,14 @@ cdef class Token:
         def __set__(self, tag):
             self.tag = self.vocab.strings.add(tag)
 
+    def has_dep(self):
+        """Check whether the token has annotated dep information.
+        Returns False when the dep label is unset/missing.
+
+        RETURNS (bool): Whether the dep label is valid or not.
+        """
+        return not Token.missing_dep(self.c)
+
     property dep_:
         """RETURNS (str): The syntactic dependency label."""
         def __get__(self):
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 3303a8456..fe4ee6ff4 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -12,7 +12,7 @@ from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
 from .iob_utils import biluo_tags_to_spans
 from ..errors import Errors, Warnings
 from ..pipeline._parser_internals import nonproj
-from ..tokens.token import MISSING_DEP_
+from ..tokens.token cimport MISSING_DEP
 from ..util import logger
 
 
@@ -180,14 +180,15 @@ cdef class Example:
         gold_to_cand = self.alignment.y2x
         aligned_heads = [None] * self.x.length
         aligned_deps = [None] * self.x.length
+        has_deps = [token.has_dep() for token in self.y]
         has_heads = [token.has_head() for token in self.y]
         heads = [token.head.i for token in self.y]
         deps = [token.dep_ for token in self.y]
         if projectivize:
             proj_heads, proj_deps = nonproj.projectivize(heads, deps)
-            # ensure that data that was previously missing, remains missing
+            # ensure that missing data remains missing
             heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
-            deps = [d if deps[i] != MISSING_DEP_ else MISSING_DEP_ for i, d in enumerate(proj_deps)]
+            deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
         for cand_i in range(self.x.length):
             if cand_to_gold.lengths[cand_i] == 1:
                 gold_i = cand_to_gold[cand_i].dataXd[0, 0]
@@ -337,8 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
             values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
         elif key == "DEP":
             attrs.append(key)
-            value = [v if v is not None else MISSING_DEP_ for v in value]
-            values.append([vocab.strings.add(h) for h in value])
+            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
         elif key == "SENT_START":
             attrs.append(key)
             values.append(value)