From c6b12ab02adcdfe760bc10e249924553cb826410 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 2 Mar 2020 11:49:28 +0100
Subject: [PATCH] Bugfix/get doc (#5049)

* new (broken) unit test

* fixing get_doc method
---
 spacy/errors.py                               |  4 ++
 spacy/pipeline/pipes.pyx                      |  2 +-
 spacy/tests/doc/test_doc_api.py               |  9 +--
 spacy/tests/doc/test_token_api.py             |  2 +-
 spacy/tests/parser/test_parse_navigate.py     | 32 +++++-----
 spacy/tests/regression/test_issue2001-2500.py |  2 +-
 spacy/tests/regression/test_issue2501-3000.py |  2 +-
 spacy/tests/regression/test_issue4590.py      |  2 +-
 spacy/tests/regression/test_issue5048.py      | 35 +++++++++++
 spacy/tests/test_displacy.py                  | 10 ++--
 spacy/tests/util.py                           | 58 ++++++++++++++-----
 spacy/tokens/doc.pyx                          |  4 +-
 12 files changed, 115 insertions(+), 47 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue5048.py

diff --git a/spacy/errors.py b/spacy/errors.py
index 2f0a8a2ad..5957c5ecd 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -107,6 +107,9 @@ class Warnings(object):
     W027 = ("Found a large training file of {size} bytes. Note that it may "
             "be more efficient to split your training data into multiple "
             "smaller JSON files instead.")
+    W028 = ("Doc.from_array was called with a vector of type '{type}', "
+            "but is expecting one of type 'uint64' instead. This may result "
+            "in problems with the vocab further on in the pipeline.")
 
 
 
@@ -541,6 +544,7 @@ class Errors(object):
     E188 = ("Could not match the gold entity links to entities in the doc - "
             "make sure the gold EL data refers to valid results of the "
             "named entity recognizer in the `nlp` pipeline.")
+    E189 = ("Each argument to `get_doc` should be of equal length.")
 
 
 @add_codes
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 3b190debe..a20c9b6df 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -367,7 +367,7 @@ class Tensorizer(Pipe):
         return sgd
 
 
-@component("tagger", assigns=["token.tag", "token.pos"])
+@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"])
 class Tagger(Pipe):
     """Pipeline component for part-of-speech tagging.
 
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 52f856d3e..19d908529 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -150,10 +150,9 @@ def test_doc_api_runtime_error(en_tokenizer):
     # Example that caused run-time error while parsing Reddit
     # fmt: off
     text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
-    deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "",
-            "nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep",
-            "pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
-            "ROOT", "amod", "dobj"]
+    deps = ["nummod", "nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", "nummod", "appos", "prep", "det",
+            "amod", "pobj", "acl", "prep", "prep", "pobj",
+            "", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
     # fmt: on
     tokens = en_tokenizer(text)
     doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
@@ -277,7 +276,9 @@ def test_doc_is_nered(en_vocab):
 def test_doc_from_array_sent_starts(en_vocab):
     words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
     heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
+    # fmt: off
     deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
+    # fmt: on
     doc = Doc(en_vocab, words=words)
     for i, (dep, head) in enumerate(zip(deps, heads)):
         doc[i].dep_ = dep
diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py
index bff2a95c6..b7522bb98 100644
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@@ -214,7 +214,7 @@ def test_token_api_conjuncts_chain(en_vocab):
 def test_token_api_conjuncts_simple(en_vocab):
     words = "They came and went .".split()
     heads = [1, 0, -1, -2, -1]
-    deps = ["nsubj", "ROOT", "cc", "conj"]
+    deps = ["nsubj", "ROOT", "cc", "conj", "dep"]
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
     assert [w.text for w in doc[1].conjuncts] == ["went"]
     assert [w.text for w in doc[3].conjuncts] == ["came"]
diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py
index eb206458e..41524d45e 100644
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@@ -34,23 +34,23 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
 @pytest.fixture
 def heads():
     # fmt: off
-    return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15,
-            -1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
-            -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14,
-            1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 2, 1,
-            0, -1, 1, -2, -1, 2, 1, -4, -8, 0, 1, -2, -1, -1, 3, -1, 1, -6,
-            9, 1, 7, -1, 1, -2, 3, 2, 1, -10, -1, 1, -2, -22, -1, 1, 0, -1,
-            2, 1, -4, -1, -2, -1, 1, -2, -6, -7, 1, -9, -1, 2, -1, -3, -1,
-            3, 2, 1, -4, -19, -24, 3, 2, 1, -4, -1, 1, 2, -1, -5, -34, 1, 0,
-            -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, -3, -1,
-            -1, 3, 2, 1, 0, -1, -2, 7, -1, 5, 1, 3, -1, 1, -10, -1, -2, 1,
-            -2, -15, 1, 0, -1, -1, 2, 1, -3, -1, -1, -2, -1, 1, -2, -12, 1,
-            1, 0, 1, -2, -1, -2, -3, 9, -1, 2, -1, -4, 2, 1, -3, -4, -15, 2,
-            1, -3, -1, 2, 1, -3, -8, -9, -1, -2, -1, -4, 1, -2, -3, 1, -2,
-            -19, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
+    return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2,
+            -1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
+            -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14,
+            1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1,
+            0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10,
+            9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1,
+            2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1,
+            3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0,
+            -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1,
+            -1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1,
+            -2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1,
+            1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2,
+            1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2,
+            -10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
             0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
-            1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1,
-            -1, -8, -9, -1]
+            1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1,
+            -1, 0, -1, -1]
     # fmt: on
 
 
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index e95c1a9b9..01f0f905c 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -48,7 +48,7 @@ def test_issue2203(en_vocab):
     tag_ids = [en_vocab.strings.add(tag) for tag in tags]
     lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
     doc = Doc(en_vocab, words=words)
-    # Work around lemma corrpution problem and set lemmas after tags
+    # Work around lemma corruption problem and set lemmas after tags
     doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
     doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
     assert [t.tag_ for t in doc] == tags
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index 73ff7376a..1f5e44499 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -124,7 +124,7 @@ def test_issue2772(en_vocab):
     words = "When we write or communicate virtually , we can hide our true feelings .".split()
     # A tree with a non-projective (i.e. crossing) arc
     # The arcs (0, 4) and (2, 9) cross.
-    heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1]
+    heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
     deps = ["dep"] * len(heads)
     doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
     assert doc[1].is_sent_start is None
diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py
index 8ec9a0bd1..3d01cd487 100644
--- a/spacy/tests/regression/test_issue4590.py
+++ b/spacy/tests/regression/test_issue4590.py
@@ -27,7 +27,7 @@ def test_issue4590(en_vocab):
 
     text = "The quick brown fox jumped over the lazy fox"
     heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
-    deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
+    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
 
     doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
 
diff --git a/spacy/tests/regression/test_issue5048.py b/spacy/tests/regression/test_issue5048.py
new file mode 100644
index 000000000..228322493
--- /dev/null
+++ b/spacy/tests/regression/test_issue5048.py
@@ -0,0 +1,35 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import numpy
+from spacy.tokens import Doc
+from spacy.attrs import DEP, POS, TAG
+
+from ..util import get_doc
+
+
+def test_issue5048(en_vocab):
+    words = ["This", "is", "a", "sentence"]
+    pos_s = ["DET", "VERB", "DET", "NOUN"]
+    spaces = [" ", " ", " ", ""]
+    deps_s = ["dep", "adj", "nn", "atm"]
+    tags_s = ["DT", "VBZ", "DT", "NN"]
+
+    strings = en_vocab.strings
+
+    for w in words:
+        strings.add(w)
+    deps = [strings.add(d) for d in deps_s]
+    pos = [strings.add(p) for p in pos_s]
+    tags = [strings.add(t) for t in tags_s]
+
+    attrs = [POS, DEP, TAG]
+    array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
+
+    doc = Doc(en_vocab, words=words, spaces=spaces)
+    doc.from_array(attrs, array)
+    v1 = [(token.text, token.pos_, token.tag_) for token in doc]
+
+    doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
+    v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
+    assert v1 == v2
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index d04c0506f..539714e0c 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab):
     deps = displacy.parse_deps(doc)
     assert isinstance(deps, dict)
     assert deps["words"] == [
-        {"lemma": None, "text": "This", "tag": "DET"},
-        {"lemma": None, "text": "is", "tag": "AUX"},
-        {"lemma": None, "text": "a", "tag": "DET"},
-        {"lemma": None, "text": "sentence", "tag": "NOUN"},
+        {"lemma": None, "text": words[0], "tag": pos[0]},
+        {"lemma": None, "text": words[1], "tag": pos[1]},
+        {"lemma": None, "text": words[2], "tag": pos[2]},
+        {"lemma": None, "text": words[3], "tag": pos[3]},
     ]
     assert deps["arcs"] == [
         {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
@@ -75,7 +75,7 @@ def test_displacy_rtl():
     deps = ["foo", "bar", "foo", "baz"]
     heads = [1, 0, 1, -2]
     nlp = Persian()
-    doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps)
+    doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
     doc.ents = [Span(doc, 1, 3, label="TEST")]
     html = displacy.render(doc, page=True, style="dep")
     assert "direction: rtl" in html
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 9ee5b89f8..52768dd41 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -7,8 +7,10 @@ import shutil
 import contextlib
 import srsly
 from pathlib import Path
+
+from spacy import Errors
 from spacy.tokens import Doc, Span
-from spacy.attrs import POS, HEAD, DEP
+from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA
 from spacy.compat import path2str
 
 
@@ -26,30 +28,54 @@ def make_tempdir():
     shutil.rmtree(path2str(d))
 
 
-def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
+def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None):
     """Create Doc object from given vocab, words and annotations."""
-    pos = pos or [""] * len(words)
-    tags = tags or [""] * len(words)
-    heads = heads or [0] * len(words)
-    deps = deps or [""] * len(words)
-    for value in deps + tags + pos:
+    if deps and not heads:
+        heads = [0] * len(deps)
+    headings = []
+    values = []
+    annotations = [pos, heads, deps, lemmas, tags]
+    possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
+    for a, annot in enumerate(annotations):
+        if annot is not None:
+            if len(annot) != len(words):
+                raise ValueError(Errors.E189)
+            headings.append(possible_headings[a])
+            if annot is not heads:
+                values.extend(annot)
+    for value in values:
         vocab.strings.add(value)
 
     doc = Doc(vocab, words=words)
-    attrs = doc.to_array([POS, HEAD, DEP])
-    for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
-        attrs[i, 0] = doc.vocab.strings[p]
-        attrs[i, 1] = head
-        attrs[i, 2] = doc.vocab.strings[dep]
-    doc.from_array([POS, HEAD, DEP], attrs)
+
+    # if there are any other annotations, set them
+    if headings:
+        attrs = doc.to_array(headings)
+
+        j = 0
+        for annot in annotations:
+            if annot:
+                if annot is heads:
+                    for i in range(len(words)):
+                        if attrs.ndim == 1:
+                            attrs[i] = heads[i]
+                        else:
+                            attrs[i,j] = heads[i]
+                else:
+                    for i in range(len(words)):
+                        if attrs.ndim == 1:
+                            attrs[i] = doc.vocab.strings[annot[i]]
+                        else:
+                            attrs[i, j] = doc.vocab.strings[annot[i]]
+                j += 1
+        doc.from_array(headings, attrs)
+
+    # finally, set the entities
     if ents:
         doc.ents = [
             Span(doc, start, end, label=doc.vocab.strings[label])
             for start, end, label in ents
         ]
-    if tags:
-        for token in doc:
-            token.tag_ = tags[token.i]
     return doc
 
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 63495ec86..11f1ddf5f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -785,6 +785,8 @@ cdef class Doc:
         # Allow strings, e.g. 'lemma' or 'LEMMA'
         attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
                  for id_ in attrs]
+        if array.dtype != numpy.uint64:
+            user_warning(Warnings.W028.format(type=array.dtype))
 
         if SENT_START in attrs and HEAD in attrs:
             raise ValueError(Errors.E032)
@@ -872,7 +874,7 @@ cdef class Doc:
 
         DOCS: https://spacy.io/api/doc#to_bytes
         """
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID]  # TODO: ENT_KB_ID ?
+        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM]  # TODO: ENT_KB_ID ?
         if self.is_tagged:
             array_head.extend([TAG, POS])
         # If doc parsed add head and dep attribute