Bugfix/get doc (#5049)

* new (broken) unit test * fixing get_doc method
2025-08-01 19:00:20 +03:00 · 2020-03-02 11:49:28 +01:00 · 2020-03-02 11:49:28 +01:00 · c6b12ab02a
commit c6b12ab02a
parent 65d7bab10f
12 changed files with 115 additions and 47 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -107,6 +107,9 @@ class Warnings(object):
    W027 = ("Found a large training file of {size} bytes. Note that it may "
            "be more efficient to split your training data into multiple "
            "smaller JSON files instead.")
+    W028 = ("Doc.from_array was called with a vector of type '{type}', "
+            "but is expecting one of type 'uint64' instead. This may result "
+            "in problems with the vocab further on in the pipeline.")



@ -541,6 +544,7 @@ class Errors(object):
    E188 = ("Could not match the gold entity links to entities in the doc - "
            "make sure the gold EL data refers to valid results of the "
            "named entity recognizer in the `nlp` pipeline.")
+    E189 = ("Each argument to `get_doc` should be of equal length.")


@add_codes
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -367,7 +367,7 @@ class Tensorizer(Pipe):
        return sgd


-@component("tagger", assigns=["token.tag", "token.pos"])
+@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"])
 class Tagger(Pipe):
    """Pipeline component for part-of-speech tagging.

--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -150,10 +150,9 @@ def test_doc_api_runtime_error(en_tokenizer):
    # Example that caused run-time error while parsing Reddit
    # fmt: off
    text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
-    deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "",
-            "nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep",
-            "pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
-            "ROOT", "amod", "dobj"]
+    deps = ["nummod", "nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", "nummod", "appos", "prep", "det",
+            "amod", "pobj", "acl", "prep", "prep", "pobj",
+            "", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
    # fmt: on
    tokens = en_tokenizer(text)
    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
@ -277,7 +276,9 @@ def test_doc_is_nered(en_vocab):
 def test_doc_from_array_sent_starts(en_vocab):
    words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
    heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
+    # fmt: off
    deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
+    # fmt: on
    doc = Doc(en_vocab, words=words)
    for i, (dep, head) in enumerate(zip(deps, heads)):
        doc[i].dep_ = dep
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -214,7 +214,7 @@ def test_token_api_conjuncts_chain(en_vocab):
 def test_token_api_conjuncts_simple(en_vocab):
    words = "They came and went .".split()
    heads = [1, 0, -1, -2, -1]
-    deps = ["nsubj", "ROOT", "cc", "conj"]
+    deps = ["nsubj", "ROOT", "cc", "conj", "dep"]
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    assert [w.text for w in doc[1].conjuncts] == ["went"]
    assert [w.text for w in doc[3].conjuncts] == ["came"]
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@ -34,23 +34,23 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
@pytest.fixture
 def heads():
    # fmt: off
-    return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15,
-            -1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
-            -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14,
-            1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 2, 1,
-            0, -1, 1, -2, -1, 2, 1, -4, -8, 0, 1, -2, -1, -1, 3, -1, 1, -6,
-            9, 1, 7, -1, 1, -2, 3, 2, 1, -10, -1, 1, -2, -22, -1, 1, 0, -1,
-            2, 1, -4, -1, -2, -1, 1, -2, -6, -7, 1, -9, -1, 2, -1, -3, -1,
-            3, 2, 1, -4, -19, -24, 3, 2, 1, -4, -1, 1, 2, -1, -5, -34, 1, 0,
-            -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, -3, -1,
-            -1, 3, 2, 1, 0, -1, -2, 7, -1, 5, 1, 3, -1, 1, -10, -1, -2, 1,
-            -2, -15, 1, 0, -1, -1, 2, 1, -3, -1, -1, -2, -1, 1, -2, -12, 1,
-            1, 0, 1, -2, -1, -2, -3, 9, -1, 2, -1, -4, 2, 1, -3, -4, -15, 2,
-            1, -3, -1, 2, 1, -3, -8, -9, -1, -2, -1, -4, 1, -2, -3, 1, -2,
-            -19, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
+    return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2,
+            -1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
+            -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14,
+            1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1,
+            0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10,
+            9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1,
+            2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1,
+            3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0,
+            -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1,
+            -1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1,
+            -2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1,
+            1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2,
+            1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2,
+            -10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
            0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
-            1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1,
-            -1, -8, -9, -1]
+            1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1,
+            -1, 0, -1, -1]
    # fmt: on


--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@ -48,7 +48,7 @@ def test_issue2203(en_vocab):
    tag_ids = [en_vocab.strings.add(tag) for tag in tags]
    lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
    doc = Doc(en_vocab, words=words)
-    # Work around lemma corrpution problem and set lemmas after tags
+    # Work around lemma corruption problem and set lemmas after tags
    doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
    doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
    assert [t.tag_ for t in doc] == tags
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@ -124,7 +124,7 @@ def test_issue2772(en_vocab):
    words = "When we write or communicate virtually , we can hide our true feelings .".split()
    # A tree with a non-projective (i.e. crossing) arc
    # The arcs (0, 4) and (2, 9) cross.
-    heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1]
+    heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
    deps = ["dep"] * len(heads)
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
    assert doc[1].is_sent_start is None
--- a/spacy/tests/regression/test_issue4590.py
+++ b/spacy/tests/regression/test_issue4590.py
@ -27,7 +27,7 @@ def test_issue4590(en_vocab):

    text = "The quick brown fox jumped over the lazy fox"
    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
-    deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
+    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]

    doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)

--- a/spacy/tests/regression/test_issue5048.py
+++ b/spacy/tests/regression/test_issue5048.py
@ -0,0 +1,35 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import numpy
+from spacy.tokens import Doc
+from spacy.attrs import DEP, POS, TAG
+
+from ..util import get_doc
+
+
+def test_issue5048(en_vocab):
+    words = ["This", "is", "a", "sentence"]
+    pos_s = ["DET", "VERB", "DET", "NOUN"]
+    spaces = [" ", " ", " ", ""]
+    deps_s = ["dep", "adj", "nn", "atm"]
+    tags_s = ["DT", "VBZ", "DT", "NN"]
+
+    strings = en_vocab.strings
+
+    for w in words:
+        strings.add(w)
+    deps = [strings.add(d) for d in deps_s]
+    pos = [strings.add(p) for p in pos_s]
+    tags = [strings.add(t) for t in tags_s]
+
+    attrs = [POS, DEP, TAG]
+    array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
+
+    doc = Doc(en_vocab, words=words, spaces=spaces)
+    doc.from_array(attrs, array)
+    v1 = [(token.text, token.pos_, token.tag_) for token in doc]
+
+    doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
+    v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
+    assert v1 == v2
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab):
    deps = displacy.parse_deps(doc)
    assert isinstance(deps, dict)
    assert deps["words"] == [
-        {"lemma": None, "text": "This", "tag": "DET"},
-        {"lemma": None, "text": "is", "tag": "AUX"},
-        {"lemma": None, "text": "a", "tag": "DET"},
-        {"lemma": None, "text": "sentence", "tag": "NOUN"},
+        {"lemma": None, "text": words[0], "tag": pos[0]},
+        {"lemma": None, "text": words[1], "tag": pos[1]},
+        {"lemma": None, "text": words[2], "tag": pos[2]},
+        {"lemma": None, "text": words[3], "tag": pos[3]},
    ]
    assert deps["arcs"] == [
        {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
@ -75,7 +75,7 @@ def test_displacy_rtl():
    deps = ["foo", "bar", "foo", "baz"]
    heads = [1, 0, 1, -2]
    nlp = Persian()
-    doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps)
+    doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
    doc.ents = [Span(doc, 1, 3, label="TEST")]
    html = displacy.render(doc, page=True, style="dep")
    assert "direction: rtl" in html
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -7,8 +7,10 @@ import shutil
 import contextlib
 import srsly
 from pathlib import Path
+
+from spacy import Errors
 from spacy.tokens import Doc, Span
-from spacy.attrs import POS, HEAD, DEP
+from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA
 from spacy.compat import path2str


@ -26,30 +28,54 @@ def make_tempdir():
    shutil.rmtree(path2str(d))


-def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
+def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None):
    """Create Doc object from given vocab, words and annotations."""
-    pos = pos or [""] * len(words)
-    tags = tags or [""] * len(words)
-    heads = heads or [0] * len(words)
-    deps = deps or [""] * len(words)
-    for value in deps + tags + pos:
+    if deps and not heads:
+        heads = [0] * len(deps)
+    headings = []
+    values = []
+    annotations = [pos, heads, deps, lemmas, tags]
+    possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
+    for a, annot in enumerate(annotations):
+        if annot is not None:
+            if len(annot) != len(words):
+                raise ValueError(Errors.E189)
+            headings.append(possible_headings[a])
+            if annot is not heads:
+                values.extend(annot)
+    for value in values:
        vocab.strings.add(value)

    doc = Doc(vocab, words=words)
-    attrs = doc.to_array([POS, HEAD, DEP])
-    for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
-        attrs[i, 0] = doc.vocab.strings[p]
-        attrs[i, 1] = head
-        attrs[i, 2] = doc.vocab.strings[dep]
-    doc.from_array([POS, HEAD, DEP], attrs)
+
+    # if there are any other annotations, set them
+    if headings:
+        attrs = doc.to_array(headings)
+
+        j = 0
+        for annot in annotations:
+            if annot:
+                if annot is heads:
+                    for i in range(len(words)):
+                        if attrs.ndim == 1:
+                            attrs[i] = heads[i]
+                        else:
+                            attrs[i,j] = heads[i]
+                else:
+                    for i in range(len(words)):
+                        if attrs.ndim == 1:
+                            attrs[i] = doc.vocab.strings[annot[i]]
+                        else:
+                            attrs[i, j] = doc.vocab.strings[annot[i]]
+                j += 1
+        doc.from_array(headings, attrs)
+
+    # finally, set the entities
    if ents:
        doc.ents = [
            Span(doc, start, end, label=doc.vocab.strings[label])
            for start, end, label in ents
        ]
-    if tags:
-        for token in doc:
-            token.tag_ = tags[token.i]
    return doc


--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -785,6 +785,8 @@ cdef class Doc:
        # Allow strings, e.g. 'lemma' or 'LEMMA'
        attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
                 for id_ in attrs]
+        if array.dtype != numpy.uint64:
+            user_warning(Warnings.W028.format(type=array.dtype))

        if SENT_START in attrs and HEAD in attrs:
            raise ValueError(Errors.E032)
@ -872,7 +874,7 @@ cdef class Doc:

        DOCS: https://spacy.io/api/doc#to_bytes
        """
-        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID]  # TODO: ENT_KB_ID ?
+        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM]  # TODO: ENT_KB_ID ?
        if self.is_tagged:
            array_head.extend([TAG, POS])
        # If doc parsed add head and dep attribute