From c6b12ab02adcdfe760bc10e249924553cb826410 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 2 Mar 2020 11:49:28 +0100 Subject: [PATCH] Bugfix/get doc (#5049) * new (broken) unit test * fixing get_doc method --- spacy/errors.py | 4 ++ spacy/pipeline/pipes.pyx | 2 +- spacy/tests/doc/test_doc_api.py | 9 +-- spacy/tests/doc/test_token_api.py | 2 +- spacy/tests/parser/test_parse_navigate.py | 32 +++++----- spacy/tests/regression/test_issue2001-2500.py | 2 +- spacy/tests/regression/test_issue2501-3000.py | 2 +- spacy/tests/regression/test_issue4590.py | 2 +- spacy/tests/regression/test_issue5048.py | 35 +++++++++++ spacy/tests/test_displacy.py | 10 ++-- spacy/tests/util.py | 58 ++++++++++++++----- spacy/tokens/doc.pyx | 4 +- 12 files changed, 115 insertions(+), 47 deletions(-) create mode 100644 spacy/tests/regression/test_issue5048.py diff --git a/spacy/errors.py b/spacy/errors.py index 2f0a8a2ad..5957c5ecd 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -107,6 +107,9 @@ class Warnings(object): W027 = ("Found a large training file of {size} bytes. Note that it may " "be more efficient to split your training data into multiple " "smaller JSON files instead.") + W028 = ("Doc.from_array was called with a vector of type '{type}', " + "but is expecting one of type 'uint64' instead. This may result " + "in problems with the vocab further on in the pipeline.") @@ -541,6 +544,7 @@ class Errors(object): E188 = ("Could not match the gold entity links to entities in the doc - " "make sure the gold EL data refers to valid results of the " "named entity recognizer in the `nlp` pipeline.") + E189 = ("Each argument to `get_doc` should be of equal length.") @add_codes diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 3b190debe..a20c9b6df 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -367,7 +367,7 @@ class Tensorizer(Pipe): return sgd -@component("tagger", assigns=["token.tag", "token.pos"]) +@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"]) class Tagger(Pipe): """Pipeline component for part-of-speech tagging. diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 52f856d3e..19d908529 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -150,10 +150,9 @@ def test_doc_api_runtime_error(en_tokenizer): # Example that caused run-time error while parsing Reddit # fmt: off text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school" - deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", - "nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep", - "pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg", - "ROOT", "amod", "dobj"] + deps = ["nummod", "nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", "nummod", "appos", "prep", "det", + "amod", "pobj", "acl", "prep", "prep", "pobj", + "", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"] # fmt: on tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps) @@ -277,7 +276,9 @@ def test_doc_is_nered(en_vocab): def test_doc_from_array_sent_starts(en_vocab): words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."] heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6] + # fmt: off deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"] + # fmt: on doc = Doc(en_vocab, words=words) for i, (dep, head) in enumerate(zip(deps, heads)): doc[i].dep_ = dep diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index bff2a95c6..b7522bb98 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -214,7 +214,7 @@ def test_token_api_conjuncts_chain(en_vocab): def test_token_api_conjuncts_simple(en_vocab): words = "They came and went .".split() heads = [1, 0, -1, -2, -1] - deps = ["nsubj", "ROOT", "cc", "conj"] + deps = ["nsubj", "ROOT", "cc", "conj", "dep"] doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) assert [w.text for w in doc[1].conjuncts] == ["went"] assert [w.text for w in doc[3].conjuncts] == ["came"] diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index eb206458e..41524d45e 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -34,23 +34,23 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran. @pytest.fixture def heads(): # fmt: off - return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15, - -1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1, - -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14, - 1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 2, 1, - 0, -1, 1, -2, -1, 2, 1, -4, -8, 0, 1, -2, -1, -1, 3, -1, 1, -6, - 9, 1, 7, -1, 1, -2, 3, 2, 1, -10, -1, 1, -2, -22, -1, 1, 0, -1, - 2, 1, -4, -1, -2, -1, 1, -2, -6, -7, 1, -9, -1, 2, -1, -3, -1, - 3, 2, 1, -4, -19, -24, 3, 2, 1, -4, -1, 1, 2, -1, -5, -34, 1, 0, - -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, -3, -1, - -1, 3, 2, 1, 0, -1, -2, 7, -1, 5, 1, 3, -1, 1, -10, -1, -2, 1, - -2, -15, 1, 0, -1, -1, 2, 1, -3, -1, -1, -2, -1, 1, -2, -12, 1, - 1, 0, 1, -2, -1, -2, -3, 9, -1, 2, -1, -4, 2, 1, -3, -4, -15, 2, - 1, -3, -1, 2, 1, -3, -8, -9, -1, -2, -1, -4, 1, -2, -3, 1, -2, - -19, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3, + return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2, + -1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1, + -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14, + 1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1, + 0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10, + 9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1, + 2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1, + 3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0, + -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1, + -1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1, + -2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1, + 1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2, + 1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2, + -10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3, 0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1, - 1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1, - -1, -8, -9, -1] + 1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1, + -1, 0, -1, -1] # fmt: on diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index e95c1a9b9..01f0f905c 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -48,7 +48,7 @@ def test_issue2203(en_vocab): tag_ids = [en_vocab.strings.add(tag) for tag in tags] lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas] doc = Doc(en_vocab, words=words) - # Work around lemma corrpution problem and set lemmas after tags + # Work around lemma corruption problem and set lemmas after tags doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64")) assert [t.tag_ for t in doc] == tags diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 73ff7376a..1f5e44499 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -124,7 +124,7 @@ def test_issue2772(en_vocab): words = "When we write or communicate virtually , we can hide our true feelings .".split() # A tree with a non-projective (i.e. crossing) arc # The arcs (0, 4) and (2, 9) cross. - heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1] + heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4] deps = ["dep"] * len(heads) doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) assert doc[1].is_sent_start is None diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py index 8ec9a0bd1..3d01cd487 100644 --- a/spacy/tests/regression/test_issue4590.py +++ b/spacy/tests/regression/test_issue4590.py @@ -27,7 +27,7 @@ def test_issue4590(en_vocab): text = "The quick brown fox jumped over the lazy fox" heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] - deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"] + deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"] doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) diff --git a/spacy/tests/regression/test_issue5048.py b/spacy/tests/regression/test_issue5048.py new file mode 100644 index 000000000..228322493 --- /dev/null +++ b/spacy/tests/regression/test_issue5048.py @@ -0,0 +1,35 @@ +# coding: utf8 +from __future__ import unicode_literals + +import numpy +from spacy.tokens import Doc +from spacy.attrs import DEP, POS, TAG + +from ..util import get_doc + + +def test_issue5048(en_vocab): + words = ["This", "is", "a", "sentence"] + pos_s = ["DET", "VERB", "DET", "NOUN"] + spaces = [" ", " ", " ", ""] + deps_s = ["dep", "adj", "nn", "atm"] + tags_s = ["DT", "VBZ", "DT", "NN"] + + strings = en_vocab.strings + + for w in words: + strings.add(w) + deps = [strings.add(d) for d in deps_s] + pos = [strings.add(p) for p in pos_s] + tags = [strings.add(t) for t in tags_s] + + attrs = [POS, DEP, TAG] + array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64") + + doc = Doc(en_vocab, words=words, spaces=spaces) + doc.from_array(attrs, array) + v1 = [(token.text, token.pos_, token.tag_) for token in doc] + + doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s) + v2 = [(token.text, token.pos_, token.tag_) for token in doc2] + assert v1 == v2 diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index d04c0506f..539714e0c 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab): deps = displacy.parse_deps(doc) assert isinstance(deps, dict) assert deps["words"] == [ - {"lemma": None, "text": "This", "tag": "DET"}, - {"lemma": None, "text": "is", "tag": "AUX"}, - {"lemma": None, "text": "a", "tag": "DET"}, - {"lemma": None, "text": "sentence", "tag": "NOUN"}, + {"lemma": None, "text": words[0], "tag": pos[0]}, + {"lemma": None, "text": words[1], "tag": pos[1]}, + {"lemma": None, "text": words[2], "tag": pos[2]}, + {"lemma": None, "text": words[3], "tag": pos[3]}, ] assert deps["arcs"] == [ {"start": 0, "end": 1, "label": "nsubj", "dir": "left"}, @@ -75,7 +75,7 @@ def test_displacy_rtl(): deps = ["foo", "bar", "foo", "baz"] heads = [1, 0, 1, -2] nlp = Persian() - doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps) + doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps) doc.ents = [Span(doc, 1, 3, label="TEST")] html = displacy.render(doc, page=True, style="dep") assert "direction: rtl" in html diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 9ee5b89f8..52768dd41 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -7,8 +7,10 @@ import shutil import contextlib import srsly from pathlib import Path + +from spacy import Errors from spacy.tokens import Doc, Span -from spacy.attrs import POS, HEAD, DEP +from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA from spacy.compat import path2str @@ -26,30 +28,54 @@ def make_tempdir(): shutil.rmtree(path2str(d)) -def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): +def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None): """Create Doc object from given vocab, words and annotations.""" - pos = pos or [""] * len(words) - tags = tags or [""] * len(words) - heads = heads or [0] * len(words) - deps = deps or [""] * len(words) - for value in deps + tags + pos: + if deps and not heads: + heads = [0] * len(deps) + headings = [] + values = [] + annotations = [pos, heads, deps, lemmas, tags] + possible_headings = [POS, HEAD, DEP, LEMMA, TAG] + for a, annot in enumerate(annotations): + if annot is not None: + if len(annot) != len(words): + raise ValueError(Errors.E189) + headings.append(possible_headings[a]) + if annot is not heads: + values.extend(annot) + for value in values: vocab.strings.add(value) doc = Doc(vocab, words=words) - attrs = doc.to_array([POS, HEAD, DEP]) - for i, (p, head, dep) in enumerate(zip(pos, heads, deps)): - attrs[i, 0] = doc.vocab.strings[p] - attrs[i, 1] = head - attrs[i, 2] = doc.vocab.strings[dep] - doc.from_array([POS, HEAD, DEP], attrs) + + # if there are any other annotations, set them + if headings: + attrs = doc.to_array(headings) + + j = 0 + for annot in annotations: + if annot: + if annot is heads: + for i in range(len(words)): + if attrs.ndim == 1: + attrs[i] = heads[i] + else: + attrs[i,j] = heads[i] + else: + for i in range(len(words)): + if attrs.ndim == 1: + attrs[i] = doc.vocab.strings[annot[i]] + else: + attrs[i, j] = doc.vocab.strings[annot[i]] + j += 1 + doc.from_array(headings, attrs) + + # finally, set the entities if ents: doc.ents = [ Span(doc, start, end, label=doc.vocab.strings[label]) for start, end, label in ents ] - if tags: - for token in doc: - token.tag_ = tags[token.i] return doc diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 63495ec86..11f1ddf5f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -785,6 +785,8 @@ cdef class Doc: # Allow strings, e.g. 'lemma' or 'LEMMA' attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) for id_ in attrs] + if array.dtype != numpy.uint64: + user_warning(Warnings.W028.format(type=array.dtype)) if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) @@ -872,7 +874,7 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#to_bytes """ - array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID] # TODO: ENT_KB_ID ? + array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM] # TODO: ENT_KB_ID ? if self.is_tagged: array_head.extend([TAG, POS]) # If doc parsed add head and dep attribute