Bugfix/get doc (#5049)

* new (broken) unit test

* fixing get_doc method
This commit is contained in:
Sofie Van Landeghem 2020-03-02 11:49:28 +01:00 committed by GitHub
parent 65d7bab10f
commit c6b12ab02a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 115 additions and 47 deletions

View File

@ -107,6 +107,9 @@ class Warnings(object):
W027 = ("Found a large training file of {size} bytes. Note that it may "
"be more efficient to split your training data into multiple "
"smaller JSON files instead.")
W028 = ("Doc.from_array was called with a vector of type '{type}', "
"but is expecting one of type 'uint64' instead. This may result "
"in problems with the vocab further on in the pipeline.")
@ -541,6 +544,7 @@ class Errors(object):
E188 = ("Could not match the gold entity links to entities in the doc - "
"make sure the gold EL data refers to valid results of the "
"named entity recognizer in the `nlp` pipeline.")
E189 = ("Each argument to `get_doc` should be of equal length.")
@add_codes

View File

@ -367,7 +367,7 @@ class Tensorizer(Pipe):
return sgd
@component("tagger", assigns=["token.tag", "token.pos"])
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"])
class Tagger(Pipe):
"""Pipeline component for part-of-speech tagging.

View File

@ -150,10 +150,9 @@ def test_doc_api_runtime_error(en_tokenizer):
# Example that caused run-time error while parsing Reddit
# fmt: off
text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "",
"nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep",
"pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
"ROOT", "amod", "dobj"]
deps = ["nummod", "nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", "nummod", "appos", "prep", "det",
"amod", "pobj", "acl", "prep", "prep", "pobj",
"", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
# fmt: on
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
@ -277,7 +276,9 @@ def test_doc_is_nered(en_vocab):
def test_doc_from_array_sent_starts(en_vocab):
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
# fmt: off
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
# fmt: on
doc = Doc(en_vocab, words=words)
for i, (dep, head) in enumerate(zip(deps, heads)):
doc[i].dep_ = dep

View File

@ -214,7 +214,7 @@ def test_token_api_conjuncts_chain(en_vocab):
def test_token_api_conjuncts_simple(en_vocab):
words = "They came and went .".split()
heads = [1, 0, -1, -2, -1]
deps = ["nsubj", "ROOT", "cc", "conj"]
deps = ["nsubj", "ROOT", "cc", "conj", "dep"]
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
assert [w.text for w in doc[1].conjuncts] == ["went"]
assert [w.text for w in doc[3].conjuncts] == ["came"]

View File

@ -34,23 +34,23 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
@pytest.fixture
def heads():
# fmt: off
return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15,
-1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14,
1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 2, 1,
0, -1, 1, -2, -1, 2, 1, -4, -8, 0, 1, -2, -1, -1, 3, -1, 1, -6,
9, 1, 7, -1, 1, -2, 3, 2, 1, -10, -1, 1, -2, -22, -1, 1, 0, -1,
2, 1, -4, -1, -2, -1, 1, -2, -6, -7, 1, -9, -1, 2, -1, -3, -1,
3, 2, 1, -4, -19, -24, 3, 2, 1, -4, -1, 1, 2, -1, -5, -34, 1, 0,
-1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, -3, -1,
-1, 3, 2, 1, 0, -1, -2, 7, -1, 5, 1, 3, -1, 1, -10, -1, -2, 1,
-2, -15, 1, 0, -1, -1, 2, 1, -3, -1, -1, -2, -1, 1, -2, -12, 1,
1, 0, 1, -2, -1, -2, -3, 9, -1, 2, -1, -4, 2, 1, -3, -4, -15, 2,
1, -3, -1, 2, 1, -3, -8, -9, -1, -2, -1, -4, 1, -2, -3, 1, -2,
-19, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2,
-1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14,
1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1,
0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10,
9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1,
2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1,
3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0,
-1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1,
-1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1,
-2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1,
1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2,
1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2,
-10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1,
-1, -8, -9, -1]
1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1,
-1, 0, -1, -1]
# fmt: on

View File

@ -48,7 +48,7 @@ def test_issue2203(en_vocab):
tag_ids = [en_vocab.strings.add(tag) for tag in tags]
lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
doc = Doc(en_vocab, words=words)
# Work around lemma corrpution problem and set lemmas after tags
# Work around lemma corruption problem and set lemmas after tags
doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
assert [t.tag_ for t in doc] == tags

View File

@ -124,7 +124,7 @@ def test_issue2772(en_vocab):
words = "When we write or communicate virtually , we can hide our true feelings .".split()
# A tree with a non-projective (i.e. crossing) arc
# The arcs (0, 4) and (2, 9) cross.
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1]
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
deps = ["dep"] * len(heads)
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
assert doc[1].is_sent_start is None

View File

@ -27,7 +27,7 @@ def test_issue4590(en_vocab):
text = "The quick brown fox jumped over the lazy fox"
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)

View File

@ -0,0 +1,35 @@
# coding: utf8
from __future__ import unicode_literals
import numpy
from spacy.tokens import Doc
from spacy.attrs import DEP, POS, TAG
from ..util import get_doc
def test_issue5048(en_vocab):
words = ["This", "is", "a", "sentence"]
pos_s = ["DET", "VERB", "DET", "NOUN"]
spaces = [" ", " ", " ", ""]
deps_s = ["dep", "adj", "nn", "atm"]
tags_s = ["DT", "VBZ", "DT", "NN"]
strings = en_vocab.strings
for w in words:
strings.add(w)
deps = [strings.add(d) for d in deps_s]
pos = [strings.add(p) for p in pos_s]
tags = [strings.add(t) for t in tags_s]
attrs = [POS, DEP, TAG]
array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
doc = Doc(en_vocab, words=words, spaces=spaces)
doc.from_array(attrs, array)
v1 = [(token.text, token.pos_, token.tag_) for token in doc]
doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
assert v1 == v2

View File

@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab):
deps = displacy.parse_deps(doc)
assert isinstance(deps, dict)
assert deps["words"] == [
{"lemma": None, "text": "This", "tag": "DET"},
{"lemma": None, "text": "is", "tag": "AUX"},
{"lemma": None, "text": "a", "tag": "DET"},
{"lemma": None, "text": "sentence", "tag": "NOUN"},
{"lemma": None, "text": words[0], "tag": pos[0]},
{"lemma": None, "text": words[1], "tag": pos[1]},
{"lemma": None, "text": words[2], "tag": pos[2]},
{"lemma": None, "text": words[3], "tag": pos[3]},
]
assert deps["arcs"] == [
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
@ -75,7 +75,7 @@ def test_displacy_rtl():
deps = ["foo", "bar", "foo", "baz"]
heads = [1, 0, 1, -2]
nlp = Persian()
doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps)
doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
doc.ents = [Span(doc, 1, 3, label="TEST")]
html = displacy.render(doc, page=True, style="dep")
assert "direction: rtl" in html

View File

@ -7,8 +7,10 @@ import shutil
import contextlib
import srsly
from pathlib import Path
from spacy import Errors
from spacy.tokens import Doc, Span
from spacy.attrs import POS, HEAD, DEP
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA
from spacy.compat import path2str
@ -26,30 +28,54 @@ def make_tempdir():
shutil.rmtree(path2str(d))
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None):
"""Create Doc object from given vocab, words and annotations."""
pos = pos or [""] * len(words)
tags = tags or [""] * len(words)
heads = heads or [0] * len(words)
deps = deps or [""] * len(words)
for value in deps + tags + pos:
if deps and not heads:
heads = [0] * len(deps)
headings = []
values = []
annotations = [pos, heads, deps, lemmas, tags]
possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
for a, annot in enumerate(annotations):
if annot is not None:
if len(annot) != len(words):
raise ValueError(Errors.E189)
headings.append(possible_headings[a])
if annot is not heads:
values.extend(annot)
for value in values:
vocab.strings.add(value)
doc = Doc(vocab, words=words)
attrs = doc.to_array([POS, HEAD, DEP])
for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
attrs[i, 0] = doc.vocab.strings[p]
attrs[i, 1] = head
attrs[i, 2] = doc.vocab.strings[dep]
doc.from_array([POS, HEAD, DEP], attrs)
# if there are any other annotations, set them
if headings:
attrs = doc.to_array(headings)
j = 0
for annot in annotations:
if annot:
if annot is heads:
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = heads[i]
else:
attrs[i,j] = heads[i]
else:
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = doc.vocab.strings[annot[i]]
else:
attrs[i, j] = doc.vocab.strings[annot[i]]
j += 1
doc.from_array(headings, attrs)
# finally, set the entities
if ents:
doc.ents = [
Span(doc, start, end, label=doc.vocab.strings[label])
for start, end, label in ents
]
if tags:
for token in doc:
token.tag_ = tags[token.i]
return doc

View File

@ -785,6 +785,8 @@ cdef class Doc:
# Allow strings, e.g. 'lemma' or 'LEMMA'
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
for id_ in attrs]
if array.dtype != numpy.uint64:
user_warning(Warnings.W028.format(type=array.dtype))
if SENT_START in attrs and HEAD in attrs:
raise ValueError(Errors.E032)
@ -872,7 +874,7 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#to_bytes
"""
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID] # TODO: ENT_KB_ID ?
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM] # TODO: ENT_KB_ID ?
if self.is_tagged:
array_head.extend([TAG, POS])
# If doc parsed add head and dep attribute