Bugfix/get doc (#5049)

* new (broken) unit test

* fixing get_doc method
This commit is contained in:
Sofie Van Landeghem 2020-03-02 11:49:28 +01:00 committed by GitHub
parent 65d7bab10f
commit c6b12ab02a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 115 additions and 47 deletions

View File

@ -107,6 +107,9 @@ class Warnings(object):
W027 = ("Found a large training file of {size} bytes. Note that it may " W027 = ("Found a large training file of {size} bytes. Note that it may "
"be more efficient to split your training data into multiple " "be more efficient to split your training data into multiple "
"smaller JSON files instead.") "smaller JSON files instead.")
W028 = ("Doc.from_array was called with a vector of type '{type}', "
"but is expecting one of type 'uint64' instead. This may result "
"in problems with the vocab further on in the pipeline.")
@ -541,6 +544,7 @@ class Errors(object):
E188 = ("Could not match the gold entity links to entities in the doc - " E188 = ("Could not match the gold entity links to entities in the doc - "
"make sure the gold EL data refers to valid results of the " "make sure the gold EL data refers to valid results of the "
"named entity recognizer in the `nlp` pipeline.") "named entity recognizer in the `nlp` pipeline.")
E189 = ("Each argument to `get_doc` should be of equal length.")
@add_codes @add_codes

View File

@ -367,7 +367,7 @@ class Tensorizer(Pipe):
return sgd return sgd
@component("tagger", assigns=["token.tag", "token.pos"]) @component("tagger", assigns=["token.tag", "token.pos", "token.lemma"])
class Tagger(Pipe): class Tagger(Pipe):
"""Pipeline component for part-of-speech tagging. """Pipeline component for part-of-speech tagging.

View File

@ -150,10 +150,9 @@ def test_doc_api_runtime_error(en_tokenizer):
# Example that caused run-time error while parsing Reddit # Example that caused run-time error while parsing Reddit
# fmt: off # fmt: off
text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school" text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", deps = ["nummod", "nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", "nummod", "appos", "prep", "det",
"nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep", "amod", "pobj", "acl", "prep", "prep", "pobj",
"pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg", "", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
"ROOT", "amod", "dobj"]
# fmt: on # fmt: on
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
@ -277,7 +276,9 @@ def test_doc_is_nered(en_vocab):
def test_doc_from_array_sent_starts(en_vocab): def test_doc_from_array_sent_starts(en_vocab):
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."] words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6] heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
# fmt: off
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"] deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
# fmt: on
doc = Doc(en_vocab, words=words) doc = Doc(en_vocab, words=words)
for i, (dep, head) in enumerate(zip(deps, heads)): for i, (dep, head) in enumerate(zip(deps, heads)):
doc[i].dep_ = dep doc[i].dep_ = dep

View File

@ -214,7 +214,7 @@ def test_token_api_conjuncts_chain(en_vocab):
def test_token_api_conjuncts_simple(en_vocab): def test_token_api_conjuncts_simple(en_vocab):
words = "They came and went .".split() words = "They came and went .".split()
heads = [1, 0, -1, -2, -1] heads = [1, 0, -1, -2, -1]
deps = ["nsubj", "ROOT", "cc", "conj"] deps = ["nsubj", "ROOT", "cc", "conj", "dep"]
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
assert [w.text for w in doc[1].conjuncts] == ["went"] assert [w.text for w in doc[1].conjuncts] == ["went"]
assert [w.text for w in doc[3].conjuncts] == ["came"] assert [w.text for w in doc[3].conjuncts] == ["came"]

View File

@ -34,23 +34,23 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
@pytest.fixture @pytest.fixture
def heads(): def heads():
# fmt: off # fmt: off
return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15, return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2,
-1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1, -1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14, -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14,
1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 2, 1, 1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1,
0, -1, 1, -2, -1, 2, 1, -4, -8, 0, 1, -2, -1, -1, 3, -1, 1, -6, 0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10,
9, 1, 7, -1, 1, -2, 3, 2, 1, -10, -1, 1, -2, -22, -1, 1, 0, -1, 9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1,
2, 1, -4, -1, -2, -1, 1, -2, -6, -7, 1, -9, -1, 2, -1, -3, -1, 2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1,
3, 2, 1, -4, -19, -24, 3, 2, 1, -4, -1, 1, 2, -1, -5, -34, 1, 0, 3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0,
-1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, -3, -1, -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1,
-1, 3, 2, 1, 0, -1, -2, 7, -1, 5, 1, 3, -1, 1, -10, -1, -2, 1, -1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1,
-2, -15, 1, 0, -1, -1, 2, 1, -3, -1, -1, -2, -1, 1, -2, -12, 1, -2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1,
1, 0, 1, -2, -1, -2, -3, 9, -1, 2, -1, -4, 2, 1, -3, -4, -15, 2, 1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2,
1, -3, -1, 2, 1, -3, -8, -9, -1, -2, -1, -4, 1, -2, -3, 1, -2, 1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2,
-19, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3, -10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1, 0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1, 1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1,
-1, -8, -9, -1] -1, 0, -1, -1]
# fmt: on # fmt: on

View File

@ -48,7 +48,7 @@ def test_issue2203(en_vocab):
tag_ids = [en_vocab.strings.add(tag) for tag in tags] tag_ids = [en_vocab.strings.add(tag) for tag in tags]
lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas] lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
doc = Doc(en_vocab, words=words) doc = Doc(en_vocab, words=words)
# Work around lemma corrpution problem and set lemmas after tags # Work around lemma corruption problem and set lemmas after tags
doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64")) doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
assert [t.tag_ for t in doc] == tags assert [t.tag_ for t in doc] == tags

View File

@ -124,7 +124,7 @@ def test_issue2772(en_vocab):
words = "When we write or communicate virtually , we can hide our true feelings .".split() words = "When we write or communicate virtually , we can hide our true feelings .".split()
# A tree with a non-projective (i.e. crossing) arc # A tree with a non-projective (i.e. crossing) arc
# The arcs (0, 4) and (2, 9) cross. # The arcs (0, 4) and (2, 9) cross.
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1] heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
deps = ["dep"] * len(heads) deps = ["dep"] * len(heads)
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
assert doc[1].is_sent_start is None assert doc[1].is_sent_start is None

View File

@ -27,7 +27,7 @@ def test_issue4590(en_vocab):
text = "The quick brown fox jumped over the lazy fox" text = "The quick brown fox jumped over the lazy fox"
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"] deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)

View File

@ -0,0 +1,35 @@
# coding: utf8
from __future__ import unicode_literals
import numpy
from spacy.tokens import Doc
from spacy.attrs import DEP, POS, TAG
from ..util import get_doc
def test_issue5048(en_vocab):
words = ["This", "is", "a", "sentence"]
pos_s = ["DET", "VERB", "DET", "NOUN"]
spaces = [" ", " ", " ", ""]
deps_s = ["dep", "adj", "nn", "atm"]
tags_s = ["DT", "VBZ", "DT", "NN"]
strings = en_vocab.strings
for w in words:
strings.add(w)
deps = [strings.add(d) for d in deps_s]
pos = [strings.add(p) for p in pos_s]
tags = [strings.add(t) for t in tags_s]
attrs = [POS, DEP, TAG]
array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
doc = Doc(en_vocab, words=words, spaces=spaces)
doc.from_array(attrs, array)
v1 = [(token.text, token.pos_, token.tag_) for token in doc]
doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
assert v1 == v2

View File

@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab):
deps = displacy.parse_deps(doc) deps = displacy.parse_deps(doc)
assert isinstance(deps, dict) assert isinstance(deps, dict)
assert deps["words"] == [ assert deps["words"] == [
{"lemma": None, "text": "This", "tag": "DET"}, {"lemma": None, "text": words[0], "tag": pos[0]},
{"lemma": None, "text": "is", "tag": "AUX"}, {"lemma": None, "text": words[1], "tag": pos[1]},
{"lemma": None, "text": "a", "tag": "DET"}, {"lemma": None, "text": words[2], "tag": pos[2]},
{"lemma": None, "text": "sentence", "tag": "NOUN"}, {"lemma": None, "text": words[3], "tag": pos[3]},
] ]
assert deps["arcs"] == [ assert deps["arcs"] == [
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"}, {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
@ -75,7 +75,7 @@ def test_displacy_rtl():
deps = ["foo", "bar", "foo", "baz"] deps = ["foo", "bar", "foo", "baz"]
heads = [1, 0, 1, -2] heads = [1, 0, 1, -2]
nlp = Persian() nlp = Persian()
doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps) doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
doc.ents = [Span(doc, 1, 3, label="TEST")] doc.ents = [Span(doc, 1, 3, label="TEST")]
html = displacy.render(doc, page=True, style="dep") html = displacy.render(doc, page=True, style="dep")
assert "direction: rtl" in html assert "direction: rtl" in html

View File

@ -7,8 +7,10 @@ import shutil
import contextlib import contextlib
import srsly import srsly
from pathlib import Path from pathlib import Path
from spacy import Errors
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.attrs import POS, HEAD, DEP from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA
from spacy.compat import path2str from spacy.compat import path2str
@ -26,30 +28,54 @@ def make_tempdir():
shutil.rmtree(path2str(d)) shutil.rmtree(path2str(d))
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None):
"""Create Doc object from given vocab, words and annotations.""" """Create Doc object from given vocab, words and annotations."""
pos = pos or [""] * len(words) if deps and not heads:
tags = tags or [""] * len(words) heads = [0] * len(deps)
heads = heads or [0] * len(words) headings = []
deps = deps or [""] * len(words) values = []
for value in deps + tags + pos: annotations = [pos, heads, deps, lemmas, tags]
possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
for a, annot in enumerate(annotations):
if annot is not None:
if len(annot) != len(words):
raise ValueError(Errors.E189)
headings.append(possible_headings[a])
if annot is not heads:
values.extend(annot)
for value in values:
vocab.strings.add(value) vocab.strings.add(value)
doc = Doc(vocab, words=words) doc = Doc(vocab, words=words)
attrs = doc.to_array([POS, HEAD, DEP])
for i, (p, head, dep) in enumerate(zip(pos, heads, deps)): # if there are any other annotations, set them
attrs[i, 0] = doc.vocab.strings[p] if headings:
attrs[i, 1] = head attrs = doc.to_array(headings)
attrs[i, 2] = doc.vocab.strings[dep]
doc.from_array([POS, HEAD, DEP], attrs) j = 0
for annot in annotations:
if annot:
if annot is heads:
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = heads[i]
else:
attrs[i,j] = heads[i]
else:
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = doc.vocab.strings[annot[i]]
else:
attrs[i, j] = doc.vocab.strings[annot[i]]
j += 1
doc.from_array(headings, attrs)
# finally, set the entities
if ents: if ents:
doc.ents = [ doc.ents = [
Span(doc, start, end, label=doc.vocab.strings[label]) Span(doc, start, end, label=doc.vocab.strings[label])
for start, end, label in ents for start, end, label in ents
] ]
if tags:
for token in doc:
token.tag_ = tags[token.i]
return doc return doc

View File

@ -785,6 +785,8 @@ cdef class Doc:
# Allow strings, e.g. 'lemma' or 'LEMMA' # Allow strings, e.g. 'lemma' or 'LEMMA'
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
for id_ in attrs] for id_ in attrs]
if array.dtype != numpy.uint64:
user_warning(Warnings.W028.format(type=array.dtype))
if SENT_START in attrs and HEAD in attrs: if SENT_START in attrs and HEAD in attrs:
raise ValueError(Errors.E032) raise ValueError(Errors.E032)
@ -872,7 +874,7 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#to_bytes DOCS: https://spacy.io/api/doc#to_bytes
""" """
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID] # TODO: ENT_KB_ID ? array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM] # TODO: ENT_KB_ID ?
if self.is_tagged: if self.is_tagged:
array_head.extend([TAG, POS]) array_head.extend([TAG, POS])
# If doc parsed add head and dep attribute # If doc parsed add head and dep attribute