mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Bugfix/get doc (#5049)
* new (broken) unit test * fixing get_doc method
This commit is contained in:
parent
65d7bab10f
commit
c6b12ab02a
|
@ -107,6 +107,9 @@ class Warnings(object):
|
|||
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
||||
"be more efficient to split your training data into multiple "
|
||||
"smaller JSON files instead.")
|
||||
W028 = ("Doc.from_array was called with a vector of type '{type}', "
|
||||
"but is expecting one of type 'uint64' instead. This may result "
|
||||
"in problems with the vocab further on in the pipeline.")
|
||||
|
||||
|
||||
|
||||
|
@ -541,6 +544,7 @@ class Errors(object):
|
|||
E188 = ("Could not match the gold entity links to entities in the doc - "
|
||||
"make sure the gold EL data refers to valid results of the "
|
||||
"named entity recognizer in the `nlp` pipeline.")
|
||||
E189 = ("Each argument to `get_doc` should be of equal length.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -367,7 +367,7 @@ class Tensorizer(Pipe):
|
|||
return sgd
|
||||
|
||||
|
||||
@component("tagger", assigns=["token.tag", "token.pos"])
|
||||
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"])
|
||||
class Tagger(Pipe):
|
||||
"""Pipeline component for part-of-speech tagging.
|
||||
|
||||
|
|
|
@ -150,10 +150,9 @@ def test_doc_api_runtime_error(en_tokenizer):
|
|||
# Example that caused run-time error while parsing Reddit
|
||||
# fmt: off
|
||||
text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
|
||||
deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "",
|
||||
"nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep",
|
||||
"pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
|
||||
"ROOT", "amod", "dobj"]
|
||||
deps = ["nummod", "nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", "nummod", "appos", "prep", "det",
|
||||
"amod", "pobj", "acl", "prep", "prep", "pobj",
|
||||
"", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
|
||||
# fmt: on
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
||||
|
@ -277,7 +276,9 @@ def test_doc_is_nered(en_vocab):
|
|||
def test_doc_from_array_sent_starts(en_vocab):
|
||||
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
|
||||
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
|
||||
# fmt: off
|
||||
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
|
||||
# fmt: on
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, (dep, head) in enumerate(zip(deps, heads)):
|
||||
doc[i].dep_ = dep
|
||||
|
|
|
@ -214,7 +214,7 @@ def test_token_api_conjuncts_chain(en_vocab):
|
|||
def test_token_api_conjuncts_simple(en_vocab):
|
||||
words = "They came and went .".split()
|
||||
heads = [1, 0, -1, -2, -1]
|
||||
deps = ["nsubj", "ROOT", "cc", "conj"]
|
||||
deps = ["nsubj", "ROOT", "cc", "conj", "dep"]
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert [w.text for w in doc[1].conjuncts] == ["went"]
|
||||
assert [w.text for w in doc[3].conjuncts] == ["came"]
|
||||
|
|
|
@ -34,23 +34,23 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
|
|||
@pytest.fixture
|
||||
def heads():
|
||||
# fmt: off
|
||||
return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15,
|
||||
-1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
|
||||
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14,
|
||||
1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 2, 1,
|
||||
0, -1, 1, -2, -1, 2, 1, -4, -8, 0, 1, -2, -1, -1, 3, -1, 1, -6,
|
||||
9, 1, 7, -1, 1, -2, 3, 2, 1, -10, -1, 1, -2, -22, -1, 1, 0, -1,
|
||||
2, 1, -4, -1, -2, -1, 1, -2, -6, -7, 1, -9, -1, 2, -1, -3, -1,
|
||||
3, 2, 1, -4, -19, -24, 3, 2, 1, -4, -1, 1, 2, -1, -5, -34, 1, 0,
|
||||
-1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, -3, -1,
|
||||
-1, 3, 2, 1, 0, -1, -2, 7, -1, 5, 1, 3, -1, 1, -10, -1, -2, 1,
|
||||
-2, -15, 1, 0, -1, -1, 2, 1, -3, -1, -1, -2, -1, 1, -2, -12, 1,
|
||||
1, 0, 1, -2, -1, -2, -3, 9, -1, 2, -1, -4, 2, 1, -3, -4, -15, 2,
|
||||
1, -3, -1, 2, 1, -3, -8, -9, -1, -2, -1, -4, 1, -2, -3, 1, -2,
|
||||
-19, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
|
||||
return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2,
|
||||
-1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
|
||||
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14,
|
||||
1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1,
|
||||
0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10,
|
||||
9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1,
|
||||
2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1,
|
||||
3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0,
|
||||
-1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1,
|
||||
-1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1,
|
||||
-2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1,
|
||||
1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2,
|
||||
1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2,
|
||||
-10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
|
||||
0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
|
||||
1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1,
|
||||
-1, -8, -9, -1]
|
||||
1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1,
|
||||
-1, 0, -1, -1]
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ def test_issue2203(en_vocab):
|
|||
tag_ids = [en_vocab.strings.add(tag) for tag in tags]
|
||||
lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
# Work around lemma corrpution problem and set lemmas after tags
|
||||
# Work around lemma corruption problem and set lemmas after tags
|
||||
doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
|
||||
doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
|
||||
assert [t.tag_ for t in doc] == tags
|
||||
|
|
|
@ -124,7 +124,7 @@ def test_issue2772(en_vocab):
|
|||
words = "When we write or communicate virtually , we can hide our true feelings .".split()
|
||||
# A tree with a non-projective (i.e. crossing) arc
|
||||
# The arcs (0, 4) and (2, 9) cross.
|
||||
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1]
|
||||
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
|
||||
deps = ["dep"] * len(heads)
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert doc[1].is_sent_start is None
|
||||
|
|
|
@ -27,7 +27,7 @@ def test_issue4590(en_vocab):
|
|||
|
||||
text = "The quick brown fox jumped over the lazy fox"
|
||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
|
||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
||||
|
||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||
|
||||
|
|
35
spacy/tests/regression/test_issue5048.py
Normal file
35
spacy/tests/regression/test_issue5048.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy
|
||||
from spacy.tokens import Doc
|
||||
from spacy.attrs import DEP, POS, TAG
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_issue5048(en_vocab):
|
||||
words = ["This", "is", "a", "sentence"]
|
||||
pos_s = ["DET", "VERB", "DET", "NOUN"]
|
||||
spaces = [" ", " ", " ", ""]
|
||||
deps_s = ["dep", "adj", "nn", "atm"]
|
||||
tags_s = ["DT", "VBZ", "DT", "NN"]
|
||||
|
||||
strings = en_vocab.strings
|
||||
|
||||
for w in words:
|
||||
strings.add(w)
|
||||
deps = [strings.add(d) for d in deps_s]
|
||||
pos = [strings.add(p) for p in pos_s]
|
||||
tags = [strings.add(t) for t in tags_s]
|
||||
|
||||
attrs = [POS, DEP, TAG]
|
||||
array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
|
||||
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
doc.from_array(attrs, array)
|
||||
v1 = [(token.text, token.pos_, token.tag_) for token in doc]
|
||||
|
||||
doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
|
||||
v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
|
||||
assert v1 == v2
|
|
@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab):
|
|||
deps = displacy.parse_deps(doc)
|
||||
assert isinstance(deps, dict)
|
||||
assert deps["words"] == [
|
||||
{"lemma": None, "text": "This", "tag": "DET"},
|
||||
{"lemma": None, "text": "is", "tag": "AUX"},
|
||||
{"lemma": None, "text": "a", "tag": "DET"},
|
||||
{"lemma": None, "text": "sentence", "tag": "NOUN"},
|
||||
{"lemma": None, "text": words[0], "tag": pos[0]},
|
||||
{"lemma": None, "text": words[1], "tag": pos[1]},
|
||||
{"lemma": None, "text": words[2], "tag": pos[2]},
|
||||
{"lemma": None, "text": words[3], "tag": pos[3]},
|
||||
]
|
||||
assert deps["arcs"] == [
|
||||
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
||||
|
@ -75,7 +75,7 @@ def test_displacy_rtl():
|
|||
deps = ["foo", "bar", "foo", "baz"]
|
||||
heads = [1, 0, 1, -2]
|
||||
nlp = Persian()
|
||||
doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps)
|
||||
doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
|
||||
doc.ents = [Span(doc, 1, 3, label="TEST")]
|
||||
html = displacy.render(doc, page=True, style="dep")
|
||||
assert "direction: rtl" in html
|
||||
|
|
|
@ -7,8 +7,10 @@ import shutil
|
|||
import contextlib
|
||||
import srsly
|
||||
from pathlib import Path
|
||||
|
||||
from spacy import Errors
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.attrs import POS, HEAD, DEP
|
||||
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA
|
||||
from spacy.compat import path2str
|
||||
|
||||
|
||||
|
@ -26,30 +28,54 @@ def make_tempdir():
|
|||
shutil.rmtree(path2str(d))
|
||||
|
||||
|
||||
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
|
||||
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None):
|
||||
"""Create Doc object from given vocab, words and annotations."""
|
||||
pos = pos or [""] * len(words)
|
||||
tags = tags or [""] * len(words)
|
||||
heads = heads or [0] * len(words)
|
||||
deps = deps or [""] * len(words)
|
||||
for value in deps + tags + pos:
|
||||
if deps and not heads:
|
||||
heads = [0] * len(deps)
|
||||
headings = []
|
||||
values = []
|
||||
annotations = [pos, heads, deps, lemmas, tags]
|
||||
possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
|
||||
for a, annot in enumerate(annotations):
|
||||
if annot is not None:
|
||||
if len(annot) != len(words):
|
||||
raise ValueError(Errors.E189)
|
||||
headings.append(possible_headings[a])
|
||||
if annot is not heads:
|
||||
values.extend(annot)
|
||||
for value in values:
|
||||
vocab.strings.add(value)
|
||||
|
||||
doc = Doc(vocab, words=words)
|
||||
attrs = doc.to_array([POS, HEAD, DEP])
|
||||
for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
|
||||
attrs[i, 0] = doc.vocab.strings[p]
|
||||
attrs[i, 1] = head
|
||||
attrs[i, 2] = doc.vocab.strings[dep]
|
||||
doc.from_array([POS, HEAD, DEP], attrs)
|
||||
|
||||
# if there are any other annotations, set them
|
||||
if headings:
|
||||
attrs = doc.to_array(headings)
|
||||
|
||||
j = 0
|
||||
for annot in annotations:
|
||||
if annot:
|
||||
if annot is heads:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = heads[i]
|
||||
else:
|
||||
attrs[i,j] = heads[i]
|
||||
else:
|
||||
for i in range(len(words)):
|
||||
if attrs.ndim == 1:
|
||||
attrs[i] = doc.vocab.strings[annot[i]]
|
||||
else:
|
||||
attrs[i, j] = doc.vocab.strings[annot[i]]
|
||||
j += 1
|
||||
doc.from_array(headings, attrs)
|
||||
|
||||
# finally, set the entities
|
||||
if ents:
|
||||
doc.ents = [
|
||||
Span(doc, start, end, label=doc.vocab.strings[label])
|
||||
for start, end, label in ents
|
||||
]
|
||||
if tags:
|
||||
for token in doc:
|
||||
token.tag_ = tags[token.i]
|
||||
return doc
|
||||
|
||||
|
||||
|
|
|
@ -785,6 +785,8 @@ cdef class Doc:
|
|||
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
||||
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
||||
for id_ in attrs]
|
||||
if array.dtype != numpy.uint64:
|
||||
user_warning(Warnings.W028.format(type=array.dtype))
|
||||
|
||||
if SENT_START in attrs and HEAD in attrs:
|
||||
raise ValueError(Errors.E032)
|
||||
|
@ -872,7 +874,7 @@ cdef class Doc:
|
|||
|
||||
DOCS: https://spacy.io/api/doc#to_bytes
|
||||
"""
|
||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID] # TODO: ENT_KB_ID ?
|
||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM] # TODO: ENT_KB_ID ?
|
||||
if self.is_tagged:
|
||||
array_head.extend([TAG, POS])
|
||||
# If doc parsed add head and dep attribute
|
||||
|
|
Loading…
Reference in New Issue
Block a user