mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Bugfix/get doc (#5049)
* new (broken) unit test * fixing get_doc method
This commit is contained in:
parent
65d7bab10f
commit
c6b12ab02a
|
@ -107,6 +107,9 @@ class Warnings(object):
|
||||||
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
||||||
"be more efficient to split your training data into multiple "
|
"be more efficient to split your training data into multiple "
|
||||||
"smaller JSON files instead.")
|
"smaller JSON files instead.")
|
||||||
|
W028 = ("Doc.from_array was called with a vector of type '{type}', "
|
||||||
|
"but is expecting one of type 'uint64' instead. This may result "
|
||||||
|
"in problems with the vocab further on in the pipeline.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -541,6 +544,7 @@ class Errors(object):
|
||||||
E188 = ("Could not match the gold entity links to entities in the doc - "
|
E188 = ("Could not match the gold entity links to entities in the doc - "
|
||||||
"make sure the gold EL data refers to valid results of the "
|
"make sure the gold EL data refers to valid results of the "
|
||||||
"named entity recognizer in the `nlp` pipeline.")
|
"named entity recognizer in the `nlp` pipeline.")
|
||||||
|
E189 = ("Each argument to `get_doc` should be of equal length.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -367,7 +367,7 @@ class Tensorizer(Pipe):
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
|
|
||||||
@component("tagger", assigns=["token.tag", "token.pos"])
|
@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"])
|
||||||
class Tagger(Pipe):
|
class Tagger(Pipe):
|
||||||
"""Pipeline component for part-of-speech tagging.
|
"""Pipeline component for part-of-speech tagging.
|
||||||
|
|
||||||
|
|
|
@ -150,10 +150,9 @@ def test_doc_api_runtime_error(en_tokenizer):
|
||||||
# Example that caused run-time error while parsing Reddit
|
# Example that caused run-time error while parsing Reddit
|
||||||
# fmt: off
|
# fmt: off
|
||||||
text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
|
text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
|
||||||
deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "",
|
deps = ["nummod", "nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", "nummod", "appos", "prep", "det",
|
||||||
"nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep",
|
"amod", "pobj", "acl", "prep", "prep", "pobj",
|
||||||
"pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
|
"", "nummod", "nsubj", "prep", "det", "amod", "pobj", "aux", "neg", "ccomp", "amod", "dobj"]
|
||||||
"ROOT", "amod", "dobj"]
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
||||||
|
@ -277,7 +276,9 @@ def test_doc_is_nered(en_vocab):
|
||||||
def test_doc_from_array_sent_starts(en_vocab):
|
def test_doc_from_array_sent_starts(en_vocab):
|
||||||
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
|
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
|
||||||
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
|
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
|
||||||
|
# fmt: off
|
||||||
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
|
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
|
||||||
|
# fmt: on
|
||||||
doc = Doc(en_vocab, words=words)
|
doc = Doc(en_vocab, words=words)
|
||||||
for i, (dep, head) in enumerate(zip(deps, heads)):
|
for i, (dep, head) in enumerate(zip(deps, heads)):
|
||||||
doc[i].dep_ = dep
|
doc[i].dep_ = dep
|
||||||
|
|
|
@ -214,7 +214,7 @@ def test_token_api_conjuncts_chain(en_vocab):
|
||||||
def test_token_api_conjuncts_simple(en_vocab):
|
def test_token_api_conjuncts_simple(en_vocab):
|
||||||
words = "They came and went .".split()
|
words = "They came and went .".split()
|
||||||
heads = [1, 0, -1, -2, -1]
|
heads = [1, 0, -1, -2, -1]
|
||||||
deps = ["nsubj", "ROOT", "cc", "conj"]
|
deps = ["nsubj", "ROOT", "cc", "conj", "dep"]
|
||||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
assert [w.text for w in doc[1].conjuncts] == ["went"]
|
assert [w.text for w in doc[1].conjuncts] == ["went"]
|
||||||
assert [w.text for w in doc[3].conjuncts] == ["came"]
|
assert [w.text for w in doc[3].conjuncts] == ["came"]
|
||||||
|
|
|
@ -34,23 +34,23 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def heads():
|
def heads():
|
||||||
# fmt: off
|
# fmt: off
|
||||||
return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15,
|
return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2,
|
||||||
-1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
|
-1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
|
||||||
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14,
|
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14,
|
||||||
1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 2, 1,
|
1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1,
|
||||||
0, -1, 1, -2, -1, 2, 1, -4, -8, 0, 1, -2, -1, -1, 3, -1, 1, -6,
|
0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10,
|
||||||
9, 1, 7, -1, 1, -2, 3, 2, 1, -10, -1, 1, -2, -22, -1, 1, 0, -1,
|
9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1,
|
||||||
2, 1, -4, -1, -2, -1, 1, -2, -6, -7, 1, -9, -1, 2, -1, -3, -1,
|
2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1,
|
||||||
3, 2, 1, -4, -19, -24, 3, 2, 1, -4, -1, 1, 2, -1, -5, -34, 1, 0,
|
3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0,
|
||||||
-1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, -3, -1,
|
-1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1,
|
||||||
-1, 3, 2, 1, 0, -1, -2, 7, -1, 5, 1, 3, -1, 1, -10, -1, -2, 1,
|
-1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1,
|
||||||
-2, -15, 1, 0, -1, -1, 2, 1, -3, -1, -1, -2, -1, 1, -2, -12, 1,
|
-2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1,
|
||||||
1, 0, 1, -2, -1, -2, -3, 9, -1, 2, -1, -4, 2, 1, -3, -4, -15, 2,
|
1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2,
|
||||||
1, -3, -1, 2, 1, -3, -8, -9, -1, -2, -1, -4, 1, -2, -3, 1, -2,
|
1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2,
|
||||||
-19, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
|
-10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
|
||||||
0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
|
0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
|
||||||
1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1,
|
1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1,
|
||||||
-1, -8, -9, -1]
|
-1, 0, -1, -1]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,7 @@ def test_issue2203(en_vocab):
|
||||||
tag_ids = [en_vocab.strings.add(tag) for tag in tags]
|
tag_ids = [en_vocab.strings.add(tag) for tag in tags]
|
||||||
lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
|
lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
|
||||||
doc = Doc(en_vocab, words=words)
|
doc = Doc(en_vocab, words=words)
|
||||||
# Work around lemma corrpution problem and set lemmas after tags
|
# Work around lemma corruption problem and set lemmas after tags
|
||||||
doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
|
doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
|
||||||
doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
|
doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
|
||||||
assert [t.tag_ for t in doc] == tags
|
assert [t.tag_ for t in doc] == tags
|
||||||
|
|
|
@ -124,7 +124,7 @@ def test_issue2772(en_vocab):
|
||||||
words = "When we write or communicate virtually , we can hide our true feelings .".split()
|
words = "When we write or communicate virtually , we can hide our true feelings .".split()
|
||||||
# A tree with a non-projective (i.e. crossing) arc
|
# A tree with a non-projective (i.e. crossing) arc
|
||||||
# The arcs (0, 4) and (2, 9) cross.
|
# The arcs (0, 4) and (2, 9) cross.
|
||||||
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1]
|
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
|
||||||
deps = ["dep"] * len(heads)
|
deps = ["dep"] * len(heads)
|
||||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
assert doc[1].is_sent_start is None
|
assert doc[1].is_sent_start is None
|
||||||
|
|
|
@ -27,7 +27,7 @@ def test_issue4590(en_vocab):
|
||||||
|
|
||||||
text = "The quick brown fox jumped over the lazy fox"
|
text = "The quick brown fox jumped over the lazy fox"
|
||||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||||
deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
|
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
||||||
|
|
||||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||||
|
|
||||||
|
|
35
spacy/tests/regression/test_issue5048.py
Normal file
35
spacy/tests/regression/test_issue5048.py
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy.attrs import DEP, POS, TAG
|
||||||
|
|
||||||
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue5048(en_vocab):
|
||||||
|
words = ["This", "is", "a", "sentence"]
|
||||||
|
pos_s = ["DET", "VERB", "DET", "NOUN"]
|
||||||
|
spaces = [" ", " ", " ", ""]
|
||||||
|
deps_s = ["dep", "adj", "nn", "atm"]
|
||||||
|
tags_s = ["DT", "VBZ", "DT", "NN"]
|
||||||
|
|
||||||
|
strings = en_vocab.strings
|
||||||
|
|
||||||
|
for w in words:
|
||||||
|
strings.add(w)
|
||||||
|
deps = [strings.add(d) for d in deps_s]
|
||||||
|
pos = [strings.add(p) for p in pos_s]
|
||||||
|
tags = [strings.add(t) for t in tags_s]
|
||||||
|
|
||||||
|
attrs = [POS, DEP, TAG]
|
||||||
|
array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64")
|
||||||
|
|
||||||
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
|
doc.from_array(attrs, array)
|
||||||
|
v1 = [(token.text, token.pos_, token.tag_) for token in doc]
|
||||||
|
|
||||||
|
doc2 = get_doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s)
|
||||||
|
v2 = [(token.text, token.pos_, token.tag_) for token in doc2]
|
||||||
|
assert v1 == v2
|
|
@ -31,10 +31,10 @@ def test_displacy_parse_deps(en_vocab):
|
||||||
deps = displacy.parse_deps(doc)
|
deps = displacy.parse_deps(doc)
|
||||||
assert isinstance(deps, dict)
|
assert isinstance(deps, dict)
|
||||||
assert deps["words"] == [
|
assert deps["words"] == [
|
||||||
{"lemma": None, "text": "This", "tag": "DET"},
|
{"lemma": None, "text": words[0], "tag": pos[0]},
|
||||||
{"lemma": None, "text": "is", "tag": "AUX"},
|
{"lemma": None, "text": words[1], "tag": pos[1]},
|
||||||
{"lemma": None, "text": "a", "tag": "DET"},
|
{"lemma": None, "text": words[2], "tag": pos[2]},
|
||||||
{"lemma": None, "text": "sentence", "tag": "NOUN"},
|
{"lemma": None, "text": words[3], "tag": pos[3]},
|
||||||
]
|
]
|
||||||
assert deps["arcs"] == [
|
assert deps["arcs"] == [
|
||||||
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
||||||
|
@ -75,7 +75,7 @@ def test_displacy_rtl():
|
||||||
deps = ["foo", "bar", "foo", "baz"]
|
deps = ["foo", "bar", "foo", "baz"]
|
||||||
heads = [1, 0, 1, -2]
|
heads = [1, 0, 1, -2]
|
||||||
nlp = Persian()
|
nlp = Persian()
|
||||||
doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps)
|
doc = get_doc(nlp.vocab, words=words, tags=pos, heads=heads, deps=deps)
|
||||||
doc.ents = [Span(doc, 1, 3, label="TEST")]
|
doc.ents = [Span(doc, 1, 3, label="TEST")]
|
||||||
html = displacy.render(doc, page=True, style="dep")
|
html = displacy.render(doc, page=True, style="dep")
|
||||||
assert "direction: rtl" in html
|
assert "direction: rtl" in html
|
||||||
|
|
|
@ -7,8 +7,10 @@ import shutil
|
||||||
import contextlib
|
import contextlib
|
||||||
import srsly
|
import srsly
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from spacy import Errors
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.attrs import POS, HEAD, DEP
|
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA
|
||||||
from spacy.compat import path2str
|
from spacy.compat import path2str
|
||||||
|
|
||||||
|
|
||||||
|
@ -26,30 +28,54 @@ def make_tempdir():
|
||||||
shutil.rmtree(path2str(d))
|
shutil.rmtree(path2str(d))
|
||||||
|
|
||||||
|
|
||||||
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
|
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None):
|
||||||
"""Create Doc object from given vocab, words and annotations."""
|
"""Create Doc object from given vocab, words and annotations."""
|
||||||
pos = pos or [""] * len(words)
|
if deps and not heads:
|
||||||
tags = tags or [""] * len(words)
|
heads = [0] * len(deps)
|
||||||
heads = heads or [0] * len(words)
|
headings = []
|
||||||
deps = deps or [""] * len(words)
|
values = []
|
||||||
for value in deps + tags + pos:
|
annotations = [pos, heads, deps, lemmas, tags]
|
||||||
|
possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
|
||||||
|
for a, annot in enumerate(annotations):
|
||||||
|
if annot is not None:
|
||||||
|
if len(annot) != len(words):
|
||||||
|
raise ValueError(Errors.E189)
|
||||||
|
headings.append(possible_headings[a])
|
||||||
|
if annot is not heads:
|
||||||
|
values.extend(annot)
|
||||||
|
for value in values:
|
||||||
vocab.strings.add(value)
|
vocab.strings.add(value)
|
||||||
|
|
||||||
doc = Doc(vocab, words=words)
|
doc = Doc(vocab, words=words)
|
||||||
attrs = doc.to_array([POS, HEAD, DEP])
|
|
||||||
for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
|
# if there are any other annotations, set them
|
||||||
attrs[i, 0] = doc.vocab.strings[p]
|
if headings:
|
||||||
attrs[i, 1] = head
|
attrs = doc.to_array(headings)
|
||||||
attrs[i, 2] = doc.vocab.strings[dep]
|
|
||||||
doc.from_array([POS, HEAD, DEP], attrs)
|
j = 0
|
||||||
|
for annot in annotations:
|
||||||
|
if annot:
|
||||||
|
if annot is heads:
|
||||||
|
for i in range(len(words)):
|
||||||
|
if attrs.ndim == 1:
|
||||||
|
attrs[i] = heads[i]
|
||||||
|
else:
|
||||||
|
attrs[i,j] = heads[i]
|
||||||
|
else:
|
||||||
|
for i in range(len(words)):
|
||||||
|
if attrs.ndim == 1:
|
||||||
|
attrs[i] = doc.vocab.strings[annot[i]]
|
||||||
|
else:
|
||||||
|
attrs[i, j] = doc.vocab.strings[annot[i]]
|
||||||
|
j += 1
|
||||||
|
doc.from_array(headings, attrs)
|
||||||
|
|
||||||
|
# finally, set the entities
|
||||||
if ents:
|
if ents:
|
||||||
doc.ents = [
|
doc.ents = [
|
||||||
Span(doc, start, end, label=doc.vocab.strings[label])
|
Span(doc, start, end, label=doc.vocab.strings[label])
|
||||||
for start, end, label in ents
|
for start, end, label in ents
|
||||||
]
|
]
|
||||||
if tags:
|
|
||||||
for token in doc:
|
|
||||||
token.tag_ = tags[token.i]
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -785,6 +785,8 @@ cdef class Doc:
|
||||||
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
||||||
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
||||||
for id_ in attrs]
|
for id_ in attrs]
|
||||||
|
if array.dtype != numpy.uint64:
|
||||||
|
user_warning(Warnings.W028.format(type=array.dtype))
|
||||||
|
|
||||||
if SENT_START in attrs and HEAD in attrs:
|
if SENT_START in attrs and HEAD in attrs:
|
||||||
raise ValueError(Errors.E032)
|
raise ValueError(Errors.E032)
|
||||||
|
@ -872,7 +874,7 @@ cdef class Doc:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#to_bytes
|
DOCS: https://spacy.io/api/doc#to_bytes
|
||||||
"""
|
"""
|
||||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID] # TODO: ENT_KB_ID ?
|
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM] # TODO: ENT_KB_ID ?
|
||||||
if self.is_tagged:
|
if self.is_tagged:
|
||||||
array_head.extend([TAG, POS])
|
array_head.extend([TAG, POS])
|
||||||
# If doc parsed add head and dep attribute
|
# If doc parsed add head and dep attribute
|
||||||
|
|
Loading…
Reference in New Issue
Block a user