mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 21:57:15 +03:00
0426689db8
* Use default return instead of else * Add Doc.is_nered to indicate if entities have been set * Add properties in Doc.to_json if they were set, not if they're available This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training.
275 lines
8.9 KiB
Python
275 lines
8.9 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
import pytest
|
|
import numpy
|
|
from spacy.tokens import Doc, Span
|
|
from spacy.vocab import Vocab
|
|
from spacy.errors import ModelsWarning
|
|
from spacy.attrs import ENT_TYPE, ENT_IOB
|
|
|
|
from ..util import get_doc
|
|
|
|
|
|
@pytest.mark.parametrize("text", [["one", "two", "three"]])
|
|
def test_doc_api_compare_by_string_position(en_vocab, text):
|
|
doc = Doc(en_vocab, words=text)
|
|
# Get the tokens in this order, so their ID ordering doesn't match the idx
|
|
token3 = doc[-1]
|
|
token2 = doc[-2]
|
|
token1 = doc[-1]
|
|
token1, token2, token3 = doc
|
|
assert token1 < token2 < token3
|
|
assert not token1 > token2
|
|
assert token2 > token1
|
|
assert token2 <= token3
|
|
assert token3 >= token1
|
|
|
|
|
|
def test_doc_api_getitem(en_tokenizer):
|
|
text = "Give it back! He pleaded."
|
|
tokens = en_tokenizer(text)
|
|
assert tokens[0].text == "Give"
|
|
assert tokens[-1].text == "."
|
|
with pytest.raises(IndexError):
|
|
tokens[len(tokens)]
|
|
|
|
def to_str(span):
|
|
return "/".join(token.text for token in span)
|
|
|
|
span = tokens[1:1]
|
|
assert not to_str(span)
|
|
span = tokens[1:4]
|
|
assert to_str(span) == "it/back/!"
|
|
span = tokens[1:4:1]
|
|
assert to_str(span) == "it/back/!"
|
|
with pytest.raises(ValueError):
|
|
tokens[1:4:2]
|
|
with pytest.raises(ValueError):
|
|
tokens[1:4:-1]
|
|
|
|
span = tokens[-3:6]
|
|
assert to_str(span) == "He/pleaded"
|
|
span = tokens[4:-1]
|
|
assert to_str(span) == "He/pleaded"
|
|
span = tokens[-5:-3]
|
|
assert to_str(span) == "back/!"
|
|
span = tokens[5:4]
|
|
assert span.start == span.end == 5 and not to_str(span)
|
|
span = tokens[4:-3]
|
|
assert span.start == span.end == 4 and not to_str(span)
|
|
|
|
span = tokens[:]
|
|
assert to_str(span) == "Give/it/back/!/He/pleaded/."
|
|
span = tokens[4:]
|
|
assert to_str(span) == "He/pleaded/."
|
|
span = tokens[:4]
|
|
assert to_str(span) == "Give/it/back/!"
|
|
span = tokens[:-3]
|
|
assert to_str(span) == "Give/it/back/!"
|
|
span = tokens[-3:]
|
|
assert to_str(span) == "He/pleaded/."
|
|
|
|
span = tokens[4:50]
|
|
assert to_str(span) == "He/pleaded/."
|
|
span = tokens[-50:4]
|
|
assert to_str(span) == "Give/it/back/!"
|
|
span = tokens[-50:-40]
|
|
assert span.start == span.end == 0 and not to_str(span)
|
|
span = tokens[40:50]
|
|
assert span.start == span.end == 7 and not to_str(span)
|
|
|
|
span = tokens[1:4]
|
|
assert span[0].orth_ == "it"
|
|
subspan = span[:]
|
|
assert to_str(subspan) == "it/back/!"
|
|
subspan = span[:2]
|
|
assert to_str(subspan) == "it/back"
|
|
subspan = span[1:]
|
|
assert to_str(subspan) == "back/!"
|
|
subspan = span[:-1]
|
|
assert to_str(subspan) == "it/back"
|
|
subspan = span[-2:]
|
|
assert to_str(subspan) == "back/!"
|
|
subspan = span[1:2]
|
|
assert to_str(subspan) == "back"
|
|
subspan = span[-2:-1]
|
|
assert to_str(subspan) == "back"
|
|
subspan = span[-50:50]
|
|
assert to_str(subspan) == "it/back/!"
|
|
subspan = span[50:-50]
|
|
assert subspan.start == subspan.end == 4 and not to_str(subspan)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"text", ["Give it back! He pleaded.", " Give it back! He pleaded. "]
|
|
)
|
|
def test_doc_api_serialize(en_tokenizer, text):
|
|
tokens = en_tokenizer(text)
|
|
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
|
|
assert tokens.text == new_tokens.text
|
|
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
|
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
|
|
|
new_tokens = Doc(tokens.vocab).from_bytes(
|
|
tokens.to_bytes(tensor=False), tensor=False
|
|
)
|
|
assert tokens.text == new_tokens.text
|
|
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
|
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
|
|
|
new_tokens = Doc(tokens.vocab).from_bytes(
|
|
tokens.to_bytes(sentiment=False), sentiment=False
|
|
)
|
|
assert tokens.text == new_tokens.text
|
|
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
|
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
|
|
|
|
|
def test_doc_api_set_ents(en_tokenizer):
|
|
text = "I use goggle chrone to surf the web"
|
|
tokens = en_tokenizer(text)
|
|
assert len(tokens.ents) == 0
|
|
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
|
|
assert len(list(tokens.ents)) == 1
|
|
assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
|
|
assert tokens.ents[0].label_ == "PRODUCT"
|
|
assert tokens.ents[0].start == 2
|
|
assert tokens.ents[0].end == 4
|
|
|
|
|
|
def test_doc_api_sents_empty_string(en_tokenizer):
|
|
doc = en_tokenizer("")
|
|
doc.is_parsed = True
|
|
sents = list(doc.sents)
|
|
assert len(sents) == 0
|
|
|
|
|
|
def test_doc_api_runtime_error(en_tokenizer):
|
|
# Example that caused run-time error while parsing Reddit
|
|
# fmt: off
|
|
text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
|
|
deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "",
|
|
"nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep",
|
|
"pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
|
|
"ROOT", "amod", "dobj"]
|
|
# fmt: on
|
|
tokens = en_tokenizer(text)
|
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
|
nps = []
|
|
for np in doc.noun_chunks:
|
|
while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
|
|
np = np[1:]
|
|
if len(np) > 1:
|
|
nps.append(np)
|
|
with doc.retokenize() as retokenizer:
|
|
for np in nps:
|
|
attrs = {
|
|
"tag": np.root.tag_,
|
|
"lemma": np.text,
|
|
"ent_type": np.root.ent_type_,
|
|
}
|
|
retokenizer.merge(np, attrs=attrs)
|
|
|
|
|
|
def test_doc_api_right_edge(en_tokenizer):
|
|
"""Test for bug occurring from Unshift action, causing incorrect right edge"""
|
|
# fmt: off
|
|
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
|
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
|
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
|
# fmt: on
|
|
|
|
tokens = en_tokenizer(text)
|
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
|
assert doc[6].text == "for"
|
|
subtree = [w.text for w in doc[6].subtree]
|
|
assert subtree == [
|
|
"for",
|
|
"the",
|
|
"sake",
|
|
"of",
|
|
"such",
|
|
"as",
|
|
"live",
|
|
"under",
|
|
"the",
|
|
"government",
|
|
"of",
|
|
"the",
|
|
"Romans",
|
|
",",
|
|
]
|
|
assert doc[6].right_edge.text == ","
|
|
|
|
|
|
def test_doc_api_has_vector():
|
|
vocab = Vocab()
|
|
vocab.reset_vectors(width=2)
|
|
vocab.set_vector("kitten", vector=numpy.asarray([0.0, 2.0], dtype="f"))
|
|
doc = Doc(vocab, words=["kitten"])
|
|
assert doc.has_vector
|
|
|
|
|
|
def test_doc_api_similarity_match():
|
|
doc = Doc(Vocab(), words=["a"])
|
|
assert doc.similarity(doc[0]) == 1.0
|
|
assert doc.similarity(doc.vocab["a"]) == 1.0
|
|
doc2 = Doc(doc.vocab, words=["a", "b", "c"])
|
|
with pytest.warns(ModelsWarning):
|
|
assert doc.similarity(doc2[:1]) == 1.0
|
|
assert doc.similarity(doc2) == 0.0
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"sentence,heads,lca_matrix",
|
|
[
|
|
(
|
|
"the lazy dog slept",
|
|
[2, 1, 1, 0],
|
|
numpy.array([[0, 2, 2, 3], [2, 1, 2, 3], [2, 2, 2, 3], [3, 3, 3, 3]]),
|
|
),
|
|
(
|
|
"The lazy dog slept. The quick fox jumped",
|
|
[2, 1, 1, 0, -1, 2, 1, 1, 0],
|
|
numpy.array(
|
|
[
|
|
[0, 2, 2, 3, 3, -1, -1, -1, -1],
|
|
[2, 1, 2, 3, 3, -1, -1, -1, -1],
|
|
[2, 2, 2, 3, 3, -1, -1, -1, -1],
|
|
[3, 3, 3, 3, 3, -1, -1, -1, -1],
|
|
[3, 3, 3, 3, 4, -1, -1, -1, -1],
|
|
[-1, -1, -1, -1, -1, 5, 7, 7, 8],
|
|
[-1, -1, -1, -1, -1, 7, 6, 7, 8],
|
|
[-1, -1, -1, -1, -1, 7, 7, 7, 8],
|
|
[-1, -1, -1, -1, -1, 8, 8, 8, 8],
|
|
]
|
|
),
|
|
),
|
|
],
|
|
)
|
|
def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
|
|
tokens = en_tokenizer(sentence)
|
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
|
lca = doc.get_lca_matrix()
|
|
assert (lca == lca_matrix).all()
|
|
assert lca[1, 1] == 1
|
|
assert lca[0, 1] == 2
|
|
assert lca[1, 2] == 2
|
|
|
|
|
|
def test_doc_is_nered(en_vocab):
|
|
words = ["I", "live", "in", "New", "York"]
|
|
doc = Doc(en_vocab, words=words)
|
|
assert not doc.is_nered
|
|
doc.ents = [Span(doc, 3, 5, label="GPE")]
|
|
assert doc.is_nered
|
|
# Test creating doc from array with unknown values
|
|
arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
|
|
doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
|
|
assert doc.is_nered
|
|
# Test serialization
|
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
|
assert new_doc.is_nered
|