spaCy/spacy/tests/doc/test_span.py
Adriane Boyd 5d0f48fe69
Enforce that Span.start/end(_char) remain valid and in sync (#12268)
* Enforce that Span.start/end(_char) remain valid and in sync

Allowing span attributes to be writable starting in v3 has made it
possible for the internal `Span.start/end/start_char/end_char` to get
out-of-sync or have invalid values.

This checks that the values are valid and syncs the token and char
offsets if any attributes are modified directly. It does not yet handle
the case where the underlying doc is modified.

* Format
2023-04-06 16:01:59 +02:00

757 lines
25 KiB
Python

import pytest
import numpy
from numpy.testing import assert_array_equal
from spacy.attrs import ORTH, LENGTH
from spacy.lang.en import English
from spacy.tokens import Doc, Span, SpanGroup, Token
from spacy.vocab import Vocab
from spacy.util import filter_spans
from thinc.api import get_current_ops
from ..util import add_vecs_to_vocab
from .test_underscore import clean_underscore # noqa: F401
@pytest.fixture
def doc(en_tokenizer):
# fmt: off
text = "This is a sentence. This is another sentence. And a third."
heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
"attr", "punct", "ROOT", "det", "npadvmod", "punct"]
ents = ["O", "O", "B-ENT", "I-ENT", "I-ENT", "I-ENT", "I-ENT", "O", "O",
"O", "O", "O", "O", "O"]
# fmt: on
tokens = en_tokenizer(text)
lemmas = [t.text for t in tokens] # this is not correct, just a placeholder
spaces = [bool(t.whitespace_) for t in tokens]
return Doc(
tokens.vocab,
words=[t.text for t in tokens],
spaces=spaces,
heads=heads,
deps=deps,
ents=ents,
lemmas=lemmas,
)
@pytest.fixture
def doc_not_parsed(en_tokenizer):
text = "This is a sentence. This is another sentence. And a third."
tokens = en_tokenizer(text)
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
return doc
@pytest.mark.issue(1537)
def test_issue1537():
"""Test that Span.as_doc() doesn't segfault."""
string = "The sky is blue . The man is pink . The dog is purple ."
doc = Doc(Vocab(), words=string.split())
doc[0].sent_start = True
for word in doc[1:]:
if word.nbor(-1).text == ".":
word.sent_start = True
else:
word.sent_start = False
sents = list(doc.sents)
sent0 = sents[0].as_doc()
sent1 = sents[1].as_doc()
assert isinstance(sent0, Doc)
assert isinstance(sent1, Doc)
@pytest.mark.issue(1612)
def test_issue1612(en_tokenizer):
"""Test that span.orth_ is identical to span.text"""
doc = en_tokenizer("The black cat purrs.")
span = doc[1:3]
assert span.orth_ == span.text
@pytest.mark.issue(3199)
def test_issue3199():
"""Test that Span.noun_chunks works correctly if no noun chunks iterator
is available. To make this test future-proof, we're constructing a Doc
with a new Vocab here and a parse tree to make sure the noun chunks run.
"""
words = ["This", "is", "a", "sentence"]
doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
with pytest.raises(NotImplementedError):
list(doc[0:3].noun_chunks)
@pytest.mark.issue(5152)
def test_issue5152():
# Test that the comparison between a Span and a Token, goes well
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
nlp = English()
text = nlp("Talk about being boring!")
text_var = nlp("Talk of being boring!")
y = nlp("Let")
span = text[0:3] # Talk about being
span_2 = text[0:3] # Talk about being
span_3 = text_var[0:3] # Talk of being
token = y[0] # Let
with pytest.warns(UserWarning):
assert span.similarity(token) == 0.0
assert span.similarity(span_2) == 1.0
with pytest.warns(UserWarning):
assert span_2.similarity(span_3) < 1.0
@pytest.mark.issue(6755)
def test_issue6755(en_tokenizer):
doc = en_tokenizer("This is a magnificent sentence.")
span = doc[:0]
assert span.text_with_ws == ""
assert span.text == ""
@pytest.mark.parametrize(
"sentence, start_idx,end_idx,label",
[("Welcome to Mumbai, my friend", 11, 17, "GPE")],
)
@pytest.mark.issue(6815)
def test_issue6815_1(sentence, start_idx, end_idx, label):
nlp = English()
doc = nlp(sentence)
span = doc[:].char_span(start_idx, end_idx, label=label)
assert span.label_ == label
@pytest.mark.parametrize(
"sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)]
)
@pytest.mark.issue(6815)
def test_issue6815_2(sentence, start_idx, end_idx, kb_id):
nlp = English()
doc = nlp(sentence)
span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id)
assert span.kb_id == kb_id
@pytest.mark.parametrize(
"sentence, start_idx,end_idx,vector",
[("Welcome to Mumbai, my friend", 11, 17, numpy.array([0.1, 0.2, 0.3]))],
)
@pytest.mark.issue(6815)
def test_issue6815_3(sentence, start_idx, end_idx, vector):
nlp = English()
doc = nlp(sentence)
span = doc[:].char_span(start_idx, end_idx, vector=vector)
assert (span.vector == vector).all()
@pytest.mark.parametrize(
"i_sent,i,j,text",
[
(0, 0, len("This is a"), "This is a"),
(1, 0, len("This is another"), "This is another"),
(2, len("And "), len("And ") + len("a third"), "a third"),
(0, 1, 2, None),
],
)
def test_char_span(doc, i_sent, i, j, text):
sents = list(doc.sents)
span = sents[i_sent].char_span(i, j)
if not text:
assert not span
else:
assert span.text == text
@pytest.mark.issue(9556)
def test_modify_span_group(doc):
group = SpanGroup(doc, spans=doc.ents)
for span in group:
span.start = 0
span.label = doc.vocab.strings["TEST"]
# Span changes must be reflected in the span group
assert group[0].start == 0
assert group[0].label == doc.vocab.strings["TEST"]
def test_char_span_attributes(doc):
label = "LABEL"
kb_id = "KB_ID"
span_id = "SPAN_ID"
span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
assert span1.text == span2.text
assert span1.label_ == span2.label_ == label
assert span1.kb_id_ == span2.kb_id_ == kb_id
assert span1.id_ == span2.id_ == span_id
def test_spans_sent_spans(doc):
sents = list(doc.sents)
assert sents[0].start == 0
assert sents[0].end == 5
assert len(sents) == 3
assert sum(len(sent) for sent in sents) == len(doc)
def test_spans_root(doc):
span = doc[2:4]
assert len(span) == 2
assert span.text == "a sentence"
assert span.root.text == "sentence"
assert span.root.head.text == "is"
def test_spans_string_fn(doc):
span = doc[0:4]
assert len(span) == 4
assert span.text == "This is a sentence"
def test_spans_root2(en_tokenizer):
text = "through North and South Carolina"
heads = [0, 4, 1, 1, 0]
deps = ["dep"] * len(heads)
tokens = en_tokenizer(text)
doc = Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
assert doc[-2:].root.text == "Carolina"
def test_spans_span_sent(doc, doc_not_parsed):
"""Test span.sent property"""
assert len(list(doc.sents))
assert doc[:2].sent.root.text == "is"
assert doc[:2].sent.text == "This is a sentence."
assert doc[6:7].sent.root.left_edge.text == "This"
assert doc[0 : len(doc)].sent == list(doc.sents)[0]
assert list(doc[0 : len(doc)].sents) == list(doc.sents)
with pytest.raises(ValueError):
doc_not_parsed[:2].sent
# test on manual sbd
doc_not_parsed[0].is_sent_start = True
doc_not_parsed[5].is_sent_start = True
assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
@pytest.mark.parametrize(
"start,end,expected_sentence",
[
(0, 14, "This is"), # Entire doc
(1, 4, "This is"), # Overlapping with 2 sentences
(0, 2, "This is"), # Beginning of the Doc. Full sentence
(0, 1, "This is"), # Beginning of the Doc. Part of a sentence
(10, 14, "And a"), # End of the Doc. Overlapping with 2 senteces
(12, 14, "third."), # End of the Doc. Full sentence
(1, 1, "This is"), # Empty Span
],
)
def test_spans_span_sent_user_hooks(doc, start, end, expected_sentence):
# Doc-level sents hook
def user_hook(doc):
return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
doc.user_hooks["sents"] = user_hook
# Make sure doc-level sents hook works
assert doc[start:end].sent.text == expected_sentence
# Span-level sent hook
doc.user_span_hooks["sent"] = lambda x: x
# Now, span=level sent hook overrides the doc-level sents hook
assert doc[start:end].sent == doc[start:end]
def test_spans_lca_matrix(en_tokenizer):
"""Test span's lca matrix generation"""
tokens = en_tokenizer("the lazy dog slept")
doc = Doc(
tokens.vocab,
words=[t.text for t in tokens],
heads=[2, 2, 3, 3],
deps=["dep"] * 4,
)
lca = doc[:2].get_lca_matrix()
assert lca.shape == (2, 2)
assert lca[0, 0] == 0 # the & the -> the
assert lca[0, 1] == -1 # the & lazy -> dog (out of span)
assert lca[1, 0] == -1 # lazy & the -> dog (out of span)
assert lca[1, 1] == 1 # lazy & lazy -> lazy
lca = doc[1:].get_lca_matrix()
assert lca.shape == (3, 3)
assert lca[0, 0] == 0 # lazy & lazy -> lazy
assert lca[0, 1] == 1 # lazy & dog -> dog
assert lca[0, 2] == 2 # lazy & slept -> slept
lca = doc[2:].get_lca_matrix()
assert lca.shape == (2, 2)
assert lca[0, 0] == 0 # dog & dog -> dog
assert lca[0, 1] == 1 # dog & slept -> slept
assert lca[1, 0] == 1 # slept & dog -> slept
assert lca[1, 1] == 1 # slept & slept -> slept
# example from Span API docs
tokens = en_tokenizer("I like New York in Autumn")
doc = Doc(
tokens.vocab,
words=[t.text for t in tokens],
heads=[1, 1, 3, 1, 3, 4],
deps=["dep"] * len(tokens),
)
lca = doc[1:4].get_lca_matrix()
assert_array_equal(lca, numpy.asarray([[0, 0, 0], [0, 1, 2], [0, 2, 2]]))
def test_span_similarity_match():
doc = Doc(Vocab(), words=["a", "b", "a", "b"])
span1 = doc[:2]
span2 = doc[2:]
with pytest.warns(UserWarning):
assert span1.similarity(span2) == 1.0
assert span1.similarity(doc) == 0.0
assert span1[:1].similarity(doc.vocab["a"]) == 1.0
def test_spans_are_hashable(en_tokenizer):
"""Test spans can be hashed."""
text = "good stuff bad stuff"
tokens = en_tokenizer(text)
span1 = tokens[:2]
span2 = tokens[2:4]
assert hash(span1) != hash(span2)
span3 = tokens[0:2]
assert hash(span3) == hash(span1)
def test_spans_by_character(doc):
span1 = doc[1:-2]
# default and specified alignment mode "strict"
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
span2 = doc.char_span(
span1.start_char, span1.end_char, label="GPE", alignment_mode="strict"
)
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
# alignment mode "contract"
span2 = doc.char_span(
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
)
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
# alignment mode "expand"
span2 = doc.char_span(
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand"
)
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
# unsupported alignment mode
with pytest.raises(ValueError):
span2 = doc.char_span(
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
)
# Span.char_span + alignment mode "contract"
span2 = doc[0:2].char_span(
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
)
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
def test_span_to_array(doc):
span = doc[1:-2]
arr = span.to_array([ORTH, LENGTH])
assert arr.shape == (len(span), 2)
assert arr[0, 0] == span[0].orth
assert arr[0, 1] == len(span[0])
def test_span_as_doc(doc):
span = doc[4:10]
span_doc = span.as_doc()
assert span.text == span_doc.text.strip()
assert isinstance(span_doc, doc.__class__)
assert span_doc is not doc
assert span_doc[0].idx == 0
# partial initial entity is removed
assert len(span_doc.ents) == 0
# full entity is preserved
span_doc = doc[2:10].as_doc()
assert len(span_doc.ents) == 1
# partial final entity is removed
span_doc = doc[0:5].as_doc()
assert len(span_doc.ents) == 0
@pytest.mark.usefixtures("clean_underscore")
def test_span_as_doc_user_data(doc):
"""Test that the user_data can be preserved (but not by default)."""
my_key = "my_info"
my_value = 342
doc.user_data[my_key] = my_value
Token.set_extension("is_x", default=False)
doc[7]._.is_x = True
span = doc[4:10]
span_doc_with = span.as_doc(copy_user_data=True)
span_doc_without = span.as_doc()
assert doc.user_data.get(my_key, None) is my_value
assert span_doc_with.user_data.get(my_key, None) is my_value
assert span_doc_without.user_data.get(my_key, None) is None
for i in range(len(span_doc_with)):
if i != 3:
assert span_doc_with[i]._.is_x is False
else:
assert span_doc_with[i]._.is_x is True
assert not any([t._.is_x for t in span_doc_without])
def test_span_string_label_kb_id(doc):
span = Span(doc, 0, 1, label="hello", kb_id="Q342")
assert span.label_ == "hello"
assert span.label == doc.vocab.strings["hello"]
assert span.kb_id_ == "Q342"
assert span.kb_id == doc.vocab.strings["Q342"]
def test_span_string_label_id(doc):
span = Span(doc, 0, 1, label="hello", span_id="Q342")
assert span.label_ == "hello"
assert span.label == doc.vocab.strings["hello"]
assert span.id_ == "Q342"
assert span.id == doc.vocab.strings["Q342"]
def test_span_attrs_writable(doc):
span = Span(doc, 0, 1)
span.label_ = "label"
span.kb_id_ = "kb_id"
span.id_ = "id"
def test_span_ents_property(doc):
doc.ents = [
(doc.vocab.strings["PRODUCT"], 0, 1),
(doc.vocab.strings["PRODUCT"], 7, 8),
(doc.vocab.strings["PRODUCT"], 11, 14),
]
assert len(list(doc.ents)) == 3
sentences = list(doc.sents)
assert len(sentences) == 3
assert len(sentences[0].ents) == 1
# First sentence, also tests start of sentence
assert sentences[0].ents[0].text == "This"
assert sentences[0].ents[0].label_ == "PRODUCT"
assert sentences[0].ents[0].start == 0
assert sentences[0].ents[0].end == 1
# Second sentence
assert len(sentences[1].ents) == 1
assert sentences[1].ents[0].text == "another"
assert sentences[1].ents[0].label_ == "PRODUCT"
assert sentences[1].ents[0].start == 7
assert sentences[1].ents[0].end == 8
# Third sentence ents, Also tests end of sentence
assert sentences[2].ents[0].text == "a third."
assert sentences[2].ents[0].label_ == "PRODUCT"
assert sentences[2].ents[0].start == 11
assert sentences[2].ents[0].end == 14
def test_filter_spans(doc):
# Test filtering duplicates
spans = [doc[1:4], doc[6:8], doc[1:4], doc[10:14]]
filtered = filter_spans(spans)
assert len(filtered) == 3
assert filtered[0].start == 1 and filtered[0].end == 4
assert filtered[1].start == 6 and filtered[1].end == 8
assert filtered[2].start == 10 and filtered[2].end == 14
# Test filtering overlaps with longest preference
spans = [doc[1:4], doc[1:3], doc[5:10], doc[7:9], doc[1:4]]
filtered = filter_spans(spans)
assert len(filtered) == 2
assert len(filtered[0]) == 3
assert len(filtered[1]) == 5
assert filtered[0].start == 1 and filtered[0].end == 4
assert filtered[1].start == 5 and filtered[1].end == 10
# Test filtering overlaps with earlier preference for identical length
spans = [doc[1:4], doc[2:5], doc[5:10], doc[7:9], doc[1:4]]
filtered = filter_spans(spans)
assert len(filtered) == 2
assert len(filtered[0]) == 3
assert len(filtered[1]) == 5
assert filtered[0].start == 1 and filtered[0].end == 4
assert filtered[1].start == 5 and filtered[1].end == 10
def test_span_eq_hash(doc, doc_not_parsed):
assert doc[0:2] == doc[0:2]
assert doc[0:2] != doc[1:3]
assert doc[0:2] != doc_not_parsed[0:2]
assert hash(doc[0:2]) == hash(doc[0:2])
assert hash(doc[0:2]) != hash(doc[1:3])
assert hash(doc[0:2]) != hash(doc_not_parsed[0:2])
# check that an out-of-bounds is not equivalent to the span of the full doc
assert doc[0 : len(doc)] != doc[len(doc) : len(doc) + 1]
def test_span_boundaries(doc):
start = 1
end = 5
span = doc[start:end]
for i in range(start, end):
assert span[i - start] == doc[i]
with pytest.raises(IndexError):
span[-5]
with pytest.raises(IndexError):
span[5]
empty_span_0 = doc[0:0]
assert empty_span_0.text == ""
assert empty_span_0.start == 0
assert empty_span_0.end == 0
assert empty_span_0.start_char == 0
assert empty_span_0.end_char == 0
empty_span_1 = doc[1:1]
assert empty_span_1.text == ""
assert empty_span_1.start == 1
assert empty_span_1.end == 1
assert empty_span_1.start_char == empty_span_1.end_char
oob_span_start = doc[-len(doc) - 1 : -len(doc) - 10]
assert oob_span_start.text == ""
assert oob_span_start.start == 0
assert oob_span_start.end == 0
assert oob_span_start.start_char == 0
assert oob_span_start.end_char == 0
oob_span_end = doc[len(doc) + 1 : len(doc) + 10]
assert oob_span_end.text == ""
assert oob_span_end.start == len(doc)
assert oob_span_end.end == len(doc)
assert oob_span_end.start_char == len(doc.text)
assert oob_span_end.end_char == len(doc.text)
def test_span_lemma(doc):
# span lemmas should have the same number of spaces as the span
sp = doc[1:5]
assert len(sp.text.split(" ")) == len(sp.lemma_.split(" "))
def test_sent(en_tokenizer):
doc = en_tokenizer("Check span.sent raises error if doc is not sentencized.")
span = doc[1:3]
assert not span.doc.has_annotation("SENT_START")
with pytest.raises(ValueError):
span.sent
def test_span_with_vectors(doc):
ops = get_current_ops()
prev_vectors = doc.vocab.vectors
vectors = [
("apple", ops.asarray([1, 2, 3])),
("orange", ops.asarray([-1, -2, -3])),
("And", ops.asarray([-1, -1, -1])),
("juice", ops.asarray([5, 5, 10])),
("pie", ops.asarray([7, 6.3, 8.9])),
]
add_vecs_to_vocab(doc.vocab, vectors)
# 0-length span
assert_array_equal(ops.to_numpy(doc[0:0].vector), numpy.zeros((3,)))
# longer span with no vector
assert_array_equal(ops.to_numpy(doc[0:4].vector), numpy.zeros((3,)))
# single-token span with vector
assert_array_equal(ops.to_numpy(doc[10:11].vector), [-1, -1, -1])
doc.vocab.vectors = prev_vectors
# fmt: off
def test_span_comparison(doc):
# Identical start, end, only differ in label and kb_id
assert Span(doc, 0, 3) == Span(doc, 0, 3)
assert Span(doc, 0, 3, "LABEL") == Span(doc, 0, 3, "LABEL")
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") == Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL")
assert Span(doc, 0, 3) != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 3, "LABEL") != Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 3) <= Span(doc, 0, 3) and Span(doc, 0, 3) >= Span(doc, 0, 3)
assert Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL") and Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "LABEL")
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert (Span(doc, 0, 3) < Span(doc, 0, 3, "", kb_id="KB_ID") < Span(doc, 0, 3, "LABEL") < Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
assert (Span(doc, 0, 3) <= Span(doc, 0, 3, "", kb_id="KB_ID") <= Span(doc, 0, 3, "LABEL") <= Span(doc, 0, 3, "LABEL", kb_id="KB_ID"))
assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") > Span(doc, 0, 3, "LABEL") > Span(doc, 0, 3, "", kb_id="KB_ID") > Span(doc, 0, 3))
assert (Span(doc, 0, 3, "LABEL", kb_id="KB_ID") >= Span(doc, 0, 3, "LABEL") >= Span(doc, 0, 3, "", kb_id="KB_ID") >= Span(doc, 0, 3))
# Different end
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 0, 4)
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 0, 4)
assert Span(doc, 0, 4) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 4) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
# Different start
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
assert Span(doc, 0, 3, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
assert Span(doc, 1, 3) > Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 1, 3) >= Span(doc, 0, 3, "LABEL", kb_id="KB_ID")
# Different start & different end
assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") != Span(doc, 1, 3, "LABEL", kb_id="KB_ID")
assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") < Span(doc, 1, 3)
assert Span(doc, 0, 4, "LABEL", kb_id="KB_ID") <= Span(doc, 1, 3)
assert Span(doc, 1, 3) > Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
assert Span(doc, 1, 3) >= Span(doc, 0, 4, "LABEL", kb_id="KB_ID")
# Different id
assert Span(doc, 1, 3, span_id="AAA") < Span(doc, 1, 3, span_id="BBB")
# fmt: on
@pytest.mark.parametrize(
"start,end,expected_sentences,expected_sentences_with_hook",
[
(0, 14, 3, 7), # Entire doc
(3, 6, 2, 2), # Overlapping with 2 sentences
(0, 4, 1, 2), # Beginning of the Doc. Full sentence
(0, 3, 1, 2), # Beginning of the Doc. Part of a sentence
(9, 14, 2, 3), # End of the Doc. Overlapping with 2 senteces
(10, 14, 1, 2), # End of the Doc. Full sentence
(11, 14, 1, 2), # End of the Doc. Partial sentence
(0, 0, 1, 1), # Empty Span
],
)
def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with_hook):
assert len(list(doc[start:end].sents)) == expected_sentences
def user_hook(doc):
return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
doc.user_hooks["sents"] = user_hook
assert len(list(doc[start:end].sents)) == expected_sentences_with_hook
doc.user_span_hooks["sents"] = lambda x: [x]
assert list(doc[start:end].sents)[0] == doc[start:end]
assert len(list(doc[start:end].sents)) == 1
def test_span_sents_not_parsed(doc_not_parsed):
with pytest.raises(ValueError):
list(Span(doc_not_parsed, 0, 3).sents)
def test_span_group_copy(doc):
doc.spans["test"] = [doc[0:1], doc[2:4]]
assert len(doc.spans["test"]) == 2
doc_copy = doc.copy()
# check that the spans were indeed copied
assert len(doc_copy.spans["test"]) == 2
# add a new span to the original doc
doc.spans["test"].append(doc[3:4])
assert len(doc.spans["test"]) == 3
# check that the copy spans were not modified and this is an isolated doc
assert len(doc_copy.spans["test"]) == 2
@pytest.mark.issue(11113)
def test_span_ent_id(en_tokenizer):
doc = en_tokenizer("a b c d")
doc.ents = [Span(doc, 1, 3, label="A", span_id="ID0")]
span = doc.ents[0]
assert doc[1].ent_id_ == "ID0"
# setting Span.id sets Token.ent_id
span.id_ = "ID1"
doc.ents = [span]
assert doc.ents[0].ent_id_ == "ID1"
assert doc[1].ent_id_ == "ID1"
# Span.ent_id is an alias of Span.id
span.ent_id_ = "ID2"
doc.ents = [span]
assert doc.ents[0].ent_id_ == "ID2"
assert doc[1].ent_id_ == "ID2"
def test_span_start_end_sync(en_tokenizer):
doc = en_tokenizer("a bc def e fghij kl")
# can create and edit span starts/ends
span = doc[2:4]
span.start_char = 2
span.end = 5
assert span == doc[span.start : span.end]
assert span == doc.char_span(span.start_char, span.end_char)
# cannot set completely out of bounds starts/ends
with pytest.raises(IndexError):
span.start = -1
with pytest.raises(IndexError):
span.end = -1
with pytest.raises(IndexError):
span.start_char = len(doc.text) + 1
with pytest.raises(IndexError):
span.end = len(doc.text) + 1
# test all possible char starts/ends
span = doc[0 : len(doc)]
token_char_starts = [token.idx for token in doc]
token_char_ends = [token.idx + len(token.text) for token in doc]
for i in range(len(doc.text)):
if i not in token_char_starts:
with pytest.raises(ValueError):
span.start_char = i
else:
span.start_char = i
span = doc[0 : len(doc)]
for i in range(len(doc.text)):
if i not in token_char_ends:
with pytest.raises(ValueError):
span.end_char = i
else:
span.end_char = i
# start must be <= end
span = doc[1:3]
with pytest.raises(ValueError):
span.start = 4
with pytest.raises(ValueError):
span.end = 0
span = doc.char_span(2, 8)
with pytest.raises(ValueError):
span.start_char = 9
with pytest.raises(ValueError):
span.end_char = 1