Added sents property to Span for Spans spanning over several sentences (#9699)

* Added sents property to Span class that returns a generator of sentences the Span belongs to

* Added description to Span.sents property

* Update test_span to clarify the difference between span.sent and span.sents

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/tests/doc/test_span.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Fix documentation typos in spacy/tokens/span.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update Span.sents doc string in spacy/tokens/span.pyx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Parametrized test_span_spans

* Corrected Span.sents to check for span-level hook first. Also, made Span.sent respect doc-level sents hook if no span-level hook is provided

* Corrected Span ocumentation copy/paste issue

* Put back accidentally deleted lines

* Fixed formatting in span.pyx

* Moved check for SENT_START annotation after user hooks in Span.sents

* add version where the property was introduced

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Natalia Rodnova 2021-12-06 01:58:01 -07:00 committed by GitHub
parent 7d50804644
commit 472740d613
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 136 additions and 0 deletions

View File

@ -200,6 +200,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
assert doc[:2].sent.root.text == "is" assert doc[:2].sent.root.text == "is"
assert doc[:2].sent.text == "This is a sentence." assert doc[:2].sent.text == "This is a sentence."
assert doc[6:7].sent.root.left_edge.text == "This" assert doc[6:7].sent.root.left_edge.text == "This"
assert doc[0 : len(doc)].sent == list(doc.sents)[0]
assert list(doc[0 : len(doc)].sents) == list(doc.sents)
with pytest.raises(ValueError):
doc_not_parsed[:2].sent
# test on manual sbd # test on manual sbd
doc_not_parsed[0].is_sent_start = True doc_not_parsed[0].is_sent_start = True
doc_not_parsed[5].is_sent_start = True doc_not_parsed[5].is_sent_start = True
@ -207,6 +213,35 @@ def test_spans_span_sent(doc, doc_not_parsed):
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:] assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
@pytest.mark.parametrize(
"start,end,expected_sentence",
[
(0, 14, "This is"), # Entire doc
(1, 4, "This is"), # Overlapping with 2 sentences
(0, 2, "This is"), # Beginning of the Doc. Full sentence
(0, 1, "This is"), # Beginning of the Doc. Part of a sentence
(10, 14, "And a"), # End of the Doc. Overlapping with 2 senteces
(12, 14, "third."), # End of the Doc. Full sentence
(1, 1, "This is"), # Empty Span
],
)
def test_spans_span_sent_user_hooks(doc, start, end, expected_sentence):
# Doc-level sents hook
def user_hook(doc):
return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
doc.user_hooks["sents"] = user_hook
# Make sure doc-level sents hook works
assert doc[start:end].sent.text == expected_sentence
# Span-level sent hook
doc.user_span_hooks["sent"] = lambda x: x
# Now, span=level sent hook overrides the doc-level sents hook
assert doc[start:end].sent == doc[start:end]
def test_spans_lca_matrix(en_tokenizer): def test_spans_lca_matrix(en_tokenizer):
"""Test span's lca matrix generation""" """Test span's lca matrix generation"""
tokens = en_tokenizer("the lazy dog slept") tokens = en_tokenizer("the lazy dog slept")
@ -536,3 +571,38 @@ def test_span_with_vectors(doc):
# single-token span with vector # single-token span with vector
assert_array_equal(ops.to_numpy(doc[10:11].vector), [-1, -1, -1]) assert_array_equal(ops.to_numpy(doc[10:11].vector), [-1, -1, -1])
doc.vocab.vectors = prev_vectors doc.vocab.vectors = prev_vectors
@pytest.mark.parametrize(
"start,end,expected_sentences,expected_sentences_with_hook",
[
(0, 14, 3, 7), # Entire doc
(3, 6, 2, 2), # Overlapping with 2 sentences
(0, 4, 1, 2), # Beginning of the Doc. Full sentence
(0, 3, 1, 2), # Beginning of the Doc. Part of a sentence
(9, 14, 2, 3), # End of the Doc. Overlapping with 2 senteces
(10, 14, 1, 2), # End of the Doc. Full sentence
(11, 14, 1, 2), # End of the Doc. Partial sentence
(0, 0, 1, 1), # Empty Span
],
)
def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with_hook):
assert len(list(doc[start:end].sents)) == expected_sentences
def user_hook(doc):
return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
doc.user_hooks["sents"] = user_hook
assert len(list(doc[start:end].sents)) == expected_sentences_with_hook
doc.user_span_hooks["sents"] = lambda x: [x]
assert list(doc[start:end].sents)[0] == doc[start:end]
assert len(list(doc[start:end].sents)) == 1
def test_span_sents_not_parsed(doc_not_parsed):
with pytest.raises(ValueError):
list(Span(doc_not_parsed, 0, 3).sents)

View File

@ -404,6 +404,10 @@ cdef class Span:
""" """
if "sent" in self.doc.user_span_hooks: if "sent" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["sent"](self) return self.doc.user_span_hooks["sent"](self)
elif "sents" in self.doc.user_hooks:
for sentence in self.doc.user_hooks["sents"](self.doc):
if sentence.start <= self.start < sentence.end:
return sentence
# Use `sent_start` token attribute to find sentence boundaries # Use `sent_start` token attribute to find sentence boundaries
cdef int n = 0 cdef int n = 0
if self.doc.has_annotation("SENT_START"): if self.doc.has_annotation("SENT_START"):
@ -422,6 +426,47 @@ cdef class Span:
else: else:
raise ValueError(Errors.E030) raise ValueError(Errors.E030)
@property
def sents(self):
"""Obtain the sentences that contain this span. If the given span
crosses sentence boundaries, return all sentences it is a part of.
RETURNS (Iterable[Span]): All sentences that the span is a part of.
DOCS: https://spacy.io/api/span#sents
"""
cdef int start
cdef int i
if "sents" in self.doc.user_span_hooks:
yield from self.doc.user_span_hooks["sents"](self)
elif "sents" in self.doc.user_hooks:
for sentence in self.doc.user_hooks["sents"](self.doc):
if sentence.end > self.start:
if sentence.start < self.end or sentence.start == self.start == self.end:
yield sentence
else:
break
else:
if not self.doc.has_annotation("SENT_START"):
raise ValueError(Errors.E030)
# Use `sent_start` token attribute to find sentence boundaries
# Find start of the 1st sentence of the Span
start = self.start
while self.doc.c[start].sent_start != 1 and start > 0:
start -= 1
# Now, find all the sentences in the span
for i in range(start + 1, self.doc.length):
if self.doc.c[i].sent_start == 1:
yield Span(self.doc, start, i)
start = i
if start >= self.end:
break
if start < self.end:
yield Span(self.doc, start, self.end)
@property @property
def ents(self): def ents(self):
"""The named entities in the span. Returns a tuple of named entity """The named entities in the span. Returns a tuple of named entity

View File

@ -518,6 +518,27 @@ sent = doc[sent.start : max(sent.end, span.end)]
| ----------- | ------------------------------------------------------- | | ----------- | ------------------------------------------------------- |
| **RETURNS** | The sentence span that this span is a part of. ~~Span~~ | | **RETURNS** | The sentence span that this span is a part of. ~~Span~~ |
## Span.sents {#sents tag="property" model="sentences" new="3.2.1"}
Returns a generator over the sentences the span belongs to. This property is only available
when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
document by the `parser`, `senter`, `sentencizer` or some custom function. It
will raise an error otherwise.
If the span happens to cross sentence boundaries, all sentences the span overlaps with will be returned.
> #### Example
>
> ```python
> doc = nlp("Give it back! He pleaded.")
> span = doc[2:4]
> assert len(span.sents) == 2
> ```
| Name | Description |
| ----------- | -------------------------------------------------------------------------- |
| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Description | | Name | Description |