mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Added sents property to Span for Spans spanning over several sentences (#9699)
* Added sents property to Span class that returns a generator of sentences the Span belongs to * Added description to Span.sents property * Update test_span to clarify the difference between span.sent and span.sents Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/tests/doc/test_span.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix documentation typos in spacy/tokens/span.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update Span.sents doc string in spacy/tokens/span.pyx Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Parametrized test_span_spans * Corrected Span.sents to check for span-level hook first. Also, made Span.sent respect doc-level sents hook if no span-level hook is provided * Corrected Span ocumentation copy/paste issue * Put back accidentally deleted lines * Fixed formatting in span.pyx * Moved check for SENT_START annotation after user hooks in Span.sents * add version where the property was introduced Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
7d50804644
commit
472740d613
|
@ -200,6 +200,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
|||
assert doc[:2].sent.root.text == "is"
|
||||
assert doc[:2].sent.text == "This is a sentence."
|
||||
assert doc[6:7].sent.root.left_edge.text == "This"
|
||||
assert doc[0 : len(doc)].sent == list(doc.sents)[0]
|
||||
assert list(doc[0 : len(doc)].sents) == list(doc.sents)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
doc_not_parsed[:2].sent
|
||||
|
||||
# test on manual sbd
|
||||
doc_not_parsed[0].is_sent_start = True
|
||||
doc_not_parsed[5].is_sent_start = True
|
||||
|
@ -207,6 +213,35 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
|||
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"start,end,expected_sentence",
|
||||
[
|
||||
(0, 14, "This is"), # Entire doc
|
||||
(1, 4, "This is"), # Overlapping with 2 sentences
|
||||
(0, 2, "This is"), # Beginning of the Doc. Full sentence
|
||||
(0, 1, "This is"), # Beginning of the Doc. Part of a sentence
|
||||
(10, 14, "And a"), # End of the Doc. Overlapping with 2 senteces
|
||||
(12, 14, "third."), # End of the Doc. Full sentence
|
||||
(1, 1, "This is"), # Empty Span
|
||||
],
|
||||
)
|
||||
def test_spans_span_sent_user_hooks(doc, start, end, expected_sentence):
|
||||
|
||||
# Doc-level sents hook
|
||||
def user_hook(doc):
|
||||
return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
|
||||
|
||||
doc.user_hooks["sents"] = user_hook
|
||||
|
||||
# Make sure doc-level sents hook works
|
||||
assert doc[start:end].sent.text == expected_sentence
|
||||
|
||||
# Span-level sent hook
|
||||
doc.user_span_hooks["sent"] = lambda x: x
|
||||
# Now, span=level sent hook overrides the doc-level sents hook
|
||||
assert doc[start:end].sent == doc[start:end]
|
||||
|
||||
|
||||
def test_spans_lca_matrix(en_tokenizer):
|
||||
"""Test span's lca matrix generation"""
|
||||
tokens = en_tokenizer("the lazy dog slept")
|
||||
|
@ -536,3 +571,38 @@ def test_span_with_vectors(doc):
|
|||
# single-token span with vector
|
||||
assert_array_equal(ops.to_numpy(doc[10:11].vector), [-1, -1, -1])
|
||||
doc.vocab.vectors = prev_vectors
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"start,end,expected_sentences,expected_sentences_with_hook",
|
||||
[
|
||||
(0, 14, 3, 7), # Entire doc
|
||||
(3, 6, 2, 2), # Overlapping with 2 sentences
|
||||
(0, 4, 1, 2), # Beginning of the Doc. Full sentence
|
||||
(0, 3, 1, 2), # Beginning of the Doc. Part of a sentence
|
||||
(9, 14, 2, 3), # End of the Doc. Overlapping with 2 senteces
|
||||
(10, 14, 1, 2), # End of the Doc. Full sentence
|
||||
(11, 14, 1, 2), # End of the Doc. Partial sentence
|
||||
(0, 0, 1, 1), # Empty Span
|
||||
],
|
||||
)
|
||||
def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with_hook):
|
||||
|
||||
assert len(list(doc[start:end].sents)) == expected_sentences
|
||||
|
||||
def user_hook(doc):
|
||||
return [doc[ii : ii + 2] for ii in range(0, len(doc), 2)]
|
||||
|
||||
doc.user_hooks["sents"] = user_hook
|
||||
|
||||
assert len(list(doc[start:end].sents)) == expected_sentences_with_hook
|
||||
|
||||
doc.user_span_hooks["sents"] = lambda x: [x]
|
||||
|
||||
assert list(doc[start:end].sents)[0] == doc[start:end]
|
||||
assert len(list(doc[start:end].sents)) == 1
|
||||
|
||||
|
||||
def test_span_sents_not_parsed(doc_not_parsed):
|
||||
with pytest.raises(ValueError):
|
||||
list(Span(doc_not_parsed, 0, 3).sents)
|
||||
|
|
|
@ -404,6 +404,10 @@ cdef class Span:
|
|||
"""
|
||||
if "sent" in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks["sent"](self)
|
||||
elif "sents" in self.doc.user_hooks:
|
||||
for sentence in self.doc.user_hooks["sents"](self.doc):
|
||||
if sentence.start <= self.start < sentence.end:
|
||||
return sentence
|
||||
# Use `sent_start` token attribute to find sentence boundaries
|
||||
cdef int n = 0
|
||||
if self.doc.has_annotation("SENT_START"):
|
||||
|
@ -422,6 +426,47 @@ cdef class Span:
|
|||
else:
|
||||
raise ValueError(Errors.E030)
|
||||
|
||||
@property
|
||||
def sents(self):
|
||||
"""Obtain the sentences that contain this span. If the given span
|
||||
crosses sentence boundaries, return all sentences it is a part of.
|
||||
|
||||
RETURNS (Iterable[Span]): All sentences that the span is a part of.
|
||||
|
||||
DOCS: https://spacy.io/api/span#sents
|
||||
"""
|
||||
cdef int start
|
||||
cdef int i
|
||||
|
||||
if "sents" in self.doc.user_span_hooks:
|
||||
yield from self.doc.user_span_hooks["sents"](self)
|
||||
elif "sents" in self.doc.user_hooks:
|
||||
for sentence in self.doc.user_hooks["sents"](self.doc):
|
||||
if sentence.end > self.start:
|
||||
if sentence.start < self.end or sentence.start == self.start == self.end:
|
||||
yield sentence
|
||||
else:
|
||||
break
|
||||
else:
|
||||
if not self.doc.has_annotation("SENT_START"):
|
||||
raise ValueError(Errors.E030)
|
||||
# Use `sent_start` token attribute to find sentence boundaries
|
||||
# Find start of the 1st sentence of the Span
|
||||
start = self.start
|
||||
while self.doc.c[start].sent_start != 1 and start > 0:
|
||||
start -= 1
|
||||
|
||||
# Now, find all the sentences in the span
|
||||
for i in range(start + 1, self.doc.length):
|
||||
if self.doc.c[i].sent_start == 1:
|
||||
yield Span(self.doc, start, i)
|
||||
start = i
|
||||
if start >= self.end:
|
||||
break
|
||||
if start < self.end:
|
||||
yield Span(self.doc, start, self.end)
|
||||
|
||||
|
||||
@property
|
||||
def ents(self):
|
||||
"""The named entities in the span. Returns a tuple of named entity
|
||||
|
|
|
@ -518,6 +518,27 @@ sent = doc[sent.start : max(sent.end, span.end)]
|
|||
| ----------- | ------------------------------------------------------- |
|
||||
| **RETURNS** | The sentence span that this span is a part of. ~~Span~~ |
|
||||
|
||||
## Span.sents {#sents tag="property" model="sentences" new="3.2.1"}
|
||||
|
||||
Returns a generator over the sentences the span belongs to. This property is only available
|
||||
when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
|
||||
document by the `parser`, `senter`, `sentencizer` or some custom function. It
|
||||
will raise an error otherwise.
|
||||
|
||||
If the span happens to cross sentence boundaries, all sentences the span overlaps with will be returned.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc = nlp("Give it back! He pleaded.")
|
||||
> span = doc[2:4]
|
||||
> assert len(span.sents) == 2
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------- |
|
||||
| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Description |
|
||||
|
|
Loading…
Reference in New Issue
Block a user